[PASS] More robust UnrollLoop configuratin (#576)

0f1e0ff0 · Tianqi Chen · GitHub · 69759c0c · 0f1e0ff0 · 0f1e0ff0
Commit 0f1e0ff0 authored Oct 22, 2017 by Tianqi Chen Committed by GitHub Oct 22, 2017
10 changed files
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,8 +3,11 @@ TVM Change Log

 This file records the changes in TVM library in reverse chronological order.

-## 0.1rc

+## On onging verison
+- UnrollLoop : more robust version of unroll loop, count maximum steps that can be unrolled.
+
+## 0.1rc
 - Language runtime
    - python
    - javascript

--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -28,7 +28,7 @@ class BuildConfig(object):
    current = None
    defaults = {
        "auto_unroll_max_step": 0,
-        "auto_unroll_min_depth": 1,
+        "auto_unroll_max_depth": 4,
        "unroll_explicit": True,
        "detect_global_barrier": False,
        "offset_factor": 0,
@@ -72,10 +72,11 @@ def build_config(**kwargs):
    Parameters
    ----------
    auto_unroll_max_step: int, default=0
-        Threshold of loop extent to be automatically unrolled.
+        Threshold of number of steps in the loop to be automatically unrolled.
+        This takes inner loop count into consideration.

-    auto_unroll_min_depth: int, default=1
-        The minimum loop nest level before the loop can be automatically unrolled.
+    auto_unroll_max_depth: int, default=4
+        The maximum nested level of loops that can be automatically unrolled.

    unroll_explicit: bool, default=True
        Whether explicitly unroll the loop, if set false, the unroll hint will
@@ -221,7 +222,7 @@ def lower(sch,
    stmt = ir_pass.UnrollLoop(
        stmt,
        cfg.auto_unroll_max_step,
-        cfg.auto_unroll_min_depth,
+        cfg.auto_unroll_max_depth,
        cfg.unroll_explicit)
    for f in lower_phase1:
        stmt = f(stmt)

--- a/src/pass/unroll_loop.cc
+++ b/src/pass/unroll_loop.cc
@@ -18,15 +18,16 @@ namespace ir {
 class LoopUnroller : public IRMutator {
 public:
  explicit LoopUnroller(int auto_max_step,
-                        int auto_min_depth,
+                        int auto_max_depth,
                        bool explicit_unroll)
      : auto_max_step_(auto_max_step),
-        auto_min_depth_(auto_min_depth),
+        auto_max_depth_(auto_max_depth),
        explicit_unroll_(explicit_unroll) {
  }

  Stmt Mutate_(const For* op, const Stmt& s) {
-    Stmt stmt = s;
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    op = stmt.as<For>();
    // constant folding.
    Expr extent = ir::Simplify(op->extent);
    const IntImm* v1 = extent.as<IntImm>();
@@ -38,15 +39,27 @@ class LoopUnroller : public IRMutator {
    if (v2 != nullptr) {
      value = static_cast<int>(v2->value);
    }
-    bool auto_unroll = (op->for_type == ForType::Serial &&
-                        value >= 0 && value <= auto_max_step_ &&
-                        loop_depth_ >= auto_min_depth_);
+    // condition for auto unroll
+    bool auto_unroll = (
+        op->for_type == ForType::Serial &&
+        normal_loop_depth_ == 0 &&
+        value >= 0 &&
+        unroll_depth_ <= auto_max_depth_ &&
+        value * step_count_ <= auto_max_step_);
+
    if (op->for_type == ForType::Unrolled) {
      CHECK_GE(value, 0)
          << "Cannot unroll non-constant loop";
      auto_unroll = true;
    }

+    if (auto_unroll) {
+      step_count_  *=  value;
+      unroll_depth_ += 1;
+    } else {
+      normal_loop_depth_ += 1;
+    }
+
    if (auto_unroll && explicit_unroll_) {
      using arith::ComputeExpr;
      if (value == 0) return Evaluate::make(0);
@@ -65,42 +78,72 @@ class LoopUnroller : public IRMutator {
          unrolled = step;
        }
      }
-      ++loop_depth_;
-      Stmt ret = this->Mutate(unrolled);
-      --loop_depth_;
-      return ret;
+      return unrolled;
    } else {
-      ++loop_depth_;
-      Stmt ret = IRMutator::Mutate_(op, stmt);
      if (auto_unroll) {
-        op = ret.as<For>();
        if (op->for_type != ForType::Unrolled) {
-          ret = For::make(
+          return For::make(
              op->loop_var, op->min, op->extent,
              ForType::Unrolled, op->device_api, op->body);
        }
      }
-      --loop_depth_;
-      return ret;
+      return stmt;
+    }
+  }
+
+  Stmt Mutate_(const Store* op, const Stmt& stmt) final {
+    ++step_count_;
+    return IRMutator::Mutate_(op, stmt);
+  }
+
+  Stmt Mutate_(const Evaluate* op, const Stmt& stmt) final {
+    ++step_count_;
+    return IRMutator::Mutate_(op, stmt);
+  }
+
+  Stmt Mutate_(const Block* op, const Stmt& stmt) final {
+    Stmt first = this->Mutate(op->first);
+    // cleanup state
+    int step_count = step_count_;
+    int unroll_depth = unroll_depth_;
+    int normal_loop_depth = normal_loop_depth_;
+    step_count_ = 0;
+    unroll_depth_ = 0;
+    normal_loop_depth_ = 0;
+    // work on rest part
+    Stmt rest = this->Mutate(op->rest);
+    step_count_ += step_count;
+    normal_loop_depth_ = std::max(normal_loop_depth, normal_loop_depth_);
+    unroll_depth_ = std::max(unroll_depth_, unroll_depth);
+    if (first.same_as(op->first) &&
+        rest.same_as(op->rest)) {
+      return stmt;
+    } else {
+      return Block::make(first, rest);
    }
  }

 private:
  // maximum number of step to perform auto unroll.
  int auto_max_step_;
-  int auto_min_depth_;
+  int auto_max_depth_;
  bool explicit_unroll_;
-  int loop_depth_{0};
+  // Number of normal loops in scope
+  int normal_loop_depth_{0};
+  // number of unrolled cases in current scope.
+  int unroll_depth_{0};
+  // Number of total steps unrolled
+  int step_count_{0};
 };


 Stmt UnrollLoop(Stmt stmt,
                int auto_max_step,
-                int auto_min_depth,
+                int auto_max_depth,
                bool explicit_unroll) {
  Stmt ret = LoopUnroller(
      auto_max_step,
-      auto_min_depth,
+      auto_max_depth,
      explicit_unroll).Mutate(stmt);
  if (!ret.same_as(stmt)) {
    return ConvertSSA(ret);

--- a/tests/python/unittest/test_pass_unroll.py
+++ b/tests/python/unittest/test_pass_unroll.py
@@ -14,11 +14,14 @@ def test_unroll_loop():
                                    tvm.make.Load(dtype, Ab.data, i) + 1,
                                    j + 1)))
    assert isinstance(stmt, tvm.stmt.For)
-    ret = tvm.ir_pass.UnrollLoop(stmt, 2, 0, True)
+    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, True)
    assert not isinstance(ret, tvm.stmt.For)
-    ret = tvm.ir_pass.UnrollLoop(stmt, 4, 0, False)
+    ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, True)
+    assert isinstance(ret, tvm.stmt.For)
+    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, False)
    assert isinstance(ret, tvm.stmt.For)
    assert ret.for_type == tvm.stmt.For.Unrolled

+
 if __name__ == "__main__":
    test_unroll_loop()
--- a/topi/recipe/conv/depthwise_conv2d_test.py
+++ b/topi/recipe/conv/depthwise_conv2d_test.py
@@ -112,8 +112,7 @@ def test_depthwise_conv2d_nchw():
        print("success")

    for device in ['cuda', 'opencl', 'rocm']:
-        with tvm.build_config(auto_unroll_max_step=32,
-                              auto_unroll_min_depth=0,
+        with tvm.build_config(auto_unroll_max_step=128,
                              unroll_explicit=device == 'rocm',
                              detect_global_barrier=False,
                              restricted_func=True):
@@ -202,9 +201,7 @@ def test_depthwise_conv2d_nhwc():
        print("success")

    for device in ['cuda', 'opencl', 'rocm']:
-        with tvm.build_config(auto_unroll_max_step=32,
-                              auto_unroll_min_depth=0,
-                              unroll_explicit=device == 'rocm',
+        with tvm.build_config(auto_unroll_max_step=128,
                              detect_global_barrier=False,
                              restricted_func=True):
            check_device(device)

--- a/topi/recipe/conv/test_conv2d_hwcn_map.py
+++ b/topi/recipe/conv/test_conv2d_hwcn_map.py
@@ -60,8 +60,7 @@ def test_conv2d_hwcn_map():
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=32,
-                              auto_unroll_min_depth=0,
+        with tvm.build_config(auto_unroll_max_step=128,
                              unroll_explicit=device == 'rocm'):
            func1 = tvm.build(s1, [A, W, B], device)
            func1(a, w, b)

--- a/topi/recipe/gemm/cuda_gemm_square.py
+++ b/topi/recipe/gemm/cuda_gemm_square.py
@@ -80,6 +80,7 @@ def test_gemm():
    s[CC].reorder(ko, kt, ki, yo, xo)
    s[AA].compute_at(s[CC], ko)
    s[BB].compute_at(s[CC], ko)
+    s[CC].unroll(kt)
    s[AL].compute_at(s[CC], kt)
    s[BL].compute_at(s[CC], kt)
    # Schedule for A's shared memory load
@@ -125,9 +126,8 @@ def test_gemm():
        GFLOPS = num_flops / (t * 1e3) / 1e6
        print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS))

-    for device in ["cuda", "opencl", "rocm"]:
-        with tvm.build_config(auto_unroll_max_step=32,
-                              auto_unroll_min_depth=0,
+    for device in ["cuda", "opencl", "rocm", "nvptx"]:
+        with tvm.build_config(auto_unroll_max_step=128,
                              unroll_explicit=(device != "cuda")):
            check_device(device)


--- a/topi/recipe/rnn/matexp.py
+++ b/topi/recipe/rnn/matexp.py
@@ -112,7 +112,6 @@ def rnn_matexp():
    def check_device(target):
        with tvm.build_config(
                detect_global_barrier=detect_global_barrier,
-                auto_unroll_min_depth=2,
                auto_unroll_max_step=128,
                unroll_explicit=False):
            f = tvm.build(s, [s_scan, Whh], target)

--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -39,9 +39,8 @@ def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, p
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=32,
-                              auto_unroll_min_depth=0,
-                              unroll_explicit=device == 'rocm'):
+        with tvm.build_config(auto_unroll_max_step=128,
+                              unroll_explicit=(device != "cuda")):
            func1 = tvm.build(s1, [A, W, B], device)
            func2 = tvm.build(s2, [A, W, C], device)
            func1(a, w, b)

--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -41,9 +41,8 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=32,
-                              auto_unroll_min_depth=0,
-                              unroll_explicit=device == 'rocm'):
+        with tvm.build_config(auto_unroll_max_step=128,
+                              unroll_explicit=(device != "cuda")):
            func1 = tvm.build(s1, [A, W, B], device)
            func2 = tvm.build(s2, [A, W, C], device)
            func1(a, w, b)