Commit 0f1e0ff0 by Tianqi Chen Committed by GitHub

[PASS] More robust UnrollLoop configuratin (#576)

parent 69759c0c
...@@ -3,8 +3,11 @@ TVM Change Log ...@@ -3,8 +3,11 @@ TVM Change Log
This file records the changes in TVM library in reverse chronological order. This file records the changes in TVM library in reverse chronological order.
## 0.1rc
## On onging verison
- UnrollLoop : more robust version of unroll loop, count maximum steps that can be unrolled.
## 0.1rc
- Language runtime - Language runtime
- python - python
- javascript - javascript
......
...@@ -28,7 +28,7 @@ class BuildConfig(object): ...@@ -28,7 +28,7 @@ class BuildConfig(object):
current = None current = None
defaults = { defaults = {
"auto_unroll_max_step": 0, "auto_unroll_max_step": 0,
"auto_unroll_min_depth": 1, "auto_unroll_max_depth": 4,
"unroll_explicit": True, "unroll_explicit": True,
"detect_global_barrier": False, "detect_global_barrier": False,
"offset_factor": 0, "offset_factor": 0,
...@@ -72,10 +72,11 @@ def build_config(**kwargs): ...@@ -72,10 +72,11 @@ def build_config(**kwargs):
Parameters Parameters
---------- ----------
auto_unroll_max_step: int, default=0 auto_unroll_max_step: int, default=0
Threshold of loop extent to be automatically unrolled. Threshold of number of steps in the loop to be automatically unrolled.
This takes inner loop count into consideration.
auto_unroll_min_depth: int, default=1 auto_unroll_max_depth: int, default=4
The minimum loop nest level before the loop can be automatically unrolled. The maximum nested level of loops that can be automatically unrolled.
unroll_explicit: bool, default=True unroll_explicit: bool, default=True
Whether explicitly unroll the loop, if set false, the unroll hint will Whether explicitly unroll the loop, if set false, the unroll hint will
...@@ -221,7 +222,7 @@ def lower(sch, ...@@ -221,7 +222,7 @@ def lower(sch,
stmt = ir_pass.UnrollLoop( stmt = ir_pass.UnrollLoop(
stmt, stmt,
cfg.auto_unroll_max_step, cfg.auto_unroll_max_step,
cfg.auto_unroll_min_depth, cfg.auto_unroll_max_depth,
cfg.unroll_explicit) cfg.unroll_explicit)
for f in lower_phase1: for f in lower_phase1:
stmt = f(stmt) stmt = f(stmt)
......
...@@ -18,15 +18,16 @@ namespace ir { ...@@ -18,15 +18,16 @@ namespace ir {
class LoopUnroller : public IRMutator { class LoopUnroller : public IRMutator {
public: public:
explicit LoopUnroller(int auto_max_step, explicit LoopUnroller(int auto_max_step,
int auto_min_depth, int auto_max_depth,
bool explicit_unroll) bool explicit_unroll)
: auto_max_step_(auto_max_step), : auto_max_step_(auto_max_step),
auto_min_depth_(auto_min_depth), auto_max_depth_(auto_max_depth),
explicit_unroll_(explicit_unroll) { explicit_unroll_(explicit_unroll) {
} }
Stmt Mutate_(const For* op, const Stmt& s) { Stmt Mutate_(const For* op, const Stmt& s) {
Stmt stmt = s; Stmt stmt = IRMutator::Mutate_(op, s);
op = stmt.as<For>();
// constant folding. // constant folding.
Expr extent = ir::Simplify(op->extent); Expr extent = ir::Simplify(op->extent);
const IntImm* v1 = extent.as<IntImm>(); const IntImm* v1 = extent.as<IntImm>();
...@@ -38,15 +39,27 @@ class LoopUnroller : public IRMutator { ...@@ -38,15 +39,27 @@ class LoopUnroller : public IRMutator {
if (v2 != nullptr) { if (v2 != nullptr) {
value = static_cast<int>(v2->value); value = static_cast<int>(v2->value);
} }
bool auto_unroll = (op->for_type == ForType::Serial && // condition for auto unroll
value >= 0 && value <= auto_max_step_ && bool auto_unroll = (
loop_depth_ >= auto_min_depth_); op->for_type == ForType::Serial &&
normal_loop_depth_ == 0 &&
value >= 0 &&
unroll_depth_ <= auto_max_depth_ &&
value * step_count_ <= auto_max_step_);
if (op->for_type == ForType::Unrolled) { if (op->for_type == ForType::Unrolled) {
CHECK_GE(value, 0) CHECK_GE(value, 0)
<< "Cannot unroll non-constant loop"; << "Cannot unroll non-constant loop";
auto_unroll = true; auto_unroll = true;
} }
if (auto_unroll) {
step_count_ *= value;
unroll_depth_ += 1;
} else {
normal_loop_depth_ += 1;
}
if (auto_unroll && explicit_unroll_) { if (auto_unroll && explicit_unroll_) {
using arith::ComputeExpr; using arith::ComputeExpr;
if (value == 0) return Evaluate::make(0); if (value == 0) return Evaluate::make(0);
...@@ -65,42 +78,72 @@ class LoopUnroller : public IRMutator { ...@@ -65,42 +78,72 @@ class LoopUnroller : public IRMutator {
unrolled = step; unrolled = step;
} }
} }
++loop_depth_; return unrolled;
Stmt ret = this->Mutate(unrolled);
--loop_depth_;
return ret;
} else { } else {
++loop_depth_;
Stmt ret = IRMutator::Mutate_(op, stmt);
if (auto_unroll) { if (auto_unroll) {
op = ret.as<For>();
if (op->for_type != ForType::Unrolled) { if (op->for_type != ForType::Unrolled) {
ret = For::make( return For::make(
op->loop_var, op->min, op->extent, op->loop_var, op->min, op->extent,
ForType::Unrolled, op->device_api, op->body); ForType::Unrolled, op->device_api, op->body);
} }
} }
--loop_depth_; return stmt;
return ret; }
}
Stmt Mutate_(const Store* op, const Stmt& stmt) final {
++step_count_;
return IRMutator::Mutate_(op, stmt);
}
Stmt Mutate_(const Evaluate* op, const Stmt& stmt) final {
++step_count_;
return IRMutator::Mutate_(op, stmt);
}
Stmt Mutate_(const Block* op, const Stmt& stmt) final {
Stmt first = this->Mutate(op->first);
// cleanup state
int step_count = step_count_;
int unroll_depth = unroll_depth_;
int normal_loop_depth = normal_loop_depth_;
step_count_ = 0;
unroll_depth_ = 0;
normal_loop_depth_ = 0;
// work on rest part
Stmt rest = this->Mutate(op->rest);
step_count_ += step_count;
normal_loop_depth_ = std::max(normal_loop_depth, normal_loop_depth_);
unroll_depth_ = std::max(unroll_depth_, unroll_depth);
if (first.same_as(op->first) &&
rest.same_as(op->rest)) {
return stmt;
} else {
return Block::make(first, rest);
} }
} }
private: private:
// maximum number of step to perform auto unroll. // maximum number of step to perform auto unroll.
int auto_max_step_; int auto_max_step_;
int auto_min_depth_; int auto_max_depth_;
bool explicit_unroll_; bool explicit_unroll_;
int loop_depth_{0}; // Number of normal loops in scope
int normal_loop_depth_{0};
// number of unrolled cases in current scope.
int unroll_depth_{0};
// Number of total steps unrolled
int step_count_{0};
}; };
Stmt UnrollLoop(Stmt stmt, Stmt UnrollLoop(Stmt stmt,
int auto_max_step, int auto_max_step,
int auto_min_depth, int auto_max_depth,
bool explicit_unroll) { bool explicit_unroll) {
Stmt ret = LoopUnroller( Stmt ret = LoopUnroller(
auto_max_step, auto_max_step,
auto_min_depth, auto_max_depth,
explicit_unroll).Mutate(stmt); explicit_unroll).Mutate(stmt);
if (!ret.same_as(stmt)) { if (!ret.same_as(stmt)) {
return ConvertSSA(ret); return ConvertSSA(ret);
......
...@@ -14,11 +14,14 @@ def test_unroll_loop(): ...@@ -14,11 +14,14 @@ def test_unroll_loop():
tvm.make.Load(dtype, Ab.data, i) + 1, tvm.make.Load(dtype, Ab.data, i) + 1,
j + 1))) j + 1)))
assert isinstance(stmt, tvm.stmt.For) assert isinstance(stmt, tvm.stmt.For)
ret = tvm.ir_pass.UnrollLoop(stmt, 2, 0, True) ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, True)
assert not isinstance(ret, tvm.stmt.For) assert not isinstance(ret, tvm.stmt.For)
ret = tvm.ir_pass.UnrollLoop(stmt, 4, 0, False) ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, True)
assert isinstance(ret, tvm.stmt.For)
ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, False)
assert isinstance(ret, tvm.stmt.For) assert isinstance(ret, tvm.stmt.For)
assert ret.for_type == tvm.stmt.For.Unrolled assert ret.for_type == tvm.stmt.For.Unrolled
if __name__ == "__main__": if __name__ == "__main__":
test_unroll_loop() test_unroll_loop()
...@@ -112,8 +112,7 @@ def test_depthwise_conv2d_nchw(): ...@@ -112,8 +112,7 @@ def test_depthwise_conv2d_nchw():
print("success") print("success")
for device in ['cuda', 'opencl', 'rocm']: for device in ['cuda', 'opencl', 'rocm']:
with tvm.build_config(auto_unroll_max_step=32, with tvm.build_config(auto_unroll_max_step=128,
auto_unroll_min_depth=0,
unroll_explicit=device == 'rocm', unroll_explicit=device == 'rocm',
detect_global_barrier=False, detect_global_barrier=False,
restricted_func=True): restricted_func=True):
...@@ -202,9 +201,7 @@ def test_depthwise_conv2d_nhwc(): ...@@ -202,9 +201,7 @@ def test_depthwise_conv2d_nhwc():
print("success") print("success")
for device in ['cuda', 'opencl', 'rocm']: for device in ['cuda', 'opencl', 'rocm']:
with tvm.build_config(auto_unroll_max_step=32, with tvm.build_config(auto_unroll_max_step=128,
auto_unroll_min_depth=0,
unroll_explicit=device == 'rocm',
detect_global_barrier=False, detect_global_barrier=False,
restricted_func=True): restricted_func=True):
check_device(device) check_device(device)
......
...@@ -60,8 +60,7 @@ def test_conv2d_hwcn_map(): ...@@ -60,8 +60,7 @@ def test_conv2d_hwcn_map():
w = tvm.nd.array(w_np, ctx) w = tvm.nd.array(w_np, ctx)
b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
with tvm.build_config(auto_unroll_max_step=32, with tvm.build_config(auto_unroll_max_step=128,
auto_unroll_min_depth=0,
unroll_explicit=device == 'rocm'): unroll_explicit=device == 'rocm'):
func1 = tvm.build(s1, [A, W, B], device) func1 = tvm.build(s1, [A, W, B], device)
func1(a, w, b) func1(a, w, b)
......
...@@ -80,6 +80,7 @@ def test_gemm(): ...@@ -80,6 +80,7 @@ def test_gemm():
s[CC].reorder(ko, kt, ki, yo, xo) s[CC].reorder(ko, kt, ki, yo, xo)
s[AA].compute_at(s[CC], ko) s[AA].compute_at(s[CC], ko)
s[BB].compute_at(s[CC], ko) s[BB].compute_at(s[CC], ko)
s[CC].unroll(kt)
s[AL].compute_at(s[CC], kt) s[AL].compute_at(s[CC], kt)
s[BL].compute_at(s[CC], kt) s[BL].compute_at(s[CC], kt)
# Schedule for A's shared memory load # Schedule for A's shared memory load
...@@ -125,9 +126,8 @@ def test_gemm(): ...@@ -125,9 +126,8 @@ def test_gemm():
GFLOPS = num_flops / (t * 1e3) / 1e6 GFLOPS = num_flops / (t * 1e3) / 1e6
print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS)) print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS))
for device in ["cuda", "opencl", "rocm"]: for device in ["cuda", "opencl", "rocm", "nvptx"]:
with tvm.build_config(auto_unroll_max_step=32, with tvm.build_config(auto_unroll_max_step=128,
auto_unroll_min_depth=0,
unroll_explicit=(device != "cuda")): unroll_explicit=(device != "cuda")):
check_device(device) check_device(device)
......
...@@ -112,7 +112,6 @@ def rnn_matexp(): ...@@ -112,7 +112,6 @@ def rnn_matexp():
def check_device(target): def check_device(target):
with tvm.build_config( with tvm.build_config(
detect_global_barrier=detect_global_barrier, detect_global_barrier=detect_global_barrier,
auto_unroll_min_depth=2,
auto_unroll_max_step=128, auto_unroll_max_step=128,
unroll_explicit=False): unroll_explicit=False):
f = tvm.build(s, [s_scan, Whh], target) f = tvm.build(s, [s_scan, Whh], target)
......
...@@ -39,9 +39,8 @@ def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, p ...@@ -39,9 +39,8 @@ def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, p
w = tvm.nd.array(w_np, ctx) w = tvm.nd.array(w_np, ctx)
b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
with tvm.build_config(auto_unroll_max_step=32, with tvm.build_config(auto_unroll_max_step=128,
auto_unroll_min_depth=0, unroll_explicit=(device != "cuda")):
unroll_explicit=device == 'rocm'):
func1 = tvm.build(s1, [A, W, B], device) func1 = tvm.build(s1, [A, W, B], device)
func2 = tvm.build(s2, [A, W, C], device) func2 = tvm.build(s2, [A, W, C], device)
func1(a, w, b) func1(a, w, b)
......
...@@ -41,9 +41,8 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p ...@@ -41,9 +41,8 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
w = tvm.nd.array(w_np, ctx) w = tvm.nd.array(w_np, ctx)
b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
with tvm.build_config(auto_unroll_max_step=32, with tvm.build_config(auto_unroll_max_step=128,
auto_unroll_min_depth=0, unroll_explicit=(device != "cuda")):
unroll_explicit=device == 'rocm'):
func1 = tvm.build(s1, [A, W, B], device) func1 = tvm.build(s1, [A, W, B], device)
func2 = tvm.build(s2, [A, W, C], device) func2 = tvm.build(s2, [A, W, C], device)
func1(a, w, b) func1(a, w, b)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment