Commit 0f1e0ff0 by Tianqi Chen Committed by GitHub

[PASS] More robust UnrollLoop configuratin (#576)

parent 69759c0c
......@@ -3,8 +3,11 @@ TVM Change Log
This file records the changes in TVM library in reverse chronological order.
## 0.1rc
## On onging verison
- UnrollLoop : more robust version of unroll loop, count maximum steps that can be unrolled.
## 0.1rc
- Language runtime
- python
- javascript
......
......@@ -28,7 +28,7 @@ class BuildConfig(object):
current = None
defaults = {
"auto_unroll_max_step": 0,
"auto_unroll_min_depth": 1,
"auto_unroll_max_depth": 4,
"unroll_explicit": True,
"detect_global_barrier": False,
"offset_factor": 0,
......@@ -72,10 +72,11 @@ def build_config(**kwargs):
Parameters
----------
auto_unroll_max_step: int, default=0
Threshold of loop extent to be automatically unrolled.
Threshold of number of steps in the loop to be automatically unrolled.
This takes inner loop count into consideration.
auto_unroll_min_depth: int, default=1
The minimum loop nest level before the loop can be automatically unrolled.
auto_unroll_max_depth: int, default=4
The maximum nested level of loops that can be automatically unrolled.
unroll_explicit: bool, default=True
Whether explicitly unroll the loop, if set false, the unroll hint will
......@@ -221,7 +222,7 @@ def lower(sch,
stmt = ir_pass.UnrollLoop(
stmt,
cfg.auto_unroll_max_step,
cfg.auto_unroll_min_depth,
cfg.auto_unroll_max_depth,
cfg.unroll_explicit)
for f in lower_phase1:
stmt = f(stmt)
......
......@@ -18,15 +18,16 @@ namespace ir {
class LoopUnroller : public IRMutator {
public:
explicit LoopUnroller(int auto_max_step,
int auto_min_depth,
int auto_max_depth,
bool explicit_unroll)
: auto_max_step_(auto_max_step),
auto_min_depth_(auto_min_depth),
auto_max_depth_(auto_max_depth),
explicit_unroll_(explicit_unroll) {
}
Stmt Mutate_(const For* op, const Stmt& s) {
Stmt stmt = s;
Stmt stmt = IRMutator::Mutate_(op, s);
op = stmt.as<For>();
// constant folding.
Expr extent = ir::Simplify(op->extent);
const IntImm* v1 = extent.as<IntImm>();
......@@ -38,15 +39,27 @@ class LoopUnroller : public IRMutator {
if (v2 != nullptr) {
value = static_cast<int>(v2->value);
}
bool auto_unroll = (op->for_type == ForType::Serial &&
value >= 0 && value <= auto_max_step_ &&
loop_depth_ >= auto_min_depth_);
// condition for auto unroll
bool auto_unroll = (
op->for_type == ForType::Serial &&
normal_loop_depth_ == 0 &&
value >= 0 &&
unroll_depth_ <= auto_max_depth_ &&
value * step_count_ <= auto_max_step_);
if (op->for_type == ForType::Unrolled) {
CHECK_GE(value, 0)
<< "Cannot unroll non-constant loop";
auto_unroll = true;
}
if (auto_unroll) {
step_count_ *= value;
unroll_depth_ += 1;
} else {
normal_loop_depth_ += 1;
}
if (auto_unroll && explicit_unroll_) {
using arith::ComputeExpr;
if (value == 0) return Evaluate::make(0);
......@@ -65,42 +78,72 @@ class LoopUnroller : public IRMutator {
unrolled = step;
}
}
++loop_depth_;
Stmt ret = this->Mutate(unrolled);
--loop_depth_;
return ret;
return unrolled;
} else {
++loop_depth_;
Stmt ret = IRMutator::Mutate_(op, stmt);
if (auto_unroll) {
op = ret.as<For>();
if (op->for_type != ForType::Unrolled) {
ret = For::make(
return For::make(
op->loop_var, op->min, op->extent,
ForType::Unrolled, op->device_api, op->body);
}
}
--loop_depth_;
return ret;
return stmt;
}
}
Stmt Mutate_(const Store* op, const Stmt& stmt) final {
++step_count_;
return IRMutator::Mutate_(op, stmt);
}
Stmt Mutate_(const Evaluate* op, const Stmt& stmt) final {
++step_count_;
return IRMutator::Mutate_(op, stmt);
}
Stmt Mutate_(const Block* op, const Stmt& stmt) final {
Stmt first = this->Mutate(op->first);
// cleanup state
int step_count = step_count_;
int unroll_depth = unroll_depth_;
int normal_loop_depth = normal_loop_depth_;
step_count_ = 0;
unroll_depth_ = 0;
normal_loop_depth_ = 0;
// work on rest part
Stmt rest = this->Mutate(op->rest);
step_count_ += step_count;
normal_loop_depth_ = std::max(normal_loop_depth, normal_loop_depth_);
unroll_depth_ = std::max(unroll_depth_, unroll_depth);
if (first.same_as(op->first) &&
rest.same_as(op->rest)) {
return stmt;
} else {
return Block::make(first, rest);
}
}
private:
// maximum number of step to perform auto unroll.
int auto_max_step_;
int auto_min_depth_;
int auto_max_depth_;
bool explicit_unroll_;
int loop_depth_{0};
// Number of normal loops in scope
int normal_loop_depth_{0};
// number of unrolled cases in current scope.
int unroll_depth_{0};
// Number of total steps unrolled
int step_count_{0};
};
Stmt UnrollLoop(Stmt stmt,
int auto_max_step,
int auto_min_depth,
int auto_max_depth,
bool explicit_unroll) {
Stmt ret = LoopUnroller(
auto_max_step,
auto_min_depth,
auto_max_depth,
explicit_unroll).Mutate(stmt);
if (!ret.same_as(stmt)) {
return ConvertSSA(ret);
......
......@@ -14,11 +14,14 @@ def test_unroll_loop():
tvm.make.Load(dtype, Ab.data, i) + 1,
j + 1)))
assert isinstance(stmt, tvm.stmt.For)
ret = tvm.ir_pass.UnrollLoop(stmt, 2, 0, True)
ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, True)
assert not isinstance(ret, tvm.stmt.For)
ret = tvm.ir_pass.UnrollLoop(stmt, 4, 0, False)
ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, True)
assert isinstance(ret, tvm.stmt.For)
ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, False)
assert isinstance(ret, tvm.stmt.For)
assert ret.for_type == tvm.stmt.For.Unrolled
if __name__ == "__main__":
test_unroll_loop()
......@@ -112,8 +112,7 @@ def test_depthwise_conv2d_nchw():
print("success")
for device in ['cuda', 'opencl', 'rocm']:
with tvm.build_config(auto_unroll_max_step=32,
auto_unroll_min_depth=0,
with tvm.build_config(auto_unroll_max_step=128,
unroll_explicit=device == 'rocm',
detect_global_barrier=False,
restricted_func=True):
......@@ -202,9 +201,7 @@ def test_depthwise_conv2d_nhwc():
print("success")
for device in ['cuda', 'opencl', 'rocm']:
with tvm.build_config(auto_unroll_max_step=32,
auto_unroll_min_depth=0,
unroll_explicit=device == 'rocm',
with tvm.build_config(auto_unroll_max_step=128,
detect_global_barrier=False,
restricted_func=True):
check_device(device)
......
......@@ -60,8 +60,7 @@ def test_conv2d_hwcn_map():
w = tvm.nd.array(w_np, ctx)
b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
with tvm.build_config(auto_unroll_max_step=32,
auto_unroll_min_depth=0,
with tvm.build_config(auto_unroll_max_step=128,
unroll_explicit=device == 'rocm'):
func1 = tvm.build(s1, [A, W, B], device)
func1(a, w, b)
......
......@@ -80,6 +80,7 @@ def test_gemm():
s[CC].reorder(ko, kt, ki, yo, xo)
s[AA].compute_at(s[CC], ko)
s[BB].compute_at(s[CC], ko)
s[CC].unroll(kt)
s[AL].compute_at(s[CC], kt)
s[BL].compute_at(s[CC], kt)
# Schedule for A's shared memory load
......@@ -125,9 +126,8 @@ def test_gemm():
GFLOPS = num_flops / (t * 1e3) / 1e6
print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS))
for device in ["cuda", "opencl", "rocm"]:
with tvm.build_config(auto_unroll_max_step=32,
auto_unroll_min_depth=0,
for device in ["cuda", "opencl", "rocm", "nvptx"]:
with tvm.build_config(auto_unroll_max_step=128,
unroll_explicit=(device != "cuda")):
check_device(device)
......
......@@ -112,7 +112,6 @@ def rnn_matexp():
def check_device(target):
with tvm.build_config(
detect_global_barrier=detect_global_barrier,
auto_unroll_min_depth=2,
auto_unroll_max_step=128,
unroll_explicit=False):
f = tvm.build(s, [s_scan, Whh], target)
......
......@@ -39,9 +39,8 @@ def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, p
w = tvm.nd.array(w_np, ctx)
b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
with tvm.build_config(auto_unroll_max_step=32,
auto_unroll_min_depth=0,
unroll_explicit=device == 'rocm'):
with tvm.build_config(auto_unroll_max_step=128,
unroll_explicit=(device != "cuda")):
func1 = tvm.build(s1, [A, W, B], device)
func2 = tvm.build(s2, [A, W, C], device)
func1(a, w, b)
......
......@@ -41,9 +41,8 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
w = tvm.nd.array(w_np, ctx)
b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
with tvm.build_config(auto_unroll_max_step=32,
auto_unroll_min_depth=0,
unroll_explicit=device == 'rocm'):
with tvm.build_config(auto_unroll_max_step=128,
unroll_explicit=(device != "cuda")):
func1 = tvm.build(s1, [A, W, B], device)
func2 = tvm.build(s2, [A, W, C], device)
func1(a, w, b)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment