Update topi/cuda schedules to use target.max_num_threads (#577)

* update topi/cuda schedules to use target.max_num_threads * allow num_thread to be larger than cuda.max_num_threads * remove get_max_num_threads and make it inline

Update topi/cuda schedules to use target.max_num_threads (#577)
* update topi/cuda schedules to use target.max_num_threads * allow num_thread to be larger than cuda.max_num_threads * remove get_max_num_threads and make it inline
12218358 · masahi · Tianqi Chen · 0f1e0ff0 · 12218358 · 12218358
Commit 12218358 authored Oct 23, 2017 by masahi Committed by Tianqi Chen Oct 22, 2017
8 changed files
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ b/topi/python/topi/cuda/conv2d_nchw.py
@@ -36,7 +36,7 @@ def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L):
    s[temp_S].compute_at(s[Out_L], ic)
    s[Filter_S].compute_at(s[Out_L], w)
-    num_thread1 = 512
+    num_thread1 = tvm.target.current_target(allow_none=False).max_num_threads
    thread_xx = tvm.thread_axis((0, num_thread1), "threadIdx.x")
    block_xx = tvm.thread_axis("blockIdx.x")
@@ -116,7 +116,7 @@ def conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag):
        s[temp_S].compute_at(s[Out_L], oic)
        s[Filter_S].compute_at(s[Out_L], dw)
-        num_thread = 512
+        num_thread = tvm.target.current_target(allow_none=False).max_num_threads
        thread_xx = tvm.thread_axis((0, num_thread), "threadIdx.x")
        block_xx = tvm.thread_axis("blockIdx.x")
@@ -186,15 +186,18 @@ def conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag):
        ic, dh, dw = s[Out_L].op.reduce_axis
        oic, iic = s[Out_L].split(ic, factor=ifactor)
        s[Out_L].reorder(oic, dh, dw, iic, h, w)
+        max_num_thread = tvm.target.current_target(allow_none=False).max_num_threads
        if util.get_const_int(Filter_S.shape[1]) == 128:
            oic = s[Out_L].fuse(dh, oic)
            s[temp_S].compute_at(s[Out_L], oic)
            s[Filter_S].compute_at(s[Out_L], oic)
-            num_thread = 512
+            num_thread = max_num_thread
        else:
            s[temp_S].compute_at(s[Out_L], oic)
            s[Filter_S].compute_at(s[Out_L], dw)
            num_thread = 456
+            if max_num_thread < num_thread:
+                num_thread = max_num_thread
        thread_xx = tvm.thread_axis((0, num_thread), "threadIdx.x")
        block_xx = tvm.thread_axis("blockIdx.x")
@@ -300,7 +303,7 @@ def conv2d_14_256_256(s, temp, temp_R, temp_S, Filter, Filter_S, Out, Out_L):
        s[temp_S].compute_at(s[Out_L], oic)
        s[Filter_S].compute_at(s[Out_L], oic)
-        num_thread = 512
+        num_thread = tvm.target.current_target(allow_none=False).max_num_threads
        thread_xx = tvm.thread_axis((0, num_thread), "threadIdx.x")
        block_xx = tvm.thread_axis("blockIdx.x")

--- a/topi/python/topi/cuda/depthwise_conv2d.py
+++ b/topi/python/topi/cuda/depthwise_conv2d.py
@@ -119,6 +119,7 @@ def schedule_depthwise_conv2d_nchw(outs):
    traverse(outs[0].op)
    return s
+@generic.schedule_depthwise_conv2d_nhwc.register(["cuda", "gpu"])
 def schedule_depthwise_conv2d_nhwc(outs):
    """Schedule for depthwise_conv2d nhwc forward.
@@ -151,8 +152,12 @@ def schedule_depthwise_conv2d_nhwc(outs):
        b, h, w, c = s[Output].op.axis
-        ic_val = tvm.ir_pass.Simplify(temp.shape[3]).value
+        # num_thread here could be 728, it is larger than cuda.max_num_threads
-        xoc, xic = s[Output].split(c, factor=ic_val)
+        num_thread = tvm.ir_pass.Simplify(temp.shape[3]).value
+        target = tvm.target.current_target()
+        if target and target.target_name != "cuda":
+            num_thread = target.max_num_threads
+        xoc, xic = s[Output].split(c, factor=num_thread)
        s[Output].reorder(xoc, b, h, w, xic)
        xo, yo, _, _ = s[Output].tile(h, w, x_factor=2, y_factor=2)
        fused = s[Output].fuse(yo, xo)

--- a/topi/python/topi/cuda/injective.py
+++ b/topi/python/topi/cuda/injective.py
@@ -6,9 +6,7 @@ from .. import generic
 def _schedule_injective(op, sch):
    x = op.output(0)
    fused = sch[x].fuse(*sch[x].op.axis)
-    target = tvm.target.current_target()
+    num_thread = tvm.target.current_target(allow_none=False).max_num_threads
-    target = target if target else tvm.target.cuda()
-    num_thread = target.max_num_threads
    bx, tx = sch[x].split(fused, factor=num_thread)
    sch[x].bind(bx, tvm.thread_axis("blockIdx.x"))
    sch[x].bind(tx, tvm.thread_axis("threadIdx.x"))

--- a/topi/python/topi/cuda/pooling.py
+++ b/topi/python/topi/cuda/pooling.py
@@ -84,7 +84,7 @@ def schedule_pool(outs):
    s = tvm.create_schedule([x.op for x in outs])
    def _schedule(PaddedInput, Pool):
        s[PaddedInput].compute_inline()
-        num_thread = 512
+        num_thread = tvm.target.current_target(allow_none=False).max_num_threads
        if Pool.op in s.outputs:
            Out = Pool
            OL = s.cache_write(Pool, "local")

--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -16,12 +16,17 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
    if len(sch[data_out].op.axis) > 0:
        all_reduce = False
        num_thread = 32
+        target = tvm.target.current_target()
+        if target and target.target_name == "opencl":
+            # without it, CL_INVALID_WORK_GROUP_SIZE occured when running test_topi_reduce.py
+            # don't know why
+            num_thread = 16
        block_x = tvm.thread_axis("blockIdx.x")
        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
        thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
    else:
        all_reduce = True
-        num_thread = 512
+        num_thread = tvm.target.current_target(allow_none=False).max_num_threads
        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
    # Fuse and refactor the reduce axis

--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -54,6 +54,23 @@ def schedule_depthwise_conv2d_nchw(outs):
 @tvm.target.generic_func
+def schedule_depthwise_conv2d_nhwc(outs):
+    """Schedule for depthwise nhcw conv2
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+@tvm.target.generic_func
 def schedule_reduce(outs):
    """Schedule for reduction

--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -32,7 +32,6 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
            s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
            s2 = topi.generic.schedule_depthwise_conv2d_nchw(ScaleShift)
            s3 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
        ctx = tvm.context(device, 0)
        # build the kernels
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
@@ -107,14 +106,16 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
    ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
    Relu = topi.nn.relu(ScaleShift)
    # schedule
-    s1 = schedule_depthwise_conv2d_nhwc(DepthwiseConv2d)
-    s2 = schedule_depthwise_conv2d_nhwc(ScaleShift)
-    s3 = schedule_depthwise_conv2d_nhwc(Relu)
    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
+        with tvm.target.create(device):
+            s1 = topi.generic.schedule_depthwise_conv2d_nhwc(DepthwiseConv2d)
+            s2 = topi.generic.schedule_depthwise_conv2d_nhwc(ScaleShift)
+            s3 = topi.generic.schedule_depthwise_conv2d_nhwc(Relu)
        ctx = tvm.context(device, 0)
        # build the kernels
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)

--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -8,7 +8,6 @@ from topi.util import get_const_tuple
 def verify_relu(m, n):
    A = tvm.placeholder((m, n), name='A')
    B = topi.nn.relu(A)
-    s = topi.cuda.schedule_elemwise(B)
    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
    b_np = a_np * (a_np > 0)
@@ -17,6 +16,8 @@ def verify_relu(m, n):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_elemwise(B)
        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)