Fixed issue #3069 by checking op tag (#3070)

* Fixed issue #3069 by adding in_channels * Registerd group_conv2d_nchw as topi compute * Improved by checking tag value * Removed group_conv2d_nchw topi registration * Added test for relay group_conv2d_nchw * Added assertions to forbid small group size * Removed hard-coded oc_block_factor * Added explanatory comments to group_conv2d_nchw_cuda * Updated group_conv2d_nchw_cuda schedule Removed 'direct' CUDA tests * Reverted an accidental change in a conv2d test * Fixed indentation problems * Fixed a mis-commented line * Reverted change in group_conv2d_nchw tag * Removed commented int8 group_conv2d test * Fixed group size assertions in group_conv2d_nchw_cuda

Fixed issue #3069 by checking op tag (#3070)
* Fixed issue #3069 by adding in_channels * Registerd group_conv2d_nchw as topi compute * Improved by checking tag value * Removed group_conv2d_nchw topi registration * Added test for relay group_conv2d_nchw * Added assertions to forbid small group size * Removed hard-coded oc_block_factor * Added explanatory comments to group_conv2d_nchw_cuda * Updated group_conv2d_nchw_cuda schedule Removed 'direct' CUDA tests * Reverted an accidental change in a conv2d test * Fixed indentation problems * Fixed a mis-commented line * Reverted change in group_conv2d_nchw tag * Removed commented int8 group_conv2d test * Fixed group size assertions in group_conv2d_nchw_cuda
8f56949b · Ruizhe Zhao (Vincent) · Wuwei Lin · 7e68d63f · 8f56949b · 8f56949b
Commit 8f56949b authored Apr 27, 2019 by Ruizhe Zhao (Vincent) Committed by Wuwei Lin Apr 27, 2019
7 changed files
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -169,7 +169,6 @@ class Conv(OnnxOpConverter):
    @classmethod
    def _impl_v1(cls, inputs, attr, params):
-        # get number of channels
        out = AttrCvt(op_name=dimension_picker('conv'),
                      transforms={
                          'kernel_shape': 'kernel_size',

--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -296,7 +296,7 @@ def override_native_generic_func(func_name):
 def generic_func(fdefault):
    """Wrap a target generic function.
-    Generic function allows registeration of further functions
+    Generic function allows registration of further functions
    that can be dispatched on current target context.
    If no registered dispatch is matched, the fdefault will be called.

--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -86,9 +86,13 @@ def test_conv2d_run():
                        fref=None,
                        groups=1,
                        dilation=(1, 1),
+                        except_targets=None,
                        **attrs):
-        x = relay.var("x", shape=dshape)
+        if except_targets is None:
-        w = relay.var("w")
+          except_targets = []
+        x = relay.var("x", shape=dshape, dtype=dtype)
+        w = relay.var("w", dtype=dtype)
        y = relay.nn.conv2d(x, w,
                            padding=padding,
                            dilation=dilation,
@@ -100,11 +104,15 @@ def test_conv2d_run():
        dkernel = topi.testing.dilate_python(kernel, (1, 1) + dilation)
        if fref is None:
            ref_res = topi.testing.conv2d_nchw_python(
-                data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding)
+                data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding,
+                groups=groups)
        else:
            ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
        for target, ctx in ctx_list():
+            if target in except_targets:
+                continue
            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
            op_res1 = intrp1.evaluate(func)(data, kernel)
            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
@@ -117,6 +125,21 @@ def test_conv2d_run():
                    fref=lambda x, w: topi.testing.depthwise_conv2d_python_nchw(
                        x, w, (1, 1), "SAME"))
+    # CUDA is disabled for 'direct' schedule:
+    # https://github.com/dmlc/tvm/pull/3070#issuecomment-486597553
+    # group conv2d
+    dshape = (1, 32, 18, 18)
+    kshape = (32, 4, 3, 3)
+    run_test_conv2d("float32", "float32", 1, dshape, kshape,
+                    padding=(1, 1), channels=32, groups=8, kernel_size=(3 ,3),
+                    except_targets=['cuda'])
+    # also group conv2d
+    dshape = (1, 32, 18, 18)
+    kshape = (64, 1, 3, 3)
+    run_test_conv2d("float32", "float32", 1, dshape, kshape,
+                    padding=(1, 1), channels=64, groups=32, kernel_size=(3 ,3),
+                    except_targets=['cuda'])
    # normal conv2d
    dshape = (1, 3, 224, 224)
    kshape = (10, 3, 3, 3)

--- a/topi/python/topi/cuda/group_conv2d_nchw.py
+++ b/topi/python/topi/cuda/group_conv2d_nchw.py
@@ -27,10 +27,13 @@ from ..util import traverse_inline, get_const_tuple, get_const_int
 from .. import nn, generic
-@autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], ['direct', 'int8'])
+autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], 'direct',
+                              nn.group_conv2d_nchw.fdefault)
+@autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], ['int8'])
 def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
                           out_dtype='float32'):
-    """Group convolution operator in NCHW layout.
+    """Group convolution operator for 'group_conv2d_NCHWc_int8'.
    Parameters
    ----------
@@ -76,7 +79,7 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
        assert out_channels % groups == 0, "output channels must divide group size"
        assert channels % ic_block_factor == 0, \
            "Number of input channels per group must divide {}".format(ic_block_factor)
-        assert out_channels % 4 == 0, \
+        assert out_channels % oc_block_factor == 0, \
            "Number of output channels per group must divide {}".format(oc_block_factor)
        packed_data = tvm.compute((batch, channels // ic_block_factor, height, width,
@@ -99,6 +102,17 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
    oc_chunk, _, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(
        packed_kernel.shape)
+    # TODO(kumasento): these assertions ensure that the number of groups
+    # should be smaller or equal to the number of blocks, so that each
+    # group will have at least one block.
+    # Shall we pad the channels to avoid raising assertions?
+    assert groups <= oc_chunk, \
+        ('Number of groups {} should be less than '
+         'output channel chunk size {}'.format(groups, oc_chunk))
+    assert groups <= ic_chunk, \
+        ('Number of groups {} should be less than '
+         'input channel chunk size {}'.format(groups, ic_chunk))
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
@@ -109,9 +123,9 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
    else:
        dilation_h, dilation_w = dilation
+    # pad the input data
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (kernel_h, kernel_w))
-    # compute graph
    pad_before = [0, 0, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_down, pad_right, 0]
    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
@@ -129,6 +143,17 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
    kh = tvm.reduce_axis((0, kernel_h), name='kh')
    kw = tvm.reduce_axis((0, kernel_w), name='kw')
+    # NOTE(kumasento): explanation of this snippet -
+    # oc_chunk//groups and ic_chunk//groups give you the number of blocks,
+    # i.e., chunk, per group.
+    # occ is the ID of the output channel block, so that occ//(oc_chunk//groups)
+    # produces the ID of the group.
+    # Multiplying that result with ic_chunk//groups resulting in the ID
+    # of the beginning block of the corresponding input group.
+    # Adding the block offset (icc) will give you the exact block ID.
+    #
+    # Compared with a normal convolution, group convolution only sums
+    # input channels from the group that an output channel resides in.
    conv = tvm.compute(oshape, lambda n, occ, oh, ow, ocb:
                       tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc,
                                        oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb]
@@ -138,8 +163,10 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
                               .astype('int32'),
                               axis=[icc, kh, kw, icb]))
+    # Type conversion
    output = tvm.compute(oshape, lambda *index: conv(*index).astype(out_dtype),
                         tag='group_conv2d_NCHWc_int8')
    num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
        ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups
    cfg.add_flop(num_flop)
@@ -295,7 +322,7 @@ def schedule_group_conv2d_NCHWc_int8(cfg, s, output):
 @autotvm.register_topi_schedule(generic.schedule_group_conv2d_nchw,
-                                ["cuda", "gpu"], ["direct", "int8"])
+                                ["cuda", "gpu"], ["int8"])
 def schedule_conv2d_nchw_cuda(cfg, outs):
    """TOPI schedule callback of group conv2d for cuda gpu

--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -242,7 +242,7 @@ def schedule_depthwise_conv2d_NCHWc(outs):
 @tvm.target.generic_func
 def schedule_group_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw
+    """Schedule for group_conv2d_nchw
    Parameters
    ----------

--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -603,4 +603,4 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp
                 yy * stride_h + ry * dilation_h,
                 xx * stride_w + rx * dilation_w].astype(out_dtype) *
            Filter[ff, rc, ry, rx].astype(out_dtype),
-            axis=[rc, ry, rx]), tag="conv2d_nchw")
+            axis=[rc, ry, rx]), tag='group_conv2d_nchw')