Commit 8f56949b by Ruizhe Zhao (Vincent) Committed by Wuwei Lin

Fixed issue #3069 by checking op tag (#3070)

* Fixed issue #3069 by adding in_channels

* Registerd group_conv2d_nchw as topi compute

* Improved by checking tag value

* Removed group_conv2d_nchw topi registration

* Added test for relay group_conv2d_nchw

* Added assertions to forbid small group size

* Removed hard-coded oc_block_factor

* Added explanatory comments to group_conv2d_nchw_cuda

* Updated group_conv2d_nchw_cuda schedule

Removed 'direct' CUDA tests

* Reverted an accidental change in a conv2d test

* Fixed indentation problems

* Fixed a mis-commented line

* Reverted change in group_conv2d_nchw tag

* Removed commented int8 group_conv2d test

* Fixed group size assertions in group_conv2d_nchw_cuda
parent 7e68d63f
......@@ -169,7 +169,6 @@ class Conv(OnnxOpConverter):
def _impl_v1(cls, inputs, attr, params):
# get number of channels
out = AttrCvt(op_name=dimension_picker('conv'),
'kernel_shape': 'kernel_size',
......@@ -296,7 +296,7 @@ def override_native_generic_func(func_name):
def generic_func(fdefault):
"""Wrap a target generic function.
Generic function allows registeration of further functions
Generic function allows registration of further functions
that can be dispatched on current target context.
If no registered dispatch is matched, the fdefault will be called.
......@@ -86,9 +86,13 @@ def test_conv2d_run():
dilation=(1, 1),
x = relay.var("x", shape=dshape)
w = relay.var("w")
if except_targets is None:
except_targets = []
x = relay.var("x", shape=dshape, dtype=dtype)
w = relay.var("w", dtype=dtype)
y = relay.nn.conv2d(x, w,
......@@ -100,11 +104,15 @@ def test_conv2d_run():
dkernel = topi.testing.dilate_python(kernel, (1, 1) + dilation)
if fref is None:
ref_res = topi.testing.conv2d_nchw_python(
data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding)
data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding,
ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
for target, ctx in ctx_list():
if target in except_targets:
intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
op_res1 = intrp1.evaluate(func)(data, kernel)
tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
......@@ -117,6 +125,21 @@ def test_conv2d_run():
fref=lambda x, w: topi.testing.depthwise_conv2d_python_nchw(
x, w, (1, 1), "SAME"))
# CUDA is disabled for 'direct' schedule:
# group conv2d
dshape = (1, 32, 18, 18)
kshape = (32, 4, 3, 3)
run_test_conv2d("float32", "float32", 1, dshape, kshape,
padding=(1, 1), channels=32, groups=8, kernel_size=(3 ,3),
# also group conv2d
dshape = (1, 32, 18, 18)
kshape = (64, 1, 3, 3)
run_test_conv2d("float32", "float32", 1, dshape, kshape,
padding=(1, 1), channels=64, groups=32, kernel_size=(3 ,3),
# normal conv2d
dshape = (1, 3, 224, 224)
kshape = (10, 3, 3, 3)
......@@ -27,10 +27,13 @@ from ..util import traverse_inline, get_const_tuple, get_const_int
from .. import nn, generic
@autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], ['direct', 'int8'])
autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], 'direct',
@autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], ['int8'])
def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
"""Group convolution operator in NCHW layout.
"""Group convolution operator for 'group_conv2d_NCHWc_int8'.
......@@ -76,7 +79,7 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
assert out_channels % groups == 0, "output channels must divide group size"
assert channels % ic_block_factor == 0, \
"Number of input channels per group must divide {}".format(ic_block_factor)
assert out_channels % 4 == 0, \
assert out_channels % oc_block_factor == 0, \
"Number of output channels per group must divide {}".format(oc_block_factor)
packed_data = tvm.compute((batch, channels // ic_block_factor, height, width,
......@@ -99,6 +102,17 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
oc_chunk, _, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(
# TODO(kumasento): these assertions ensure that the number of groups
# should be smaller or equal to the number of blocks, so that each
# group will have at least one block.
# Shall we pad the channels to avoid raising assertions?
assert groups <= oc_chunk, \
('Number of groups {} should be less than '
'output channel chunk size {}'.format(groups, oc_chunk))
assert groups <= ic_chunk, \
('Number of groups {} should be less than '
'input channel chunk size {}'.format(groups, ic_chunk))
if isinstance(stride, int):
stride_h = stride_w = stride
......@@ -109,9 +123,9 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
dilation_h, dilation_w = dilation
# pad the input data
pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
padding, (kernel_h, kernel_w))
# compute graph
pad_before = [0, 0, pad_top, pad_left, 0]
pad_after = [0, 0, pad_down, pad_right, 0]
pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
......@@ -129,6 +143,17 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
kh = tvm.reduce_axis((0, kernel_h), name='kh')
kw = tvm.reduce_axis((0, kernel_w), name='kw')
# NOTE(kumasento): explanation of this snippet -
# oc_chunk//groups and ic_chunk//groups give you the number of blocks,
# i.e., chunk, per group.
# occ is the ID of the output channel block, so that occ//(oc_chunk//groups)
# produces the ID of the group.
# Multiplying that result with ic_chunk//groups resulting in the ID
# of the beginning block of the corresponding input group.
# Adding the block offset (icc) will give you the exact block ID.
# Compared with a normal convolution, group convolution only sums
# input channels from the group that an output channel resides in.
conv = tvm.compute(oshape, lambda n, occ, oh, ow, ocb:
tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc,
oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb]
......@@ -138,8 +163,10 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
axis=[icc, kh, kw, icb]))
# Type conversion
output = tvm.compute(oshape, lambda *index: conv(*index).astype(out_dtype),
num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups
......@@ -295,7 +322,7 @@ def schedule_group_conv2d_NCHWc_int8(cfg, s, output):
["cuda", "gpu"], ["direct", "int8"])
["cuda", "gpu"], ["int8"])
def schedule_conv2d_nchw_cuda(cfg, outs):
"""TOPI schedule callback of group conv2d for cuda gpu
......@@ -242,7 +242,7 @@ def schedule_depthwise_conv2d_NCHWc(outs):
def schedule_group_conv2d_nchw(outs):
"""Schedule for conv2d_nchw
"""Schedule for group_conv2d_nchw
......@@ -603,4 +603,4 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp
yy * stride_h + ry * dilation_h,
xx * stride_w + rx * dilation_w].astype(out_dtype) *
Filter[ff, rc, ry, rx].astype(out_dtype),
axis=[rc, ry, rx]), tag="conv2d_nchw")
axis=[rc, ry, rx]), tag='group_conv2d_nchw')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment