[cuDNN] Add cuDNN grouped convolutions support (#5319)

Signed-off-by: Wei Pan <weip@nvidia.com>

[cuDNN] Add cuDNN grouped convolutions support (#5319)
Signed-off-by: Wei Pan <weip@nvidia.com>
9c12ec81 · Wei Pan · GitHub · a3b13973 · 9c12ec81 · 9c12ec81
Unverified Commit 9c12ec81 authored Apr 23, 2020 by Wei Pan Committed by GitHub Apr 23, 2020
9 changed files
--- a/python/tvm/contrib/cudnn.py
+++ b/python/tvm/contrib/cudnn.py
@@ -182,7 +182,8 @@ def conv_output_shape(tensor_format,
                      x_shape,
                      w_shape,
                      data_dtype,
-                      conv_dtype):
+                      conv_dtype,
+                      groups=1):
    """Get output shape of 2D or 3D convolution

    Paramters
@@ -205,6 +206,8 @@ def conv_output_shape(tensor_format,
        data type
    conv_dtype: str
        convolution type
+    groups: int
+        number of groups

    Returns
    -------
@@ -228,7 +231,8 @@ def conv_output_shape(tensor_format,
         _get_np_int32_array_handle(wshape),
         _get_np_int32_array_handle(oshape),
         data_dtype,
-         conv_dtype)
+         conv_dtype,
+         groups)
    return list(oshape)


@@ -240,7 +244,8 @@ def conv_find_algo(tensor_format,
                   w_shape,
                   y_shape,
                   data_dtype,
-                   conv_dtype):
+                   conv_dtype,
+                   groups=1):
    """Choose the best algo for the given input.

    Paramters
@@ -265,6 +270,8 @@ def conv_find_algo(tensor_format,
        data type
    conv_dtype: str
        convolution type
+    groups: int
+        number of groups

    Returns
    -------
@@ -287,7 +294,8 @@ def conv_find_algo(tensor_format,
                _get_np_int32_array_handle(wshape),
                _get_np_int32_array_handle(yshape),
                data_dtype,
-                conv_dtype)
+                conv_dtype,
+                groups)


 def conv_forward(x,
@@ -298,7 +306,8 @@ def conv_forward(x,
                 conv_mode,
                 tensor_format,
                 algo,
-                 conv_dtype):
+                 conv_dtype,
+                 groups=1):
    """Create an extern op that compute 2D or 3D convolution with CuDNN

    Parameters
@@ -325,6 +334,8 @@ def conv_forward(x,
        if algo == -1, the best algo will be chosen by CUDNN
    conv_dtype: str
        convolution type
+    groups: int
+        the number of groups

    Returns
    -------
@@ -335,8 +346,7 @@ def conv_forward(x,
    assert dims in (4, 5)

    conv_dtype = x.dtype if conv_dtype is None else conv_dtype
-    pad, stride, dilation, _, _ = \
-        _prepare_global_func_params(dims - 2, pad, stride, dilation)
+    pad, stride, dilation, _, _ = _prepare_global_func_params(dims - 2, pad, stride, dilation)

    oshape = conv_output_shape(tensor_format,
                               pad,
@@ -345,7 +355,8 @@ def conv_forward(x,
                               list(x.shape),
                               list(w.shape),
                               x.dtype,
-                               conv_dtype)
+                               conv_dtype,
+                               groups)
    if algo == -1:
        # For now if we try to call `cudnnFindConvolutionForwardAlgorithm` when
        # using INT8 data type, CuDNN will crash down.
@@ -361,7 +372,8 @@ def conv_forward(x,
                                  list(w.shape),
                                  oshape,
                                  x.dtype,
-                                  conv_dtype)
+                                  conv_dtype,
+                                  groups)

    if dims == 4:
        return te.extern(
@@ -380,7 +392,8 @@ def conv_forward(x,
                ins[0],
                ins[1],
                outs[0],
-                conv_dtype), name="y")
+                conv_dtype,
+                groups), name="y")

    return te.extern(
        oshape, [x, w],
@@ -401,7 +414,8 @@ def conv_forward(x,
            ins[0],
            ins[1],
            outs[0],
-            conv_dtype), name="y")
+            conv_dtype,
+            groups), name="y")

 def softmax(x, axis=-1):
    """Compute softmax using CuDNN

--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -161,7 +161,9 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
            if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \
                    padding[1] == padding[3]:
                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn, True),
+                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
+                                        need_data_layout=True,
+                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=15)
@@ -181,6 +183,20 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
        else:
            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
    else: # group_conv2d
+        # add cudnn implementation, if any
+        cudnn_impl = False
+        if target.target_name == "cuda" and "cudnn" in target.libs:
+            if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \
+                    padding[1] == padding[3]:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
+                                        need_data_layout=True,
+                                        has_groups=True),
+                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
+                    name="conv2d_cudnn.cuda",
+                    plevel=15)
+                cudnn_impl = True
+
        if layout == 'NCHW':
            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
            assert kernel_layout == "OIHW"
@@ -194,7 +210,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                name="group_conv2d_NCHWc_int8.cuda")
-        else:
+        elif not cudnn_impl:
            raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
    return strategy


--- a/src/runtime/contrib/cudnn/conv_forward.cc
+++ b/src/runtime/contrib/cudnn/conv_forward.cc
@@ -35,6 +35,7 @@ void ConvolutionForward(
  int format,
  int algo,
  int dims,
+  int groups,
  const int pad[],
  const int stride[],
  const int dilation[],
@@ -62,8 +63,10 @@ void ConvolutionForward(

  // Note: For 2D tenor, using ND setters causes CUDNN_STATUS_NOT_SUPPORTED error
  // in following cudnnGetConvolutionForwardWorkspaceSize() when data type is fp16, int
+
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(entry_ptr->conv_entry.conv_desc, groups));
  if (dims == 2) {
-  // Set Desc
+    // Set Desc
    CUDNN_CALL(cudnnSetConvolution2dDescriptor(entry_ptr->conv_entry.conv_desc,
                                               pad[0],
                                               pad[1],
@@ -183,6 +186,7 @@ void ConvolutionForward(
 void OutputShape(
  int format,
  int dims,
+  int groups,
  const int pad[],
  const int stride[],
  const int dilation[],
@@ -202,6 +206,7 @@ void OutputShape(
  int full_dims = dims + 2;

  // conv desc
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(entry_ptr->conv_entry.conv_desc, groups));
  CUDNN_CALL(cudnnSetConvolutionNdDescriptor(entry_ptr->conv_entry.conv_desc,
                                             dims,
                                             pad,
@@ -240,6 +245,7 @@ void OutputShape(
    // Set Input
    std::vector<int> tensor_stride(full_dims);
    GetCudnnStride(full_dims, x_dim, tensor_stride.data());
+
    CUDNN_CALL(cudnnSetTensorNdDescriptor(entry_ptr->conv_entry.input_desc,
                                          data_type,
                                          full_dims,
@@ -264,6 +270,7 @@ void OutputShape(
 void FindAlgo(
  int format,
  int dims,
+  int groups,
  const int pad[],
  const int stride[],
  const int dilation[],
@@ -284,6 +291,7 @@ void FindAlgo(
  int full_dims = dims + 2;

  // conv desc
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(entry_ptr->conv_entry.conv_desc, groups));
  CUDNN_CALL(cudnnSetConvolutionNdDescriptor(entry_ptr->conv_entry.conv_desc,
                                             dims,
                                             pad,
@@ -360,16 +368,18 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
  int algo = args[2];
  int pad_v[2], stride_v[2], dilation_v[2];
  for (int i = 0; i < 2; i++) {
-      pad_v[i] = args[3 + i];
-      stride_v[i] = args[5 + i];
-      dilation_v[i] = args[7 + i];
+    pad_v[i] = args[3 + i];
+    stride_v[i] = args[5 + i];
+    dilation_v[i] = args[7 + i];
  }
  DLTensor* x = args[9];
  DLTensor* w = args[10];
  DLTensor* y = args[11];
  std::string conv_dtype = args[12];
+  int groups = args[13];

-  ConvolutionForward(mode, format, algo, 2, pad_v, stride_v, dilation_v, x, w, y, conv_dtype);
+  ConvolutionForward(mode, format, algo, 2, groups, pad_v, stride_v,
+                     dilation_v, x, w, y, conv_dtype);
 });


@@ -380,17 +390,18 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv3d.forward")
  int algo = args[2];
  int pad_v[3], stride_v[3], dilation_v[3];
  for (int i = 0; i < 3; i++) {
-      pad_v[i] = args[3 + i];
-      stride_v[i] = args[6 + i];
-      dilation_v[i] = args[9 + i];
+    pad_v[i] = args[3 + i];
+    stride_v[i] = args[6 + i];
+    dilation_v[i] = args[9 + i];
  }
  DLTensor *x = args[12];
  DLTensor *w = args[13];
  DLTensor *y = args[14];
  std::string conv_dtype = args[15];
+  int groups = args[16];

-  ConvolutionForward(mode, format, algo, 3, pad_v, stride_v, dilation_v, x, w, y,
-                     conv_dtype);
+  ConvolutionForward(mode, format, algo, 3, groups, pad_v, stride_v,
+                     dilation_v, x, w, y, conv_dtype);
 });


@@ -406,8 +417,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv.output_shape")
  void* out_shape = args[7];
  std::string data_dtype = args[8];
  std::string conv_dtype = args[9];
+  int groups = args[10];

-  OutputShape(format, dims, pad, stride, dilation, x_dim,
+  OutputShape(format, dims, groups, pad, stride, dilation, x_dim,
              w_dim, out_shape, data_dtype, conv_dtype);
 });

@@ -424,8 +436,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv.find_algo")
  int* y_dim = static_cast<int*>(static_cast<void*>(args[7]));
  std::string data_dtype = args[8];
  std::string conv_dtype = args[9];
+  int groups = args[10];

-  FindAlgo(format, dims, pad, stride, dilation, x_dim,
+  FindAlgo(format, dims, groups, pad, stride, dilation, x_dim,
           w_dim, y_dim, data_dtype, conv_dtype, ret);
 });


--- a/src/runtime/contrib/cudnn/cudnn_utils.h
+++ b/src/runtime/contrib/cudnn/cudnn_utils.h
@@ -78,7 +78,6 @@ struct ConvEntry {
  runtime::DeviceAPI *cuda_api;
  void *workspace{nullptr};
  size_t workspace_size{0};
-  int group_count {0};
  ConvEntry();
  ~ConvEntry();
  void UpdateWorkspace(const size_t wsize);

--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -17,11 +17,11 @@
 import tvm
 from tvm import te
 from tvm.contrib import cudnn
+from tvm.contrib.nvcc import have_fp16
 import numpy as np
 import topi.testing

-
-def verify_conv2d(data_dtype, conv_dtype, tensor_format=0):
+def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1):
    in_channel = 4
    out_channel = 16
    filter_h = 3
@@ -34,7 +34,7 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0):
    dilation_w = 1
    batch = 3
    height = 32
-    weight = 32
+    width = 32

    if not tvm.runtime.enabled("cuda"):
        print("skip because cuda is not enabled...")
@@ -42,12 +42,17 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0):
    if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
        print("skip because cudnn is not enabled...")
        return
+    if data_dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
+        print("Skip because gpu does not have fp16 support")
+        return
+
+    # schedule
    if tensor_format == 0:
-        xshape = [batch, in_channel, height, weight]
-        wshape = [out_channel, in_channel, filter_h, filter_w]
+        xshape = [batch, in_channel, height, width]
+        wshape = [out_channel, in_channel // groups, filter_h, filter_w]
    else:
-        xshape = [batch, height, weight, in_channel]
-        wshape = [out_channel, filter_h, filter_w, in_channel]
+        xshape = [batch, height, width, in_channel]
+        wshape = [out_channel, filter_h, filter_w, in_channel // groups]

    X = te.placeholder(xshape, name='X', dtype=data_dtype)
    W = te.placeholder(wshape, name='W', dtype=data_dtype)
@@ -59,39 +64,41 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0):
                           conv_mode=1,
                           tensor_format=tensor_format,
                           conv_dtype=conv_dtype,
-                           algo=-1)
+                           algo=-1,
+                           groups=groups)
    yshape = [x.value for x in Y.shape]
    s = te.create_schedule(Y.op)

-    def verify():
-        ctx = tvm.gpu(0)
-        f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d")
-        x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
-        w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
-        y_np = np.zeros(yshape).astype(data_dtype)
-        x = tvm.nd.array(x_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        y = tvm.nd.array(y_np, ctx)
-        if tensor_format == 0:
-            c_np = topi.testing.conv2d_nchw_python(x_np, w_np, 1, 1)
-        elif tensor_format == 1:
-            wt = w_np.transpose((1, 2, 3, 0))  #OHWI => HWIO
-            c_np = topi.testing.conv2d_nhwc_python(x_np, wt, 1, 1)
-
-        f(x, w, y)
-        tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=3e-5, rtol=1e-3)
-
-    verify()
+    # validation
+    ctx = tvm.gpu(0)
+    f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d")
+    x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
+    w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
+    y_np = np.zeros(yshape).astype(data_dtype)
+    x = tvm.nd.array(x_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    y = tvm.nd.array(y_np, ctx)
+    if tensor_format == 0:
+        c_np = topi.testing.conv2d_nchw_python(x_np, w_np, 1, 1, groups=groups)
+    elif tensor_format == 1:
+        wt = w_np.transpose((1, 2, 3, 0))  #OHWI => HWIO
+        c_np = topi.testing.conv2d_nhwc_python(x_np, wt, 1, 1, groups=groups)
+
+    f(x, w, y)
+    tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=1e-2, rtol=1e-2)

 def test_conv2d():
    verify_conv2d("float32", "float32", tensor_format=0)
    verify_conv2d("float16", "float32", tensor_format=1)
-    #Not pass accuracy test, need check
-    #verify_conv2d("float16", "float16", tensor_format=0)
+    verify_conv2d("float16", "float16", tensor_format=0)
    verify_conv2d("int8", "int32", tensor_format=1)

+    verify_conv2d("float32", "float32", tensor_format=0, groups=2)
+    verify_conv2d("float16", "float32", tensor_format=1, groups=2)
+    verify_conv2d("float16", "float16", tensor_format=0, groups=2)
+    verify_conv2d("int8", "int32", tensor_format=1, groups=2)

-def verify_conv3d(data_dtype, conv_dtype, tensor_format=0):
+def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1):
    in_channel = 4
    out_channel = 16
    filter_d = 3
@@ -109,7 +116,7 @@ def verify_conv3d(data_dtype, conv_dtype, tensor_format=0):
    batch = 3
    depth = 32
    height = 32
-    weight = 32
+    width = 32

    if not tvm.runtime.enabled("cuda"):
        print("skip because cuda is not enabled...")
@@ -118,8 +125,9 @@ def verify_conv3d(data_dtype, conv_dtype, tensor_format=0):
        print("skip because cudnn is not enabled...")
        return

-    xshape = [batch, in_channel, depth, height, weight]
-    wshape = [out_channel, in_channel, filter_d, filter_h, filter_w]
+    # schedule
+    xshape = [batch, in_channel, depth, height, width]
+    wshape = [out_channel, in_channel // groups, filter_d, filter_h, filter_w]

    X = te.placeholder(xshape, name='X', dtype=data_dtype)
    W = te.placeholder(wshape, name='W', dtype=data_dtype)
@@ -131,33 +139,31 @@ def verify_conv3d(data_dtype, conv_dtype, tensor_format=0):
                           conv_mode=1,
                           tensor_format=tensor_format,
                           algo=-1,
-                           conv_dtype=conv_dtype)
+                           conv_dtype=conv_dtype,
+                           groups=groups)
    yshape = [x.value for x in Y.shape]
    s = te.create_schedule(Y.op)

-    def verify():
-        ctx = tvm.gpu(0)
-        f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv3d")
-        x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
-        w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
-        y_np = np.zeros(yshape).astype(data_dtype)
-        x = tvm.nd.array(x_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        y = tvm.nd.array(y_np, ctx)
-        if tensor_format == 0:
-            c_np = topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1)
-        else:
-            raise AssertionError("For now, conv3d tensor format only support: 0(NCHW)")
-
-        f(x, w, y)
-        tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=3e-5, rtol=1e-4)
-
-    verify()
+    # validation
+    ctx = tvm.gpu(0)
+    f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv3d")
+    x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
+    w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
+    y_np = np.zeros(yshape).astype(data_dtype)
+    x = tvm.nd.array(x_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    y = tvm.nd.array(y_np, ctx)
+    if tensor_format == 0:
+        c_np = topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1, groups)
+    else:
+        raise AssertionError("For now, conv3d tensor format only support: 0(NCHW)")

+    f(x, w, y)
+    tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=3e-5, rtol=1e-4)

 def test_conv3d():
    verify_conv3d("float32", "float32", tensor_format=0)
-
+    verify_conv3d("float32", "float32", tensor_format=0, groups=2)

 def verify_softmax(shape, axis, dtype="float32"):
    A = te.placeholder(shape, dtype=dtype, name='A')

--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -66,8 +66,8 @@ def schedule_conv2d_nhwc(cfg, outs):


 @autotvm.register_topi_compute("conv2d_cudnn.cuda")
-def conv2d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCHW',
-                 out_dtype='float32'):
+def conv2d_cudnn(cfg, data, kernel, strides, padding, dilation, groups=1,
+                 layout='NCHW', out_dtype='float32'):
    """Compute conv2d using CuDNN library"""
    if layout == 'NCHW':
        tensor_format = 0 # CUDNN_TENSOR_NCHW
@@ -89,7 +89,7 @@ def conv2d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCHW',
    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
    OH = (H + pt + pb - KH) // stride_h + 1
    OW = (W + pl + pr - KW) // stride_w + 1
-    cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) * \
+    cfg.add_flop(groups * 2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) * \
                 ((KW - 1) * dilation_w + 1))

    if data.dtype == "int8" or kernel.dtype == "int8":
@@ -107,7 +107,8 @@ def conv2d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCHW',
                              conv_mode=1,
                              tensor_format=tensor_format,
                              algo=-1,         # let CUDNN choose the best algo
-                              conv_dtype=dtype)
+                              conv_dtype=dtype,
+                              groups=groups)


 @autotvm.register_topi_schedule("conv2d_cudnn.cuda")

--- a/topi/python/topi/testing/conv2d_nhwc_python.py
+++ b/topi/python/topi/testing/conv2d_nhwc_python.py
@@ -21,7 +21,7 @@ import scipy.signal
 from topi.nn.util import get_pad_tuple


-def conv2d_nhwc_python(a_np, w_np, stride, padding):
+def _conv2d_nhwc_python(a_np, w_np, stride, padding):
    """Convolution operator in NHWC layout.

    Parameters
@@ -77,3 +77,38 @@ def conv2d_nhwc_python(a_np, w_np, stride, padding):
                    apad, np.rot90(np.rot90(wt[f, c])), mode='valid')
                bt[n, f] += out[::stride_h, ::stride_w]
    return bt.transpose((0, 2, 3, 1))
+
+def conv2d_nhwc_python(a_np, w_np, stride, padding, groups=1):
+    """Convolution operator in NHWC layout.
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_height, in_width, in_channel]
+
+    w_np : numpy.ndarray
+        4-D with shape [filter_height, filter_width, in_channel // groups, num_filter]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str or a list/tuple of 2 or 4 ints
+        Padding size, or ['VALID', 'SAME'], or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 2 ints
+
+    groups : int
+        Number of groups
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+
+    a_slices = np.array_split(a_np, groups, axis=3)
+    w_slices = np.array_split(w_np, groups, axis=3)
+    b_slices = [_conv2d_nhwc_python(a_slice, w_slice, stride, padding)
+                for a_slice, w_slice in zip(a_slices, w_slices)]
+    b_np = np.concatenate(b_slices, axis=3)
+    return b_np
--- a/topi/python/topi/testing/conv3d_ncdhw_python.py
+++ b/topi/python/topi/testing/conv3d_ncdhw_python.py
@@ -73,6 +73,7 @@ def conv3d_ncdhw_python(a_np, w_np, stride, padding, groups=1):

    padding : int or str or a list/tuple of three ints
        Padding size, or ['VALID', 'SAME'], or [pad_depth, pad_height, pad_width]
+
    groups : int
        Number of groups


--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -75,7 +75,7 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p

        with tvm.target.create(device):
            if "cudnn" in device:
-                C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), "NCHW", dtype)
+                C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), 1, "NCHW", dtype)
            else:
                C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
            if add_bias: