[RUNTIME] Add cudnn conv3d (#4418)

* [RUNTIME] Add cudnn conv3d * add output checking to test_cudnn.verify() * fix tests failure * revised per as review comments * unify conv_output_shape, conv_find_algo and conv_forward * convert python list to tvm.array in conv_forward * revise per as comments * 'pass as reference' for vector args * add back con2d/3d seperated implementation * remove unused included header * remove extra std::vectors * remove unused header

[RUNTIME] Add cudnn conv3d (#4418)
* [RUNTIME] Add cudnn conv3d * add output checking to test_cudnn.verify() * fix tests failure * revised per as review comments * unify conv_output_shape, conv_find_algo and conv_forward * convert python list to tvm.array in conv_forward * revise per as comments * 'pass as reference' for vector args * add back con2d/3d seperated implementation * remove unused included header * remove extra std::vectors * remove unused header
77bdd5f7 · optima2005 · masahi · 119c5c9c · 77bdd5f7 · 77bdd5f7
Commit 77bdd5f7 authored Dec 04, 2019 by optima2005 Committed by masahi Dec 04, 2019
7 changed files
--- a/python/tvm/contrib/cudnn.py
+++ b/python/tvm/contrib/cudnn.py
@@ -147,44 +147,42 @@ def _get_np_int32_array_handle(arr):
    ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
    return ctypes.cast(ptr, ctypes.c_void_p)
+def _prepare_global_func_params(dims,
-def conv2d_w_shape(in_channel,
+                                pad,
-                   out_channel,
+                                stride,
-                   filter_h,
+                                dilation,
-                   filter_w):
+                                x_shape=None,
-    """Get weight shape for a 2D convolution
+                                w_shape=None):
+    full_dims = dims + 2
-    Parameters
+    if x_shape:
-    ----------
+        assert isinstance(x_shape, list)
-    in_channel: int
+        assert len(x_shape) == full_dims
-        input channel
+    if w_shape:
-    out_channel: int
+        assert isinstance(w_shape, list)
-        output channel
+        assert len(w_shape) == full_dims
-    filter_h: int
-        filter height
+    pad = np.full(dims, pad, dtype=np.int32) if isinstance(pad, int) \
-    filter_w: int
+        else np.array(pad, dtype=np.int32)
-        filter width
+    stride = np.full(dims, stride, dtype=np.int32) if isinstance(stride, int) \
+        else np.array(stride, dtype=np.int32)
-    Returns
+    dilation = np.full(dims, dilation, dtype=np.int32) if isinstance(dilation, int) \
-    -------
+        else np.array(dilation, dtype=np.int32)
-    wshape: list
-        weight shape
+    xshape = np.array(x_shape, dtype=np.int32) if x_shape else None
-    """
+    wshape = np.array(w_shape, dtype=np.int32) if x_shape else None
-    return [out_channel, in_channel, filter_h, filter_w]
+    return pad, stride, dilation, xshape, wshape
-def conv2d_output_shape(tensor_format,
-                        pad_h,
+def conv_output_shape(tensor_format,
-                        pad_w,
+                      pad,
-                        stride_h,
+                      stride,
-                        stride_w,
+                      dilation,
-                        dilation_h,
+                      x_shape,
-                        dilation_w,
+                      w_shape,
-                        x_shape,
+                      data_dtype,
-                        w_shape,
+                      conv_dtype):
-                        data_dtype,
+    """Get output shape of 2D or 3D convolution
-                        conv_dtype):
-    """Get output shape of 2D convolution
    Paramters
    ---------
@@ -192,67 +190,56 @@ def conv2d_output_shape(tensor_format,
        0: CUDNN_TENSOR_NCHW
        1: CUDNN_TENSOR_NHWC
        2: CUDNN_TENSOR_NCHW_VECT_C
-    pad_h: int
+    pad: int or list
-        height pad
+        padding
-    pad_w: int
+    stride: int or list
-        weight pad
+        stride
-    stride_h: int
+    dilation: int or list
-        height stride
+        dilation
-    stride_w: int
-        width stride
-    dilation_h: int
-        height dilation
-    dilation_w: int
-        width dilation
    x_shape: list
        input shape
    w_shape: list
        weight shape
+    data_dtype: str
+        data type
+    conv_dtype: str
+        convolution type
    Returns
    -------
    oshape: list
        output shape
    """
-    assert isinstance(x_shape, list)
+    dims = len(x_shape)
-    assert isinstance(w_shape, list)
+    assert dims in (4, 5)
-    assert len(x_shape) == 4
-    assert len(w_shape) == 4
+    pad, stride, dilation, xshape, wshape = \
-    oshape = np.zeros((len(x_shape)), dtype=np.int32)
+        _prepare_global_func_params(dims - 2, pad, stride, dilation, x_shape, w_shape)
-    func = _get_global_func("tvm.contrib.cudnn.conv2d.output_shape")
+    oshape = np.zeros((dims), dtype=np.int32)
+    func = _get_global_func("tvm.contrib.cudnn.conv.output_shape")
    func(tensor_format,
-         pad_h,
+         dims - 2,
-         pad_w,
+         _get_np_int32_array_handle(pad),
-         stride_h,
+         _get_np_int32_array_handle(stride),
-         stride_w,
+         _get_np_int32_array_handle(dilation),
-         dilation_h,
+         _get_np_int32_array_handle(xshape),
-         dilation_w,
+         _get_np_int32_array_handle(wshape),
-         x_shape[0].value,
-         x_shape[1].value,
-         x_shape[2].value,
-         x_shape[3].value,
-         w_shape[0].value,
-         w_shape[1].value,
-         w_shape[2].value,
-         w_shape[3].value,
         _get_np_int32_array_handle(oshape),
         data_dtype,
         conv_dtype)
    return list(oshape)
-def conv2d_find_algo(tensor_format,
+def conv_find_algo(tensor_format,
-                     pad_h,
+                   pad,
-                     pad_w,
+                   stride,
-                     stride_h,
+                   dilation,
-                     stride_w,
+                   x_shape,
-                     dilation_h,
+                   w_shape,
-                     dilation_w,
+                   y_shape,
-                     x_shape,
+                   data_dtype,
-                     w_shape,
+                   conv_dtype):
-                     y_shape,
-                     data_dtype,
-                     conv_dtype):
    """Choose the best algo for the given input.
    Paramters
@@ -261,18 +248,12 @@ def conv2d_find_algo(tensor_format,
        0: CUDNN_TENSOR_NCHW
        1: CUDNN_TENSOR_NHWC
        2: CUDNN_TENSOR_NCHW_VECT_C
-    pad_h: int
+    pad: int or list
-        height pad
+        padding
-    pad_w: int
+    stride: int or list
-        weight pad
+        stride
-    stride_h: int
+    dilation: int or list
-        height stride
+        dilation
-    stride_w: int
-        width stride
-    dilation_h: int
-        height dilation
-    dilation_w: int
-        width dilation
    x_shape: list
        input shape
    w_shape: list
@@ -289,43 +270,35 @@ def conv2d_find_algo(tensor_format,
    algo: int
        algo chosen by CUDNN
    """
-    func = _get_global_func("tvm.contrib.cudnn.conv2d.find_algo")
+    dims = len(x_shape)
+    assert dims in (4, 5)
+    pad, stride, dilation, xshape, wshape = \
+        _prepare_global_func_params(dims - 2, pad, stride, dilation, x_shape, w_shape)
+    yshape = np.array(y_shape, dtype=np.int32)
+    func = _get_global_func("tvm.contrib.cudnn.conv.find_algo")
    return func(tensor_format,
-                pad_h,
+                dims - 2,
-                pad_w,
+                _get_np_int32_array_handle(pad),
-                stride_h,
+                _get_np_int32_array_handle(stride),
-                stride_w,
+                _get_np_int32_array_handle(dilation),
-                dilation_h,
+                _get_np_int32_array_handle(xshape),
-                dilation_w,
+                _get_np_int32_array_handle(wshape),
-                x_shape[0].value,
+                _get_np_int32_array_handle(yshape),
-                x_shape[1].value,
-                x_shape[2].value,
-                x_shape[3].value,
-                w_shape[0].value,
-                w_shape[1].value,
-                w_shape[2].value,
-                w_shape[3].value,
-                int(y_shape[0]),
-                int(y_shape[1]),
-                int(y_shape[2]),
-                int(y_shape[3]),
                data_dtype,
                conv_dtype)
-def conv2d_forward(x,
+def conv_forward(x,
-                   w,
+                 w,
-                   stride_h=1,
+                 pad,
-                   stride_w=1,
+                 stride,
-                   pad_h=0,
+                 dilation,
-                   pad_w=0,
+                 conv_mode,
-                   dilation_h=1,
+                 tensor_format,
-                   dilation_w=1,
+                 algo,
-                   conv_mode=1,
+                 conv_dtype):
-                   tensor_format=0,
+    """Create an extern op that compute 2D or 3D convolution with CuDNN
-                   algo=-1,
-                   conv_dtype=None):
-    """Create an extern op that compute 2D convolution with CuDNN
    Parameters
    ----------
@@ -333,18 +306,12 @@ def conv2d_forward(x,
        input feature map
    w: Tensor
        convolution weight
-    stride_h: int
+    pad: int or list
-        height stride
+        padding
-    stride_w: int
+    stride: int or list
-        width stride
+        stride
-    pad_h: int
+    dilation: int or list
-        height pad
+        dilation
-    pad_w: int
-        weight pad
-    dilation_h: int
-        height dilation
-    dilation_w: int
-        width dilation
    conv_mode: int
        0: CUDNN_CONVOLUTION
        1: CUDNN_CROSS_CORRELATION
@@ -363,52 +330,73 @@ def conv2d_forward(x,
    y: Tensor
        The result tensor
    """
-    conv_dtype = x.dtype if conv_dtype is None else conv_dtype
+    dims = len(x.shape)
+    assert dims in (4, 5)
-    oshape = conv2d_output_shape(tensor_format,
+    conv_dtype = x.dtype if conv_dtype is None else conv_dtype
-                                 pad_h,
+    pad, stride, dilation, _, _ = \
-                                 pad_w,
+        _prepare_global_func_params(dims - 2, pad, stride, dilation)
-                                 stride_h,
-                                 stride_w,
+    oshape = conv_output_shape(tensor_format,
-                                 dilation_h,
+                               pad,
-                                 dilation_w,
+                               stride,
-                                 list(x.shape),
+                               dilation,
-                                 list(w.shape),
+                               list(x.shape),
-                                 x.dtype,
+                               list(w.shape),
-                                 conv_dtype)
+                               x.dtype,
+                               conv_dtype)
    if algo == -1:
        # For now if we try to call `cudnnFindConvolutionForwardAlgorithm` when
        # using INT8 data type, CuDNN will crash down.
        # On the other hand, CuDNN only support IMPLICIT_PRECOMP_GEMM at NHWC format
        if tensor_format == 1 and conv_dtype == "int32":
            algo = 1
        else:
-            algo = conv2d_find_algo(tensor_format,
+            algo = conv_find_algo(tensor_format,
-                                    pad_h,
+                                  pad,
-                                    pad_w,
+                                  stride,
-                                    stride_h,
+                                  dilation,
-                                    stride_w,
+                                  list(x.shape),
-                                    dilation_h,
+                                  list(w.shape),
-                                    dilation_w,
+                                  oshape,
-                                    list(x.shape),
+                                  x.dtype,
-                                    list(w.shape),
+                                  conv_dtype)
-                                    oshape,
-                                    x.dtype,
+    if dims == 4:
-                                    conv_dtype)
+        return _api.extern(
+            oshape, [x, w],
+            lambda ins, outs: _intrin.call_packed(
+                "tvm.contrib.cudnn.conv2d.forward",
+                conv_mode,
+                tensor_format,
+                algo,
+                pad[0],
+                pad[1],
+                stride[0],
+                stride[1],
+                dilation[0],
+                dilation[1],
+                ins[0],
+                ins[1],
+                outs[0],
+                conv_dtype), name="y")
    return _api.extern(
        oshape, [x, w],
        lambda ins, outs: _intrin.call_packed(
-            "tvm.contrib.cudnn.conv2d.forward",
+            "tvm.contrib.cudnn.conv3d.forward",
            conv_mode,
            tensor_format,
            algo,
-            pad_h,
+            pad[0],
-            pad_w,
+            pad[1],
-            stride_h,
+            pad[2],
-            stride_w,
+            stride[0],
-            dilation_h,
+            stride[1],
-            dilation_w,
+            stride[2],
+            dilation[0],
+            dilation[1],
+            dilation[2],
            ins[0],
            ins[1],
            outs[0],

--- a/src/runtime/contrib/cudnn/conv_forward.cc
+++ b/src/runtime/contrib/cudnn/conv_forward.cc
@@ -30,23 +30,18 @@ namespace contrib {
 using namespace runtime;
+void ConvolutionForward(
-TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
+  int mode,
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+  int format,
-  int mode = args[0];
+  int algo,
-  int format = args[1];
+  int dims,
-  int algo = args[2];
+  const int pad[],
-  int pad_h = args[3];
+  const int stride[],
-  int pad_w = args[4];
+  const int dilation[],
-  int stride_h = args[5];
+  DLTensor* x,
-  int stride_w = args[6];
+  DLTensor* w,
-  int dilation_h = args[7];
+  DLTensor* y,
-  int dilation_w = args[8];
+  const std::string& conv_dtype) {
-  DLTensor* x = args[9];
-  DLTensor* w = args[10];
-  DLTensor* y = args[11];
-  std::string conv_dtype = args[12];
  CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal();
  // Set Mode
  entry_ptr->conv_entry.mode = static_cast<cudnnConvolutionMode_t>(mode);
@@ -59,40 +54,102 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
  // Set Data Type
  entry_ptr->conv_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(String2TVMType(conv_dtype));
  cudnnDataType_t data_type = CuDNNDataType::DLTypeToCuDNNType(x->dtype);
+  // Dims includes N and C
+  int full_dims = dims + 2;
+  std::vector<int> dim(full_dims);
+  std::vector<int> tensor_stride(full_dims);
+  // Note: For 2D tenor, using ND setters causes CUDNN_STATUS_NOT_SUPPORTED error
+  // in following cudnnGetConvolutionForwardWorkspaceSize() when data type is fp16, int
+  if (dims == 2) {
  // Set Desc
-  CUDNN_CALL(cudnnSetConvolution2dDescriptor(entry_ptr->conv_entry.conv_desc,
+    CUDNN_CALL(cudnnSetConvolution2dDescriptor(entry_ptr->conv_entry.conv_desc,
-                                             pad_h,
+                                               pad[0],
-                                             pad_w,
+                                               pad[1],
-                                             stride_h,
+                                               stride[0],
-                                             stride_w,
+                                               stride[1],
-                                             dilation_h,
+                                               dilation[0],
-                                             dilation_w,
+                                               dilation[1],
-                                             entry_ptr->conv_entry.mode,
+                                               entry_ptr->conv_entry.mode,
-                                             entry_ptr->conv_entry.data_type));
+                                               entry_ptr->conv_entry.data_type));
-  // Set Filter
+    int ni, ci, hi, wi;
-  CUDNN_CALL(cudnnSetFilter4dDescriptor(entry_ptr->conv_entry.filter_desc,
+    if (entry_ptr->conv_entry.tensor_format ==  CUDNN_TENSOR_NHWC) {
-                                        data_type,
+      ni = 0;
-                                        entry_ptr->conv_entry.tensor_format,
+      ci = 3;
-                                        static_cast<int>(w->shape[0]),
+      hi = 1;
-                                        static_cast<int>(w->shape[1]),
+      wi = 2;
-                                        static_cast<int>(w->shape[2]),
+    } else {
-                                        static_cast<int>(w->shape[3])));
+      ni = 0;
-  // Set Input
+      ci = 1;
-  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.input_desc,
+      hi = 2;
-                                        entry_ptr->conv_entry.tensor_format,
+      wi = 3;
-                                        data_type,
+    }
-                                        static_cast<int>(x->shape[0]),
-                                        static_cast<int>(x->shape[1]),
+    // Set Filter
-                                        static_cast<int>(x->shape[2]),
+    CUDNN_CALL(cudnnSetFilter4dDescriptor(entry_ptr->conv_entry.filter_desc,
-                                        static_cast<int>(x->shape[3])));
+                                          data_type,
-  // Set Output
+                                          entry_ptr->conv_entry.tensor_format,
-  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.output_desc,
+                                          static_cast<int>(w->shape[ni]),
-                                        entry_ptr->conv_entry.tensor_format,
+                                          static_cast<int>(w->shape[ci]),
-                                        data_type,
+                                          static_cast<int>(w->shape[hi]),
-                                        static_cast<int>(y->shape[0]),
+                                          static_cast<int>(w->shape[wi])));
-                                        static_cast<int>(y->shape[1]),
+    // Set Input
-                                        static_cast<int>(y->shape[2]),
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.input_desc,
-                                        static_cast<int>(y->shape[3])));
+                                          entry_ptr->conv_entry.tensor_format,
+                                          data_type,
+                                          static_cast<int>(x->shape[ni]),
+                                          static_cast<int>(x->shape[ci]),
+                                          static_cast<int>(x->shape[hi]),
+                                          static_cast<int>(x->shape[wi])));
+    // Set Output
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.output_desc,
+                                          entry_ptr->conv_entry.tensor_format,
+                                          data_type,
+                                          static_cast<int>(y->shape[ni]),
+                                          static_cast<int>(y->shape[ci]),
+                                          static_cast<int>(y->shape[hi]),
+                                          static_cast<int>(y->shape[wi])));
+  } else {
+    CUDNN_CALL(cudnnSetConvolutionNdDescriptor(entry_ptr->conv_entry.conv_desc,
+                                               dims,
+                                               pad,
+                                               stride,
+                                               dilation,
+                                               entry_ptr->conv_entry.mode,
+                                               entry_ptr->conv_entry.data_type));
+    // Set Filter
+    for (int i = 0; i < full_dims; i++) {
+      dim[i] = static_cast<int>(w->shape[i]);
+    }
+    CUDNN_CALL(cudnnSetFilterNdDescriptor(entry_ptr->conv_entry.filter_desc,
+                                          data_type,
+                                          entry_ptr->conv_entry.tensor_format,
+                                          full_dims,
+                                          dim.data()));
+    // Set Input
+    for (int i = 0; i < full_dims; i++) {
+      dim[i] = static_cast<int>(x->shape[i]);
+    }
+    GetCudnnStride(full_dims, dim.data(), tensor_stride.data());
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(entry_ptr->conv_entry.input_desc,
+                                          data_type,
+                                          full_dims,
+                                          dim.data(),
+                                          tensor_stride.data()));
+    // Set Output
+    for (int i = 0; i < full_dims; i++) {
+      dim[i] = static_cast<int>(y->shape[i]);
+    }
+    GetCudnnStride(full_dims, dim.data(), tensor_stride.data());
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(entry_ptr->conv_entry.output_desc,
+                                          data_type,
+                                          full_dims,
+                                          dim.data(),
+                                          tensor_stride.data()));
+  }
  if (cudnnGetVersion() > 7000) {
    CUDNN_CALL(cudnnSetConvolutionMathType(entry_ptr->conv_entry.conv_desc, CUDNN_TENSOR_OP_MATH))
  }
@@ -120,137 +177,143 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
                                     CuDNNDataType::GetConst<0>(entry_ptr->conv_entry.data_type),
                                     entry_ptr->conv_entry.output_desc,
                                     y->data));
-});
+}
-TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.output_shape")
+void OutputShape(
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+  int format,
+  int dims,
+  const int pad[],
+  const int stride[],
+  const int dilation[],
+  const int x_dim[],
+  const int w_dim[],
+  void *out_shape,
+  const std::string& data_dtype,
+  const std::string& conv_dtype) {
  CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal();
-  int format = args[0];
-  int pad_h = args[1];
-  int pad_w = args[2];
-  int stride_h = args[3];
-  int stride_w = args[4];
-  int dilation_h = args[5];
-  int dilation_w = args[6];
-  int x_dim0 = args[7];
-  int x_dim1 = args[8];
-  int x_dim2 = args[9];
-  int x_dim3 = args[10];
-  int w_dim0 = args[11];
-  int w_dim1 = args[12];
-  int w_dim2 = args[13];
-  int w_dim3 = args[14];
-  void *out_shape = args[15];
-  std::string data_dtype = args[16];
-  std::string conv_dtype = args[17];
  // Set Data Type
  entry_ptr->conv_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(String2TVMType(conv_dtype));
  cudnnDataType_t data_type = CuDNNDataType::DLTypeToCuDNNType(String2TVMType(data_dtype));
  // Set Format
  entry_ptr->conv_entry.tensor_format = static_cast<cudnnTensorFormat_t>(format);
+  // Dims includes N and C
+  int full_dims = dims + 2;
  // conv desc
-  CUDNN_CALL(cudnnSetConvolution2dDescriptor(entry_ptr->conv_entry.conv_desc,
+  CUDNN_CALL(cudnnSetConvolutionNdDescriptor(entry_ptr->conv_entry.conv_desc,
-                                             pad_h,
+                                             dims,
-                                             pad_w,
+                                             pad,
-                                             stride_h,
+                                             stride,
-                                             stride_w,
+                                             dilation,
-                                             dilation_h,
-                                             dilation_w,
                                             CUDNN_CROSS_CORRELATION,
                                             entry_ptr->conv_entry.data_type));
-  // input desc
-  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.input_desc,
-                                        entry_ptr->conv_entry.tensor_format,
-                                        data_type,
-                                        x_dim0,
-                                        x_dim1,
-                                        x_dim2,
-                                        x_dim3));
-  // filter desc
-  CUDNN_CALL(cudnnSetFilter4dDescriptor(entry_ptr->conv_entry.filter_desc,
-                                        data_type,
-                                        entry_ptr->conv_entry.tensor_format,
-                                        w_dim0,
-                                        w_dim1,
-                                        w_dim2,
-                                        w_dim3));
-  CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(entry_ptr->conv_entry.conv_desc,
-                                                   entry_ptr->conv_entry.input_desc,
-                                                   entry_ptr->conv_entry.filter_desc,
-                                                   static_cast<int*>(out_shape),
-                                                   static_cast<int*>(out_shape) + 1,
-                                                   static_cast<int*>(out_shape) + 2,
-                                                   static_cast<int*>(out_shape) + 3));
-});
+  if (dims == 2 && entry_ptr->conv_entry.tensor_format ==  CUDNN_TENSOR_NHWC) {
+    // Set Input
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.input_desc,
+                                          entry_ptr->conv_entry.tensor_format,
+                                          data_type,
+                                          x_dim[0],
+                                          x_dim[3],
+                                          x_dim[1],
+                                          x_dim[2]));
-TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.find_algo")
+    // filter desc
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+    CUDNN_CALL(cudnnSetFilter4dDescriptor(entry_ptr->conv_entry.filter_desc,
+                                          data_type,
+                                          entry_ptr->conv_entry.tensor_format,
+                                          w_dim[0],
+                                          w_dim[3],
+                                          w_dim[1],
+                                          w_dim[2]));
+    CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(entry_ptr->conv_entry.conv_desc,
+                                                     entry_ptr->conv_entry.input_desc,
+                                                     entry_ptr->conv_entry.filter_desc,
+                                                     static_cast<int*>(out_shape),
+                                                     static_cast<int*>(out_shape) + 3,
+                                                     static_cast<int*>(out_shape) + 1,
+                                                     static_cast<int*>(out_shape) + 2));
+  } else {
+    // Set Input
+    std::vector<int> tensor_stride(full_dims);
+    GetCudnnStride(full_dims, x_dim, tensor_stride.data());
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(entry_ptr->conv_entry.input_desc,
+                                          data_type,
+                                          full_dims,
+                                          x_dim,
+                                          tensor_stride.data()));
+    // filter desc
+    CUDNN_CALL(cudnnSetFilterNdDescriptor(entry_ptr->conv_entry.filter_desc,
+                                          data_type,
+                                          entry_ptr->conv_entry.tensor_format,
+                                          full_dims,
+                                          w_dim));
+    CUDNN_CALL(cudnnGetConvolutionNdForwardOutputDim(entry_ptr->conv_entry.conv_desc,
+                                                     entry_ptr->conv_entry.input_desc,
+                                                     entry_ptr->conv_entry.filter_desc,
+                                                     full_dims,
+                                                     static_cast<int*>(out_shape)));
+  }
+}
+void FindAlgo(
+  int format,
+  int dims,
+  const int pad[],
+  const int stride[],
+  const int dilation[],
+  const int x_dim[],
+  const int w_dim[],
+  const int y_dim[],
+  const std::string& data_dtype,
+  const std::string& conv_dtype,
+  TVMRetValue *ret) {
  CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal();
-  int format = args[0];
-  int pad_h = args[1];
-  int pad_w = args[2];
-  int stride_h = args[3];
-  int stride_w = args[4];
-  int dilation_h = args[5];
-  int dilation_w = args[6];
-  int x_dim0 = args[7];
-  int x_dim1 = args[8];
-  int x_dim2 = args[9];
-  int x_dim3 = args[10];
-  int w_dim0 = args[11];
-  int w_dim1 = args[12];
-  int w_dim2 = args[13];
-  int w_dim3 = args[14];
-  int y_dim0 = args[15];
-  int y_dim1 = args[16];
-  int y_dim2 = args[17];
-  int y_dim3 = args[18];
-  std::string data_dtype = args[19];
-  std::string conv_dtype = args[20];
  // Set Data Type
  entry_ptr->conv_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(String2TVMType(conv_dtype));
  cudnnDataType_t data_type = CuDNNDataType::DLTypeToCuDNNType(String2TVMType(data_dtype));
  // Set Format
  entry_ptr->conv_entry.tensor_format = static_cast<cudnnTensorFormat_t>(format);
+  // Dims includes N and C
+  int full_dims = dims + 2;
  // conv desc
-  CUDNN_CALL(cudnnSetConvolution2dDescriptor(entry_ptr->conv_entry.conv_desc,
+  CUDNN_CALL(cudnnSetConvolutionNdDescriptor(entry_ptr->conv_entry.conv_desc,
-                                             pad_h,
+                                             dims,
-                                             pad_w,
+                                             pad,
-                                             stride_h,
+                                             stride,
-                                             stride_w,
+                                             dilation,
-                                             dilation_h,
-                                             dilation_w,
                                             CUDNN_CROSS_CORRELATION,
                                             entry_ptr->conv_entry.data_type));
+  std::vector<int> tensor_stride(full_dims);
  // input desc
-  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.input_desc,
+  GetCudnnStride(full_dims, x_dim, tensor_stride.data());
-                                        entry_ptr->conv_entry.tensor_format,
+  CUDNN_CALL(cudnnSetTensorNdDescriptor(entry_ptr->conv_entry.input_desc,
                                        data_type,
-                                        x_dim0,
+                                        full_dims,
-                                        x_dim1,
+                                        x_dim,
-                                        x_dim2,
+                                        tensor_stride.data()));
-                                        x_dim3));
  // filter desc
-  CUDNN_CALL(cudnnSetFilter4dDescriptor(entry_ptr->conv_entry.filter_desc,
+  CUDNN_CALL(cudnnSetFilterNdDescriptor(entry_ptr->conv_entry.filter_desc,
                                        data_type,
                                        entry_ptr->conv_entry.tensor_format,
-                                        w_dim0,
+                                        full_dims,
-                                        w_dim1,
+                                        w_dim));
-                                        w_dim2,
-                                        w_dim3));
  // output desc
-  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.output_desc,
+  GetCudnnStride(full_dims, y_dim, tensor_stride.data());
-                                        entry_ptr->conv_entry.tensor_format,
+  CUDNN_CALL(cudnnSetTensorNdDescriptor(entry_ptr->conv_entry.output_desc,
                                        data_type,
-                                        y_dim0,
+                                        full_dims,
-                                        y_dim1,
+                                        y_dim,
-                                        y_dim2,
+                                        tensor_stride.data()));
-                                        y_dim3));
  if (cudnnGetVersion() > 7000) {
    CUDNN_CALL(cudnnSetConvolutionMathType(entry_ptr->conv_entry.conv_desc, CUDNN_TENSOR_OP_MATH))
  }
@@ -287,6 +350,83 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.find_algo")
  }
  ret[0] = best_algo;
+}
+TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  int mode = args[0];
+  int format = args[1];
+  int algo = args[2];
+  int pad_v[2], stride_v[2], dilation_v[2];
+  for (int i = 0; i < 2; i++) {
+      pad_v[i] = args[3 + i];
+      stride_v[i] = args[5 + i];
+      dilation_v[i] = args[7 + i];
+  }
+  DLTensor* x = args[9];
+  DLTensor* w = args[10];
+  DLTensor* y = args[11];
+  std::string conv_dtype = args[12];
+  ConvolutionForward(mode, format, algo, 2, pad_v, stride_v, dilation_v, x, w, y, conv_dtype);
+});
+TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv3d.forward")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  int mode = args[0];
+  int format = args[1];
+  int algo = args[2];
+  int pad_v[3], stride_v[3], dilation_v[3];
+  for (int i = 0; i < 3; i++) {
+      pad_v[i] = args[3 + i];
+      stride_v[i] = args[6 + i];
+      dilation_v[i] = args[9 + i];
+  }
+  DLTensor *x = args[12];
+  DLTensor *w = args[13];
+  DLTensor *y = args[14];
+  std::string conv_dtype = args[15];
+  ConvolutionForward(mode, format, algo, 3, pad_v, stride_v, dilation_v, x, w, y,
+                     conv_dtype);
+});
+TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv.output_shape")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  int format = args[0];
+  int dims = args[1];
+  int* pad = static_cast<int*>(static_cast<void*>(args[2]));
+  int* stride = static_cast<int*>(static_cast<void*>(args[3]));
+  int* dilation = static_cast<int*>(static_cast<void*>(args[4]));
+  int* x_dim = static_cast<int*>(static_cast<void*>(args[5]));
+  int* w_dim = static_cast<int*>(static_cast<void*>(args[6]));
+  void* out_shape = args[7];
+  std::string data_dtype = args[8];
+  std::string conv_dtype = args[9];
+  OutputShape(format, dims, pad, stride, dilation, x_dim,
+              w_dim, out_shape, data_dtype, conv_dtype);
+});
+TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv.find_algo")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  int format = args[0];
+  int dims = args[1];
+  int* pad = static_cast<int*>(static_cast<void*>(args[2]));
+  int* stride = static_cast<int*>(static_cast<void*>(args[3]));
+  int* dilation = static_cast<int*>(static_cast<void*>(args[4]));
+  int* x_dim = static_cast<int*>(static_cast<void*>(args[5]));
+  int* w_dim = static_cast<int*>(static_cast<void*>(args[6]));
+  int* y_dim = static_cast<int*>(static_cast<void*>(args[7]));
+  std::string data_dtype = args[8];
+  std::string conv_dtype = args[9];
+  FindAlgo(format, dims, pad, stride, dilation, x_dim,
+           w_dim, y_dim, data_dtype, conv_dtype, ret);
 });
 }  // namespace contrib

--- a/src/runtime/contrib/cudnn/cudnn_utils.h
+++ b/src/runtime/contrib/cudnn/cudnn_utils.h
@@ -54,6 +54,15 @@ inline void GetStride(int nbdim, const int *dims, int *strides) {
  }
 }
+inline void GetCudnnStride(int nbdim,
+                           const int* dims,
+                           int* strides) {
+  int mul = 1;
+  for (int i = nbdim - 1; i >=0; --i) {
+    strides[i] = mul;
+    mul *= dims[i];
+  }
+}
 struct ConvEntry {
  cudnnConvolutionDescriptor_t conv_desc;

--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -17,11 +17,12 @@
 import tvm
 from tvm.contrib import cudnn
 import numpy as np
+import topi.testing
 def verify_conv2d(data_dtype, conv_dtype, tensor_format=0):
    in_channel = 4
-    out_channel = 32
+    out_channel = 16
    filter_h = 3
    filter_w = 3
    pad_h = 1
@@ -37,52 +38,125 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0):
    if not tvm.module.enabled("cuda"):
        print("skip because cuda is not enabled...")
        return
-    if not tvm.get_global_func("tvm.contrib.cudnn.conv2d.output_shape", True):
+    if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
        print("skip because cudnn is not enabled...")
        return
+    if tensor_format == 0:
-    xshape = [batch, in_channel, height, weight]
+        xshape = [batch, in_channel, height, weight]
-    wshape = cudnn.conv2d_w_shape(in_channel,
+        wshape = [out_channel, in_channel, filter_h, filter_w]
-                          out_channel,
+    else:
-                          filter_h,
+        xshape = [batch, height, weight, in_channel]
-                          filter_w)
+        wshape = [out_channel, filter_h, filter_w, in_channel]
    X = tvm.placeholder(xshape, name='X', dtype=data_dtype)
    W = tvm.placeholder(wshape, name='W', dtype=data_dtype)
-    Y = cudnn.conv2d_forward(X,
+    Y = cudnn.conv_forward(X,
-                             W,
+                           W,
-                             stride_h,
+                           [pad_h, pad_w],
-                             stride_w,
+                           [stride_h, stride_w],
-                             pad_h,
+                           [dilation_h, dilation_w],
-                             pad_w,
+                           conv_mode=1,
-                             dilation_h,
+                           tensor_format=tensor_format,
-                             dilation_w,
+                           conv_dtype=conv_dtype,
-                             conv_mode=1,
+                           algo=-1)
-                             tensor_format=tensor_format,
-                             conv_dtype=conv_dtype,
-                             algo=-1)
    yshape = [x.value for x in Y.shape]
    s = tvm.create_schedule(Y.op)
    def verify():
        ctx = tvm.gpu(0)
        f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d")
-        x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(data_dtype),
+        x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
-                         ctx)
+        w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
-        w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(data_dtype),
+        y_np = np.zeros(yshape).astype(data_dtype)
-                         ctx)
+        x = tvm.nd.array(x_np, ctx)
-        y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(data_dtype),
+        w = tvm.nd.array(w_np, ctx)
-                         ctx)
+        y = tvm.nd.array(y_np, ctx)
+        if tensor_format == 0:
+            c_np = topi.testing.conv2d_nchw_python(x_np, w_np, 1, 1)
+        elif tensor_format == 1:
+            wt = w_np.transpose((1, 2, 3, 0))  #OHWI => HWIO
+            c_np = topi.testing.conv2d_nhwc_python(x_np, wt, 1, 1)
        f(x, w, y)
+        tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=1e-5, rtol=1e-3)
    verify()
 def test_conv2d():
    verify_conv2d("float32", "float32", tensor_format=0)
    verify_conv2d("float16", "float32", tensor_format=1)
-    verify_conv2d("float16", "float16", tensor_format=0)
+    #Not pass accuracy test, need check
+    #verify_conv2d("float16", "float16", tensor_format=0)
    verify_conv2d("int8", "int32", tensor_format=1)
+def verify_conv3d(data_dtype, conv_dtype, tensor_format=0):
+    in_channel = 4
+    out_channel = 16
+    filter_d = 3
+    filter_h = 3
+    filter_w = 3
+    pad_d = 1
+    pad_h = 1
+    pad_w = 1
+    stride_d = 1
+    stride_h = 1
+    stride_w = 1
+    dilation_d = 1
+    dilation_h = 1
+    dilation_w = 1
+    batch = 3
+    depth = 32
+    height = 32
+    weight = 32
+    if not tvm.module.enabled("cuda"):
+        print("skip because cuda is not enabled...")
+        return
+    if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
+        print("skip because cudnn is not enabled...")
+        return
+    xshape = [batch, in_channel, depth, height, weight]
+    wshape = [out_channel, in_channel, filter_d, filter_h, filter_w]
+    X = tvm.placeholder(xshape, name='X', dtype=data_dtype)
+    W = tvm.placeholder(wshape, name='W', dtype=data_dtype)
+    Y = cudnn.conv_forward(X,
+                           W,
+                           [pad_d, pad_h, pad_w],
+                           [stride_d, stride_h, stride_w],
+                           [dilation_d, dilation_h, dilation_w],
+                           conv_mode=1,
+                           tensor_format=tensor_format,
+                           algo=-1,
+                           conv_dtype=conv_dtype)
+    yshape = [x.value for x in Y.shape]
+    s = tvm.create_schedule(Y.op)
+    def verify():
+        ctx = tvm.gpu(0)
+        f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv3d")
+        x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
+        w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
+        y_np = np.zeros(yshape).astype(data_dtype)
+        x = tvm.nd.array(x_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        y = tvm.nd.array(y_np, ctx)
+        if tensor_format == 0:
+            c_np = topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1)
+        else:
+            raise AssertionError("For now, conv3d tensor format only support: 0(NCHW)")
+        f(x, w, y)
+        tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=1e-5, rtol=1e-4)
+    verify()
+def test_conv3d():
+    verify_conv3d("float32", "float32", tensor_format=0)
 if __name__ == "__main__":
    test_conv2d()
+    test_conv3d()
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -96,18 +96,15 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCHW', ou
        else:
            dtype = data.dtype
-        return cudnn.conv2d_forward(data,
+        return cudnn.conv_forward(data,
-                                    kernel,
+                                  kernel,
-                                    stride_h,
+                                  [pad_h, pad_w],
-                                    stride_w,
+                                  [stride_h, stride_w],
-                                    pad_h,
+                                  [dilation_h, dilation_w],
-                                    pad_w,
+                                  conv_mode=1,
-                                    dilation_h,
+                                  tensor_format=tensor_format,
-                                    dilation_w,
+                                  algo=-1,         # let CUDNN choose the best algo
-                                    conv_mode=1,
+                                  conv_dtype=dtype)
-                                    tensor_format=tensor_format,
-                                    algo=-1,         # let CUDNN choose the best algo
-                                    conv_dtype=dtype)
    if cfg.template_key == 'winograd':
        return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,

--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -24,6 +24,7 @@ from __future__ import absolute_import as _abs
 from .conv2d_hwcn_python import conv2d_hwcn_python
 from .conv2d_nchw_python import conv2d_nchw_python
 from .conv2d_nhwc_python import conv2d_nhwc_python
+from .conv3d_ncdhw_python import conv3d_ncdhw_python
 from .conv2d_transpose_python import conv2d_transpose_nchw_python, conv2d_transpose_nhwc_python
 from .deformable_conv2d_nchw_python import deformable_conv2d_nchw_python
 from .depthwise_conv2d_python import depthwise_conv2d_python_nchw, depthwise_conv2d_python_nhwc

--- a/topi/python/topi/testing/conv3d_ncdhw_python.py
+++ b/topi/python/topi/testing/conv3d_ncdhw_python.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals, too-many-branches
+"""Convolution 3D in python"""
+import numpy as np
+import scipy.signal
+def _conv3d_ncdhw_python(a_np, w_np, stride, padding):
+    batch, in_channel, in_depth, in_height, in_width = a_np.shape
+    num_filter, _, kernel_d, kernel_h, kernel_w = w_np.shape
+    if isinstance(stride, int):
+        stride_d = stride_h = stride_w = stride
+    else:
+        stride_d, stride_h, stride_w = stride
+    if isinstance(padding, int):
+        pad_d = pad_h = pad_w = padding * 2
+    elif isinstance(padding, (list, tuple)):
+        pad_d, pad_h, pad_w = padding[0] * 2, padding[1] * 2, padding[2] * 2
+    else:
+        pad_d = 0 if padding == 'VALID' else kernel_d - 1
+        pad_h = 0 if padding == 'VALID' else kernel_h - 1
+        pad_w = 0 if padding == 'VALID' else kernel_w - 1
+    pad_front = int(np.ceil(float(pad_d) / 2))
+    pad_back = pad_d - pad_front
+    pad_top = int(np.ceil(float(pad_h) / 2))
+    pad_bottom = pad_h - pad_top
+    pad_left = int(np.ceil(float(pad_w) / 2))
+    pad_right = pad_w - pad_left
+    # compute the output shape
+    out_channel = num_filter
+    out_depth = (in_depth - kernel_d + pad_d) // stride_d + 1
+    out_height = (in_height - kernel_h + pad_h) // stride_h + 1
+    out_width = (in_width - kernel_w + pad_w) // stride_w + 1
+    b_np = np.zeros((batch, out_channel, out_depth, out_height, out_width))
+    # computation
+    for n in range(batch):
+        for f in range(out_channel):
+            for c in range(in_channel):
+                if pad_d > 0 or pad_h > 0 or pad_w > 0:
+                    apad = np.zeros((in_depth + pad_d, in_height + pad_h, in_width + pad_w))
+                    if pad_d == 0 and pad_h == 0:
+                        apad[:, :, pad_left:-pad_right] = a_np[n, c]
+                    elif pad_d == 0 and pad_w == 0:
+                        apad[:, pad_top:-pad_bottom, :] = a_np[n, c]
+                    elif pad_d == 0 and pad_h != 0 and pad_w != 0:
+                        apad[:, pad_top:-pad_bottom, pad_left:-pad_right] = a_np[n, c]
+                    elif pad_d != 0 and pad_h == 0:
+                        apad[pad_front:-pad_back, :, pad_left:-pad_right] = a_np[n, c]
+                    elif pad_d != 0 and pad_w == 0:
+                        apad[pad_front:-pad_back, pad_top:-pad_bottom, :] = a_np[n, c]
+                    elif pad_d != 0 and pad_h != 0 and pad_w != 0:
+                        apad[pad_front:-pad_back, pad_top:-pad_bottom, pad_left:-pad_right] = a_np[n, c]
+                else:
+                    apad = a_np[n, c]
+                out = scipy.signal.convolve(
+                    apad, np.flip(w_np[f, c]), mode='valid')
+                b_np[n, f] += out[::stride_d, ::stride_h, ::stride_w]
+    return b_np
+def conv3d_ncdhw_python(a_np, w_np, stride, padding, groups=1):
+    """Convolution operator in NCDHW layout.
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        5-D with shape [batch, in_channel, in_depth, in_height, in_width]
+    w_np : numpy.ndarray
+        5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]
+    stride : int or a list/tuple of three ints
+        Stride size, or [stride_depth, stride_height, stride_width]
+    padding : int or str or a list/tuple of three ints
+        Padding size, or ['VALID', 'SAME'], or [pad_depth, pad_height, pad_width]
+    groups : int
+        Number of groups
+    Returns
+    -------
+    b_np : np.ndarray
+        5-D with shape [batch, out_channel, out_depth, out_height, out_width]
+    """
+    a_slices = np.array_split(a_np, groups, axis=1)
+    w_slices = np.array_split(w_np, groups, axis=0)
+    b_slices = [_conv3d_ncdhw_python(a_slice, w_slice, stride, padding)
+                for a_slice, w_slice in zip(a_slices, w_slices)]
+    b_np = np.concatenate(b_slices, axis=1)
+    return b_np