[Perf] Enhance cudnn and cublas backend and enable TensorCore (#4353)

* add half and mix precision support to cublas backend * add TensorCore support in CuDNN * enhance CuDNN support * address comments and fix lint * fix * add fp16 test

[Perf] Enhance cudnn and cublas backend and enable TensorCore (#4353)
* add half and mix precision support to cublas backend * add TensorCore support in CuDNN * enhance CuDNN support * address comments and fix lint * fix * add fp16 test
dabde40f · Siyuan Feng · Leyuan Wang · fbb2a356 · dabde40f · dabde40f
Commit dabde40f authored Nov 24, 2019 by Siyuan Feng Committed by Leyuan Wang Nov 24, 2019
11 changed files
--- a/include/tvm/runtime/util.h
+++ b/include/tvm/runtime/util.h
@@ -39,6 +39,14 @@ namespace runtime {
 inline bool TypeMatch(TVMType t, int code, int bits, int lanes = 1) {
  return t.code == code && t.bits == bits && t.lanes == lanes;
 }
+/*!
+ * \brief Check whether two types are equal .
+ * \param lhs The left operand.
+ * \param rhs The right operand.
+ */
+inline bool TypeEqual(TVMType lhs, TVMType rhs) {
+  return lhs.code == rhs.code && lhs.bits == rhs.bits && lhs.lanes == rhs.lanes;
+}
 }  // namespace runtime
 }  // namespace tvm
 // Forward declare the intrinsic id we need

--- a/python/tvm/contrib/cublas.py
+++ b/python/tvm/contrib/cublas.py
@@ -20,7 +20,7 @@ from __future__ import absolute_import as _abs
 from .. import api as _api
 from .. import intrin as _intrin

-def matmul(lhs, rhs, transa=False, transb=False):
+def matmul(lhs, rhs, transa=False, transb=False, dtype=None):
    """Create an extern op that compute matrix mult of A and rhs with cuBLAS

    Parameters
@@ -41,13 +41,14 @@ def matmul(lhs, rhs, transa=False, transb=False):
    """
    n = lhs.shape[1] if transa else lhs.shape[0]
    m = rhs.shape[0] if transb else rhs.shape[1]
+    dtype = dtype if dtype is not None else lhs.dtype
    return _api.extern(
        (n, m), [lhs, rhs],
        lambda ins, outs: _intrin.call_packed(
            "tvm.contrib.cublas.matmul",
-            ins[0], ins[1], outs[0], transa, transb), name="C")
+            ins[0], ins[1], outs[0], transa, transb), dtype=dtype, name="C")

-def batch_matmul(lhs, rhs, transa=False, transb=False):
+def batch_matmul(lhs, rhs, transa=False, transb=False, dtype=None):
    """Create an extern op that compute batch matrix mult of A and rhs with cuBLAS

    Parameters
@@ -69,8 +70,9 @@ def batch_matmul(lhs, rhs, transa=False, transb=False):
    b = lhs.shape[0]
    n = lhs.shape[2] if transa else lhs.shape[1]
    m = rhs.shape[1] if transb else rhs.shape[2]
+    dtype = dtype if dtype is not None else lhs.dtype
    return _api.extern(
        (b, n, m), [lhs, rhs],
        lambda ins, outs: _intrin.call_packed(
            "tvm.contrib.cublas.batch_matmul",
-            ins[0], ins[1], outs[0], transa, transb), name="C")
+            ins[0], ins[1], outs[0], transa, transb), dtype=dtype, name="C")
--- a/python/tvm/contrib/cudnn.py
+++ b/python/tvm/contrib/cudnn.py
@@ -22,7 +22,6 @@ from .. import api as _api
 from .. import intrin as _intrin
 from .. import get_global_func as _get_global_func

-
 # algos can be read from cudnn.h
 _FWD_ALGOS = [
    "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM",
@@ -67,6 +66,7 @@ _ALGO_TYPE = [
    "bwd_data"
 ]

+
 def algo_to_index(algo_type, algo_name):
    """Return a index represents the algorithm, which can be used in
    calling CuDNN function
@@ -172,6 +172,7 @@ def conv2d_w_shape(in_channel,
    """
    return [out_channel, in_channel, filter_h, filter_w]

+
 def conv2d_output_shape(tensor_format,
                        pad_h,
                        pad_w,
@@ -180,7 +181,9 @@ def conv2d_output_shape(tensor_format,
                        dilation_h,
                        dilation_w,
                        x_shape,
-                        w_shape):
+                        w_shape,
+                        data_dtype,
+                        conv_dtype):
    """Get output shape of 2D convolution

    Paramters
@@ -232,7 +235,9 @@ def conv2d_output_shape(tensor_format,
         w_shape[1].value,
         w_shape[2].value,
         w_shape[3].value,
-         _get_np_int32_array_handle(oshape))
+         _get_np_int32_array_handle(oshape),
+         data_dtype,
+         conv_dtype)
    return list(oshape)


@@ -245,7 +250,9 @@ def conv2d_find_algo(tensor_format,
                     dilation_w,
                     x_shape,
                     w_shape,
-                     y_shape):
+                     y_shape,
+                     data_dtype,
+                     conv_dtype):
    """Choose the best algo for the given input.

    Paramters
@@ -272,6 +279,10 @@ def conv2d_find_algo(tensor_format,
        weight shape
    y_shape: list
        output shape
+    data_dtype: str
+        data type
+    conv_dtype: str
+        convolution type

    Returns
    -------
@@ -297,7 +308,9 @@ def conv2d_find_algo(tensor_format,
                int(y_shape[0]),
                int(y_shape[1]),
                int(y_shape[2]),
-                int(y_shape[3]))
+                int(y_shape[3]),
+                data_dtype,
+                conv_dtype)


 def conv2d_forward(x,
@@ -310,7 +323,8 @@ def conv2d_forward(x,
                   dilation_w=1,
                   conv_mode=1,
                   tensor_format=0,
-                   algo=-1):
+                   algo=-1,
+                   conv_dtype=None):
    """Create an extern op that compute 2D convolution with CuDNN

    Parameters
@@ -341,12 +355,16 @@ def conv2d_forward(x,
    algo: int
        Forward algorithm, get index from ```algo_to_index``` function
        if algo == -1, the best algo will be chosen by CUDNN
+    conv_dtype: str
+        convolution type

    Returns
    -------
    y: Tensor
        The result tensor
    """
+    conv_dtype = x.dtype if conv_dtype is None else conv_dtype
+
    oshape = conv2d_output_shape(tensor_format,
                                 pad_h,
                                 pad_w,
@@ -355,18 +373,28 @@ def conv2d_forward(x,
                                 dilation_h,
                                 dilation_w,
                                 list(x.shape),
-                                 list(w.shape))
+                                 list(w.shape),
+                                 x.dtype,
+                                 conv_dtype)
    if algo == -1:
-        algo = conv2d_find_algo(tensor_format,
-                                pad_h,
-                                pad_w,
-                                stride_h,
-                                stride_w,
-                                dilation_h,
-                                dilation_w,
-                                list(x.shape),
-                                list(w.shape),
-                                oshape)
+        # For now if we try to call `cudnnFindConvolutionForwardAlgorithm` when
+        # using INT8 data type, CuDNN will crash down.
+        # On the other hand, CuDNN only support IMPLICIT_PRECOMP_GEMM at NHWC format
+        if tensor_format == 1 and conv_dtype == "int32":
+            algo = 1
+        else:
+            algo = conv2d_find_algo(tensor_format,
+                                    pad_h,
+                                    pad_w,
+                                    stride_h,
+                                    stride_w,
+                                    dilation_h,
+                                    dilation_w,
+                                    list(x.shape),
+                                    list(w.shape),
+                                    oshape,
+                                    x.dtype,
+                                    conv_dtype)

    return _api.extern(
        oshape, [x, w],
@@ -383,4 +411,5 @@ def conv2d_forward(x,
            dilation_w,
            ins[0],
            ins[1],
-            outs[0]), name="y")
+            outs[0],
+            conv_dtype), name="y")
--- a/src/runtime/contrib/cblas/gemm_common.h
+++ b/src/runtime/contrib/cblas/gemm_common.h
@@ -93,13 +93,13 @@ inline void CallGemm(TVMArgs args, TVMRetValue *ret, TGemmOp op) {
  double alpha = args.size() > 5 ? args[5] : 1.0;
  double beta = args.size() > 6 ? args[6] : 0.0;
  op(transb, transa, ColumnCount(B, transb), RowCount(A, transa),
-     ColumnCount(A, transa), static_cast<float>(alpha),
+     ColumnCount(A, transa), static_cast<typename TGemmOp::TDatatype>(alpha),
     reinterpret_cast<typename TGemmOp::TDatatype *>(
         static_cast<char *>(B->data) + B->byte_offset),
     ColumnStride(B),
     reinterpret_cast<typename TGemmOp::TDatatype *>(
         static_cast<char *>(A->data) + A->byte_offset),
-     ColumnStride(A), static_cast<float>(beta),
+     ColumnStride(A), static_cast<typename TGemmOp::TDatatype>(beta),
     reinterpret_cast<typename TGemmOp::TDatatype *>(
         static_cast<char *>(C->data) + C->byte_offset),
     ColumnStride(C));
@@ -170,9 +170,10 @@ inline void CallBatchGemm(TVMArgs args, TVMRetValue *ret, TBatchGemmOp op) {
  DType *C_data = reinterpret_cast<typename TBatchGemmOp::TDatatype *>(
      static_cast<char *>(C->data) + C->byte_offset);
  op(batch_size, transb, transa, ColumnCount3D(B, transb),
-     RowCount3D(A, transa), ColumnCount3D(A, transa), static_cast<float>(alpha),
+     RowCount3D(A, transa), ColumnCount3D(A, transa),
+     static_cast<typename TBatchGemmOp::TDatatype>(alpha),
     B_data, B_size, ColumnStride3D(B), A_data, A_size, ColumnStride3D(A),
-     static_cast<float>(beta), C_data, C_size, ColumnStride3D(C));
+     static_cast<typename TBatchGemmOp::TDatatype>(beta), C_data, C_size, ColumnStride3D(C));
 }

 }  // namespace contrib

--- a/src/runtime/contrib/cublas/cublas.cc
+++ b/src/runtime/contrib/cublas/cublas.cc
@@ -36,12 +36,40 @@ inline cublasOperation_t BooleanToTranspose(bool item) {
  return item ? CUBLAS_OP_T : CUBLAS_OP_N;
 }

+inline void TryEnableTensorCore(cublasHandle_t hdl) {
+  // TensorCores are only supported in cublas 9.0 or higher
+  int version;
+  CHECK_CUBLAS_ERROR(cublasGetVersion(hdl, &version));
+  if (version >= 9000)
+    CHECK_CUBLAS_ERROR(cublasSetMathMode(hdl, CUBLAS_TENSOR_OP_MATH));
+}
+
+struct CublasHgemmOp {
+  typedef half TDatatype;
+  cublasHandle_t handle;
+  explicit CublasHgemmOp(cublasHandle_t hdl)
+      : handle(hdl) {}
+
+  void operator()(bool ta, bool tb,
+                  int M, int N, int K,
+                  half alpha, half* A, int lda,
+                  half* B, int ldb,
+                  half beta, half* C, int ldc) {
+    CHECK_CUBLAS_ERROR(cublasHgemm(handle,
+                                   BooleanToTranspose(ta),
+                                   BooleanToTranspose(tb),
+                                   M, N, K,
+                                   &alpha, A, lda,
+                                   B, ldb,
+                                   &beta, C, ldc));
+  }
+};
+
 struct CublasSgemmOp {
  typedef float TDatatype;
  cublasHandle_t handle;
  explicit CublasSgemmOp(cublasHandle_t hdl)
-    : handle(hdl)
-    {}
+    : handle(hdl) {}

  void operator()(bool ta, bool tb,
                  int M, int N, int K,
@@ -58,13 +86,11 @@ struct CublasSgemmOp {
  }
 };

-
 struct CublasDgemmOp {
  typedef double TDatatype;
  cublasHandle_t handle;
  explicit CublasDgemmOp(cublasHandle_t hdl)
-    : handle(hdl)
-    {}
+    : handle(hdl) {}
  void operator()(bool ta, bool tb,
                  int M, int N, int K,
                  double alpha, double* A, int lda,
@@ -80,12 +106,32 @@ struct CublasDgemmOp {
  }
 };

+struct CublasHgemmBatchOp {
+  typedef half TDatatype;
+  cublasHandle_t handle;
+  explicit CublasHgemmBatchOp(cublasHandle_t hdl)
+      : handle(hdl) {}
+  void operator()(int batch_size, bool ta, bool tb, int M, int N, int K, half alpha, half* A,
+                  int a_stride, int lda, half* B, int b_stride, int ldb, half beta, half* C,
+                  int c_stride, int ldc) {
+    CHECK_CUBLAS_ERROR(cublasHgemmStridedBatched(handle,
+                                                 BooleanToTranspose(ta),
+                                                 BooleanToTranspose(tb),
+                                                 M, N, K,
+                                                 &alpha,
+                                                 A, lda, a_stride,
+                                                 B, ldb, b_stride,
+                                                 &beta,
+                                                 C, ldc, c_stride,
+                                                 batch_size));
+  }
+};
+
 struct CublasSgemmBatchOp {
  typedef float TDatatype;
  cublasHandle_t handle;
  explicit CublasSgemmBatchOp(cublasHandle_t hdl)
-    : handle(hdl)
-    {}
+    : handle(hdl) {}
  void operator()(int batch_size, bool ta, bool tb, int M, int N, int K, float alpha, float* A,
                  int a_stride, int lda, float* B, int b_stride, int ldb, float beta, float* C,
                  int c_stride, int ldc) {
@@ -106,8 +152,7 @@ struct CublasDgemmBatchOp {
  typedef double TDatatype;
  cublasHandle_t handle;
  explicit CublasDgemmBatchOp(cublasHandle_t hdl)
-    : handle(hdl)
-    {}
+    : handle(hdl) {}
  void operator()(int batch_size, bool ta, bool tb, int M, int N, int K, double alpha, double* A,
                  int a_stride, int lda, double* B, int b_stride, int ldb, double beta, double* C,
                  int c_stride, int ldc) {
@@ -124,35 +169,203 @@ struct CublasDgemmBatchOp {
  }
 };

+// Check cublas supported mix-precision computation type and return computeType
+bool CheckMixPrecisionType(DLDataType in_dtype, DLDataType out_dtype, bool int_support = true) {
+  if (int_support && TypeMatch(out_dtype, kDLInt, 32)) {
+    return TypeMatch(in_dtype, kDLInt, 8);
+  } else if (TypeMatch(out_dtype, kDLFloat, 32)) {
+    return TypeMatch(in_dtype, kDLInt, 8) ||
+           TypeMatch(in_dtype, kDLFloat, 16);
+  } else {
+    return false;
+  }
+}
+
+inline void CallGemmEx(TVMArgs args, TVMRetValue *ret, cublasHandle_t hdl) {
+  DLTensor *A = args[0];
+  DLTensor *B = args[1];
+  DLTensor *C = args[2];
+  bool transa = args[3];
+  bool transb = args[4];
+  CHECK_EQ(A->ndim, 2);
+  CHECK_EQ(B->ndim, 2);
+  CHECK_EQ(C->ndim, 2);
+
+  CHECK_EQ(ElementStride(A), 1);
+  CHECK_EQ(ElementStride(B), 1);
+  CHECK_EQ(ElementStride(C), 1);
+
+  CHECK(TypeEqual(A->dtype, B->dtype));
+
+  // C can never be transposed.
+  CHECK(!IsInPlaceTransposed(C));
+
+  // Reversed strides indicates an in-place transpose operation.
+  transa = IsInPlaceTransposed(A) ? !transa : transa;
+  transb = IsInPlaceTransposed(B) ? !transb : transb;
+
+  CHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
+  CHECK(!TypeMatch(A->dtype, kDLInt, 8) ||
+      ColumnStride(A) % 4 == 0) << "leading dimension must divide 4 for int8 gemm";
+  CHECK(!TypeMatch(B->dtype, kDLInt, 8) ||
+      ColumnStride(B) % 4 == 0) << "leading dimension must divide 4 for int8 gemm";
+  double alpha = args.size() > 5 ? args[5] : 1.0;
+  double beta = args.size() > 6 ? args[6] : 0.0;
+
+  cudaDataType_t cuda_in_type = GetCudaDataType(A->dtype);
+  cudaDataType_t cuda_out_type = GetCudaDataType(C->dtype);
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+  void *alpha_ptr = nullptr, *beta_ptr = nullptr;
+  auto alpha_int = static_cast<int32_t>(alpha);
+  auto beta_int = static_cast<int32_t>(beta);
+  auto alpha_float = static_cast<float>(alpha);
+  auto beta_float = static_cast<float>(beta);
+  if (C->dtype.code == kDLInt) {
+    alpha_ptr = &alpha_int;
+    beta_ptr = &beta_int;
+  } else if (C->dtype.code == kDLFloat) {
+    alpha_ptr = &alpha_float;
+    beta_ptr = &beta_float;
+  }
+
+  auto A_data = reinterpret_cast<void *>(static_cast<char *>(A->data) + A->byte_offset);
+  auto B_data = reinterpret_cast<void *>(static_cast<char *>(B->data) + B->byte_offset);
+  auto C_data = reinterpret_cast<void *>(static_cast<char *>(C->data) + C->byte_offset);
+
+  CHECK_CUBLAS_ERROR(cublasGemmEx(hdl,
+                                 BooleanToTranspose(transb),
+                                 BooleanToTranspose(transa),
+                                 ColumnCount(B, transb),
+                                 RowCount(A, transa),
+                                 ColumnCount(A, transa),
+                                 alpha_ptr,
+                                 B_data, cuda_in_type, ColumnStride(B),
+                                 A_data, cuda_in_type, ColumnStride(A),
+                                 beta_ptr,
+                                 C_data, cuda_out_type, ColumnStride(C),
+                                 cuda_out_type, algo));
+}
+
+inline void CallBatchGemmEx(TVMArgs args, TVMRetValue *ret, cublasHandle_t hdl) {
+  DLTensor *A = args[0];
+  DLTensor *B = args[1];
+  DLTensor *C = args[2];
+  bool transa = args[3];
+  bool transb = args[4];
+  CHECK_EQ(A->ndim, 3);
+  CHECK_EQ(B->ndim, 3);
+  CHECK_EQ(C->ndim, 3);
+  int batch_size = BatchCount3D(A);
+  CHECK_EQ(BatchCount3D(B), batch_size);
+  CHECK_EQ(BatchCount3D(C), batch_size);
+  CHECK_EQ(ElementStride(A), 1);
+  CHECK_EQ(ElementStride(B), 1);
+  CHECK_EQ(ElementStride(C), 1);
+
+  CHECK(TypeEqual(A->dtype, B->dtype));
+
+  // C can never be transposed.
+  CHECK(!IsInPlaceTransposed(C));
+
+  // Reversed strides indicates an in-place transpose operation.
+  transa = IsInPlaceTransposed(A) ? !transa : transa;
+  transb = IsInPlaceTransposed(B) ? !transb : transb;
+
+  CHECK(CheckMixPrecisionType(A->dtype, C->dtype, false)) << "Unsupported data type";
+  CHECK(!TypeMatch(A->dtype, kDLInt, 8) ||
+      ColumnStride(A) % 4 == 0) << "leading dimension must divide 4 for int8 gemm";
+  CHECK(!TypeMatch(B->dtype, kDLInt, 8) ||
+      ColumnStride(B) % 4 == 0) << "leading dimension must divide 4 for int8 gemm";
+  double alpha = args.size() > 5 ? args[5] : 1.0;
+  double beta = args.size() > 6 ? args[6] : 0.0;
+
+  const int A_size = A->shape[1] * A->shape[2];
+  const int B_size = B->shape[1] * B->shape[2];
+  const int C_size = C->shape[1] * C->shape[2];
+
+  cudaDataType_t cuda_in_type = GetCudaDataType(A->dtype);
+  cudaDataType_t cuda_out_type = GetCudaDataType(C->dtype);
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+  void *alpha_ptr = nullptr, *beta_ptr = nullptr;
+  auto alpha_int = static_cast<int32_t>(alpha);
+  auto beta_int = static_cast<int32_t>(beta);
+  auto alpha_float = static_cast<float>(alpha);
+  auto beta_float = static_cast<float>(beta);
+  if (C->dtype.code == kDLInt) {
+    alpha_ptr = &alpha_int;
+    beta_ptr = &beta_int;
+  } else if (C->dtype.code == kDLFloat) {
+    alpha_ptr = &alpha_float;
+    beta_ptr = &beta_float;
+  }
+
+  auto A_data = reinterpret_cast<void *>(static_cast<char *>(A->data) + A->byte_offset);
+  auto B_data = reinterpret_cast<void *>(static_cast<char *>(B->data) + B->byte_offset);
+  auto C_data = reinterpret_cast<void *>(static_cast<char *>(C->data) + C->byte_offset);
+  CHECK_CUBLAS_ERROR(cublasGemmStridedBatchedEx(hdl,
+                                  BooleanToTranspose(transb),
+                                  BooleanToTranspose(transa),
+                                  ColumnCount3D(B, transb),
+                                  RowCount3D(A, transa),
+                                  ColumnCount3D(A, transa),
+                                  alpha_ptr,
+                                  B_data, cuda_in_type, ColumnStride3D(B), B_size,
+                                  A_data, cuda_in_type, ColumnStride3D(A), A_size,
+                                  beta_ptr,
+                                  C_data, cuda_out_type, ColumnStride3D(C), C_size,
+                                  batch_size, cuda_out_type, algo));
+}
+
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.cublas.matmul")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
    DLTensor* A = args[0];
-
-    CHECK(TypeMatch(A->dtype, kDLFloat, 32) ||
-          TypeMatch(A->dtype, kDLFloat, 64));
+    DLTensor* C = args[2];

    CuBlasThreadEntry* entry_ptr = CuBlasThreadEntry::ThreadLocal();

-    if (TypeMatch(A->dtype, kDLFloat, 32))
-      CallGemm(args, ret, CublasSgemmOp(entry_ptr->handle));
-    else
-      CallGemm(args, ret, CublasDgemmOp(entry_ptr->handle));
+    TryEnableTensorCore(entry_ptr->handle);
+
+    if (TypeEqual(A->dtype, C->dtype)) {
+      CHECK(TypeMatch(A->dtype, kDLFloat, 16) ||
+          TypeMatch(A->dtype, kDLFloat, 32) ||
+          TypeMatch(A->dtype, kDLFloat, 64));
+
+      if (TypeMatch(A->dtype, kDLFloat, 16))
+        CallGemm(args, ret, CublasHgemmOp(entry_ptr->handle));
+      else if (TypeMatch(A->dtype, kDLFloat, 32))
+        CallGemm(args, ret, CublasSgemmOp(entry_ptr->handle));
+      else
+        CallGemm(args, ret, CublasDgemmOp(entry_ptr->handle));
+    } else {
+      CallGemmEx(args, ret, entry_ptr->handle);
+    }
 });

 TVM_REGISTER_GLOBAL("tvm.contrib.cublas.batch_matmul")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
    DLTensor* A = args[0];
+    DLTensor* C = args[2];
+

-    CHECK(TypeMatch(A->dtype, kDLFloat, 32) ||
-          TypeMatch(A->dtype, kDLFloat, 64));

    CuBlasThreadEntry* entry_ptr = CuBlasThreadEntry::ThreadLocal();

-    if (TypeMatch(A->dtype, kDLFloat, 32))
-      CallBatchGemm(args, ret, CublasSgemmBatchOp(entry_ptr->handle));
-    else
-      CallBatchGemm(args, ret, CublasDgemmBatchOp(entry_ptr->handle));
+    TryEnableTensorCore(entry_ptr->handle);
+    if (TypeEqual(A->dtype, C->dtype)) {
+      CHECK(TypeMatch(A->dtype, kDLFloat, 16) ||
+          TypeMatch(A->dtype, kDLFloat, 32) ||
+          TypeMatch(A->dtype, kDLFloat, 64));
+
+      if (TypeMatch(A->dtype, kDLFloat, 16))
+        CallBatchGemm(args, ret, CublasHgemmBatchOp(entry_ptr->handle));
+      else if (TypeMatch(A->dtype, kDLFloat, 32))
+        CallBatchGemm(args, ret, CublasSgemmBatchOp(entry_ptr->handle));
+      else
+        CallBatchGemm(args, ret, CublasDgemmBatchOp(entry_ptr->handle));
+    } else {
+      CallBatchGemmEx(args, ret, entry_ptr->handle);
+    }
 });

 }  // namespace contrib

--- a/src/runtime/contrib/cublas/cublas_utils.h
+++ b/src/runtime/contrib/cublas/cublas_utils.h
@@ -25,7 +25,7 @@
 #define TVM_RUNTIME_CONTRIB_CUBLAS_CUBLAS_UTILS_H_

 #include <dmlc/logging.h>
-
+#include <dlpack/dlpack.h>
 #include <cublas_v2.h>

 namespace tvm {
@@ -62,7 +62,27 @@ struct CuBlasThreadEntry {
  static CuBlasThreadEntry* ThreadLocal();
 };  // CuBlasThreadEntry

-
+inline cudaDataType_t GetCudaDataType(DLDataType type) {
+  if (type.code == kDLInt) {
+    switch (type.bits) {
+      case 8: return CUDA_R_8I;
+      case 32: return CUDA_R_32I;
+    }
+  } else if (type.code == kDLUInt) {
+    switch (type.bits) {
+      case 8: return CUDA_R_8U;
+      case 32: return CUDA_R_32U;
+    }
+  } else if (type.code == kDLFloat) {
+    switch (type.bits) {
+      case 16: return CUDA_R_16F;
+      case 32: return CUDA_R_32F;
+      case 64: return CUDA_R_64F;
+    }
+  }
+  LOG(FATAL) << "Unsupported cuda type";
+  return CUDA_R_16F;
+}
 }  // namespace contrib
 }  // namespace tvm


--- a/src/runtime/contrib/cudnn/conv_forward.cc
+++ b/src/runtime/contrib/cudnn/conv_forward.cc
@@ -42,9 +42,11 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
  int stride_w = args[6];
  int dilation_h = args[7];
  int dilation_w = args[8];
-  DLTensor *x = args[9];
-  DLTensor *w = args[10];
-  DLTensor *y = args[11];
+  DLTensor* x = args[9];
+  DLTensor* w = args[10];
+  DLTensor* y = args[11];
+  std::string conv_dtype = args[12];
+
  CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal();
  // Set Mode
  entry_ptr->conv_entry.mode = static_cast<cudnnConvolutionMode_t>(mode);
@@ -55,7 +57,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
  // Set Ctx
  entry_ptr->conv_entry.ctx = x->ctx;
  // Set Data Type
-  entry_ptr->conv_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(x->dtype);
+  entry_ptr->conv_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(String2TVMType(conv_dtype));
+  cudnnDataType_t data_type = CuDNNDataType::DLTypeToCuDNNType(x->dtype);
  // Set Desc
  CUDNN_CALL(cudnnSetConvolution2dDescriptor(entry_ptr->conv_entry.conv_desc,
                                             pad_h,
@@ -68,8 +71,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
                                             entry_ptr->conv_entry.data_type));
  // Set Filter
  CUDNN_CALL(cudnnSetFilter4dDescriptor(entry_ptr->conv_entry.filter_desc,
-                                        entry_ptr->conv_entry.data_type,
-                                        CUDNN_TENSOR_NCHW,
+                                        data_type,
+                                        entry_ptr->conv_entry.tensor_format,
                                        static_cast<int>(w->shape[0]),
                                        static_cast<int>(w->shape[1]),
                                        static_cast<int>(w->shape[2]),
@@ -77,7 +80,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
  // Set Input
  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.input_desc,
                                        entry_ptr->conv_entry.tensor_format,
-                                        entry_ptr->conv_entry.data_type,
+                                        data_type,
                                        static_cast<int>(x->shape[0]),
                                        static_cast<int>(x->shape[1]),
                                        static_cast<int>(x->shape[2]),
@@ -85,11 +88,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
  // Set Output
  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.output_desc,
                                        entry_ptr->conv_entry.tensor_format,
-                                        entry_ptr->conv_entry.data_type,
+                                        data_type,
                                        static_cast<int>(y->shape[0]),
                                        static_cast<int>(y->shape[1]),
                                        static_cast<int>(y->shape[2]),
                                        static_cast<int>(y->shape[3])));
+  if (cudnnGetVersion() > 7000) {
+    CUDNN_CALL(cudnnSetConvolutionMathType(entry_ptr->conv_entry.conv_desc, CUDNN_TENSOR_OP_MATH))
+  }
+
  // Set workspace
  size_t workspace_size = 0;
  CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(entry_ptr->handle,
@@ -135,6 +142,11 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.output_shape")
  int w_dim2 = args[13];
  int w_dim3 = args[14];
  void *out_shape = args[15];
+  std::string data_dtype = args[16];
+  std::string conv_dtype = args[17];
+  // Set Data Type
+  entry_ptr->conv_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(String2TVMType(conv_dtype));
+  cudnnDataType_t data_type = CuDNNDataType::DLTypeToCuDNNType(String2TVMType(data_dtype));
  // Set Format
  entry_ptr->conv_entry.tensor_format = static_cast<cudnnTensorFormat_t>(format);
  // conv desc
@@ -150,15 +162,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.output_shape")
  // input desc
  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.input_desc,
                                        entry_ptr->conv_entry.tensor_format,
-                                        CUDNN_DATA_FLOAT,
+                                        data_type,
                                        x_dim0,
                                        x_dim1,
                                        x_dim2,
                                        x_dim3));
  // filter desc
  CUDNN_CALL(cudnnSetFilter4dDescriptor(entry_ptr->conv_entry.filter_desc,
-                                        CUDNN_DATA_FLOAT,
-                                        CUDNN_TENSOR_NCHW,
+                                        data_type,
+                                        entry_ptr->conv_entry.tensor_format,
                                        w_dim0,
                                        w_dim1,
                                        w_dim2,
@@ -196,7 +208,12 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.find_algo")
  int y_dim1 = args[16];
  int y_dim2 = args[17];
  int y_dim3 = args[18];
+  std::string data_dtype = args[19];
+  std::string conv_dtype = args[20];

+  // Set Data Type
+  entry_ptr->conv_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(String2TVMType(conv_dtype));
+  cudnnDataType_t data_type = CuDNNDataType::DLTypeToCuDNNType(String2TVMType(data_dtype));
  // Set Format
  entry_ptr->conv_entry.tensor_format = static_cast<cudnnTensorFormat_t>(format);
  // conv desc
@@ -212,15 +229,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.find_algo")
  // input desc
  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.input_desc,
                                        entry_ptr->conv_entry.tensor_format,
-                                        CUDNN_DATA_FLOAT,
+                                        data_type,
                                        x_dim0,
                                        x_dim1,
                                        x_dim2,
                                        x_dim3));
  // filter desc
  CUDNN_CALL(cudnnSetFilter4dDescriptor(entry_ptr->conv_entry.filter_desc,
-                                        CUDNN_DATA_FLOAT,
-                                        CUDNN_TENSOR_NCHW,
+                                        data_type,
+                                        entry_ptr->conv_entry.tensor_format,
                                        w_dim0,
                                        w_dim1,
                                        w_dim2,
@@ -229,11 +246,14 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.find_algo")
  // output desc
  CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->conv_entry.output_desc,
                                        entry_ptr->conv_entry.tensor_format,
-                                        entry_ptr->conv_entry.data_type,
+                                        data_type,
                                        y_dim0,
                                        y_dim1,
                                        y_dim2,
                                        y_dim3));
+  if (cudnnGetVersion() > 7000) {
+    CUDNN_CALL(cudnnSetConvolutionMathType(entry_ptr->conv_entry.conv_desc, CUDNN_TENSOR_OP_MATH))
+  }

  int returned_algo_count = 0;
  cudnnConvolutionFwdAlgoPerf_t perf_results[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];

--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -18,13 +18,13 @@ import tvm
 import numpy as np
 from tvm.contrib import cublas

-def test_matmul_add():
+def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
    n = 1024
    l = 128
-    m = 235
-    A = tvm.placeholder((n, l), name='A')
-    B = tvm.placeholder((l, m), name='B')
-    C = cublas.matmul(A, B)
+    m = 236
+    A = tvm.placeholder((n, l), name='A', dtype=in_dtype)
+    B = tvm.placeholder((l, m), name='B', dtype=in_dtype)
+    C = cublas.matmul(A, B, dtype=out_dtype)
    s = tvm.create_schedule(C.op)

    def verify(target="cuda"):
@@ -36,22 +36,22 @@ def test_matmul_add():
            return
        ctx = tvm.gpu(0)
        f = tvm.build(s, [A, B, C], target)
-        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        f(a, b, c)
        tvm.testing.assert_allclose(
-            c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
+            c.asnumpy(), np.dot(a.asnumpy().astype(C.dtype), b.asnumpy().astype(C.dtype)), rtol=rtol)
    verify()

-def test_batch_matmul():
+def verify_batch_matmul(in_dtype, out_dtype, rtol=1e-5):
    j = 16
    n = 1024
    l = 128
-    m = 235
-    A = tvm.placeholder((j, n, l), name='A')
-    B = tvm.placeholder((j, l, m), name='B')
-    C = cublas.batch_matmul(A, B)
+    m = 236
+    A = tvm.placeholder((j, n, l), name='A', dtype=in_dtype)
+    B = tvm.placeholder((j, l, m), name='B', dtype=in_dtype)
+    C = cublas.batch_matmul(A, B, dtype=out_dtype)
    s = tvm.create_schedule(C.op)

    def verify(target="cuda"):
@@ -68,10 +68,22 @@ def test_batch_matmul():
        c = tvm.nd.array(np.zeros((j, n, m), dtype=C.dtype), ctx)
        f(a, b, c)
        tvm.testing.assert_allclose(
-            c.asnumpy(), np.matmul(a.asnumpy(), b.asnumpy()), rtol=1e-5)
+            c.asnumpy(), np.matmul(a.asnumpy().astype(C.dtype),
+                                   b.asnumpy().astype(C.dtype)).astype(C.dtype), rtol=rtol)
    verify()

+def test_matmul_add():
+    verify_matmul_add('float', 'float')
+    verify_matmul_add('float16', 'float')
+    verify_matmul_add('float16', 'float16', rtol=1e-2)
+    verify_matmul_add('int8', 'int32')
+
+def test_batch_matmul():
+    verify_batch_matmul('float', 'float')
+    verify_batch_matmul('float16', 'float')
+    verify_batch_matmul('float16', 'float16', rtol=1e-2)

 if __name__ == "__main__":
    test_matmul_add()
    test_batch_matmul()
+
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -19,8 +19,8 @@ from tvm.contrib import cudnn
 import numpy as np


-def test_conv2d():
-    in_channel = 3
+def verify_conv2d(data_dtype, conv_dtype, tensor_format=0):
+    in_channel = 4
    out_channel = 32
    filter_h = 3
    filter_w = 3
@@ -30,21 +30,25 @@ def test_conv2d():
    stride_w = 1
    dilation_h = 1
    dilation_w = 1
+    batch = 3
+    height = 32
+    weight = 32

-    xshape = [4, 3, 32, 32]
    if not tvm.module.enabled("cuda"):
        print("skip because cuda is not enabled...")
        return
    if not tvm.get_global_func("tvm.contrib.cudnn.conv2d.output_shape", True):
        print("skip because cudnn is not enabled...")
        return
+
+    xshape = [batch, in_channel, height, weight]
    wshape = cudnn.conv2d_w_shape(in_channel,
-                              out_channel,
-                              filter_h,
-                              filter_w)
+                          out_channel,
+                          filter_h,
+                          filter_w)

-    X = tvm.placeholder(xshape, name='X')
-    W = tvm.placeholder(wshape, name='W')
+    X = tvm.placeholder(xshape, name='X', dtype=data_dtype)
+    W = tvm.placeholder(wshape, name='W', dtype=data_dtype)
    Y = cudnn.conv2d_forward(X,
                             W,
                             stride_h,
@@ -54,24 +58,31 @@ def test_conv2d():
                             dilation_h,
                             dilation_w,
                             conv_mode=1,
-                             tensor_format=0,
-                             algo=1)
+                             tensor_format=tensor_format,
+                             conv_dtype=conv_dtype,
+                             algo=-1)
    yshape = [x.value for x in Y.shape]
-    s =  tvm.create_schedule(Y.op)
+    s = tvm.create_schedule(Y.op)

    def verify():
        ctx = tvm.gpu(0)
        f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d")
-        x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32),
+        x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(data_dtype),
                         ctx)
-        w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32),
+        w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(data_dtype),
                         ctx)
-        y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32),
+        y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(data_dtype),
                         ctx)
        f(x, w, y)

    verify()

+def test_conv2d():
+    verify_conv2d("float32", "float32", tensor_format=0)
+    verify_conv2d("float16", "float32", tensor_format=1)
+    verify_conv2d("float16", "float16", tensor_format=0)
+    verify_conv2d("int8", "int32", tensor_format=1)
+

 if __name__ == "__main__":
    test_conv2d()
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -89,6 +89,13 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCHW', ou
        cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\
                    ((KW - 1) * dilation_w + 1))

+        if data.dtype == "int8" or kernel.dtype == "int8":
+            if layout == 'NCHW':
+                raise ValueError("NCHW layout do not support int8 in cudnn")
+            dtype = "int32"
+        else:
+            dtype = data.dtype
+
        return cudnn.conv2d_forward(data,
                                    kernel,
                                    stride_h,
@@ -99,7 +106,8 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCHW', ou
                                    dilation_w,
                                    conv_mode=1,
                                    tensor_format=tensor_format,
-                                    algo=-1)  # let CUDNN choose the best algo
+                                    algo=-1,         # let CUDNN choose the best algo
+                                    conv_dtype=dtype)

    if cfg.template_key == 'winograd':
        return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,

--- a/topi/python/topi/cuda/dense.py
+++ b/topi/python/topi/cuda/dense.py
@@ -62,8 +62,7 @@ def dense_cuda(cfg, data, weight, bias=None, out_dtype=None):
    out_dim, _ = weight.shape
    target = tvm.target.current_target()
    if "cublas" in target.libs:
-        assert out_dtype == data.dtype, "Mixed precision not supported."
-        matmul = cublas.matmul(data, weight, False, True)
+        matmul = cublas.matmul(data, weight, False, True, out_dtype)
        if bias is not None:
            matmul = tvm.compute((batch, out_dim), \
                                 lambda i, j: matmul[i, j] + bias[j], \
@@ -256,9 +255,19 @@ def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
    """Dense operator for int8 on CUDA"""
    if out_dtype is None:
        out_dtype = data.dtype
+
    batch, in_dim = get_const_tuple(data.shape)
    out_dim, _ = get_const_tuple(weight.shape)

+    target = tvm.target.current_target()
+    if "cublas" in target.libs:
+        matmul = cublas.matmul(data, weight, False, True, out_dtype)
+        if bias is not None:
+            matmul = tvm.compute((batch, out_dim), \
+                                 lambda i, j: matmul[i, j] + bias[j].astype(out_dtype), \
+                                 tag=tag.BROADCAST)
+        return matmul
+
    k = tvm.reduce_axis((0, in_dim), name='k')

    matmul = tvm.compute((batch, out_dim),
@@ -279,7 +288,14 @@ def dense_int8(cfg, data, weight, bias=None, out_dtype=None):

 @autotvm.register_topi_schedule(generic.schedule_dense, ['cuda', 'gpu'], ['int8'])
 def schedule_dense_int8(cfg, outs):
+    """Dense schedule for int8 on CUDA"""
    s = tvm.create_schedule([x.op for x in outs])
+    target = tvm.target.current_target()
+
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    if "cublas" in target.libs:
+        return generic.schedule_extern(outs)
+
    def _callback(op):
        if "dense_int8" in op.tag:
            _schedule_dense_int8(cfg, s, op.output(0))