Qnn fully connected (#3910)

* Qnn Dense layer. * Reformatting code. * Reformatting code and making the test case more readable. * Fixing lint issues. * Fixing test method names to pass the nose related configurations. * Aligning the code for code style.

Qnn fully connected (#3910)
* Qnn Dense layer. * Reformatting code. * Reformatting code and making the test case more readable. * Fixing lint issues. * Fixing test method names to pass the nose related configurations. * Aligning the code for code style.
43f54a58 · shoubhik · Zhi · 16d4da4d · 43f54a58 · 43f54a58
Commit 43f54a58 authored Sep 22, 2019 by shoubhik Committed by Zhi Sep 22, 2019
9 changed files
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -74,10 +74,8 @@ struct QuantizeAttrs : public tvm::AttrsNode<QuantizeAttrs> {
  TVM_DECLARE_ATTRS(QuantizeAttrs, "relay.attrs.QuantizeAttrs") {
    TVM_ATTR_FIELD(out_dtype)
      .describe("Output data type, can be one of [int8 or uint8].");
-
    TVM_ATTR_FIELD(output_zero_point)
      .describe("The zero_point for the activation of this op.");
-
    TVM_ATTR_FIELD(output_scale)
      .describe("The scale for the activation of this op.");
  }
@@ -91,7 +89,6 @@ struct DequantizeAttrs : public tvm::AttrsNode<DequantizeAttrs> {
  TVM_DECLARE_ATTRS(DequantizeAttrs, "relay.attrs.DequantizeAttrs") {
    TVM_ATTR_FIELD(input_zero_point)
      .describe("The zero_point for the input tensor of this op.");
-
    TVM_ATTR_FIELD(input_scale)
      .describe("The scale for the input tensor of this op.");
  }
@@ -108,16 +105,12 @@ struct QnnConcatenateAttrs : public tvm::AttrsNode<QnnConcatenateAttrs> {
  TVM_DECLARE_ATTRS(QnnConcatenateAttrs, "relay.attrs.QnnConcatenateAttrs") {
    TVM_ATTR_FIELD(input_scales)
        .describe("The list of scales of input quantized tensors.");
-
    TVM_ATTR_FIELD(input_zero_points)
        .describe("The list of zero points of input quantized tensors.");
-
    TVM_ATTR_FIELD(output_zero_point)
      .describe("The zero_point for the output tensor.");
-
    TVM_ATTR_FIELD(output_scale)
      .describe("The scale for the output tensor.");
-
    TVM_ATTR_FIELD(axis)
        .describe("The axis at which the input arrays are concatenated."
                  "Should lie in range `[-ndim, ndim)`.")
@@ -199,24 +192,39 @@ struct QnnBinaryOpAttrs : public tvm::AttrsNode<QnnBinaryOpAttrs> {
  TVM_DECLARE_ATTRS(QnnBinaryOpAttrs, "relay.attrs.QnnBinaryOpAttrs") {
    TVM_ATTR_FIELD(lhs_zero_point)
      .describe("The zero_point for the lhs input tensor of this op.");
-
    TVM_ATTR_FIELD(lhs_scale)
      .describe("The scale for the lhs input tensor of this op.");
-
    TVM_ATTR_FIELD(rhs_zero_point)
      .describe("The zero_point for the rhs input tensor of this op.");
-
    TVM_ATTR_FIELD(rhs_scale)
      .describe("The scale for the rhs input tensor of this op.");
-
    TVM_ATTR_FIELD(output_zero_point)
      .describe("The zero_point for the activation of this op.");
-
    TVM_ATTR_FIELD(output_scale)
      .describe("The scale for the activation of this op.");
  }
 };

+/*! \brief Attributes for qnn dense operator */
+struct QnnDenseAttrs : public tvm::AttrsNode<QnnDenseAttrs> {
+  IndexExpr units;
+  DataType out_dtype;
+  // Quantization related attributes.
+  int32_t input_zero_point;
+  int32_t kernel_zero_point;
+
+  TVM_DECLARE_ATTRS(QnnDenseAttrs, "relay.attrs.qnn.QnnDenseAttrs") {
+    TVM_ATTR_FIELD(units)
+      .describe("Number of hidden units of the dense transformation.");
+    TVM_ATTR_FIELD(out_dtype)
+      .describe("Output data type, set to explicit type under mixed precision setting");
+    TVM_ATTR_FIELD(input_zero_point)
+      .describe("The zero point of the input tensor.");
+    TVM_ATTR_FIELD(kernel_zero_point)
+      .describe("The zero point of the kernel tensor.");
+  }
+};
+
 }  // namespace qnn
 }  // namespace relay
 }  // namespace tvm

--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -96,7 +96,7 @@ def quantize(data,
        The output zero_point.
    output_scale : float
        The output scale.
-    input_dtype : str, optional
+    out_dtype : str, optional
        The data type of the input tensor. Can be [int8, uint8]
    Returns
    -------
@@ -265,7 +265,13 @@ def conv2d(data,
                        data_layout, kernel_layout, out_layout, out_dtype)


-def add(lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale,
+def add(lhs,
+        rhs,
+        lhs_scale,
+        lhs_zero_point,
+        rhs_scale,
+        rhs_zero_point,
+        output_scale,
        output_zero_point):
    """Quantized addition with numpy-style broadcasting.

@@ -305,3 +311,41 @@ def add(lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_s
                     lhs_scale, lhs_zero_point,
                     rhs_scale, rhs_zero_point,
                     output_scale, output_zero_point)
+
+
+def quantized_dense(data,
+                    weight,
+                    input_zero_point,
+                    kernel_zero_point,
+                    units=None,
+                    out_dtype="int32"):
+    """Qnn Dense operator.
+    Applies a quantized linear transformation
+
+     .. math::
+
+     `Y = X * W`
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The quantized input data to the operator.
+    weight : tvm.relay.Expr
+        The quantized weight expressions.
+    units : int, optional
+        Number of hidden units of the dense transformation.
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision dense can be int32 or int16.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+
+    return _make.dense(data,
+                       weight,
+                       units,
+                       input_zero_point,
+                       kernel_zero_point,
+                       out_dtype)
--- a/python/tvm/relay/qnn/transform.py
+++ b/python/tvm/relay/qnn/transform.py
@@ -22,7 +22,7 @@ from tvm import relay

 def CanonicalizeOps():
    """Converts/Lowers an expression containing QNN ops to an expression containing only core
-    (non-Dialect) Relay ops. Each QNN op is lowered to a sequence of exisiting Relay ops. This is a
+    (non-Dialect) Relay ops. Each QNN op is lowered to a sequence of existing Relay ops. This is a
    target-independent pass. One can register the lowering/transformation function for this op using
    FTVMQnnCanonicalize attr_name for FTVMLegalize op attribute.  An example of this transformation
    is below
@@ -40,7 +40,7 @@ def CanonicalizeOps():
                                           output_zero_point=0,
                                           out_dtype='int8')

-        # We want to utilize all the existing Relay infrastucture. So, instead of supporting this
+        # We want to utilize all the existing Relay infrastructure. So, instead of supporting this
        # QNN requantize op, we convert it into a sequence of existing Relay operators.
        mod = relay.Module.from_expr(qnn_expr)
        mod = relay.qnn.transform.CanonicalizeOps()(mod)

--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -25,6 +25,7 @@
 #ifndef TVM_RELAY_OP_NN_CONVOLUTION_H_
 #define TVM_RELAY_OP_NN_CONVOLUTION_H_

+#include <tvm/ir_pass.h>
 #include <string>
 #include <utility>


--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -434,6 +434,17 @@ static inline Expr Conv2D(Expr data, Expr weight, Array<IndexExpr> strides,
  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
 }

+static inline Expr Dense(Expr data,
+                         Expr weight,
+                         IndexExpr units,
+                         DataType out_dtype) {
+  auto attrs = make_node<DenseAttrs>();
+  attrs->units = units;
+  attrs->out_dtype = out_dtype;
+  static const Op& op = Op::Get("nn.dense");
+  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
+}
+
 static inline Expr Sum(Expr data, Array<Integer> axis, bool keepdims, bool exclude) {
  auto attrs = make_node<ReduceAttrs>();
  attrs->axis = std::move(axis);

--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -23,7 +23,6 @@
 * \brief Property def of qnn convolution operator.
 */
 #include <tvm/data_layout.h>
-#include <tvm/ir_pass.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/base.h>
 #include <tvm/relay/op.h>
@@ -178,7 +177,7 @@ Expr Conv2DPadInput(const Expr& data, const QnnConv2DAttrs* param) {
 * \param data The input expr.
 * \param weight The weight expr.
 * \param param The qnn conv2d attributes.
- * \return The sequence of Relay operatos for term1.
+ * \return The sequence of Relay operators for term1.
 * \note The term1 is
 *       Sigma(c,r,s) QW(k, c, r, s) * QA(n, c, h + r, w + s)
 *       This is just conv2d on int tensors.
@@ -198,12 +197,12 @@ Expr Conv2DFirstTerm(const Expr& padded_data, const Expr& weight, const QnnConv2
 * \param param The qnn conv2d attributes.
 * \param kernel_h The height of kernel.
 * \param kernel_w The width of kernel.
- * \return The sequence of Relay operatos for term2.
+ * \return The sequence of Relay operators for term2.
 * \note The term2 looks like this
 *
 *       Sigma(c,r,s) zp_w * QA(n, c, h + r, w + s)
 *
- *       Second term is not directly represetable by one Relay operator.
+ *       Second term is not directly representable by one Relay operator.
 *       However, deeper analysis shows that we can reduce r,s using avg_pool2d,
 *       followed by a reduce on the C axis. Using avg_pool2d also gives an
 *       opportunity to reuse alter_op_layout infrastructure.
@@ -313,7 +312,7 @@ Expr Conv2DThirdTerm(const Expr& weight, const Expr& zp_data, const QnnConv2DAtt
 * \param in_channels The number of input channels.
 * \param kernel_h The height of kernel.
 * \param kernel_w The width of kernel.
- * \return The sequence of Relay operatos for term4.
+ * \return The sequence of Relay operators for term4.
 * \note The term4 looks like this
 *
 *       Sigma(c,r,s) zp_a * zp_w
@@ -373,7 +372,7 @@ Expr Conv2DCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3,
 *       where QA is quantized tensor, scale_a and zp_A are quantizations
 *       params.
 *
- *       Quantized convlution convolves two quantized tensors and returns a
+ *       Quantized convolution will convolve two quantized tensors and returns a
 *       quantized tensor of default dtype of int32, with scale equaling to the
 *       product of scales of input tensors, and a zero point of zero.
 *
@@ -399,7 +398,7 @@ Expr Conv2DCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3,
 *         zero point. This might leave some performance opportunity at the
 *         table. Can be avoided by modifying conv2d API to accept the
 *         pad_const_value.
- *         2) Second term is not directly represetable by one Relay operator.
+ *         2) Second term is not directly representable by one Relay operator.
 *         However, deeper analysis shows that we can reduce r,s using
 *         avg_pool2d, followed by a reduce on the C axis. Using avg_pool2d also
 *         gives an opportunity to reuse alter_op_layout infrastructure.
@@ -408,7 +407,7 @@ Expr Conv2DCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3,
 *         the conv is dilated. We fallback also in case of depthwise conv.
 *
 *       The whole process can be broken down into following steps
- *       * Assertion checks for exisiting support, fallback if necessary
+ *       * Assertion checks for existing support, fallback if necessary
 *       * Pad the input.
 *       * Get Term1.
 *       * Get Term2.

--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/relay/qnn/op/dense.cc
+ * \brief Property def of qnn dense operator.
+ */
+
+#include <tvm/relay/base.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/qnn/attrs.h>
+#include "../../op/nn/nn.h"
+#include "../../pass/pattern_util.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+// relay.op.qnn.dense
+TVM_REGISTER_NODE_TYPE(QnnDenseAttrs);
+
+bool QnnDenseRel(const Array<Type>& types,
+                 int num_inputs,
+                 const Attrs& attrs,
+                 const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr || weight == nullptr) return false;
+  const auto* param = attrs.as<QnnDenseAttrs>();
+  CHECK(param != nullptr) << "QnnConv2DAttrs cannot be nullptr.";
+  CHECK(data->dtype == Int(8) || data->dtype == UInt(8))
+    << "Expected quantized dense type(int8, uint8) for input but was " <<  data->dtype;
+  CHECK(weight->dtype == Int(8) || weight->dtype == UInt(8))
+    << "Expected quantized dense type(int8, uint8) for weight but was " <<  weight->dtype;
+  CHECK(param->out_dtype == Int(32))
+    << "Expected quantized dense type(int32) for output but was " <<  param->out_dtype;
+  CHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
+  return DenseRel<QnnDenseAttrs>(types, num_inputs, attrs, reporter);
+}
+
+// Positional relay function to create quantized dense operator used by frontend FFI.
+Expr MakeQuantizedDense(Expr data,
+                        Expr weight,
+                        IndexExpr units,
+                        int32_t input_zero_point,
+                        int32_t kernel_zero_point,
+                        DataType out_dtype) {
+  auto attrs = make_node<QnnDenseAttrs>();
+  attrs->units = std::move(units);
+  attrs->out_dtype = out_dtype;
+  attrs->input_zero_point = input_zero_point;
+  attrs->kernel_zero_point = kernel_zero_point;
+  static const Op& op = Op::Get("qnn.dense");
+  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
+}
+
+/**
+ * \brief Lowers Qnn convolution in terms of core operators in relay.
+ * Mathematically it is equals to -
+ * Dense((quantized_input - input_zero_point;int32), (quantized_kernel - kernel_zero_point; int32))
+ *
+ * \param attrs QnnDenseAttrs for Qnn Dense layer.
+ * \param new_args The new mutated args to the call node.
+ * \param arg_types The data types of input and output.
+ * \reutrn The sequence of Relay ops for qnn cov2d op.
+ */
+Expr QnnDenseCanonicalize(const Attrs& attrs,
+                          const Array<Expr>& new_args,
+                          const Array<tvm::relay::Type>& arg_types) {
+  CHECK_EQ(new_args.size(), 2);
+  Expr quantized_data = new_args[0];
+  Expr quantized_kernel = new_args[1];
+  const auto* qnn_dense_attrs = attrs.as<QnnDenseAttrs>();
+  Expr quantized_data_int32 = Cast(quantized_data, Int(32));
+  if (qnn_dense_attrs->input_zero_point != 0) {
+    quantized_data_int32 = Subtract(quantized_data_int32,
+                                    MakeConstantScalar(Int(32),
+                                    qnn_dense_attrs->input_zero_point));
+  }
+  Expr quantized_kernel_int32 = Cast(quantized_kernel, Int(32));
+  if (qnn_dense_attrs->kernel_zero_point != 0) {
+    quantized_kernel_int32 = Subtract(quantized_kernel_int32,
+                                      MakeConstantScalar(Int(32),
+                                      qnn_dense_attrs->kernel_zero_point));
+  }
+  Expr int32_dense = Dense(quantized_data_int32,
+                           quantized_kernel_int32,
+                           qnn_dense_attrs->units,
+                           qnn_dense_attrs->out_dtype);
+  return int32_dense;
+}
+
+RELAY_REGISTER_OP("qnn.dense")
+.describe(R"code(Applies a linear transformation: :math:`Y = XW^T`.
+- **data**: quantized(int8, unit8) `(x1, x2, ..., xn, input_dim)`
+- **weight**: quantized(int8, unit8) `(units, input_dim)`
+- **out**: quantized(int32) `(x1, x2, ..., xn, units)`.
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.qnn.QnnDenseAttrs")
+.set_num_inputs(2)
+.add_argument("data", "quantized nD Tensor", "Input data.")
+.add_argument("weight", "quantized 2D Tensor", "Weight matrix.")
+.set_support_level(11)
+.add_type_rel("QDense", DenseRel<QnnDenseAttrs>)
+.set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnDenseCanonicalize);
+
+TVM_REGISTER_API("relay.qnn.op._make.dense")
+.set_body_typed(MakeQuantizedDense);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
--- a/tests/python/relay/test_op_qnn_conv2d.py
+++ b/tests/python/relay/test_op_qnn_conv2d.py
@@ -19,7 +19,6 @@ import tvm
 import numpy as np
 from tvm import relay
 from tvm.relay import transform
-from tvm.relay.testing import create_workload
 from tvm.relay.testing import run_infer_type
 from tvm.contrib import graph_runtime


--- a/tests/python/relay/test_qnn_dense.py
+++ b/tests/python/relay/test_qnn_dense.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.contrib import graph_runtime
+
+
+def make_requantize_params(input_scale, output_scale, output_zero_point, out_dtype):
+    config = {
+        'input_scale': input_scale,
+        'output_scale': output_scale,
+        'output_zero_point': output_zero_point,
+        'out_dtype': out_dtype
+    }
+    return config
+
+
+def make_configuration(quantized_data, 
+                       quantized_kernel,
+                       dtype,
+                       input_shape,
+                       kernel_shape,
+                       input_zero_point,
+                       kernel_zero_point,
+                       units,
+                       output,
+                       out_dtype='int32',
+                       bias=None,
+                       requantize=None):
+    if requantize is not None:
+        assert bias is not None
+    config = {
+        'quantized_data': quantized_data,
+        'quantized_kernel': quantized_kernel,
+        'dtype': dtype,
+        'input_shape': input_shape,
+        'kernel_shape': kernel_shape,
+        'input_zero_point': input_zero_point,
+        'kernel_zero_point': kernel_zero_point,
+        'units': units,
+        'output': output,
+        'out_dtype': out_dtype,
+        'bias': bias,
+        'requantize': requantize
+    }
+    return config
+
+
+def make_uint_configuration(use_bias=False, requantize_output=False):
+    input_shape, kernel_shape, output_shape = (2, 10), (3,10), (2, 3)
+    input_zero_point, kernel_zero_point = 127, 127
+    in_dtype = 'uint8'
+    out_dtype = 'int32' if not requantize_output else 'uint8'
+    units = 3
+    quantized_data_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 109, 107,
+                                  129, 131, 133, 135, 137, 139, 141, 111, 145, 107]) \
+        .astype(in_dtype) \
+        .reshape(input_shape)
+    quantized_kernel_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 145, 147,
+                                    129, 131, 133, 135, 137, 139, 141, 143, 145, 147,
+                                    129, 131, 133, 135, 137, 139, 141, 143, 145, 147]) \
+        .astype(in_dtype) \
+        .reshape(kernel_shape)
+    bias = np.array([4, 8, 12]).astype(out_dtype).reshape((units, )) if use_bias else None
+    requant_params = make_requantize_params(0.25, 1.0, 127, 'uint8') if requantize_output else None
+
+    if requantize_output:
+        assert use_bias
+        output = np.array([151, 152, 153, 185, 186, 187])
+    elif use_bias:
+        output = np.array([96, 100, 104, 232, 236, 240 ])
+    else:
+        output = np.array([92, 92, 92, 228, 228, 228 ])
+    output = output.astype(out_dtype).reshape(output_shape)
+    return make_configuration(quantized_data=quantized_data_np,
+                              quantized_kernel=quantized_kernel_np,
+                              dtype=in_dtype,
+                              input_shape=input_shape,
+                              kernel_shape=kernel_shape,
+                              input_zero_point=input_zero_point,
+                              kernel_zero_point=kernel_zero_point,
+                              units=units,
+                              output=output,
+                              bias=bias,
+                              requantize=requant_params)
+
+
+def make_int_configuration(use_bias=False, requantize_output=False):
+    input_shape, kernel_shape, output_shape = (2, 10), (3,10), (2, 3)
+    input_zero_point, kernel_zero_point = -1, -1
+    in_dtype = 'int8'
+    out_dtype = 'int32' if not requantize_output else 'int8'
+    units = 3
+    quantized_data_np = np.array([1, 3, 5, 7, 9, 11, 13, 15, -19, -21,
+                                  1, 3, 5, 7, 9, 11, 13, -17, 17, -21]) \
+        .astype(in_dtype) \
+        .reshape(input_shape)
+    quantized_kernel_np = np.array([1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+                                    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+                                    1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) \
+        .astype(in_dtype) \
+        .reshape(kernel_shape)
+    bias = np.array([4, 8, 12]).astype(out_dtype).reshape((units, )) if use_bias else None
+    requant_params = make_requantize_params(0.25, 1.0, -1, 'int8') if requantize_output else None
+
+    if requantize_output:
+        assert use_bias
+        output = np.array([23, 24, 25, 57, 58, 59])
+    elif use_bias:
+        output = np.array([96, 100, 104, 232, 236, 240 ])
+    else:
+        output = np.array([92, 92, 92, 228, 228, 228 ])
+    output = output.astype(out_dtype).reshape(output_shape)
+    return make_configuration(quantized_data=quantized_data_np,
+                              quantized_kernel=quantized_kernel_np,
+                              dtype=in_dtype,
+                              input_shape=input_shape,
+                              kernel_shape=kernel_shape,
+                              input_zero_point=input_zero_point,
+                              kernel_zero_point=kernel_zero_point,
+                              units=units,
+                              output=output,
+                              bias=bias,
+                              requantize=requant_params)
+
+
+def qnn_dense_driver(test_configuration):
+    in_dtype = test_configuration['dtype']
+    out_dtype = test_configuration['out_dtype']
+    quantized_data_name = "quantized_data"
+    quantized_kernel_name = "quantized_kernel"
+    expected_out_dtype = test_configuration['out_dtype']
+    bias_name = 'bias'
+    quantized_data = relay.var(quantized_data_name,
+                               shape=test_configuration['input_shape'],
+                               dtype=in_dtype)
+    quantized_kernel = relay.var(quantized_kernel_name,
+                                 shape=test_configuration['kernel_shape'],
+                                 dtype=in_dtype)
+    mod = relay.qnn.op.quantized_dense(
+        quantized_data,
+        quantized_kernel,
+        test_configuration['input_zero_point'],
+        test_configuration['kernel_zero_point'],
+        test_configuration['units'])
+    if test_configuration[bias_name] is not None:
+        bias = relay.var(bias_name,
+                         shape=test_configuration['bias'].shape,
+                         dtype=out_dtype)
+        mod = relay.nn.bias_add(mod, bias)
+    if test_configuration['requantize'] is not None:
+        requantize_config = test_configuration['requantize']
+        mod = relay.qnn.op.requantize(
+            mod,
+            input_scale=requantize_config['input_scale'],
+            input_zero_point=0,
+            output_scale=requantize_config['output_scale'],
+            output_zero_point=requantize_config['output_zero_point'],
+            out_dtype=requantize_config['out_dtype'])
+        expected_out_dtype = requantize_config['out_dtype']
+
+    mod = relay.Function(relay.analysis.free_vars(mod), mod)
+    mod = relay.Module.from_expr(mod)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    with relay.build_config(opt_level=2):
+        graph, lib, params = relay.build(mod, "llvm", params=None)
+        mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+        mod.set_input(quantized_data_name,test_configuration[quantized_data_name])
+        mod.set_input(quantized_kernel_name,test_configuration[quantized_kernel_name])
+        if test_configuration[bias_name] is not None:
+            mod.set_input(bias_name, test_configuration[bias_name])
+        mod.set_input(**params)
+        mod.run()
+        res = mod.get_output(0).asnumpy()
+        np.testing.assert_equal(res, test_configuration['output'])
+        assert res.dtype == expected_out_dtype
+
+
+def test_qnn_dense_without_bias():
+    uint32_output_without_bias_paramas = \
+        make_uint_configuration(use_bias=False)
+    int32_output_without_bias_params = \
+        make_int_configuration(use_bias=False)
+    qnn_dense_driver(uint32_output_without_bias_paramas)
+    qnn_dense_driver(int32_output_without_bias_params)
+
+
+def test_qnn_dense_with_bias():
+    uint32_output_with_bias_params = \
+        make_uint_configuration(use_bias=True)
+    int32_output_with_bias_params = \
+        make_int_configuration(use_bias=True)
+    qnn_dense_driver(uint32_output_with_bias_params)
+    qnn_dense_driver(int32_output_with_bias_params)
+
+
+def test_qnn_dense_with_requantized_output():
+    uint8_requantized_output_with_bias_params = \
+        make_uint_configuration(use_bias=True, requantize_output=True)
+    int8_requantized_output_with_bias_params = \
+        make_int_configuration(use_bias=True, requantize_output=True)
+    qnn_dense_driver(uint8_requantized_output_with_bias_params)
+    qnn_dense_driver(int8_requantized_output_with_bias_params)
+
+
+if __name__ == "__main__":
+    test_qnn_dense_without_bias()
+    test_qnn_dense_with_bias()
+    test_qnn_dense_with_requantized_output()