Adding support for QNN subtract op (#5153)

* Adding support for QNN subtract op * Fixing typo. * Fixing typo. * Fixing lint. * Addressing review comments. * Renaming variables as per convention and renamed QnnBinaryOpTypes -> QnnBinaryOpType * Renaming QnnBinaryOpType to QnnBinaryOpTensorType which now takes the index you want to extract to make the code more readable. * Fixing lint. * Moving common code to macro. * Fixing alignment. * Fixing typo. * Fixing lint. * Renaming method to pass CI.

Adding support for QNN subtract op (#5153)
* Adding support for QNN subtract op * Fixing typo. * Fixing typo. * Fixing lint. * Addressing review comments. * Renaming variables as per convention and renamed QnnBinaryOpTypes -> QnnBinaryOpType * Renaming QnnBinaryOpType to QnnBinaryOpTensorType which now takes the index you want to extract to make the code more readable. * Fixing lint. * Moving common code to macro. * Fixing alignment. * Fixing typo. * Fixing lint. * Renaming method to pass CI.
dada6761 · shoubhik · GitHub · f4286cc7 · dada6761 · dada6761
Unverified Commit dada6761 authored Mar 27, 2020 by shoubhik Committed by GitHub Mar 27, 2020
8 changed files
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -310,9 +310,6 @@ def add(lhs,
    rhs : relay.Expr
        The right hand side quantized input data.

-    lhs_scale: float
-        The scale of the lhs quantized expr.
-
    lhs_scale: relay.Expr
        The scale of the lhs quantized expr.

@@ -436,3 +433,51 @@ def mul(lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point,
                     lhs_scale, lhs_zero_point,
                     rhs_scale, rhs_zero_point,
                     output_scale, output_zero_point)
+
+
+def subtract(lhs,
+             rhs,
+             lhs_scale,
+             lhs_zero_point,
+             rhs_scale,
+             rhs_zero_point,
+             output_scale,
+             output_zero_point):
+    """Quantized subtraction with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side quantized input data.
+
+    rhs : relay.Expr
+        The right hand side quantized input data.
+
+    lhs_scale: relay.Expr
+        The scale of the lhs quantized expr.
+
+    lhs_zero_point: relay.Expr
+       The zero point of lhs quantized expr.
+
+    rhs_scale: relay.Expr
+        The scale of the rhs quantized expr.
+
+    rhs_zero_point: relay.Expr
+       The zero point of rhs quantized expr.
+
+    output_scale: relay.Expr
+        The scale of the output quantized expr.
+
+    output_zero_point: relay.Expr
+       The zero point of output quantized expr.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+
+    """
+    return _make.subtract(lhs, rhs,
+                          lhs_scale, lhs_zero_point,
+                          rhs_scale, rhs_zero_point,
+                          output_scale, output_zero_point)
--- a/src/relay/qnn/op/add.cc
+++ b/src/relay/qnn/op/add.cc
@@ -23,59 +23,27 @@
 */
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/qnn/attrs.h>
-#include "../../transforms/pattern_util.h"
-#include "../../transforms/infer_layout_util.h"
-#include "../util.h"
 #include "op_common.h"

 namespace tvm {
 namespace relay {
 namespace qnn {

-/*! \brief Infer layout for QNN binary broadcast operators */
-Array<Array<Layout> > QnnBinaryBroadcastLayout(const Attrs& attrs,
-                                               const Array<Layout>& new_in_layouts,
-                                               const Array<Layout>& old_in_layouts,
-                                               const Array<tvm::relay::Type>& old_in_types) {
-  // Use Relay Binary Broadcast Infer correct layout.
-  auto layouts = BinaryBroadcastLayout(attrs, new_in_layouts, old_in_layouts, old_in_types);
-
-  // Fill the layouts of remaining input tensors - scales and zero points. The layouts of these
-  // tensors can be treated as C.
-  Layout channel_layout = Layout("C");
-  Array<Layout> input_layouts = {layouts[0][0],  layouts[0][1],  channel_layout, channel_layout,
-                                 channel_layout, channel_layout, channel_layout, channel_layout};
-  Array<Layout> output_layouts = layouts[1];
-  return {input_layouts, output_layouts};
-}
-
 /*
 * \brief Canonicalizes the QNN add op.
- * \param attrs The QNN concatenate attrs.
+ * \param attrs The empty attribute.
 * \param new_args The new mutated args to the call node.
 * \param arg_types The types of input and output.
 * \return The sequence of Relay ops for add op.
 */
 Expr QnnAddCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                        const Array<tvm::relay::Type>& arg_types) {
-  // Get the attrs.
-  CHECK_EQ(new_args.size(), 8);
-  auto& lhs = new_args[0];
-  auto& rhs = new_args[1];
-  auto& lhs_scale = new_args[2];
-  auto& lhs_zero_point = new_args[3];
-  auto& rhs_scale = new_args[4];
-  auto& rhs_zero_point = new_args[5];
-  auto& output_scale = new_args[6];
-  auto& output_zero_point = new_args[7];
+  // Get the args.
+  QnnBinaryOpArguments args(new_args);

  // Get the input dtype and shape.
-  CHECK_EQ(arg_types.size(), 9);
-  auto tensor_type = arg_types[0].as<TensorTypeNode>();
-  CHECK(tensor_type != nullptr);
-  auto input_dtype = tensor_type->dtype;
-  auto input_shape = tensor_type->shape;
+  QnnBinaryOpTensorType input_type(arg_types, 0);
+

  // FIXME (anijain2305) - The lowering can be further optimized. Instead of inserting requantize in
  // the start, we can insert requantize at the end if both input tensors have same qnn params. In
@@ -97,47 +65,36 @@ Expr QnnAddCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
  //          Q_c = Q_a' + Q_b' - zp_c
  // The add op is done in int32 precision.

-  // Requantize LHS if necessary.
-  auto requantized_lhs = lhs;
-  if (!IsEqualScalar(lhs_scale, output_scale) ||
-      !IsEqualScalar(lhs_zero_point, output_zero_point)) {
-    requantized_lhs = Requantize(lhs, input_shape, lhs_scale, lhs_zero_point, output_scale,
-                                 output_zero_point, DataType::Int(32));
-  } else {
-    requantized_lhs = Cast(requantized_lhs, DataType::Int(32));
-  }

-  // Requantize RHS if necessary.
-  auto requantized_rhs = rhs;
-  if (!IsEqualScalar(rhs_scale, output_scale) ||
-      !IsEqualScalar(rhs_zero_point, output_zero_point)) {
-    requantized_rhs = Requantize(rhs, input_shape, rhs_scale, rhs_zero_point, output_scale,
-                                 output_zero_point, DataType::Int(32));
-  } else {
-    requantized_rhs = Cast(requantized_rhs, DataType::Int(32));
-  }

+  // Requantize LHS if necessary. Computes Q_a'
+  auto requantized_lhs = RequantizeOrUpcast(args.lhs, args.lhs_scale,
+                                            args.lhs_zero_point,
+                                            args.output_scale, args.output_zero_point,
+                                            input_type.shape);
+  // Requantize RHS if necessary. Computes Q_b'
+  auto requantized_rhs = RequantizeOrUpcast(args.rhs, args.rhs_scale,
+                                            args.rhs_zero_point,
+                                            args.output_scale, args.output_zero_point,
+                                            input_type.shape);
+  // Computes Q_a' + Q_b'
  auto output = Add(requantized_lhs, requantized_rhs);

-  // Subtract zero point.
+  // Subtract zero point. Computes (Q_a' + Q_b') - zp_c
  auto zero_scalar = MakeConstantScalar(DataType::Int(32), 0);
-  if (!IsEqualScalar(output_zero_point, zero_scalar)) {
-    output = Subtract(output, output_zero_point);
+  if (!IsEqualScalar(args.output_zero_point, zero_scalar)) {
+    output = Subtract(output, args.output_zero_point);
  }

  // Go back to lower precision.
-  auto q_min = GetQmin(input_dtype);
-  auto q_max = GetQmax(input_dtype);
-  output = Clip(output, q_min, q_max);
-  return Cast(output, input_dtype);
+  return ConvertDtype(output, input_type.dtype);
 }

 // QNN Addition operator.
 QNN_REGISTER_BINARY_OP("add")
 .describe("Elementwise add with with broadcasting for quantized tensors.")
 .set_support_level(11)
-.set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnAddCanonicalize)
-.set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnBinaryBroadcastLayout);
+.set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnAddCanonicalize);

 }  // namespace qnn
 }  // namespace relay

--- a/src/relay/qnn/op/mul.cc
+++ b/src/relay/qnn/op/mul.cc
@@ -42,22 +42,13 @@ namespace qnn {
 Expr QnnMulCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                        const Array<tvm::relay::Type>& arg_types) {
  // Get the attrs.
-  CHECK_EQ(new_args.size(), 8);
-  auto& lhs = new_args[0];
-  auto& rhs = new_args[1];
-  auto& lhs_scale = new_args[2];
-  auto& lhs_zero_point = new_args[3];
-  auto& rhs_scale = new_args[4];
-  auto& rhs_zero_point = new_args[5];
-  auto& output_scale = new_args[6];
-  auto& output_zero_point = new_args[7];
+  QnnBinaryOpArguments args(new_args);

  // Get the input dtype and shape.
-  CHECK_EQ(arg_types.size(), 9);
-  auto tensor_type = arg_types[0].as<TensorTypeNode>();
-  CHECK(tensor_type != nullptr);
-  auto input_dtype = tensor_type->dtype;
-  auto input_shape = tensor_type->shape;
+  QnnBinaryOpTensorType input_type(arg_types, 0);
+  // data types
+  const auto int32_dtype = DataType::Int(32);
+  const auto float32_dtype = DataType::Float(32);

  /*
  A tensor multiplication c = a * b can be written in terms of respective
@@ -71,31 +62,35 @@ Expr QnnMulCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
  which is essentially a requantization of tensor Q' into tensor Q_c.
  */

-  auto lhs_shifted = Cast(lhs, DataType::Int(32));
-  auto rhs_shifted = Cast(rhs, DataType::Int(32));
+  auto lhs_shifted = Cast(args.lhs, int32_dtype);
+  auto rhs_shifted = Cast(args.rhs, int32_dtype);

-  auto zero_scalar = MakeConstantScalar(DataType::Int(32), 0);
-  if (!IsEqualScalar(lhs_zero_point, zero_scalar)) {
-    lhs_shifted = Subtract(lhs_shifted, lhs_zero_point);
+  auto zero_scalar = MakeConstantScalar(int32_dtype, 0);
+  if (!IsEqualScalar(args.lhs_zero_point, zero_scalar)) {
+    lhs_shifted = Subtract(lhs_shifted, args.lhs_zero_point);
  }

-  if (!IsEqualScalar(rhs_zero_point, zero_scalar)) {
-    rhs_shifted = Subtract(rhs_shifted, rhs_zero_point);
+  if (!IsEqualScalar(args.rhs_zero_point, zero_scalar)) {
+    rhs_shifted = Subtract(rhs_shifted, args.rhs_zero_point);
  }

  // Create a new tensor Q'
  auto output = Multiply(lhs_shifted, rhs_shifted);

  // Get the adjusted new scale and zero points.
-  float lhs_scale_float = GetScalarFromConstant<float>(lhs_scale);
-  float rhs_scale_float = GetScalarFromConstant<float>(rhs_scale);
+  float lhs_scale_float = GetScalarFromConstant<float>(args.lhs_scale);
+  float rhs_scale_float = GetScalarFromConstant<float>(args.rhs_scale);
  float new_scale_float = lhs_scale_float * rhs_scale_float;
-  auto new_input_scale = MakeConstantScalar(DataType::Float(32), new_scale_float);
+  auto new_input_scale = MakeConstantScalar(float32_dtype, new_scale_float);
  auto new_input_zero_point = zero_scalar;

  // Requantize to get Q_c
-  output = Requantize(output, input_shape, new_input_scale, new_input_zero_point, output_scale,
-                      output_zero_point, input_dtype);
+  output = Requantize(output, input_type.shape,
+                      new_input_scale,
+                      new_input_zero_point,
+                      args.output_scale,
+                      args.output_zero_point,
+                      input_type.dtype);

  return output;
 }

--- a/src/relay/qnn/op/op_common.h
+++ b/src/relay/qnn/op/op_common.h
@@ -30,14 +30,152 @@
 #include <tvm/relay/qnn/attrs.h>
 #include <vector>
 #include "../../op/type_relations.h"
+#include "../../transforms/infer_layout_util.h"
+#include "../util.h"

 namespace tvm {
 namespace relay {
 namespace qnn {

-static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+/*
+ * Number of inputs for the Qnn binary operators.
+ * Refer the QNN_REGISTER_BINARY_OP macro to see
+ * what the operators are.
+ */
+static constexpr int kNumQnnBinaryOpInputs = 8;
+
+/*
+ * Number of expected arg types.
+ */
+static constexpr int kNumQnnBinaryOpArgTypes = 9;
+
+/*
+ * \brief Simple struct to organize the inputs to the Qnn
+ * binary operators. The main reason to have a struct
+ * is to be able to perform the common checks needed at a
+ * central location.
+ */
+struct QnnBinaryOpArguments {
+  Expr lhs;
+  Expr rhs;
+  Expr lhs_scale;
+  Expr lhs_zero_point;
+  Expr rhs_scale;
+  Expr rhs_zero_point;
+  Expr output_scale;
+  Expr output_zero_point;
+
+  explicit QnnBinaryOpArguments(const Array<Expr>& new_args) {
+    CHECK_EQ(new_args.size(), kNumQnnBinaryOpInputs);
+    int idx = 0;
+    lhs = new_args[idx++];
+    rhs = new_args[idx++];
+    lhs_scale = new_args[idx++];
+    lhs_zero_point = new_args[idx++];
+    rhs_scale = new_args[idx++];
+    rhs_zero_point = new_args[idx++];
+    output_scale = new_args[idx++];
+    output_zero_point = new_args[idx++];
+    CHECK_EQ(idx, kNumQnnBinaryOpInputs);
+  }
+};
+
+/*
+ * \brief Simple structure to hold the input tensor's dtype
+ * and shape. This structure allows a common point to do
+ * all the validation checks for Qnn binary operators.
+ */
+struct QnnBinaryOpTensorType {
+  DataType dtype;
+  Array <PrimExpr> shape;
+
+  explicit QnnBinaryOpTensorType(const Array<tvm::relay::Type>& arg_types,
+                                 const int32_t arg_idx) {
+    CHECK_EQ(arg_types.size(), kNumQnnBinaryOpArgTypes);
+    auto tensor_type = arg_types[arg_idx].as<TensorTypeNode>();
+    CHECK(tensor_type != nullptr);
+    dtype = tensor_type->dtype;
+    shape = tensor_type->shape;
+  }
+};
+
+/*
+ * \brief Converts the expression from expression's dtype
+ * to target dtype. This is mainly used for converting
+ * computations done in Int32 to lower precision Int8 or
+ * UInt8.
+ * \param expr The expression to whose dtype needs conversion.
+ * \param target_dtype The dtype of the target expression
+ * \return New expression with target dtype and possibly lower
+ * precision.
+ */
+inline Expr ConvertDtype(const Expr& expr,
+                         const DataType& target_dtype) {
+  auto q_min = GetQmin(target_dtype);
+  auto q_max = GetQmax(target_dtype);
+  auto output = Clip(expr, q_min, q_max);
+  return Cast(output, target_dtype);
+}
+
+/*
+ * \brief Requantizes the given expression if expression's
+ * scale and zero point both do not match target scale and
+ * zero point. This is mainly needed for requantizing the
+ * input tensors with output tensor's scale and zero point
+ * to ease the computation of final quantized tensor.
+ * \param expr The expression on which the check needs to be performed.
+ * \param expr_scale The scale of the expression.
+ * \param expr_zero_point The zero point of the expression.
+ * \param target_scale The scale of the output tensor.
+ * \param target_zero_point The zero point of the output tensor.
+ * \param expr_shape The shape of the input expression.
+ * \return New expression that is requantized to target scale and zero
+ * point if the expression scale and zero points are different otherwise
+ * it simply casts the given expression to Int32 as no requantization is
+ * needed in this case.
+ */
+inline Expr RequantizeOrUpcast(const Expr& expr,
+                               const Expr& expr_scale,
+                               const Expr& expr_zero_point,
+                               const Expr& target_scale,
+                               const Expr& target_zero_point,
+                               const Array<PrimExpr>& expr_shape,
+                               const DataType& target_dtype = DataType::Int(32)) {
+  auto result = expr;
+  if (!IsEqualScalar(expr_scale, target_scale) ||
+      !IsEqualScalar(expr_zero_point, target_zero_point)) {
+    result = Requantize(expr, expr_shape, expr_scale, expr_zero_point,
+                        target_scale, target_zero_point, target_dtype);
+  } else {
+    result = Cast(result, target_dtype);
+  }
+  return result;
+}
+
+/*! \brief Infer layout for QNN binary broadcast operators */
+inline Array<Array<Layout> > QnnBinaryBroadcastLayout(
+                    const Attrs& attrs,
+                    const Array<Layout>& new_in_layouts,
+                    const Array<Layout>& old_in_layouts,
+                    const Array<tvm::relay::Type>& old_in_types) {
+  // Use Relay Binary Broadcast Infer correct layout.
+  auto layouts = BinaryBroadcastLayout(attrs, new_in_layouts, old_in_layouts, old_in_types);
+
+  // Fill the layouts of remaining input tensors - scales and zero points. The layouts of these
+  // tensors can be treated as C.
+  Layout channel_layout = Layout("C");
+  Array<Layout> input_layouts = {layouts[0][0], layouts[0][1], channel_layout, channel_layout,
+                                 channel_layout, channel_layout, channel_layout, channel_layout};
+  Array<Layout> output_layouts = layouts[1];
+  return {input_layouts, output_layouts};
+}
+
+
+static inline bool QnnBroadcastRel(const Array<Type>& types,
+                                   int num_inputs,
+                                   const Attrs& attrs,
                                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 9);
+  CHECK_EQ(types.size(), kNumQnnBinaryOpArgTypes);

  // Check the scale and zero point types
  CHECK(IsScalarType(types[2], DataType::Float(32)));  // lhs_scale
@@ -74,16 +212,17 @@ static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, con
                     output_scale, output_zero_point}, Attrs(), {});    \
  });                                                                   \
  RELAY_REGISTER_OP("qnn." OpName)                                      \
-  .set_num_inputs(8)                                                    \
-    .add_argument("lhs", "Tensor", "The left hand side quantized tensor.")                 \
-    .add_argument("rhs", "Tensor", "The right hand side quantized tensor.")                \
-    .add_argument("lhs_scale", "Tensor", "The scale of the lhs tensor.")                   \
-    .add_argument("lhs_zero_point", "Tensor", "The zero_point of the lhs tensor.")         \
-    .add_argument("rhs_scale", "Tensor", "The scale of the rhs tensor.")                   \
-    .add_argument("rhs_zero_point", "Tensor", "The zero_point of the rhs tensor.")         \
-    .add_argument("output_scale", "Tensor", "The scale of the output tensor.")             \
-    .add_argument("output_zero_point", "Tensor", "The zero_point of the output tensor.")   \
-    .add_type_rel("QnnBroadcast", QnnBroadcastRel)
+  .set_num_inputs(kNumQnnBinaryOpInputs)                                                 \
+  .add_argument("lhs", "Tensor", "The left hand side quantized tensor.")                 \
+  .add_argument("rhs", "Tensor", "The right hand side quantized tensor.")                \
+  .add_argument("lhs_scale", "Tensor", "The scale of the lhs tensor.")                   \
+  .add_argument("lhs_zero_point", "Tensor", "The zero_point of the lhs tensor.")         \
+  .add_argument("rhs_scale", "Tensor", "The scale of the rhs tensor.")                   \
+  .add_argument("rhs_zero_point", "Tensor", "The zero_point of the rhs tensor.")         \
+  .add_argument("output_scale", "Tensor", "The scale of the output tensor.")             \
+  .add_argument("output_zero_point", "Tensor", "The zero_point of the output tensor.")   \
+  .add_type_rel("QnnBroadcast", QnnBroadcastRel)                                         \
+  .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnBinaryBroadcastLayout)

 }  // namespace qnn
 }  // namespace relay

--- a/src/relay/qnn/op/subtract.cc
+++ b/src/relay/qnn/op/subtract.cc
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/subtract.cc
+ * \brief QNN subtract operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+#include "op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+/*
+ * \brief Canonicalizes the QNN subtract op.
+ * \param attrs The empty attribute.
+ * \param new_args The new mutated args to the call node.
+ * \param arg_types The types of input and output.
+ * \return The sequence of Relay ops for add op.
+ */
+Expr QnnSubtractCanonicalize(const Attrs& attrs,
+                             const Array<Expr>& new_args,
+                             const Array<tvm::relay::Type>& arg_types) {
+  // Get the args.
+  QnnBinaryOpArguments args(new_args);
+
+  // Get the input dtype and shape.
+  QnnBinaryOpTensorType input_type(arg_types, 0);
+
+  // TODO(shoubhik) - The lowering can be further optimized. Instead of inserting requantize in
+  // the start, we can insert requantize at the end if both input tensors have same qnn params. In
+  // that case, we can first subtract the tensors, add the zero point, and requantize at the end.
+  // This can be done in future.
+
+  // Since the input qnn params can be different than output qnn params, we first requantize the
+  // input tensors to the output qnn params. Then we call relay.subtract on the requantized inputs.
+  // This subtraction results in extra subtraction of the output zero point. We further add
+  // the zero point. The whole process can be represented using following equations
+  //
+  //          scale_c * (Q_c - zp_c) = scale_a * (Q_a - zp_a) - scale_b * (Q_b - zp_b)
+  //
+  // After requantizing Q_a and Q_b, equation becomes,
+  //          scale_c * (Q_c - zp_c) = scale_c * (Q_a' - zp_c) - scale_c * (Q_b' - zp_c)
+  //          scale_c * (Q_c - zp_c) = scale_c * (Q_a' - Q_b')
+  //
+  // Comparing the LHS and RHS, it results in
+  //          Q_c = Q_a' - Q_b' + zp_c
+  // The subtract op is done in int32 precision.
+
+  // Requantize LHS if necessary. Computes Q_a'
+  auto requantized_lhs = RequantizeOrUpcast(args.lhs, args.lhs_scale,
+                                            args.lhs_zero_point,
+                                            args.output_scale,
+                                            args.output_zero_point,
+                                            input_type.shape);
+  // Requantize RHS if necessary. Computes Q_b'
+  auto requantized_rhs = RequantizeOrUpcast(args.rhs, args.rhs_scale,
+                                            args.rhs_zero_point,
+                                            args.output_scale,
+                                            args.output_zero_point,
+                                            input_type.shape);
+
+  // Computes Q_a' - Q_b'
+  auto output = Subtract(requantized_lhs, requantized_rhs);
+
+  // Add zero point. Computes (Q_a' - Q_b') + zp_c
+  auto zero_scalar = MakeConstantScalar(DataType::Int(32), 0);
+  if (!IsEqualScalar(args.output_zero_point, zero_scalar)) {
+    output = Add(output, args.output_zero_point);
+  }
+
+  // Go back to lower precision.
+  return ConvertDtype(output, input_type.dtype);
+}
+
+// QNN Subtraction operator.
+QNN_REGISTER_BINARY_OP("subtract")
+.describe("Elementwise subtract with with broadcasting for quantized tensors.")
+.set_support_level(11)
+.set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnSubtractCanonicalize);
+
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -84,7 +84,7 @@ static inline Expr Requantize(const Expr& data, const Array<IndexExpr>& input_sh
  attrs->rounding = std::move(rounding);
  attrs->out_dtype = std::move(out_dtype);
  return RequantizeLower(data, input_scale, input_zero_point, output_scale, output_zero_point,
-                         attrs.operator->(), input_shape, out_dtype);
+                         attrs.operator->(), input_shape, attrs->out_dtype);
 }

 static inline int64_t get_const_int(const tvm::PrimExpr& x) {

--- a/tests/python/relay/test_op_qnn_add.py
+++ b/tests/python/relay/test_op_qnn_add.py
@@ -16,11 +16,9 @@
 # under the License.

 import tvm
-from tvm import te
 import numpy as np
 from tvm import relay
-from tvm.contrib import graph_runtime
-import topi.testing
+

 def test_tflite_same_io_qnn_params():
    data_dtype = 'uint8'
@@ -40,15 +38,15 @@ def test_tflite_same_io_qnn_params():
    mod = relay.qnn.transform.CanonicalizeOps()(mod)
    func = mod["main"]

-    x_datas = [np.array((140, 153, 165, 178)).reshape((1,4)),
-               np.array((25, 153, 178, 216)).reshape((1,4)),
-               np.array((25, 153, 216, 165)).reshape((1,4))]
-    y_datas = [np.array((204, 178, 165, 140)).reshape((1,4)),
-               np.array((204, 178, 191, 25)).reshape((1,4)),
-               np.array((204, 178, 25, 191)).reshape((1,4))]
-    golden_outputs = [np.array((217,204,203,191)).reshape((1, 4)),
-                      np.array((102, 204, 242, 114)).reshape((1,4)),
-                      np.array((102, 204, 114, 229)).reshape((1,4))]
+    x_datas = [np.array((140, 153, 165, 178)).reshape((1, 4)),
+               np.array((25, 153, 178, 216)).reshape((1, 4)),
+               np.array((25, 153, 216, 165)).reshape((1, 4))]
+    y_datas = [np.array((204, 178, 165, 140)).reshape((1, 4)),
+               np.array((204, 178, 191, 25)).reshape((1, 4)),
+               np.array((204, 178, 25, 191)).reshape((1, 4))]
+    golden_outputs = [np.array((217, 204, 203, 191)).reshape((1, 4)),
+                      np.array((102, 204, 242, 114)).reshape((1, 4)),
+                      np.array((102, 204, 114, 229)).reshape((1, 4))]

    for i in range(0, 3):
        x_data = x_datas[i]
@@ -78,15 +76,15 @@ def test_tflite_different_io_qnn_params():
    mod = relay.qnn.transform.CanonicalizeOps()(mod)
    func = mod["main"]

-    x_datas = [np.array((76, 140, 153, 172)).reshape((1,4)),
-               np.array((133, 140, 146, 153)).reshape((1,4)),
-               np.array((76, 140, 172, 146)).reshape((1,4))]
-    y_datas = [np.array((136, 119, 128, 17)).reshape((1,4)),
-               np.array((136, 119, 111, 94)).reshape((1,4)),
-               np.array((136, 119, 17, 128)).reshape((1,4))]
+    x_datas = [np.array((76, 140, 153, 172)).reshape((1, 4)),
+               np.array((133, 140, 146, 153)).reshape((1, 4)),
+               np.array((76, 140, 172, 146)).reshape((1, 4))]
+    y_datas = [np.array((136, 119, 128, 17)).reshape((1, 4)),
+               np.array((136, 119, 111, 94)).reshape((1, 4)),
+               np.array((136, 119, 17, 128)).reshape((1, 4))]
    golden_outputs = [np.array((120, 154, 167, 124)).reshape((1, 4)),
-                      np.array((158, 154, 154, 150)).reshape((1,4)),
-                      np.array((120, 154, 124, 163)).reshape((1,4))]
+                      np.array((158, 154, 154, 150)).reshape((1, 4)),
+                      np.array((120, 154, 124, 163)).reshape((1, 4))]

    for i in range(0, 3):
        x_data = x_datas[i]
@@ -116,8 +114,8 @@ def test_saturation():
    mod = relay.qnn.transform.CanonicalizeOps()(mod)
    func = mod["main"]

-    x_data = np.array((255, 1, 1, 0)).reshape((1,4))
-    y_data = np.array((255, 255, 128, 0)).reshape((1,4))
+    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
+    y_data = np.array((255, 255, 128, 0)).reshape((1, 4))
    golden_output = np.array((255, 255, 129, 0)).reshape((1, 4))

    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
@@ -138,8 +136,8 @@ def test_saturation():
    mod = relay.qnn.transform.CanonicalizeOps()(mod)
    func = mod["main"]

-    x_data = np.array((255, 1, 1, 0)).reshape((1,4))
-    y_data = np.array((255, 255, 127, 0)).reshape((1,4))
+    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
+    y_data = np.array((255, 255, 127, 0)).reshape((1, 4))
    golden_output = np.array((255, 129, 65, 0)).reshape((1, 4))

    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
@@ -160,8 +158,8 @@ def test_saturation():
    mod = relay.qnn.transform.CanonicalizeOps()(mod)
    func = mod["main"]

-    x_data = np.array((255, 1, 1, 0)).reshape((1,4))
-    y_data = np.array((255, 255, 127, 0)).reshape((1,4))
+    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
+    y_data = np.array((255, 255, 127, 0)).reshape((1, 4))
    golden_output = np.array((255, 129, 65, 0)).reshape((1, 4))

    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
@@ -182,8 +180,8 @@ def test_saturation():
    mod = relay.qnn.transform.CanonicalizeOps()(mod)
    func = mod["main"]

-    x_data = np.array((255, 0, 1, 0)).reshape((1,4))
-    y_data = np.array((0, 128, 64, 0)).reshape((1,4))
+    x_data = np.array((255, 0, 1, 0)).reshape((1, 4))
+    y_data = np.array((0, 128, 64, 0)).reshape((1, 4))
    golden_output = np.array((255, 255, 132, 0)).reshape((1, 4))

    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")

--- a/tests/python/relay/test_op_qnn_subtract.py
+++ b/tests/python/relay/test_op_qnn_subtract.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tvm import relay
+
+
+def qnn_subtract_driver(x_datas, y_datas, golden_outputs,
+                        scale_and_zp, data_dtype='uint8'):
+    # all x, y and golden outputs should be of the same length
+    assert len(x_datas) == len(y_datas)
+    assert len(y_datas) == len(golden_outputs)
+
+    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
+    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
+    lhs_scale = relay.const(scale_and_zp['lhs_scale'], 'float32')
+    lhs_zp = relay.const(scale_and_zp['lhs_zp'], 'int32')
+    rhs_scale = relay.const(scale_and_zp['rhs_scale'], 'float32')
+    rhs_zp = relay.const(scale_and_zp['rhs_zp'], 'int32')
+    output_scale = relay.const(scale_and_zp['output_scale'], 'float32')
+    output_zp = relay.const(scale_and_zp['output_zp'], 'int32')
+    z = relay.qnn.op.subtract(lhs=x, rhs=y,
+                              lhs_scale=lhs_scale,
+                              lhs_zero_point=lhs_zp,
+                              rhs_scale=rhs_scale,
+                              rhs_zero_point=rhs_zp,
+                              output_scale=output_scale,
+                              output_zero_point=output_zp)
+    func = relay.Function([x, y], z)
+    mod = tvm.IRModule.from_expr(func)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+    for i in range(0, len(x_datas)):
+        x_data = x_datas[i]
+        y_data = y_datas[i]
+        golden_output = golden_outputs[i]
+        intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+        op_res = intrp.evaluate(func)(x_data, y_data)
+        np.testing.assert_equal(op_res.asnumpy(), golden_output)
+
+
+def test_tflite_same_io_qnn_params():
+    scale_and_zp = {'lhs_scale': 0.00784314,
+                    'lhs_zp': 127,
+                    'rhs_scale': 0.00784314,
+                    'rhs_zp': 127,
+                    'output_scale': 0.00784314,
+                    'output_zp': 127}
+    x_datas = [np.array((140, 153, 165, 178)).reshape((1, 4)),
+               np.array((25, 153, 178, 216)).reshape((1, 4)),
+               np.array((25, 153, 216, 165)).reshape((1, 4))]
+    y_datas = [np.array((204, 178, 165, 140)).reshape((1, 4)),
+               np.array((204, 178, 191, 25)).reshape((1, 4)),
+               np.array((204, 178, 25, 191)).reshape((1, 4))]
+    golden_outputs = [np.array((63, 102, 127, 165)).reshape((1, 4)),
+                      np.array((0, 102, 114, 255)).reshape((1, 4)),
+                      np.array((0, 102, 255, 101)).reshape((1, 4))]
+    qnn_subtract_driver(x_datas, y_datas, golden_outputs, scale_and_zp)
+
+
+def test_tflite_different_io_qnn_params():
+    scale_and_zp = {'lhs_scale': 0.0156863,
+                    'lhs_zp': 127,
+                    'rhs_scale': 0.0117647,
+                    'rhs_zp': 85,
+                    'output_scale': 0.0235294,
+                    'output_zp': 128}
+    x_datas = [np.array((76, 140, 153, 172)).reshape((1, 4)),
+               np.array((133, 140, 146, 153)).reshape((1, 4)),
+               np.array((76, 140, 172, 146)).reshape((1, 4))]
+    y_datas = [np.array((136, 119, 128, 17)).reshape((1, 4)),
+               np.array((136, 119, 111, 94)).reshape((1, 4)),
+               np.array((136, 119, 17, 128)).reshape((1, 4))]
+    golden_outputs = [np.array((68, 120, 123, 192)).reshape((1, 4)),
+                      np.array((106, 120, 128, 140)).reshape((1, 4)),
+                      np.array((68, 120, 192, 119)).reshape((1, 4))]
+    qnn_subtract_driver(x_datas, y_datas, golden_outputs, scale_and_zp)
+
+
+def test_saturation():
+    # Same params
+    scale_and_zp = {'lhs_scale': 0.125,
+                    'lhs_zp': 0,
+                    'rhs_scale': 0.125,
+                    'rhs_zp': 0,
+                    'output_scale': 0.125,
+                    'output_zp': 0}
+    x_data = [np.array((255, 1, 1, 0)).reshape((1, 4))]
+    y_data = [np.array((255, 255, 128, 0)).reshape((1, 4))]
+    golden_output = [np.array((0, 0, 0, 0)).reshape((1, 4))]
+    qnn_subtract_driver(x_data, y_data, golden_output, scale_and_zp)
+
+    # Same params, different scale
+    scale_and_zp = {'lhs_scale': 0.125,
+                    'lhs_zp': 0,
+                    'rhs_scale': 0.125,
+                    'rhs_zp': 0,
+                    'output_scale': 0.25,
+                    'output_zp': 0}
+    x_data = [np.array((255, 1, 200, 0)).reshape((1, 4))]
+    y_data = [np.array((255, 255, 127, 0)).reshape((1, 4))]
+    golden_output = [np.array((0, 0, 36, 0)).reshape((1, 4))]
+    qnn_subtract_driver(x_data, y_data, golden_output, scale_and_zp)
+
+    # All params different
+    scale_and_zp = {'lhs_scale': 0.5,
+                    'lhs_zp': 0,
+                    'rhs_scale': 0.25,
+                    'rhs_zp': 0,
+                    'output_scale': 0.125,
+                    'output_zp': 0}
+    x_data = [np.array((255, 0, 1, 0)).reshape((1, 4))]
+    y_data = [np.array((0, 128, 64, 0)).reshape((1, 4))]
+    golden_output = [np.array((255, 0, 0, 0)).reshape((1, 4))]
+    qnn_subtract_driver(x_data, y_data, golden_output, scale_and_zp)
+
+
+if __name__ == '__main__':
+    test_tflite_same_io_qnn_params()
+    test_tflite_different_io_qnn_params()
+    test_saturation()