[TOPI] Numpy consistency: always broadcast binary op. (#1321)

db4be63c · Tianqi Chen · GitHub · 90db723d · db4be63c · db4be63c
Commit db4be63c authored Jun 22, 2018 by Tianqi Chen Committed by GitHub Jun 22, 2018
23 changed files
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -1934,7 +1934,7 @@ ENABLE_PREPROCESSING   = YES
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-MACRO_EXPANSION        = NO
+MACRO_EXPANSION        = YES
 # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
 # the macro expansion is limited to the macros specified with the PREDEFINED and

--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -31,8 +31,6 @@ List of operators
   topi.take
   topi.full
   topi.full_like
-   topi.greater
-   topi.less
   topi.nn.relu
   topi.nn.leaky_relu
   topi.nn.dilate
@@ -49,12 +47,16 @@ List of operators
   topi.sum
   topi.min
   topi.broadcast_to
-   topi.broadcast_add
+   topi.add
-   topi.broadcast_sub
+   topi.subtract
-   topi.broadcast_mul
+   topi.multiply
-   topi.broadcast_div
+   topi.divide
-   topi.broadcast_maximum
+   topi.mod
-   topi.broadcast_minimum
+   topi.maximum
+   topi.minimum
+   topi.power
+   topi.greater
+   topi.less
   topi.image.resize
@@ -94,19 +96,20 @@ topi
 .. autofunction:: topi.take
 .. autofunction:: topi.full
 .. autofunction:: topi.full_like
-.. autofunction:: topi.greater
-.. autofunction:: topi.less
 .. autofunction:: topi.max
 .. autofunction:: topi.sum
 .. autofunction:: topi.min
 .. autofunction:: topi.broadcast_to
-.. autofunction:: topi.broadcast_add
+.. autofunction:: topi.add
-.. autofunction:: topi.broadcast_sub
+.. autofunction:: topi.subtract
-.. autofunction:: topi.broadcast_mul
+.. autofunction:: topi.multiply
-.. autofunction:: topi.broadcast_div
+.. autofunction:: topi.divide
-.. autofunction:: topi.broadcast_maximum
+.. autofunction:: topi.mod
-.. autofunction:: topi.broadcast_minimum
+.. autofunction:: topi.maximum
+.. autofunction:: topi.minimum
+.. autofunction:: topi.power
+.. autofunction:: topi.greater
+.. autofunction:: topi.less
 topi.nn
 ~~~~~~~

--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -105,7 +105,7 @@ def compute_conv2d(attrs, inputs, _):
        bias = inputs[2]
        expand_axis = 1 if layout == "NCHW" else 0
        bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
-        out = topi.broadcast_add(out, bias)
+        out = topi.add(out, bias)
    return out
 @reg.register_schedule("conv2d")
@@ -146,7 +146,7 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
    if attrs.get_bool("use_bias"):
        bias = inputs[2]
        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
-        out = topi.broadcast_add(out, bias)
+        out = topi.add(out, bias)
    return out
 @reg.register_schedule("_contrib_conv2d_NCHWc")
@@ -181,7 +181,7 @@ def compute_conv2d_transpose(attrs, inputs, _):
    if attrs.get_bool("use_bias"):
        bias = inputs[2]
        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
-        out = topi.broadcast_add(out, bias)
+        out = topi.add(out, bias)
    output_padding = attrs.get_int_tuple("output_padding")
    out = topi.nn.pad(out, \
        [0, 0, 0, 0], [0, 0, output_padding[0], output_padding[1]])

--- a/nnvm/python/nnvm/top/tensor.py
+++ b/nnvm/python/nnvm/top/tensor.py
@@ -244,7 +244,7 @@ reg.register_schedule("ones_like", _fschedule_elemwise)
 @reg.register_compute("greater")
 def compute_greater(_, inputs, out_info):
    """Compute definition of greater"""
-    return topi.tensor.greater(inputs[0], inputs[1], 'float32')
+    return topi.greater(inputs[0], inputs[1]).astype('float32')
 reg.register_pattern("greater", OpPattern.ELEMWISE)
 reg.register_schedule("greater", _fschedule_elemwise)
@@ -252,7 +252,7 @@ reg.register_schedule("greater", _fschedule_elemwise)
 @reg.register_compute("less")
 def compute_less(_, inputs, out_info):
    """Compute definition of less"""
-    return topi.tensor.less(inputs[0], inputs[1], 'float32')
+    return topi.less(inputs[0], inputs[1]).astype('float32')
 reg.register_pattern("less", OpPattern.ELEMWISE)
 reg.register_schedule("less", _fschedule_elemwise)

--- a/nnvm/src/top/tensor/broadcast.cc
+++ b/nnvm/src/top/tensor/broadcast.cc
@@ -200,7 +200,7 @@ inline bool BinaryBroadcastCorrectLayout(const NodeAttrs& attrs,
  return true;
 }
-#define NNVM_REGISTER_BINARY_BROADCAST_OP(name)                     \
+#define NNVM_REGISTER_BINARY_BROADCAST_OP(name, TOPIOp)             \
  NNVM_REGISTER_OP(name)                                            \
  .set_num_inputs(2)                                                \
  .set_num_outputs(1)                                               \
@@ -217,13 +217,13 @@ inline bool BinaryBroadcastCorrectLayout(const NodeAttrs& attrs,
      const Array<Tensor>& inputs,                                  \
      const Array<Tensor>& out_info) {                              \
        return Array<Tensor>{                                       \
-          topi::name(inputs[0], inputs[1]) };                       \
+          topi::TOPIOp(inputs[0], inputs[1]) };                     \
    })                                                              \
  .add_argument("lhs", "Tensor", "first input")                     \
  .add_argument("rhs", "Tensor", "second input")
-NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_add)
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_add, add)
 .add_alias("__add_symbol__")
 .describe(R"code(Returns element-wise sum of the input arrays with broadcasting.
@@ -241,7 +241,7 @@ Example::
 )code" NNVM_ADD_FILELINE);
-NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_sub)
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_sub, subtract)
 .add_alias("__sub_symbol__")
 .describe(R"code(Returns element-wise difference of the input arrays with broadcasting.
@@ -259,7 +259,7 @@ Example::
 )code" NNVM_ADD_FILELINE);
-NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_mul)
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_mul, multiply)
 .add_alias("__mul_symbol__")
 .describe(R"code(Returns element-wise product of the input arrays with broadcasting.
@@ -276,7 +276,7 @@ Example::
 )code" NNVM_ADD_FILELINE);
-NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_div)
+NNVM_REGISTER_BINARY_BROADCAST_OP(broadcast_div, divide)
 .add_alias("__div_symbol__")
 .describe(R"code(Returns element-wise division of the input arrays with broadcasting.

--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -230,7 +230,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_add)
  "FTVMCompute", [](const NodeAttrs& attrs,
                    const Array<Tensor>& inputs,
                    const Array<Tensor>& out_info) {
-      return Array<Tensor>{ topi::broadcast_add(inputs[0], inputs[1]) };
+      return Array<Tensor>{ topi::add(inputs[0], inputs[1]) };
  })
 .set_attr<FGradient>(
  "FGradient", [](const NodePtr& n,
@@ -253,7 +253,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_sub)
  "FTVMCompute", [](const NodeAttrs& attrs,
                    const Array<Tensor>& inputs,
                    const Array<Tensor>& out_info) {
-      return Array<Tensor>{ topi::broadcast_sub(inputs[0], inputs[1]) };
+    return Array<Tensor>{ topi::subtract(inputs[0], inputs[1]) };
 })
 .set_attr<FGradient>(
  "FGradient", [](const NodePtr& n,
@@ -276,7 +276,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_mul)
  "FTVMCompute", [](const NodeAttrs& attrs,
                    const Array<Tensor>& inputs,
                    const Array<Tensor>& out_info) {
-      return Array<Tensor>{ topi::broadcast_mul(inputs[0], inputs[1]) };
+      return Array<Tensor>{ topi::multiply(inputs[0], inputs[1]) };
 })
 .set_attr<FGradient>(
  "FGradient", [](const NodePtr& n,
@@ -301,7 +301,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_div)
  "FTVMCompute", [](const NodeAttrs& attrs,
                    const Array<Tensor>& inputs,
                    const Array<Tensor>& out_info) {
-      return Array<Tensor>{ topi::broadcast_div(inputs[0], inputs[1]) };
+      return Array<Tensor>{ topi::divide(inputs[0], inputs[1]) };
 })
 .set_attr<FGradient>(
  "FGradient", [](const NodePtr& n,

--- a/python/tvm/expr.py
+++ b/python/tvm/expr.py
@@ -137,7 +137,7 @@ class ExprOp(object):
        expr : Expr
            Expression with new type
        """
-        return _make.static_cast(dtype, self)
+        return _generic.cast(self, dtype)
 class EqualOp(NodeGeneric, ExprOp):

--- a/python/tvm/generic.py
+++ b/python/tvm/generic.py
@@ -79,3 +79,19 @@ def divide(lhs, rhs):
        The result Expr of divide operaton.
    """
    return _make.Div(lhs, rhs)
+def cast(src, dtype):
+    """Generic cast operator.
+    Parameters
+    ----------
+    src : object
+        The source operand.
+    Returns
+    -------
+    op : tvm.Expr
+        The result Expr of divide operaton.
+    """
+    return _make.static_cast(dtype, src)
--- a/tests/python/unittest/test_lang_tensor_overload_op.py
+++ b/tests/python/unittest/test_lang_tensor_overload_op.py
@@ -15,11 +15,11 @@ def test_operator_type_and_tags():
    assert isinstance(k + n, tvm.expr.Expr)
    assert isinstance(n + n, tvm.expr.Expr)
-    assert isinstance(k + A, tvm.expr.Expr)
+    assert isinstance(k + A, tvm.tensor.Tensor)
-    assert isinstance(A + k, tvm.expr.Expr)
+    assert isinstance(A + k, tvm.tensor.Tensor)
-    assert isinstance(n + A, tvm.expr.Expr)
+    assert isinstance(n + A, tvm.tensor.Tensor)
-    assert isinstance(A + n, tvm.expr.Expr)
+    assert isinstance(A + n, tvm.tensor.Tensor)
-    assert isinstance(A + A, tvm.expr.Expr)
+    assert isinstance(A + A, tvm.tensor.Tensor)
    assert isinstance(k + B, tvm.tensor.Tensor)
    assert isinstance(B + k, tvm.tensor.Tensor)
@@ -33,8 +33,8 @@ def test_operator_type_and_tags():
    assert (B + k).op.tag == topi.tag.ELEMWISE
    assert (n + B).op.tag == topi.tag.ELEMWISE
    assert (B + n).op.tag == topi.tag.ELEMWISE
-    assert (A + B).op.tag == topi.tag.ELEMWISE
+    assert (A + B).op.tag == topi.tag.BROADCAST
-    assert (B + A).op.tag == topi.tag.ELEMWISE
+    assert (B + A).op.tag == topi.tag.BROADCAST
    assert (B + B).op.tag == topi.tag.BROADCAST
    assert isinstance(k + B2, tvm.expr.Expr)
@@ -42,8 +42,8 @@ def test_operator_type_and_tags():
    assert isinstance(n + B2, tvm.expr.Expr)
    assert isinstance(B2 + n, tvm.expr.Expr)
    assert isinstance(B2 + B2, tvm.expr.Expr)
-    assert isinstance(B2 + A, tvm.expr.Expr)
+    assert isinstance(B2 + A, tvm.tensor.Tensor)
-    assert isinstance(A + B2, tvm.expr.Expr)
+    assert isinstance(A + B2, tvm.tensor.Tensor)
    assert isinstance(B2 + B, tvm.tensor.Tensor)
    assert isinstance(B + B2, tvm.tensor.Tensor)
@@ -246,4 +246,4 @@ if __name__ == "__main__":
    test_combination()
    test_tensor_scalar_bop()
    test_broadcast_bop()
    test_conv2d_scalar_bop()
\ No newline at end of file
--- a/topi/include/topi/broadcast.h
+++ b/topi/include/topi/broadcast.h
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -69,55 +69,6 @@ inline Tensor negative(const Tensor& x,
 }
 /*!
-* \brief Creates an operation that raises each element of tensor x to power y
-*
-* \param x The input tensor
-* \param y The exponent
-* \param name The name of the operation
-* \param tag The tag to mark the operation
-*
-* \return A Tensor whose op member is the pow operation
-*/
-inline Tensor pow(const Tensor& x,
-                  const Expr& y,
-                  std::string name = "tensor",
-                  std::string tag = kElementWise) {
-  return compute(x->shape, [&](const Array<Var>& i) {
-    return tvm::pow(x(i), y);
-  }, name, tag);
-}
-/*!
-* \brief Creates an operation that performs pointwise left shift by n bits
-*
-* \param x The input tensor
-* \param n The number of bits to shift by
-*
-* \return A Tensor whose op member is the left shift operation
-*/
-inline Tensor operator<<(const Tensor& x,
-                         const Expr& n) {
-  return compute(x->shape, [&](const Array<Var>& i) {
-    return x(i) << n;
-  }, "tensor", kElementWise);
-}
-/*!
-* \brief Creates an operation that performs pointwise right shift by n bits
-*
-* \param x The input tensor
-* \param n The number of bits to shift by
-*
-* \return A Tensor whose op member is the right shift operation
-*/
-inline Tensor operator>>(const Tensor& x,
-                         const Expr& n) {
-  return compute(x->shape, [&](const Array<Var>& i) {
-    return x(i) >> n;
-  }, "tensor", kElementWise);
-}
-/*!
 * \brief Creates an operation that clips each element of a tensor to
 * the interval [a_min, a_max]
 *

--- a/topi/include/topi/nn/l2_normalize.h
+++ b/topi/include/topi/nn/l2_normalize.h
@@ -26,20 +26,20 @@ using namespace tvm;
 * \return A Tensor whose op member is the l2 normalization operation
 */
 inline Tensor l2_normalize(const Tensor& data,
-                              float eps,
+                           float eps,
-                              const Array<Expr>& axis,
+                           const Array<Expr>& axis,
-                              std::string name = "tensor",
+                           std::string name = "tensor",
-                              std::string tag = "l2_normalize") {
+                           std::string tag = "l2_normalize") {
  CHECK_EQ(data->shape.size(), 4) << "L2 normalization requires 4-D input";
  auto input_shape = data->shape;
-  Tensor dot_value = pow(data, static_cast<float>(2.0));
+  Tensor dot_value = topi::power(data, static_cast<float>(2.0));
  Tensor sum_value = topi::sum(dot_value, axis, true);
  Tensor expand_sum = topi::broadcast_to(sum_value, input_shape);
-  return topi::broadcast_div(data,
+  return topi::divide(data,
-                             topi::sqrt(tvm::compute(expand_sum->shape,
+                      topi::sqrt(tvm::compute(expand_sum->shape,
-                                                     [&](const Array<Var>& i){
+                                              [&](const Array<Var>& i){
-                                                     return (max(expand_sum(i), eps));
+                                                return (max(expand_sum(i), eps));
-                                                     }, name = name, tag = tag)));
+                                              }, name = name, tag = tag)));
 }
 }  // namespace nn
 }  // namespace topi

--- a/topi/include/topi/nn/local_response_norm.h
+++ b/topi/include/topi/nn/local_response_norm.h
@@ -63,13 +63,14 @@ inline Tensor lrn(const Tensor& data,
                                           {rxs});
                           });
  }
-  auto sqrt_sum_up = tvm::compute(input_shape,
+  auto sqrt_sum_up = tvm::compute(
-                                  [&](Var i, Var j, Var k, Var l) {
+      input_shape,
-                                  return tvm::pow(bias +
+      [&](Var i, Var j, Var k, Var l) {
-                                                  (alpha * sqr_sum(i, j, k, l) / size),
+        return tvm::pow(bias +
-                                                  beta);
+                        (alpha * sqr_sum(i, j, k, l) / size),
-                                  });
+                        beta);
-  return topi::broadcast_div(data, sqrt_sum_up);
+      });
+  return topi::divide(data, sqrt_sum_up);
 }
 }  // namespace nn
 }  // namespace topi

--- a/topi/python/topi/broadcast.py
+++ b/topi/python/topi/broadcast.py
--- a/topi/python/topi/cpp.py
+++ b/topi/python/topi/cpp.py
@@ -5,7 +5,6 @@ import ctypes
 from imp import new_module as _new_module
 from tvm._ffi.function import _init_api_prefix
 from tvm._ffi import libinfo
-import tvm as _tvm
 def _get_lib_names():
    if sys.platform.startswith('win32'):
@@ -35,7 +34,6 @@ def _create_module(name):
    return mod
 # pylint: disable-msg=C0103
 nn = _create_module("nn")
 _init_api_prefix("topi.cpp.nn", "topi.nn")
 generic = _create_module("generic")
@@ -52,41 +50,3 @@ yolo2 = _create_module("vision.yolo2")
 _init_api_prefix("topi.cpp.vision.yolo2", "topi.vision.yolo2")
 image = _create_module("image")
 _init_api_prefix("topi.cpp.image", "topi.image")
-class IntVector(object):
-    """Handle to std::vector<int> instance """
-    _tvm_tcode = 27
-    def __init__(self, handle):
-        self.handle = handle
-    def __del__(self):
-        _tvm.nd.free_extension_handle(self.handle, 27)
-    @property
-    def _tvm_handle(self):
-        return self.handle.value
-    def __getitem__(self, idx):
-        return ivec_get(self, idx)
-_tvm.register_extension(IntVector, IntVector)
-class Target(object):
-    """Handle to C++ Target instance """
-    _tvm_tcode = 28
-    def __init__(self, handle):
-        self.handle = handle
-    def __del__(self):
-        _tvm.nd.free_extension_handle(self.handle, 28)
-    @property
-    def _tvm_handle(self):
-        return self.handle.value
-    def __getitem__(self, idx):
-        return ivec_get(self, idx)
-_tvm.register_extension(Target, Target)
--- a/topi/python/topi/generic_op_impl.py
+++ b/topi/python/topi/generic_op_impl.py
@@ -3,10 +3,10 @@
 from __future__ import absolute_import as _abs
 import tvm
 from . import broadcast as _broadcast
-from . import tag
+from . import math as _math
-def _make_bop(elementwise_bop, broadcast_bop, orig_bop):
+def _make_bop(broadcast_bop, orig_bop):
    """Make a specific overloaded binary operator of Tensor when applicable;
    apply the original operator if it is not supposed to be overloaded.
@@ -23,9 +23,6 @@ def _make_bop(elementwise_bop, broadcast_bop, orig_bop):
    Parameters
    ----------
-    elementwise_bop : operator function
-        Operator for element-wise tensor-scalar operation, for rule (2).
    broadcast_bop : operator function
        Operator for broadcast tensor-tensor operation, for rule (1).
@@ -66,36 +63,9 @@ def _make_bop(elementwise_bop, broadcast_bop, orig_bop):
              tvm.Expr (otherwise)
            The result of {op} operation.
        """
+        if not isinstance(lhs, tvm.tensor.Tensor) and not isinstance(rhs, tvm.tensor.Tensor):
-        def _get_rank(x):
-            """Get the rank of a value.
-            If x is Tensor, then return its rank;
-            if x is scalar_like (i.e., numeric types, Expr, or TensorSlice), return 0;
-            otherwise, return -1.
-            """
-            if isinstance(x, tvm.tensor.Tensor):
-                return len(x.shape)
-            elif isinstance(x, (int, float, tvm.expr.Expr, tvm.tensor.TensorSlice)):
-                return 0
-            return -1
-        rl = _get_rank(lhs)
-        rr = _get_rank(rhs)
-        if rl == -1 or rr == -1 or (rl == 0 and rr == 0):
            return orig_bop(lhs, rhs)
-        elif rl > 0 and rr > 0:
+        return broadcast_bop(lhs, rhs)
-            return broadcast_bop(lhs, rhs)
-        elif rl == 0:
-            f = lambda *i: elementwise_bop(lhs, rhs(*i))
-            with tvm.tag_scope(tag=tag.ELEMWISE):
-                return tvm.compute(rhs.shape, f, "tensor_" + name)
-        elif rr == 0:
-            f = lambda *i: elementwise_bop(lhs(*i), rhs)
-            with tvm.tag_scope(tag=tag.ELEMWISE):
-                return tvm.compute(lhs.shape, f, "tensor_" + name)
-        else:
-            raise AssertionError("Cannot reach this line.")
    _tensor_bop_impl.__doc__ = _tensor_bop_impl.__doc__.format(op=name)
    return _tensor_bop_impl
@@ -106,18 +76,10 @@ def _bind_generic_ops():
    __op_priority__ = 1
    if __op_priority__ > tvm.generic.__op_priority__:
        tvm.generic.__op_priority__ = __op_priority__
-        tvm.generic.add = _make_bop(lambda x, y: x + y,
+        tvm.generic.add = _make_bop(_broadcast.add, tvm.generic.add)
-                                    _broadcast.broadcast_add,
+        tvm.generic.subtract = _make_bop(_broadcast.subtract, tvm.generic.subtract)
-                                    tvm.generic.add)
+        tvm.generic.multiply = _make_bop(_broadcast.multiply, tvm.generic.multiply)
-        tvm.generic.subtract = _make_bop(lambda x, y: x - y,
+        tvm.generic.divide = _make_bop(_broadcast.divide, tvm.generic.divide)
-                                         _broadcast.broadcast_sub,
+        tvm.generic.cast = _math.cast
-                                         tvm.generic.subtract)
-        tvm.generic.multiply = _make_bop(lambda x, y: x * y,
-                                         _broadcast.broadcast_mul,
-                                         tvm.generic.multiply)
-        tvm.generic.divide = _make_bop(lambda x, y: x / y,
-                                       _broadcast.broadcast_div,
-                                       tvm.generic.divide)
 _bind_generic_ops()
--- a/topi/python/topi/math.py
+++ b/topi/python/topi/math.py
@@ -258,14 +258,14 @@ def clip(x, a_min, a_max):
    return tvm.compute(x.shape, _compute)
-@tvm.tag_scope(tag=tag.ELEMWISE)
 def cast(x, dtype):
    """Cast input to specified data type.
    Parameters
    ----------
-    x : tvm.Tensor
+    x : tvm.Tensor or Expr
        Input argument.
    dtype : str
        Data type.
@@ -274,4 +274,7 @@ def cast(x, dtype):
    y : tvm.Tensor
        The result.
    """
-    return tvm.compute(x.shape, lambda *i: x(*i).astype(dtype))
+    if isinstance(x, tvm.tensor.Tensor):
+        return tvm.compute(
+            x.shape, lambda *i: x(*i).astype(dtype), tag=tag.ELEMWISE)
+    return tvm.make.static_cast(dtype, x)
--- a/topi/python/topi/tensor.py
+++ b/topi/python/topi/tensor.py
@@ -68,52 +68,3 @@ def full_like(x, fill_value):
    """
    dtype = x.dtype
    return tvm.compute(x.shape, lambda *i: tvm.const(fill_value, dtype))
-@tvm.tag_scope(tag=tag.ELEMWISE)
-def greater(lhs, rhs, out_type=tvm.int8):
-    """Compare two input tensors element-wise and return an mask tensor
-       which contains 1 if lhs > rhs holds else 0
-    Parameters
-    ----------
-    lhs : tvm.Tensor
-        Left input argument.
-    rhs : tvm.Tensor
-        Right argument.
-    out_type: str
-        Output data type. Default is int8
-    Returns
-    -------
-    y : tvm.Tensor
-        The result.
-    """
-    return tvm.compute(lhs.shape,
-                       lambda *i: tvm.select(lhs(*i) > rhs(*i),
-                                             tvm.const(1, out_type),
-                                             tvm.const(0, out_type)))
-@tvm.tag_scope(tag=tag.ELEMWISE)
-def less(lhs, rhs, out_type=tvm.int8):
-    """Compare two input tensors element-wise and return an mask tensor
-       which contains 1 if lhs < rhs holds else 0
-    Parameters
-    ----------
-    lhs : tvm.Tensor
-        Left input argument.
-    rhs : tvm.Tensor
-        Right argument.
-    out_type: str
-        Output data type. Default is int8
-    Returns
-    -------
-    y : tvm.Tensor
-        The result.
-    """
-    return tvm.compute(lhs.shape,
-                       lambda *i: tvm.select(lhs(*i) < rhs(*i),
-                                             tvm.const(1, out_type),
-                                             tvm.const(0, out_type)))
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -67,51 +67,55 @@ Array<Expr> ArrayOrInt(TVMArgValue arg) {
  }
 }
+inline bool IsTensorType(TVMArgValue arg) {
+  return (arg.type_code() == kNodeHandle &&
+          arg.node_sptr()->is_type<tvm::TensorNode>());
+}
 TVM_REGISTER_GLOBAL("topi.TEST_create_target")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
  *rv = tvm::Target::create(args[0]);
  });
 /* Ops from broadcast.h */
+#define TOPI_REGISTER_BCAST_OP(OpName, Op)                              \
+  TVM_REGISTER_GLOBAL(OpName)                                           \
+  .set_body([](TVMArgs args, TVMRetValue *rv) {                         \
+      bool lhs_is_tensor = IsTensorType(args[0]);                       \
+      bool rhs_is_tensor = IsTensorType(args[1]);                       \
+      if (lhs_is_tensor && rhs_is_tensor) {                             \
+        *rv = Op(args[0].operator tvm::Tensor(),                        \
+                 args[1].operator tvm::Tensor());                       \
+      } else if (!lhs_is_tensor && rhs_is_tensor) {                     \
+        *rv = Op(args[0].operator tvm::Expr(),                          \
+                 args[1].operator tvm::Tensor());                       \
+      } else if (lhs_is_tensor && !rhs_is_tensor) {                     \
+        *rv = Op(args[0].operator tvm::Tensor(),                        \
+                 args[1].operator tvm::Expr());                         \
+      } else if (!lhs_is_tensor && !rhs_is_tensor) {                    \
+        *rv = Op(args[0].operator tvm::Expr(),                          \
+                 args[1].operator tvm::Expr());                         \
+      }                                                                 \
+    });                                                                 \
 TVM_REGISTER_GLOBAL("topi.broadcast_to")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
  *rv = broadcast_to(args[0], args[1]);
  });
-TVM_REGISTER_GLOBAL("topi.broadcast_add")
+TOPI_REGISTER_BCAST_OP("topi.add", topi::add);
-.set_body([](TVMArgs args, TVMRetValue *rv) {
+TOPI_REGISTER_BCAST_OP("topi.subtract", topi::subtract);
-  *rv = broadcast_add(args[0], args[1]);
+TOPI_REGISTER_BCAST_OP("topi.multiply", topi::multiply);
-  });
+TOPI_REGISTER_BCAST_OP("topi.divide", topi::divide);
+TOPI_REGISTER_BCAST_OP("topi.mod", topi::mod);
-TVM_REGISTER_GLOBAL("topi.broadcast_sub")
+TOPI_REGISTER_BCAST_OP("topi.maximum", topi::maximum);
-.set_body([](TVMArgs args, TVMRetValue *rv) {
+TOPI_REGISTER_BCAST_OP("topi.minimum", topi::minimum);
-  *rv = broadcast_sub(args[0], args[1]);
+TOPI_REGISTER_BCAST_OP("topi.power", topi::power);
-  });
+TOPI_REGISTER_BCAST_OP("topi.left_shift", topi::left_shift);
+TOPI_REGISTER_BCAST_OP("topi.right_shift", topi::right_shift);
-TVM_REGISTER_GLOBAL("topi.broadcast_mul")
+TOPI_REGISTER_BCAST_OP("topi.greater", topi::greater);
-.set_body([](TVMArgs args, TVMRetValue *rv) {
+TOPI_REGISTER_BCAST_OP("topi.less", topi::less);
-  *rv = broadcast_mul(args[0], args[1]);
-  });
-TVM_REGISTER_GLOBAL("topi.broadcast_div")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = broadcast_div(args[0], args[1]);
-  });
-TVM_REGISTER_GLOBAL("topi.broadcast_maximum")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = broadcast_maximum(args[0], args[1]);
-  });
-TVM_REGISTER_GLOBAL("topi.broadcast_minimum")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = broadcast_minimum(args[0], args[1]);
-   });
-TVM_REGISTER_GLOBAL("topi.broadcast_pow")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = broadcast_pow(args[0], args[1]);
-  });
 /* Ops from elemwise.h */
 TVM_REGISTER_GLOBAL("topi.exp")
@@ -149,25 +153,6 @@ TVM_REGISTER_GLOBAL("topi.negative")
  *rv = negative(args[0]);
  });
-TVM_REGISTER_GLOBAL("topi.pow")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = pow(args[0], args[1]);
-  });
-TVM_REGISTER_GLOBAL("topi.left_shift")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  Tensor lhs = args[0];
-  Expr rhs = args[1];
-  *rv = lhs >> rhs;
-  });
-TVM_REGISTER_GLOBAL("topi.right_shift")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  Tensor lhs = args[0];
-  Expr rhs = args[1];
-  *rv = lhs << rhs;
-  });
 TVM_REGISTER_GLOBAL("topi.clip")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
  *rv = clip(args[0], args[1], args[2]);

--- a/topi/tests/python/test_topi_broadcast.py
+++ b/topi/tests/python/test_topi_broadcast.py
@@ -4,10 +4,10 @@ import numpy as np
 import tvm
 import topi
-def verify_broadcast_to_ele(in_shape, out_shape):
+def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
    # Build the logic and compile the function
    A = tvm.placeholder(shape=in_shape, name="A")
-    B = topi.broadcast_to(A, out_shape)
+    B = fbcast(A, out_shape)
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
@@ -32,24 +32,20 @@ def verify_broadcast_to_ele(in_shape, out_shape):
    check_device("rocm")
-def verify_broadcast_binary_ele(lhs_shape, rhs_shape, typ="add"):
+def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
+                                ftopi, fnumpy,
+                                lhs_min=-100, lhs_max=100,
+                                rhs_min=-100, rhs_max=100,
+                                dtype="float32"):
    # Build the logic and compile the function
-    A = tvm.placeholder(shape=lhs_shape, name="A")
+    A = (tvm.var("A", dtype=dtype) if lhs_shape is None
-    B = tvm.placeholder(shape=rhs_shape, name="B")
+         else tvm.placeholder(shape=lhs_shape, name="A", dtype=dtype))
-    if typ == "add":
+    B = (tvm.var("B", dtype=dtype) if rhs_shape is None
-        C = topi.broadcast_add(A, B)
+         else tvm.placeholder(shape=rhs_shape, name="B", dtype=dtype))
-    elif typ == "sub":
+    C = ftopi(A, B)
-        C = topi.broadcast_sub(A, B)
+    if (isinstance(A, tvm.expr.Expr) and isinstance(B, tvm.expr.Expr)):
-    elif typ == "div":
+        assert(isinstance(C, tvm.expr.Expr))
-        C = topi.broadcast_div(A, B)
+        return
-    elif typ == "mul":
-        C = topi.broadcast_mul(A, B)
-    elif typ == "maximum":
-        C = topi.broadcast_maximum(A, B)
-    elif typ == "minimum":
-        C = topi.broadcast_minimum(A, B)
-    else:
-        raise NotImplementedError
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
@@ -58,54 +54,102 @@ def verify_broadcast_binary_ele(lhs_shape, rhs_shape, typ="add"):
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_broadcast(C)
-        foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ)
+        foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + ftopi.__name__)
-        lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
+        if lhs_shape is None:
-        rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype)
+            lhs_npy = float(np.random.uniform(low=lhs_min, high=lhs_max))
-        if typ == "add":
+            if dtype.startswith('int'):
-            out_npy = lhs_npy + rhs_npy
+                lhs_npy = int(lhs_npy)
-        elif typ == "sub":
+            lhs_nd = lhs_npy
-            out_npy = lhs_npy - rhs_npy
-        elif typ == "div":
-            rhs_npy = np.abs(rhs_npy) + 0.001
-            out_npy = lhs_npy / rhs_npy
-        elif typ == "mul":
-            out_npy = lhs_npy * rhs_npy
-        elif typ == "maximum":
-            out_npy = np.maximum(lhs_npy, rhs_npy)
-        elif typ == "minimum":
-            out_npy = np.minimum(lhs_npy, rhs_npy)
        else:
-            raise NotImplementedError
+            lhs_npy = np.random.uniform(low=lhs_min, high=lhs_max,
-        lhs_nd = tvm.nd.array(lhs_npy, ctx)
+                                        size=lhs_shape).astype(A.dtype)
-        rhs_nd = tvm.nd.array(rhs_npy, ctx)
+            lhs_nd = tvm.nd.array(lhs_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
+        if rhs_shape is None:
+            rhs_npy = float(np.random.uniform(low=rhs_min, high=rhs_max))
+            if dtype.startswith('int'):
+                lhs_npy = int(lhs_npy)
+            rhs_nd = rhs_npy
+        else:
+            rhs_npy = np.random.uniform(low=rhs_min, high=rhs_max,
+                                        size=rhs_shape).astype(A.dtype)
+            rhs_nd = tvm.nd.array(rhs_npy, ctx)
+        out_npy = fnumpy(lhs_npy, rhs_npy)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx)
        for _ in range(1):
            foo(lhs_nd, rhs_nd, out_nd)
        np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
-    check_device("vulkan")
    check_device("opencl")
+    check_device("vulkan")
    check_device("cuda")
    check_device("metal")
    check_device("rocm")
 def test_broadcast_to():
-    verify_broadcast_to_ele((1,), (10,))
+    verify_broadcast_to_ele((1,), (10,), topi.broadcast_to)
-    verify_broadcast_to_ele((), (10,))
+    verify_broadcast_to_ele((), (10,), topi.broadcast_to)
-    verify_broadcast_to_ele((1, 1, 5, 4), (3, 4, 4, 4, 5, 4))
+    verify_broadcast_to_ele((1, 1, 5, 4), (3, 4, 4, 4, 5, 4), topi.broadcast_to)
-    verify_broadcast_to_ele((1, 128, 1, 32), (64, 128, 64, 32))
+    verify_broadcast_to_ele((1, 128, 1, 32), (64, 128, 64, 32), topi.broadcast_to)
+def test_add():
+    verify_broadcast_binary_ele(
+        (5, 2, 3), (2, 1), topi.add, np.add)
+def test_subtract():
+    verify_broadcast_binary_ele(
+        (5, 2, 3), (), topi.subtract, np.subtract)
+    verify_broadcast_binary_ele(
+        (5, 2, 3), None, topi.subtract, np.subtract)
+    verify_broadcast_binary_ele(
+        None, None, topi.subtract, np.subtract)
+    verify_broadcast_binary_ele(
+        (1, 32), (64, 32), topi.subtract, np.subtract)
+def test_multiply():
+    verify_broadcast_binary_ele(
+        (5, 64, 128), (2, 5, 64, 1), topi.multiply, np.multiply)
+def test_divide():
+    verify_broadcast_binary_ele(
+        None, (10,), topi.divide, np.divide, rhs_min=0.0001)
+    verify_broadcast_binary_ele(
+        (2, 3, 1, 32), (64, 32), topi.divide, np.divide, rhs_min=0.0001)
+def test_maximum_minmum():
+    verify_broadcast_binary_ele(
+        (32,), (64, 32), topi.maximum, np.maximum)
+    verify_broadcast_binary_ele(
+        (1, 2, 2, 1, 32), (64, 32), topi.minimum, np.minimum)
+def test_power():
+    verify_broadcast_binary_ele(
+        (1, 2, 2), (2,), topi.power, np.power, lhs_min=0.001, rhs_min=0.001, rhs_max=2)
+def test_mod():
+    verify_broadcast_binary_ele(
+        (1, 2, 2), (2,), topi.mod, np.mod, lhs_min=0.001, rhs_min=1, dtype="int32")
-def test_broadcast_binary():
+def test_cmp():
-    verify_broadcast_binary_ele((5, 2, 3), (2, 1), typ="add")
+    # explicit specify the output type
-    verify_broadcast_binary_ele((5, 2, 3), (), typ="add")
+    def greater(x, y):
-    verify_broadcast_binary_ele((5, 64, 128), (2, 5, 64, 1), typ="mul")
+        return topi.greater(x, y).astype("int8")
-    verify_broadcast_binary_ele((2, 3, 1, 32), (64, 32), typ="div")
+    def less(x, y):
-    verify_broadcast_binary_ele((1, 32), (64, 32), typ="sub")
+        return topi.less(x, y).astype("int8")
-    verify_broadcast_binary_ele((32,), (64, 32), typ="maximum")
+    verify_broadcast_binary_ele(
-    verify_broadcast_binary_ele((1, 2, 2, 1, 32), (64, 32), typ="minimum")
+        (1, 2, 2), (2,), greater, np.greater)
+    verify_broadcast_binary_ele(
+        (2, 1, 2), (2, 3, 1), less, np.less)
 if __name__ == "__main__":
-    test_broadcast_binary()
+    test_cmp()
+    test_mod()
+    test_add()
+    test_subtract()
+    test_multiply()
+    test_divide()
+    test_maximum_minmum()
+    test_power()
    test_broadcast_to()
--- a/topi/tests/python/test_topi_tensor.py
+++ b/topi/tests/python/test_topi_tensor.py
@@ -69,44 +69,6 @@ def verify_full(shape, dtype, fill_value):
        check_device(device)
-def verify_comparator(shape, dtype, out_type='int8'):
-    A = tvm.placeholder(shape, dtype, name="A")
-    B = tvm.placeholder(shape, dtype, name="B")
-    C = topi.less(A, B)
-    s_less = tvm.create_schedule([C.op])
-    D = tvm.placeholder(shape, dtype, name="D")
-    E = tvm.placeholder(shape, dtype, name="E")
-    F = topi.greater(D, E, out_type)
-    s_greater = tvm.create_schedule([F.op])
-    @memoize("topi.tests.test_topi_indicator")
-    def get_ref_data():
-        return [np.random.uniform(0, 10, size=shape).astype(dtype),
-                np.random.uniform(0, 10, size=shape).astype(dtype)]
-    [np_l, np_r] = get_ref_data()
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        ctx = tvm.context(device, 0)
-        out = tvm.nd.array(np.zeros(shape, dtype=out_type), ctx)
-        tvm_l = tvm.nd.array(np_l, ctx)
-        tvm_r = tvm.nd.array(np_r, ctx)
-        f = tvm.build(s_less, [A, B, C], device, name="less")
-        f(tvm_l, tvm_r, out)
-        np.testing.assert_allclose(out.asnumpy(), np.less(np_l, np_r).astype(out_type), rtol=1e-5)
-        f = tvm.build(s_greater, [D, E, F], device, name="greater")
-        f(tvm_l, tvm_r, out)
-        np.testing.assert_allclose(out.asnumpy(), np.greater(np_l, np_r).astype(out_type), rtol=1e-5)
-    for device in ["llvm"]:
-        check_device(device)
 def test_elemwise_sum():
    verify_elemwise_sum(1, "float32")
    verify_elemwise_sum(5, "float32")
@@ -118,12 +80,6 @@ def test_full():
    verify_full((10,), "int32", 7)
-def test_comparator():
-    verify_comparator((3,4,5), "float32")
-    verify_comparator((7,), "int32")
-    verify_comparator((3,4,5), "float32", "int8")
 if __name__ == "__main__":
    test_elemwise_sum()
    test_full()
-    test_comparator()
--- a/topi/tests/python_cpp/test_topi_broadcast.py
+++ b/topi/tests/python_cpp/test_topi_broadcast.py
-"""Test code for broadcasting operators."""
-import os
-import numpy as np
-import tvm
-import topi
-def verify_broadcast_to_ele(in_shape, out_shape):
-    # Build the logic and compile the function
-    A = tvm.placeholder(shape=in_shape, name="A")
-    B = topi.cpp.broadcast_to(A, out_shape)
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        s = topi.cpp.cuda.schedule_injective(target, [B])
-        ctx = tvm.context(device, 0)
-        foo = tvm.build(s, [A, B], device, name="broadcast_to")
-        data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
-        out_npy = np.broadcast_to(data_npy, out_shape)
-        data_nd = tvm.nd.array(data_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
-        for _ in range(1):
-            foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
-    check_device("opencl")
-    check_device("cuda")
-    #check_device("metal")
-    #check_device("rocm")
-def verify_broadcast_binary_ele(lhs_shape, rhs_shape, typ="add"):
-    # Build the logic and compile the function
-    A = tvm.placeholder(shape=lhs_shape, name="A")
-    B = tvm.placeholder(shape=rhs_shape, name="B")
-    if typ == "add":
-        C = topi.cpp.broadcast_add(A, B)
-    elif typ == "sub":
-        C = topi.cpp.broadcast_sub(A, B)
-    elif typ == "div":
-        C = topi.cpp.broadcast_div(A, B)
-    elif typ == "mul":
-        C = topi.cpp.broadcast_mul(A, B)
-    elif typ == "maximum":
-        C = topi.cpp.broadcast_maximum(A, B)
-    elif typ == "minimum":
-        C = topi.cpp.broadcast_minimum(A, B)
-    elif typ == "pow":
-        C = topi.cpp.broadcast_pow(A, B)
-    else:
-        raise NotImplementedError
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        s = topi.cpp.cuda.schedule_injective(target, [C])
-        ctx = tvm.context(device, 0)
-        foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ)
-        lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
-        rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype)
-        if typ == "add":
-            out_npy = lhs_npy + rhs_npy
-        elif typ == "sub":
-            out_npy = lhs_npy - rhs_npy
-        elif typ == "div":
-            rhs_npy = np.abs(rhs_npy) + 0.001
-            out_npy = lhs_npy / rhs_npy
-        elif typ == "mul":
-            out_npy = lhs_npy * rhs_npy
-        elif typ == "maximum":
-            out_npy = np.maximum(lhs_npy, rhs_npy)
-        elif typ == "minimum":
-            out_npy = np.minimum(lhs_npy, rhs_npy)
-        elif typ == "pow":
-            out_npy = lhs_npy ** rhs_npy
-        else:
-            raise NotImplementedError
-        lhs_nd = tvm.nd.array(lhs_npy, ctx)
-        rhs_nd = tvm.nd.array(rhs_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
-        for _ in range(1):
-            foo(lhs_nd, rhs_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
-    check_device("opencl")
-    check_device("cuda")
-    #check_device("metal")
-    #check_device("rocm")
-def test_broadcast_to():
-    verify_broadcast_to_ele((1,), (10,))
-    verify_broadcast_to_ele((), (10,))
-    verify_broadcast_to_ele((1, 1, 5, 4), (3, 4, 4, 4, 5, 4))
-    verify_broadcast_to_ele((1, 128, 1, 32), (64, 128, 64, 32))
-def test_broadcast_binary():
-    verify_broadcast_binary_ele((5, 2, 3), (2, 1), typ="add")
-    verify_broadcast_binary_ele((5, 2, 3), (), typ="add")
-    verify_broadcast_binary_ele((5, 64, 128), (2, 5, 64, 1), typ="mul")
-    verify_broadcast_binary_ele((2, 3, 1, 32), (64, 32), typ="div")
-    verify_broadcast_binary_ele((1, 32), (64, 32), typ="sub")
-    verify_broadcast_binary_ele((32,), (64, 32), typ="maximum")
-    verify_broadcast_binary_ele((1, 2, 2, 1, 32), (64, 32), typ="minimum")
-    verify_broadcast_binary_ele((1, 32), (64, 32), typ="pow")
-if __name__ == "__main__":
-    test_broadcast_to()
-    test_broadcast_binary()
--- a/topi/tests/python_cpp/test_topi_transform.py
+++ b/topi/tests/python_cpp/test_topi_transform.py
@@ -243,7 +243,7 @@ def verify_concatenate_broadcast(shapes, axis, rhs_shape):
    for i, shape in enumerate(shapes):
        tensor_l.append(tvm.placeholder(shape, name="A" + str(i)))
    out_tensor = topi.cpp.concatenate(tensor_l, axis)
-    C = topi.cpp.broadcast_add(out_tensor, B)
+    C = out_tensor + B
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist: