[RELAY][EXPR] Make const numpy consistent (#2349)

6d1f4c0b · Tianqi Chen · GitHub · 3516cbe0 · 6d1f4c0b · 6d1f4c0b
Unverified Commit 6d1f4c0b authored Dec 28, 2018 by Tianqi Chen Committed by GitHub Dec 28, 2018
37 changed files
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -26,22 +26,53 @@ handle = "handle"


 def min_value(dtype):
-    """minimum value of dtype"""
+    """minimum value of dtype
+
+    Parameters
+    ----------
+    dtype : str
+        The data type.
+
+    Returns
+    -------
+    value : tvm.Expr
+        The minimum value of dtype.
+    """
    return _api_internal._min_value(dtype)


 def max_value(dtype):
-    """maximum value of dtype"""
+    """maximum value of dtype
+
+    Parameters
+    ----------
+    dtype : str
+        The data type.
+
+    Returns
+    -------
+    value : tvm.Expr
+        The maximum value of dtype.
+    """
    return _api_internal._max_value(dtype)


-def const(value, dtype=None):
-    """construct a constant"""
-    if dtype is None:
-        if isinstance(value, _Integral):
-            dtype = 'int32'
-        else:
-            dtype = 'float32'
+def const(value, dtype):
+    """construct a constant
+
+    Parameters
+    ----------
+    value : number
+        The content of the constant number.
+
+    dtype : str
+        The data type.
+
+    Returns
+    -------
+    const_val: tvm.Expr
+        The result expression.
+    """
    return _api_internal._const(value, dtype)



--- a/python/tvm/hybrid/calls.py
+++ b/python/tvm/hybrid/calls.py
@@ -43,7 +43,7 @@ def bind(func_id, args):
    _internal_assert(isinstance(args[0], str), \
                     "A loop bind's first argument should be a string!")
    iter_var = _api.thread_axis(args[0])
-    low, ext = _api.const(0), args[1]
+    low, ext = _api.const(0, "int32"), args[1]
    for_type = None
    return iter_var, low, ext, for_type


--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -4,6 +4,8 @@ import ast
 import operator
 import logging
 import sys
+from numbers import Integral
+
 from .util import _internal_assert
 from . import calls
 from . import util
@@ -137,6 +139,15 @@ class HybridParser(ast.NodeVisitor):
            return self._args[s]
        return self.alloc_buffers[s][0]

+    def _const(self, value, dtype=None):
+        if dtype is None:
+            if isinstance(value, bool):
+                dtype = "bool"
+            elif isinstance(value, Integral):
+                dtype = "int32"
+            else:
+                dtype = "float32"
+        return _api.const(value, dtype)

    #pylint: disable=invalid-name, missing-docstring
    def visit_Module(self, node):
@@ -172,9 +183,9 @@ class HybridParser(ast.NodeVisitor):
            if isinstance(res, tuple):
                buf = res[0]
                if isinstance(node.ctx, ast.Load):
-                    return _make.Call(buf.dtype, buf.name, [_api.const(0)], \
+                    return _make.Call(buf.dtype, buf.name, [self._const(0)], \
                                      _expr.Call.Halide, buf.op, buf.value_index)
-                return buf, [_api.const(0)]
+                return buf, [self._const(0)]
            if isinstance(node.ctx, ast.Load):
                return res
            return None
@@ -183,7 +194,7 @@ class HybridParser(ast.NodeVisitor):


    def visit_Num(self, node):
-        return _api.const(node.n)
+        return self._const(node.n)


    def visit_AugAssign(self, node):
@@ -193,7 +204,7 @@ class HybridParser(ast.NodeVisitor):
            _internal_assert(len(buf) == 2, "LHS is supposed to be (buf, args)!")
            buf, args = buf
        else:
-            args = [_api.const(0)]
+            args = [self._const(0)]
        _internal_assert(isinstance(buf, Tensor), "LHS is supposed to be Tensor!")

        read = _make.Call(buf.dtype, buf.name, args, _expr.Call.Halide, buf.op, buf.value_index)
@@ -378,7 +389,7 @@ class HybridParser(ast.NodeVisitor):
        if iter_var is None:
            _internal_assert(for_type is not None, "The loop bind function parse error!")
            offset = iter_var = _api.var(_name)
-            if not _ir_pass.Equal(low, _api.const(0)):
+            if not _ir_pass.Equal(low, self._const(0)):
                offset = iter_var + low
            self.loops_above[_name] = offset
        else:
@@ -389,7 +400,7 @@ class HybridParser(ast.NodeVisitor):
        if for_type is None:
            res = _make.AttrStmt(iter_var, 'thread_extent', ext, _body)
        else:
-            res = _make.For(iter_var, _api.const(0), ext, for_type, 0, _body)
+            res = _make.For(iter_var, self._const(0), ext, for_type, 0, _body)
        self.loops_above.pop(_name)
        return res


--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -465,12 +465,6 @@ def const(value, dtype=None):
    """
    if isinstance(value, (_base.numeric_types, (bool, list))):
        value = _np.array(value, dtype=dtype)
-        # convert default to int32 and float32
-        if dtype is None:
-            if value.dtype == "float64":
-                value = value.astype("float32")
-            elif value.dtype == "int64":
-                value = value.astype("int32")
    if isinstance(value, (_np.ndarray, _np.generic)):
        value = _nd.array(value)


--- a/tests/python/relay/test_backend_interpreter.py
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -37,7 +37,8 @@ def test_tuple_value():
 def test_id():
    x = relay.var('x', 'float32')
    ident = relay.Function([x], x)
-    check_eval(ident, [1.0], 1.0)
+    one = np.array(1.0, 'float32')
+    check_eval(ident, [one], one)


 def test_add_const():
@@ -60,8 +61,8 @@ def test_equal():
    j = relay.var('i', shape=[], dtype='int32')
    z = relay.equal(i, j)
    func = relay.Function([i, j], z, ret_type=relay.TensorType([], 'bool'))
-    i_data = relay.const(0)
-    j_data = relay.const(0)
+    i_data = relay.const(0, 'int32')
+    j_data = relay.const(0, 'int32')
    check_eval(func, [i_data, j_data], True)


@@ -96,10 +97,10 @@ def test_loop():
    i = relay.var('i', shape=[], dtype='int32')
    accum = relay.var('accum', shape=[], dtype='int32')
    sb = ScopeBuilder()
-    with sb.if_scope(relay.equal(i, relay.const(0))):
+    with sb.if_scope(relay.equal(i, relay.const(0, 'int32'))):
        sb.ret(accum)
    with sb.else_scope():
-        one_less = relay.subtract(i, relay.const(1))
+        one_less = relay.subtract(i, relay.const(1, 'int32'))
        new_accum = relay.add(accum, i)
        sb.ret(relay.Call(sum_up, [one_less, new_accum]))
    func = relay.Function([i, accum], sb.get())

--- a/tests/python/relay/test_debug.py
+++ b/tests/python/relay/test_debug.py
@@ -13,10 +13,11 @@ def test_debug():
        global _test_debug_hit
        _test_debug_hit = True
    prog = debug(x, debug_func=did_exec)
-    result = ex.evaluate(prog, { x: const(1) })
+    result = ex.evaluate(prog, { x: const(1, 'int32') })
    assert _test_debug_hit
    assert result.asnumpy() == 1

+
 def test_debug_with_expr():
    global _test_debug_hit
    _test_debug_hit = False
@@ -27,6 +28,6 @@ def test_debug_with_expr():
        global _test_debug_hit
        _test_debug_hit = True
    prog = debug(x + x * x, debug_func=did_exec)
-    result = ex.evaluate(prog, { x: const(2) })
+    result = ex.evaluate(prog, { x: const(2, 'int32') })
    assert _test_debug_hit
    assert result.asnumpy() == 6
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -329,7 +329,7 @@ def test_full():
        for target, ctx in ctx_list():
            for kind in ["graph", "debug"]:
                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(func)(fill_value)
+                op_res = intrp.evaluate(func)(np.array(fill_value, dtype))
                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
    verify_full(4, (1, 3, 4, 4), "int32")
    verify_full(4.0, (1, 4), "float32")
@@ -365,7 +365,7 @@ def test_full_like():
        for target, ctx in ctx_list():
            for kind in ["graph", "debug"]:
                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(func)(x_data, fill_value)
+                op_res = intrp.evaluate(func)(x_data, np.array(fill_value, dtype))
                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
    verify_full_like((1, 3, 4, 4), 4, "int32")
    verify_full_like((1, 1), 44.0, "float32")

--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -20,13 +20,13 @@ def test_alter_op():
    @register_alter_op_layout("nn.conv2d", level=100)
    def alter_conv2d(attrs, inputs, tinfos):
        data, weight = inputs
-        weight = relay.multiply(weight, relay.const(2.0))
+        weight = relay.multiply(weight, relay.const(2.0, "float32"))
        return relay.nn.conv2d(data, weight, **attrs)

    def expected():
        x = relay.var("x", shape=(1, 64, 56, 56))
        weight = relay.var('weight', shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(x, relay.multiply(weight, relay.const(2.0)),
+        y = relay.nn.conv2d(x, relay.multiply(weight, relay.const(2.0, "float32")),
                            channels=64,
                            kernel_size=(3, 3),
                            padding=(1, 1))
@@ -313,4 +313,3 @@ if __name__ == "__main__":
    test_alter_layout_dual_path()
    test_alter_layout_resnet()
    test_alter_layout_broadcast_op()
-
--- a/tests/python/unittest/test_arith_simplify.py
+++ b/tests/python/unittest/test_arith_simplify.py
@@ -21,8 +21,8 @@ def test_simplify():
    assert zz.a == x and zz.b.value == 4

    n = tvm.var('n')
-    assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n % (-1)), tvm.const(0))
-    assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n % 1), tvm.const(0))
+    assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n % (-1)), tvm.const(0, "int32"))
+    assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n % 1), tvm.const(0, "int32"))
    assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n / 1), n)
    tvm.ir_pass.CanonicalSimplify(n / (-1))
    # This is not true in the current implementation
@@ -67,10 +67,11 @@ def test_modular():
    ry = tvm.var("ry")
    y = tvm.var("y")
    x = tvm.var("x")
-    vmap = {rx: tvm.Range(tvm.const(0), tvm.const(3)),
-            ry: tvm.Range(tvm.const(0), tvm.const(3)),
-            y: tvm.Range(tvm.const(0), tvm.const(2)),
-            x: tvm.Range(tvm.const(0), tvm.const(14))}
+    i32_const = lambda x: tvm.const(x, "int32")
+    vmap = {rx: tvm.Range(i32_const(0), i32_const(3)),
+            ry: tvm.Range(i32_const(0), i32_const(3)),
+            y: tvm.Range(i32_const(0), i32_const(2)),
+            x: tvm.Range(i32_const(0), i32_const(14))}
    idx = ry * 16 + rx + y * 16 + x
    z1 = tvm.ir_pass.CanonicalSimplify(idx // 16, vmap)
    z2 = tvm.ir_pass.CanonicalSimplify(idx % 16, vmap)
@@ -82,4 +83,4 @@ if __name__ == "__main__":
    test_modular()
    test_simplify()
    test_mul()
-    test_simplify_minmax()
\ No newline at end of file
+    test_simplify_minmax()
--- a/tests/python/unittest/test_lang_basic.py
+++ b/tests/python/unittest/test_lang_basic.py
 import tvm

 def test_const():
-    x = tvm.const(1)
+    x = tvm.const(1, "int32")
    print(x.dtype)
    assert x.dtype == tvm.int32
    assert isinstance(x, tvm.expr.IntImm)

 def test_make():
-    x = tvm.const(1)
+    x = tvm.const(1, "int32")
    y = tvm.var("x")
    z = x + y
    assert isinstance(tvm.max(x, y), tvm.expr.Max)
    assert isinstance(tvm.min(x, y), tvm.expr.Min)

 def test_ir():
-    x = tvm.const(1)
+    x = tvm.const(1, "int32")
    y = tvm.make.IntImm('int32', 1)
    z = x + y
    stmt = tvm.make.Evaluate(z)

--- a/tests/python/unittest/test_lang_operator.py
+++ b/tests/python/unittest/test_lang_operator.py
@@ -2,7 +2,7 @@ import tvm

 def test_const_fold():
    def check(f, *args):
-        x = f(*[tvm.const(x) for x in args])
+        x = f(*[tvm.const(x, "int32") for x in args])
        y = f(*args)
        if not isinstance(x, (tvm.expr.IntImm, tvm.expr.UIntImm)) or x.value != int(y):
            raise ValueError("check error: %s vs %s " % (x, y))

--- a/tests/python/unittest/test_lang_reflection.py
+++ b/tests/python/unittest/test_lang_reflection.py
@@ -2,8 +2,8 @@ import tvm

 def test_const_saveload_json():
    # save load json
-    x = tvm.const(1)
-    y = tvm.const(10)
+    x = tvm.const(1, "int32")
+    y = tvm.const(10, "int32")
    z = x + y
    z = z + z
    json_str = tvm.save_json(z)
@@ -13,8 +13,8 @@ def test_const_saveload_json():

 def test_make_smap():
    # save load json
-    x = tvm.const(1)
-    y = tvm.const(10)
+    x = tvm.const(1, "int32")
+    y = tvm.const(10, "int32")
    z = tvm.expr.Add(x, y)
    smap = tvm.convert({"z": z, "x": x})
    json_str = tvm.save_json(tvm.convert([smap]))

--- a/tests/python/unittest/test_pass_simplify.py
+++ b/tests/python/unittest/test_pass_simplify.py
@@ -29,13 +29,13 @@ def test_basic():

 def test_bound():
    m = tvm.var('m')
-    vrange = tvm.convert({m: tvm.Range(tvm.const(0), tvm.const(10))})
+    vrange = tvm.convert({m: tvm.Range(tvm.const(0, "int32"), tvm.const(10, "int32"))})
    ret = tvm.ir_pass.Simplify(m % 10, vrange)
    assert ret == m

 def test_canonical():
    x = tvm.var("x")
-    z = tvm.const(3)
+    z = tvm.const(3, "int32")
    ret = tvm.ir_pass.CanonicalSimplify(x / (z*z) - x / (z*z))
    assert(tvm.ir_pass.Equal(ret, 0))


--- a/tests/python/unittest/test_pass_storage_rewrite.py
+++ b/tests/python/unittest/test_pass_storage_rewrite.py
@@ -238,7 +238,8 @@ def test_parallel_alloc():
    n = tvm.var("n")
    with ib.for_range(0, n, name="t") as i:
        ib.scope_attr(
-            tvm.const(1) , "pragma_scope", tvm.make.StringImm("parallel_launch_point"))
+            tvm.const(1, "int32") , "pragma_scope",
+            tvm.make.StringImm("parallel_launch_point"))
        with ib.for_range(0, n, name="i", for_type="parallel") as i:
            with ib.for_range(0, 10, name="j") as j:
                A = ib.allocate("float32", n, name="A", scope="global")

--- a/tests/python/unittest/test_pass_unroll.py
+++ b/tests/python/unittest/test_pass_unroll.py
@@ -24,7 +24,7 @@ def test_unroll_loop():
    assert ret.for_type == tvm.stmt.For.Unrolled

    ib = tvm.ir_builder.create()
-    ib.scope_attr(tvm.const(0), "pragma_auto_unroll_max_step", 16)
+    ib.scope_attr(tvm.const(0, "int32"), "pragma_auto_unroll_max_step", 16)
    ib.emit(stmt)
    wrapped = ib.get()
    wrapped = tvm.make.Block(wrapped, stmt)
@@ -54,4 +54,4 @@ def test_unroll_fake_loop():

 if __name__ == "__main__":
    test_unroll_loop()
-    test_unroll_fake_loop()
\ No newline at end of file
+    test_unroll_fake_loop()
--- a/tests/python/unittest/test_schedule_schedule_ops.py
+++ b/tests/python/unittest/test_schedule_schedule_ops.py
@@ -272,7 +272,8 @@ def test_schedule_cache_relayout4():

 def test_schedule_bound_condition():
   A = tvm.placeholder((64,), name='A', dtype="float32")
-   Apad = tvm.compute((66,), lambda i: tvm.select(tvm.all(i>0, i < 65), A[i-1], tvm.const(0.)), name='Apad')
+   Apad = tvm.compute((66,), lambda i: tvm.select(
+       tvm.all(i>0, i < 65), A[i-1], tvm.const(0., "float32")), name='Apad')
   Apad2 = tvm.compute((66,), lambda i: Apad[i]*2, name='Apad2')
   s = tvm.create_schedule(Apad2.op)
   AL1 = s.cache_read(A,"local",[Apad])

--- a/topi/python/topi/nn/bitserial_conv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -320,7 +320,7 @@ def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"):

            element = data(*idx)
            for b in range(bits):
-                extracted_bit = ((element & tvm.const(masks[b])) >> b).astype(pack_type)
+                extracted_bit = ((element & tvm.const(masks[b], "int32")) >> b).astype(pack_type)
                packed_data[b] = (packed_data[b] | extracted_bit)
                if k < data_width - 1:
                    packed_data[b] = packed_data[b] << 1

--- a/topi/tests/python/test_topi_basic.py
+++ b/topi/tests/python/test_topi_basic.py
@@ -4,7 +4,7 @@ from topi import util


 def test_util():
-    x = tvm.const(100)
+    x = tvm.const(100, "int32")
    assert util.get_const_int(x) == 100
    assert util.get_const_tuple((x, x)) == (100, 100)


--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -6,7 +6,7 @@ from topi import util


 def test_util():
-    x = tvm.const(100)
+    x = tvm.const(100, "int32")
    assert util.get_const_int(x) == 100
    assert util.get_const_tuple((x, x)) == (100, 100)


--- a/topi/tests/python_cpp/test_topi_basic.py
+++ b/topi/tests/python_cpp/test_topi_basic.py
-import tvm
-import topi
-from topi import util
-
-
-def test_util():
-    x = tvm.const(100)
-    assert util.get_const_int(x) == 100
-    assert util.get_const_tuple((x, x)) == (100, 100)
-
-
-def test_ewise():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
-
-    def test_apply(func, name):
-        B = func(A)
-        assert tuple(B.shape) == tuple(A.shape)
-        assert B.op.body[0].name == name
-
-    test_apply(topi.cpp.exp, "exp")
-    test_apply(topi.cpp.tanh, "tanh")
-    test_apply(topi.cpp.sigmoid, "sigmoid")
-    test_apply(topi.cpp.log, "log")
-    test_apply(topi.cpp.sqrt, "sqrt")
-
-def test_flatten_tag():
-    A = tvm.placeholder((3, 4), name='A')
-    B = topi.cpp.nn.flatten(A)
-    assert B.op.tag == topi.tag.INJECTIVE
-
-if __name__ == "__main__":
-    test_util()
-    test_ewise()
-    test_flatten_tag()
--- a/topi/tests/python_cpp/test_topi_bnn.py
+++ b/topi/tests/python_cpp/test_topi_bnn.py
-"""Test code for binary neural network operators."""
-import numpy as np
-import tvm
-import topi
-from topi.util import get_const_tuple
-from tvm.contrib.pickle_memoize import memoize
-
-
-def verify_binary_dense(batch, in_dim, out_dim):
-    A = tvm.placeholder((batch, in_dim), name='A')
-    B = tvm.placeholder((out_dim, in_dim), name='B')
-    bnn_A = topi.cpp.nn.binarize_pack(A, 1)
-    bnn_B = topi.cpp.nn.binarize_pack(B, 1)
-    # binary dense
-    bnn_A1 = tvm.placeholder(bnn_A.shape, dtype=bnn_A.dtype)
-    bnn_B1 = tvm.placeholder(bnn_B.shape, dtype=bnn_B.dtype)
-    bnn_C = topi.cpp.nn.binary_dense(bnn_A1, bnn_B1)
-    # schedule
-    target = topi.cpp.TEST_create_target("llvm")
-    s1 = topi.cpp.x86.schedule_binarize_pack(target, [bnn_A])
-    s2 = topi.cpp.x86.schedule_binarize_pack(target, [bnn_B])
-    s3 = topi.cpp.x86.schedule_binary_dense(target, [bnn_C])
-
-    dtype = A.dtype
-    @memoize("topi.tests.test_topi_binary_dense")
-    def get_ref_data():
-        # generate random matrix of +1 or -1 value
-        a_np = (np.random.randint(2, size=(batch, in_dim)) * 2 - 1).astype(dtype)
-        b_np = (np.random.randint(2, size=(out_dim, in_dim)) * 2 - 1).astype(dtype)
-        c_np = np.dot(a_np, b_np.T)
-        return (a_np, b_np, c_np)
-
-    a_np, b_np, c_np = get_ref_data()
-
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
-    bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), ctx)
-    bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), ctx)
-    bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), ctx)
-    f1 = tvm.build(s1, [A, bnn_A], 'llvm')
-    f2 = tvm.build(s2, [B, bnn_B], 'llvm')
-    f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], 'llvm')
-    f1(a, bnn_a)
-    f2(b, bnn_b)
-    f3(bnn_a, bnn_b, bnn_c)
-    tvm.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
-
-def test_binary_dense():
-    verify_binary_dense(1, 4096, 1024)
-    verify_binary_dense(1, 1024, 1000)
-
-
-if __name__ == "__main__":
-    test_binary_dense()
--- a/topi/tests/python_cpp/test_topi_clip.py
+++ b/topi/tests/python_cpp/test_topi_clip.py
-"""Test code for clip operator"""
-import numpy as np
-import tvm
-import topi
-from topi.util import get_const_tuple
-from tvm.contrib.pickle_memoize import memoize
-
-
-def verify_clip(N, a_min, a_max, dtype):
-    A = tvm.placeholder((N, N), dtype=dtype, name='A')
-    B = topi.cpp.clip(A, a_min, a_max)
-
-    # use memoize to pickle the test data for next time use
-    @memoize("topi.tests.test_topi_clip")
-    def get_ref_data():
-        a_np = np.random.uniform(a_min*2, a_max*2, size=(N, N)).astype(dtype)
-        b_np = np.clip(a_np, a_min, a_max)
-        return a_np, b_np
-    a_np, b_np = get_ref_data()
-
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        target = topi.cpp.TEST_create_target(device)
-        s = topi.cpp.generic.default_schedule(target, [B], False)
-        ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device, name="clip")
-        f(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['llvm']:
-        check_device(device)
-
-def test_clip():
-    verify_clip(1024, -127, 127, 'int8')
-    verify_clip(1024, -127, 127, 'int16')
-    verify_clip(1024, -127, 127, 'float32')
-
-
-if __name__ == "__main__":
-    test_clip()
--- a/topi/tests/python_cpp/test_topi_dense.py
+++ b/topi/tests/python_cpp/test_topi_dense.py
-"""Test code for dense operator"""
-import numpy as np
-import tvm
-import topi
-from topi.util import get_const_tuple
-from tvm.contrib.pickle_memoize import memoize
-
-
-def verify_dense(batch, in_dim, out_dim, use_bias=True):
-    A = tvm.placeholder((batch, in_dim), name='A')
-    B = tvm.placeholder((out_dim, in_dim), name='B')
-    C = tvm.placeholder((out_dim,), name='C')
-    D = topi.cpp.nn.dense(A, B, C if use_bias else None)
-    D = topi.cpp.nn.relu(D)
-    dtype = A.dtype
-
-    # use memoize to pickle the test data for next time use
-    @memoize("topi.tests.test_topi_dense")
-    def get_ref_data():
-        a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype)
-        b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype)
-        c_np = np.random.uniform(size=(out_dim,)).astype(dtype)
-        if use_bias:
-            d_np = np.maximum(np.dot(a_np, b_np.T) + c_np, 0.0)
-        else:
-            d_np = np.maximum(np.dot(a_np, b_np.T), 0.0)
-        return (a_np, b_np, c_np, d_np)
-    # get the test data
-    a_np, b_np, c_np, d_np = get_ref_data()
-
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.schedule_dense(target, [D])
-        elif device == "rocm":
-            s = topi.cpp.rocm.schedule_dense(target, [D])
-        else:
-            s = topi.cpp.cuda.schedule_dense(target, [D])
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(c_np, ctx)
-        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
-        f = tvm.build(s, [A, B, C, D], device, name="dense")
-        f(a, b, c, d)
-        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
-        check_device(device)
-
-def test_dense():
-    verify_dense(1, 1024, 1000, use_bias=True)
-    verify_dense(1, 1024, 1000, use_bias=False)
-
-
-if __name__ == "__main__":
-    test_dense()
--- a/topi/tests/python_cpp/test_topi_dilate.py
+++ b/topi/tests/python_cpp/test_topi_dilate.py
-import tvm
-import topi
-import topi.testing
-import numpy as np
-
-def test_dilate():
-    target = 'llvm'
-    ctx = tvm.cpu(0)
-
-    def _test_dilate(input_size, strides):
-        Input = tvm.placeholder((input_size))
-        Output = topi.cpp.nn.dilate(Input, strides)
-        tgt = topi.cpp.TEST_create_target(target)
-        schedule = topi.cpp.generic.default_schedule(tgt, [Output], True)
-        input_np = np.random.uniform(size=input_size).astype(Input.dtype)
-        output_np = topi.testing.dilate_python(input_np, strides)
-        input_tvm = tvm.nd.array(input_np, ctx=ctx)
-        output_size = topi.util.get_const_tuple(Output.shape)
-        output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx)
-        f = tvm.build(schedule, [Input, Output], target)
-        f(input_tvm, output_tvm)
-        tvm.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5)
-
-    _test_dilate((32,), (2,))
-    _test_dilate((32,32), (2,2))
-    _test_dilate((1,3,32,32), (1,1,1,1))
-    _test_dilate((1,3,32,32), (2,2,2,2))
-    _test_dilate((1,32,32,3,3), (1,1,1,1,1))
-    _test_dilate((1,32,32,3,3), (2,2,2,2,2))
-    _test_dilate((1,32,32,32,3,3), (1,1,1,2,2,2))
-    _test_dilate((1,32,32,32,3,3), (2,2,2,1,1,1))
-
-
-if __name__ == "__main__":
-    test_dilate()
--- a/topi/tests/python_cpp/test_topi_l2norm.py
+++ b/topi/tests/python_cpp/test_topi_l2norm.py
-"""Test code for l2 normalization"""
-import numpy as np
-import tvm
-import topi
-import logging
-from topi.util import get_const_tuple
-import topi.testing
-
-def verify_l2_normalize(shape, eps, axis=None):
-    '''Verify l2 normalization operator by comparing outputs from tvm and numpy implementation'''
-    A = tvm.placeholder(shape, name='A')
-    B = topi.cpp.nn.l2_normalize(A, eps, axis)
-    dtype = A.dtype
-
-    a_np = np.random.uniform(size=shape).astype(dtype)
-    b_np = topi.testing.l2_normalize_python(a_np, eps, axis)
-
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.default_schedule(target, [B], False)
-        else:
-            s = topi.cpp.cuda.schedule_l2_normalize(target, [B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        func = tvm.build(s, [A, B], device, name="l2_normalize")
-        func(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm']:
-        check_device(device)
-
-def test_l2_normalize():
-    verify_l2_normalize((1, 3, 20, 20), 0.001)
-    verify_l2_normalize((1, 3, 20, 20), 0.001, (1,))
-    verify_l2_normalize((1, 3, 20, 20), 0.001, (1, 2))
-    verify_l2_normalize((1, 3, 20, 20), 0.001, (2, 3))
-    verify_l2_normalize((1, 3, 20, 20), 0.001, (0, 3))
-    verify_l2_normalize((1, 3, 20, 20), 0.001, (0, 2, 3))
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
-    test_l2_normalize()
--- a/topi/tests/python_cpp/test_topi_lrn.py
+++ b/topi/tests/python_cpp/test_topi_lrn.py
-"""Test code for LRN"""
-import numpy as np
-import tvm
-import topi
-import logging
-from topi.util import get_const_tuple
-import topi.testing
-
-def verify_lrn(shape, size, axis, bias, alpha, beta):
-    '''Verify Local response normalization operator by comparing outputs from tvm and numpy implementation'''
-    A = tvm.placeholder(shape, name='A')
-    B = topi.cpp.nn.lrn(A, size, axis, alpha, beta, bias)
-    dtype = A.dtype
-
-    a_np = np.random.uniform(size=shape).astype(dtype)
-    b_np = topi.testing.lrn_python(a_np, size, axis, bias, alpha, beta)
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.default_schedule(target, [B], False)
-        else:
-            s = topi.cpp.cuda.schedule_lrn(target, [B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
-        f(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-1)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm']:
-        check_device(device)
-
-def test_lrn():
-    verify_lrn((1, 3, 5, 5), 3, 3, 1.0, 1.0, 0.5)
-    verify_lrn((1, 3, 5, 5), 3, 3, 1.0, 1.0, 0.5)
-    verify_lrn((1, 3, 20, 20), 3, 1, 2.0, 1.0, 0.75)
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
-    test_lrn()
--- a/topi/tests/python_cpp/test_topi_pooling.py
+++ b/topi/tests/python_cpp/test_topi_pooling.py
-"""Test code for pooling"""
-import numpy as np
-import tvm
-import topi
-import math
-from topi.util import get_const_tuple
-
-pool_code = {
-    "avg": 0,
-    "max": 1
-}
-def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True):
-    iw = ih
-    kw = kh
-    sw = sh
-    pt, pl, pb, pr = padding
-    A = tvm.placeholder((n, ic, ih, iw), name='A')
-    B = topi.cpp.nn.pool(A, [kh, kw], [sh, sw], padding,
-                         pool_code[pool_type], ceil_mode, "NCHW", count_include_pad)
-    B = topi.cpp.nn.relu(B)
-    dtype = A.dtype
-
-    bshape = get_const_tuple(B.shape)
-    ashape = get_const_tuple(A.shape)
-    if ceil_mode:
-        assert bshape[2] == int(math.ceil(float(ashape[2] - kh + pt + pb) / sh) + 1)
-        assert bshape[3] == int(math.ceil(float(ashape[3] - kw + pl + pr) / sw) + 1)
-    else:
-        assert bshape[2] == int(math.floor(float(ashape[2] - kh + pt + pb) / sh) + 1)
-        assert bshape[3] == int(math.floor(float(ashape[3] - kw + pl + pr) / sw) + 1)
-
-
-    a_np = np.random.uniform(size=(n, ic, ih, iw)).astype(dtype)
-    pad_np = np.zeros(shape=(n, ic, ih+pt+pb, iw+pl+pr)).astype(dtype)
-    no_zero = (range(n), range(ic), (range(pt, ih+pt)), (range(pl, iw+pl)))
-    pad_np[np.ix_(*no_zero)] = a_np
-    _, oc, oh, ow = get_const_tuple(B.shape)
-    b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype)
-
-    if pool_type == 'avg':
-        for i in range(oh):
-            for j in range(ow):
-                if count_include_pad:
-                    b_np[:,:,i,j] = np.mean(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3))
-                else:
-                    pad_count = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw] > 0, axis=(2,3))
-                    b_np[:,:,i,j] = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3)) / np.maximum(pad_count, 1)
-
-    elif pool_type =='max':
-        for i in range(oh):
-            for j in range(ow):
-                b_np[:,:,i,j] = np.max(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3))
-    b_np = np.maximum(b_np, 0.0)
-
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.default_schedule(target, [B], False)
-        else:
-            s = topi.cpp.cuda.schedule_pool(target, [B])
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
-        f(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
-        check_device(device)
-
-def test_pool():
-    verify_pool(1, 256, 32, 2, 2, [0, 0, 0, 0], 'avg', False, True)
-    verify_pool(1, 256, 31, 3, 3, [1, 2, 1, 2], 'avg', False, True)
-    verify_pool(1, 256, 32, 2, 2, [1, 2, 1, 2], 'avg', False, False)
-    verify_pool(1, 256, 31, 4, 4, [3, 3, 3, 3], 'avg', False, False)
-    verify_pool(1, 256, 31, 4, 4, [0, 0, 0, 0], 'avg', False, False)
-    verify_pool(1, 256, 32, 2, 2, [0, 0, 0, 0], 'max', False)
-    verify_pool(1, 256, 31, 3, 3, [2, 1, 2, 1], 'max', False)
-    verify_pool(1, 256, 31, 3, 3, [2, 1, 2, 1], 'max', True)
-
-    verify_pool(1, 256, 31, 3, 3, [2, 1, 0, 3], 'avg', False, True)
-    verify_pool(1, 256, 32, 2, 2, [0, 3, 2, 1], 'avg', False, False)
-    verify_pool(1, 256, 31, 3, 3, [1, 0, 3, 2], 'max', False)
-    verify_pool(1, 256, 31, 3, 3, [3, 2, 1, 0], 'max', True)
-
-
-def verify_global_pool(n, c, h, w, pool_type):
-    A = tvm.placeholder((n, c, h, w), name='A')
-    B = topi.cpp.nn.global_pool(A, pool_code[pool_type])
-    B = topi.cpp.nn.relu(B)
-
-    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
-    if pool_type == 'avg':
-        b_np = np.mean(a_np, axis=(2,3), keepdims=True)
-    elif pool_type =='max':
-        b_np = np.max(a_np, axis=(2,3), keepdims=True)
-    b_np = np.maximum(b_np, 0.0)
-
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.default_schedule(target, [B], False)
-        else:
-            s = topi.cpp.cuda.schedule_global_pool(target, [B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        f = tvm.build(s, [A, B], device)
-        f(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
-        check_device(device)
-
-def test_global_pool():
-    verify_global_pool(1, 1024, 7, 7, 'avg')
-    verify_global_pool(4, 1024, 7, 7, 'avg')
-    verify_global_pool(1, 1024, 7, 7, 'max')
-    verify_global_pool(4, 1024, 7, 7, 'max')
-
-
-if __name__ == "__main__":
-    test_pool()
-    test_global_pool()
--- a/topi/tests/python_cpp/test_topi_reduce.py
+++ b/topi/tests/python_cpp/test_topi_reduce.py
-"""Test code for reduce."""
-import os
-import numpy as np
-import tvm
-import topi
-
-def _my_npy_argmax(arr, axis, keepdims):
-    if not keepdims:
-        return arr.argmax(axis=axis)
-    else:
-        if axis is not None:
-            out_shape = list(arr.shape)
-            out_shape[axis] = 1
-        else:
-            out_shape = [1 for _ in range(len(arr.shape))]
-        return arr.argmax(axis=axis).reshape(out_shape)
-
-
-def _my_npy_argmin(arr, axis, keepdims):
-    if not keepdims:
-        return arr.argmin(axis=axis)
-    else:
-        out_shape = list(arr.shape)
-        out_shape[axis] = 1
-        return arr.argmin(axis=axis).reshape(out_shape)
-
-def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
-    # Build the logic and compile the function
-    dat_dtype = "float32"
-    A = tvm.placeholder(shape=in_shape, name="A", dtype=dat_dtype)
-    A1 = topi.cpp.sqrt(topi.cpp.exp(A))
-    out_dtype = "float32"
-    if type == "sum":
-        B = topi.cpp.sum(A1, axis, keepdims)
-    elif type == "max":
-        B = topi.cpp.max(A1, axis, keepdims)
-    elif type == "min":
-        B = topi.cpp.min(A1, axis, keepdims)
-    elif type == "argmax":
-        B = topi.cpp.argmax(A1, axis, keepdims)
-        out_dtype = "int32"
-    elif type == "argmin":
-        B = topi.cpp.argmin(A1, axis, keepdims)
-        out_dtype = "int32"
-    elif type == "prod":
-        B = topi.cpp.prod(A1, axis, keepdims)
-    else:
-        raise NotImplementedError
-
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.default_schedule(target, [B], True)
-        else:
-            s = topi.cpp.cuda.schedule_reduce(target, [B])
-
-        foo = tvm.build(s, [A, B], device, name=type)
-        # Test
-        in_npy = np.random.uniform(size=in_shape).astype(np.float32)
-        in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32)
-        if type == "sum":
-            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
-        elif type == "max":
-            out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
-        elif type == "min":
-            out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
-        elif type == "argmax":
-            out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
-        elif type == "argmin":
-            out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
-        elif type == "prod":
-            out_npy = in_npy_map.prod(axis=axis, keepdims=keepdims)
-        else:
-            raise NotImplementedError
-        data_tvm = tvm.nd.array(in_npy, ctx=ctx)
-        out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
-        for _ in range(1):
-            foo(data_tvm, out_tvm)
-        if type == "argmax" or type == "argmin":
-            out_tvm_indices = out_tvm.asnumpy()
-            if keepdims:
-                out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
-            if axis is None:
-                out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
-            else:
-                other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis+1):]))
-                sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
-                out_tvm_val = in_npy_map[sel_indices]
-            if type == "argmax":
-                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
-            elif type == "argmin":
-                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
-        else:
-            tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
-    for device in ["cuda", "opencl", "metal", "llvm", "rocm"]:
-        check_device(device)
-
-
-def test_reduce_map():
-    verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
-                          axis=(1, 2, 3),
-                          keepdims=True,
-                          type="sum")
-    verify_reduce_map_ele(in_shape=(128, 24 * 128 * 24),
-                          axis=(1,),
-                          keepdims=False,
-                          type="max")
-    verify_reduce_map_ele(in_shape=(32, 128, 24),
-                          axis=None,
-                          keepdims=True,
-                          type="sum")
-    verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
-                          axis=(0, 2),
-                          keepdims=False,
-                          type="min")
-    verify_reduce_map_ele(in_shape=(128, 4, 4, 128),
-                          axis=(1, ),
-                          keepdims=True,
-                          type="prod")
-    verify_reduce_map_ele(in_shape=(4, 4),
-                          axis=(0, 1),
-                          keepdims=False,
-                          type="prod")
-    verify_reduce_map_ele(in_shape=(32, 128),
-                          axis=1,
-                          keepdims=True,
-                          type="argmax")
-    verify_reduce_map_ele(in_shape=(32, 24, 32, 24),
-                          axis=2,
-                          keepdims=False,
-                          type="argmin")
-    verify_reduce_map_ele(in_shape=(31, 21, 15),
-                          axis=None,
-                          keepdims=True,
-                          type="argmax")
-    verify_reduce_map_ele(in_shape=(31, 21, 15),
-                          axis=None,
-                          keepdims=False,
-                          type="sum")
-
-if __name__ == "__main__":
-    test_reduce_map()
--- a/topi/tests/python_cpp/test_topi_region.py
+++ b/topi/tests/python_cpp/test_topi_region.py
-"""Test code for region"""
-import logging
-import numpy as np
-import tvm
-import topi
-import topi.testing
-from topi.util import get_const_tuple
-
-def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_softmax):
-    '''Verify region operator by comparing outputs from tvm and numpy implementation'''
-    in_height = in_width = in_size
-
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-    B = topi.cpp.yolo.region(A, n, classes, coords, background, l_softmax)
-
-    a_shape = get_const_tuple(A.shape)
-    dtype = A.dtype
-
-    def get_ref_data_region():
-        '''Randomly initialize the data variables and get refernce output for the region operation'''
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        b_np = topi.testing.region_python(a_np, n, classes, coords, background, l_softmax)
-        return a_np, b_np
-
-    a_np, b_np = get_ref_data_region()
-    def check_device(device):
-        '''Check the device is available and if so, build and run the program'''
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.default_schedule(target, [B], False)
-        else:
-            s = topi.cpp.rocm.schedule_region(target, [B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        func = tvm.build(s, [A, B], device, name="region")
-        func(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
-        check_device(device)
-
-def test_region():
-    verify_region(1, 19, 425, 5, 80, 4, 0, 1)
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
-    test_region()
--- a/topi/tests/python_cpp/test_topi_relu.py
+++ b/topi/tests/python_cpp/test_topi_relu.py
-"""Test code for relu activation"""
-import os
-import numpy as np
-import tvm
-import topi
-from topi.util import get_const_tuple
-
-def verify_relu(m, n, dtype):
-    A = tvm.placeholder((m, n), name='A', dtype=dtype)
-    B = topi.cpp.nn.relu(A)
-    assert B.dtype == dtype
-
-    a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype)
-    b_np = a_np * (a_np > 0)
-
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.schedule_injective(target, [B])
-        else:
-            s = topi.cpp.cuda.schedule_injective(target, [B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        foo = tvm.build(s, [A, B], device, name="relu")
-        foo(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
-        check_device(device)
-
-
-def verify_leaky_relu(m, alpha):
-    A = tvm.placeholder((m,), name='A')
-    B = topi.cpp.nn.leaky_relu(A, alpha)
-    device = "llvm"
-    target = topi.cpp.TEST_create_target(device)
-    s = topi.cpp.generic.schedule_injective(target, [B])
-
-    a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype)
-    b_np = a_np * (a_np > 0) + a_np * (a_np < 0) * alpha
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-    foo = tvm.build(s, [A, B], device, name="leaky_relu")
-    foo(a, b)
-    tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-def verify_prelu(x, w, axis, weight_reshape):
-    X = tvm.placeholder((x), name='X')
-    W = tvm.placeholder((w), name='W')
-    x_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(X.shape)).astype(X.dtype)
-    w_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(W.shape)).astype(W.dtype)
-    def _prelu_numpy(x, W):
-        return (x < 0) * (x *W.reshape(weight_reshape)) + (x>=0) * x
-
-    out_np = _prelu_numpy(x_np, w_np)
-    B = topi.cpp.nn.prelu(X, W, axis)
-    device = "llvm"
-    target = topi.cpp.TEST_create_target(device)
-    s = topi.cpp.generic.schedule_injective(target, [B])
-
-    ctx = tvm.cpu(0)
-    x_tvm = tvm.nd.array(x_np, ctx)
-    w_tvm = tvm.nd.array(w_np, ctx)
-
-    b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), ctx)
-    foo = tvm.build(s, [X, W, B], "llvm", name="prelu")
-    foo(x_tvm, w_tvm, b)
-    tvm.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
-
-def test_relu():
-    for dtype in ['float32', 'float64', 'int32', 'int16', 'int8', 'int64']:
-        verify_relu(10, 128, dtype)
-
-def test_leaky_relu():
-    verify_leaky_relu(100, 0.5)
-
-def test_prelu():
-    verify_prelu((1, 3, 2, 2), (3,), 1, (3, 1, 1))
-    verify_prelu((1, 3, 2, 2), (2,), 2, (2, 1))
-
-if __name__ == "__main__":
-    test_relu()
-    test_leaky_relu()
-    test_prelu()
--- a/topi/tests/python_cpp/test_topi_reorg.py
+++ b/topi/tests/python_cpp/test_topi_reorg.py
-"""Test code for reorg"""
-import logging
-import numpy as np
-import tvm
-import topi
-import topi.testing
-from topi.util import get_const_tuple
-
-def verify_reorg(batch, in_size, in_channel, stride):
-    '''Verify reorg operator by comparing outputs from tvm and numpy implementation'''
-    in_height = in_width = in_size
-
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-    B = topi.cpp.vision.reorg(A, stride)
-
-    a_shape = get_const_tuple(A.shape)
-    dtype = A.dtype
-
-    def get_ref_data_reorg():
-        '''Randomly initialize the data variables and get refernce output for the reorg operation'''
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        b_np = topi.testing.reorg_python(a_np, stride)
-        return a_np, b_np
-
-    a_np, b_np = get_ref_data_reorg()
-    def check_device(device):
-        '''Check the device is available and if so, build and run the program'''
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.default_schedule(target, [B], False)
-        else:
-            s = topi.cpp.cuda.schedule_injective(target, [B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        func = tvm.build(s, [A, B], device, name="reorg")
-        func(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
-        check_device(device)
-
-def test_reorg():
-    verify_reorg(1, 38, 64, 2)
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
-    test_reorg()
--- a/topi/tests/python_cpp/test_topi_softmax.py
+++ b/topi/tests/python_cpp/test_topi_softmax.py
-"""Test code for softmax"""
-import os
-import numpy as np
-import tvm
-import topi
-import logging
-import topi.testing
-from topi.util import get_const_tuple
-
-def verify_softmax(m, n):
-    A = tvm.placeholder((m, n), name='A')
-    B = topi.cpp.nn.softmax(A, 1)
-    # confirm lower works
-    s = tvm.create_schedule([B.op])
-    tvm.lower(s, [A, B], simple_mode=True)
-
-    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
-    b_np = topi.testing.softmax_python(a_np)
-
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.default_schedule(target, [B], False)
-        else:
-            s = topi.cpp.cuda.schedule_softmax(target, [B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        foo = tvm.build(s, [A, B], device, name="softmax")
-        foo(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
-        check_device(device)
-
-def test_softmax():
-    verify_softmax(32, 10)
-    verify_softmax(3, 4)
-
-
-def verify_log_softmax(m, n):
-    A = tvm.placeholder((m, n), name='A')
-    B = topi.cpp.nn.log_softmax(A)
-    # confirm lower works
-    s = tvm.create_schedule([B.op])
-    tvm.lower(s, [A, B], simple_mode=True)
-    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
-    b_np = topi.testing.log_softmax_python(a_np)
-
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.default_schedule(target, [B], False)
-        else:
-            s = topi.cpp.cuda.schedule_softmax(target, [B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        foo = tvm.build(s, [A, B], device, name="log_softmax")
-        foo(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ["cuda", "opencl", "metal", "rocm"]:
-        check_device(device)
-
-
-def test_log_softmax():
-    verify_log_softmax(32, 10)
-    verify_log_softmax(3, 4)
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
-    test_softmax()
-    test_log_softmax()
--- a/topi/tests/python_cpp/test_topi_tensor.py
+++ b/topi/tests/python_cpp/test_topi_tensor.py
-"""Test code for tensor operator"""
-import numpy as np
-import tvm
-import topi
-
-def verify_elemwise_sum(num_args, dtype):
-    shape = (3,5,4)
-
-    tvm_placeholders = []
-    for i in range(num_args):
-        tvm_placeholders.append(
-            tvm.placeholder(shape, name="data"+str(i), dtype=dtype))
-    esum = topi.cpp.elemwise_sum(tvm_placeholders)
-    s = tvm.create_schedule([esum.op])
-
-    def get_ref_data():
-        np_nd = [np.random.uniform(0, 10, size=shape).astype(dtype)
-                 for i in range(num_args)]
-        return np_nd
-    np_nd = get_ref_data()
-
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-
-        ctx = tvm.context(device, 0)
-        out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
-        f = tvm.build(s, tvm_placeholders + [esum], device, name="elemwise_sum")
-        tvm_nd = [tvm.nd.array(nd, ctx) for nd in np_nd] + [out]
-        f(*tvm_nd)
-        np_out = np.sum(np.array(np_nd), axis=0)
-        tvm.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
-
-    for device in ["llvm"]:
-        check_device(device)
-
-
-def verify_full(shape, dtype, fill_value):
-    A = tvm.placeholder(shape, dtype=dtype, name="A")
-    B = topi.cpp.full_like(A, fill_value)
-    C = topi.cpp.full(shape, dtype, fill_value)
-    s1 = tvm.create_schedule([B.op])
-    s2 = tvm.create_schedule([C.op])
-
-    def get_ref_data():
-        return np.full(shape, fill_value, dtype)
-    np_nd = get_ref_data()
-
-    def check_device(device):
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        target = topi.cpp.TEST_create_target(device)
-        ctx = tvm.context(device, 0)
-        out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
-        f = tvm.build(s1, [A, B], device, name="full_like")
-        f(tvm.nd.array(np.zeros(shape, dtype), ctx), out)
-        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
-
-        f = tvm.build(s2, [C], device, name="full")
-        f(out)
-        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
-
-    for device in ["llvm"]:
-        check_device(device)
-
-
-def test_elemwise_sum():
-    verify_elemwise_sum(1, "float32")
-    verify_elemwise_sum(5, "float32")
-    verify_elemwise_sum(4, "int32")
-
-
-def test_full():
-    verify_full((3,4,5), "float32", 3.14)
-    verify_full((10,), "int32", 7)
-
-if __name__ == "__main__":
-    test_elemwise_sum()
-    test_full()
--- a/topi/tests/python_cpp/test_topi_transform.py
+++ b/topi/tests/python_cpp/test_topi_transform.py
--- a/tutorials/dev/low_level_custom_pass.py
+++ b/tutorials/dev/low_level_custom_pass.py
@@ -34,7 +34,7 @@ import numpy as np
 # our customized lowering pass to manipulate the IR directly instead of using schedule premitives.
 #

-n = tvm.const(128)
+n = tvm.const(128, "int32")
 a = tvm.placeholder((n, ), name="a")
 b = tvm.placeholder((n, ), name="b")
 c = tvm.compute((n, ), lambda i: a[i] + b[i], name='c')

--- a/tutorials/optimize/opt_conv_cuda.py
+++ b/tutorials/optimize/opt_conv_cuda.py
@@ -46,7 +46,7 @@ Apad = tvm.compute(
    lambda yy, xx, cc, nn: tvm.select(
        tvm.all(yy >= pad, yy - pad < in_size,
                xx >= pad, xx - pad < in_size),
-        A[yy - pad, xx - pad, cc, nn], tvm.const(0.)),
+        A[yy - pad, xx - pad, cc, nn], tvm.const(0., "float32")),
    name='Apad')
 # Create reduction variables
 rc = tvm.reduce_axis((0, in_channel), name='rc')
@@ -64,7 +64,7 @@ B = tvm.compute(
 ###############################################################################
 # Memory Hierarchy
 # ----------------
-# 
+#
 # We first specify the memory hierarchy for buffers. The figure below shows the
 # GPU memory hierarchy. One important difference from CPU memory hierarchy is
 # that GPU provides a cache buffer called shared memory, which is managed by

--- a/vta/python/vta/ir_pass.py
+++ b/vta/python/vta/ir_pass.py
@@ -700,7 +700,7 @@ def inject_alu_intrin(stmt_in):
            elif isinstance(loop_body.value, tvm.expr.Load):
                alu_opcode = env.dev.ALU_OPCODE_SHR
                lhs = loop_body.value
-                rhs = tvm.const(0)
+                rhs = tvm.const(0, "int32")
            else:
                raise RuntimeError(
                    "Expression not recognized %s, %s, %s" % (