CPP support for region and reorg operators (#1115)

6fdb662e · Pariksheet Pinjari · Tianqi Chen · 431a42bf · 6fdb662e · 6fdb662e
Commit 6fdb662e authored May 08, 2018 by Pariksheet Pinjari Committed by Tianqi Chen May 08, 2018
16 changed files
--- a/topi/include/topi/cuda/vision.h
+++ b/topi/include/topi/cuda/vision.h
+/*!
+*  Copyright (c) 2018 by Contributors
+* \file cuda/vision.h
+* \brief CUDA schedule for vision operations
+*/
+#ifndef TOPI_CUDA_VISION_H_
+#define TOPI_CUDA_VISION_H_
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+#include "topi/tags.h"
+#include "topi/detail/array_utils.h"
+#include "topi/contrib/cublas.h"
+#include "topi/generic/extern.h"
+namespace topi {
+using namespace tvm;
+namespace cuda {
+/*!
+* \brief Create a CUDA schedule for region
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_region(const Target &target, const Array<Tensor>& outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+  auto output = outs[0]->op.output(0);
+  auto num_thread = 64;
+  auto _schedule_softmax = [&](const Operation& softmax_op) {
+    auto softmax_inputs = softmax_op->InputTensors();
+    auto softmax = softmax_inputs[0];
+    auto max_elem = softmax_inputs[1];
+    auto expsum = softmax_inputs[2];
+    auto block_x = tvm::thread_axis(Range(), "blockIdx.x");
+    auto thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x");
+    s[max_elem].bind(max_elem->op.as<ComputeOpNode>()->axis[0], block_x);
+    auto k = expsum->op.as<ComputeOpNode>()->reduce_axis[0];
+    IterVar ko, ki;
+    s[expsum].split(k, num_thread, &ko, &ki);
+    auto ef = s.rfactor(expsum, ki)[0];
+    s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->axis[0], block_x);
+    s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0], thread_x);
+    s[ef].compute_at(s[expsum], s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0]);
+    s[expsum].set_store_predicate(static_cast<Expr>(thread_x) == 0);
+    IterVar tx, xi;
+    s[softmax_op].split_by_nparts(softmax_op.as<ComputeOpNode>()->axis[1], num_thread, &tx, &xi);
+    s[softmax_op].bind(tx, thread_x);
+    return max_elem->op.as<ComputeOpNode>()->InputTensors()[0];
+  };
+  std::function<void(Operation)> traverse;
+  traverse = [&](const Operation& op) {
+    // Inline all one-to-one-mapping operators except the last stage (output)
+    if (is_injective(op->tag)) {
+      if (!detail::contains(s->outputs, op)) {
+        s[op].compute_inline();
+      }
+      for (auto tensor : op->InputTensors()) {
+        if (tensor->op->InputTensors().size() > 0) {
+          traverse(tensor->op);
+        }
+      }
+    } else if (op->tag == "softmax_output") {
+      auto tensor = _schedule_softmax(op);
+      if (tensor->op->InputTensors().size() > 0) {
+        traverse(tensor->op);
+      }
+    } else {
+      LOG(ERROR) << "Unsupported operator " << op->tag;
+    }
+  };
+  traverse(outs[0]->op);
+  auto k = output->op.as<ComputeOpNode>()->axis[0];
+  IterVar bx, tx;
+  s[output].split(k, num_thread, &bx, &tx);
+  s[output].bind(bx, tvm::thread_axis(Range(), "blockIdx.x"));
+  s[output].bind(tx, tvm::thread_axis(Range(), "threadIdx.x"));
+  return s;
+}
+}  // namespace cuda
+}  // namespace topi
+#endif  // TOPI_CUDA_VISION_H_
--- a/topi/include/topi/rocm/vision.h
+++ b/topi/include/topi/rocm/vision.h
+/*!
+*  Copyright (c) 2018 by Contributors
+* \file rocm/vision.h
+* \brief rocm schedule for region operation
+*/
+#ifndef TOPI_ROCM_VISION_H_
+#define TOPI_ROCM_VISION_H_
+#include "tvm/tvm.h"
+#include "tvm/build_module.h"
+#include "topi/tags.h"
+#include "topi/detail/array_utils.h"
+#include "topi/contrib/rocblas.h"
+#include "topi/generic/extern.h"
+#include "topi/cuda/vision.h"
+namespace topi {
+using namespace tvm;
+namespace rocm {
+/*!
+* \brief Create a rocm schedule for region
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the given ops.
+*/
+inline Schedule schedule_region(const Target &target, const Array<Tensor>& outs) {
+  return topi::cuda::schedule_region(target, outs);
+}
+}  // namespace rocm
+}  // namespace topi
+#endif  // TOPI_ROCM_VISION_H_
--- a/topi/include/topi/vision/reorg.h
+++ b/topi/include/topi/vision/reorg.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Reorg op constructions
+ * \file vision/reorg.h
+ */
+#ifndef TOPI_VISION_REORG_H_
+#define TOPI_VISION_REORG_H_
+#include <algorithm>
+#include <string>
+#include "topi/detail/constant_utils.h"
+#include "topi/reduction.h"
+#include "topi/tags.h"
+#include "topi/transform.h"
+#include "tvm/tvm.h"
+namespace topi {
+namespace vision {
+using namespace tvm;
+/*!
+* \brief Reorg operation
+*
+* \param data The input tensor. Can be any dimension
+* \param stride The input integer used as stride in reorg operation
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the reorg operation
+*/
+inline Tensor reorg(const Tensor &data,
+                    int stride = 1,
+                    std::string name = "tensor",
+                    std::string tag = "reorg_output") {
+  auto input_shape = data->shape;
+  int batch = GetConstInt(input_shape[0]);
+  int c_in = GetConstInt(input_shape[1]);
+  int h_in = GetConstInt(input_shape[2]);
+  int w_in = GetConstInt(input_shape[3]);
+  int out_c = c_in / (stride * stride);
+  auto out = tvm::compute(input_shape,
+                          [&](Var b, Var k, Var j, Var i) {
+                          return data(b * stride * stride,
+                                      (k % out_c) * stride * stride,
+                                      (j*stride + (k / out_c) / stride) * stride,
+                                      (i*stride + (k / out_c) % stride));
+                          },
+                          name,
+                          tag);
+  out_c = c_in * stride * stride;
+  int out_h = h_in / stride;
+  int out_w = w_in / stride;
+  Array<Expr> out_shape = {batch, out_c, out_h, out_w};
+  return reshape(out, out_shape);
+}
+}  // namespace vision
+}  // namespace topi
+#endif  // TOPI_VISION_REORG_H_
--- a/topi/include/topi/vision/yolo2/region.h
+++ b/topi/include/topi/vision/yolo2/region.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Region op constructions
+ * \file vision/yolo2/region.h
+ */
+#ifndef TOPI_VISION_YOLO2_REGION_H_
+#define TOPI_VISION_YOLO2_REGION_H_
+#include <algorithm>
+#include <string>
+#include "topi/detail/constant_utils.h"
+#include "topi/reduction.h"
+#include "topi/tags.h"
+#include "topi/transform.h"
+#include "topi/nn/softmax.h"
+#include "tvm/tvm.h"
+namespace topi {
+namespace vision {
+namespace yolo2 {
+using namespace tvm;
+using namespace nn;
+/*!
+* \brief region operation
+*
+* \param data The input tensor. Can be any dimension
+* \param num Darknet layer parameter n
+* \param classes number of classes in the yolo model
+* \param coords Darknet layer parameter coords
+* \param background Darknet layer parameter background
+* \param l_softmax if true apply softmax
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the region operation
+*/
+inline Tensor region(const Tensor &data,
+                     int num,
+                     int classes,
+                     int coords,
+                     int background,
+                     int l_softmax,
+                     std::string name = "tensor",
+                     std::string tag = "region_output") {
+  auto input_shape = data->shape;
+  int split_size = classes + coords + 1;
+  Array <Expr> intermediate_shape = {input_shape[0],
+                                     num,
+                                     split_size,
+                                     input_shape[2],
+                                     input_shape[3]};
+  auto data_block = reshape(data, intermediate_shape);
+  Array <Expr> split_indices;
+  for (int i = 1; i < split_size; ++i) {
+    split_indices.push_back(i);
+  }
+  Array <Tensor> split_res = split(data_block, split_indices, 2);
+  split_res.Set(0, sigmoid(split_res[0]));
+  split_res.Set(1, sigmoid(split_res[1]));
+  if (!background) {
+    split_res.Set(coords, sigmoid(split_res[coords]));
+  }
+  if (l_softmax) {
+    int offset = coords + static_cast<int>(!background);
+    Array <Tensor> softmax_input(split_res.begin() + offset, split_res.end());
+    auto softmax_output = softmax(concatenate(softmax_input, 2), 2);
+    Array <Tensor> data_block_1(split_res.begin(), split_res.begin() + offset);
+    data_block_1.push_back(softmax_output);
+    split_res = data_block_1;
+  }
+  Tensor out = concatenate(split_res, 2);
+  return reshape(out, input_shape);
+}
+}  // namespace yolo2
+}  // namespace vision
+}  // namespace topi
+#endif  // TOPI_VISION_YOLO2_REGION_H_
--- a/topi/python/topi/cpp.py
+++ b/topi/python/topi/cpp.py
@@ -46,6 +46,10 @@ rocm = _create_module("rocm")
 _init_api_prefix("topi.cpp.rocm", "topi.rocm")
 x86 = _create_module("x86")
 _init_api_prefix("topi.cpp.x86", "topi.x86")
+vision = _create_module("vision")
+_init_api_prefix("topi.cpp.vision", "topi.vision")
+yolo2 = _create_module("vision.yolo2")
+_init_api_prefix("topi.cpp.vision.yolo2", "topi.vision.yolo2")
 class IntVector(object):
    """Handle to std::vector<int> instance """

--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -16,4 +16,5 @@ from .pooling import schedule_pool, schedule_global_pool
 from .conv2d_transpose_nchw import schedule_conv2d_transpose_nchw
 from .extern import schedule_extern
 from .vision import schedule_region
+from .vision import schedule_reorg
 from .nn import schedule_lrn, schedule_l2norm
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -2,8 +2,26 @@
 """Schedule for vision operators"""
 from __future__ import absolute_import as _abs
 import tvm
-from .. import tag
 from .. import generic
+from .. import cpp
+@generic.schedule_reorg.register(["cuda", "gpu"])
+def schedule_reorg(outs):
+    """Schedule for reorg operator.
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of reorg
+        in the format of an array of tensors.
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for reorg.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.cuda.schedule_injective(cpp_target, outs)
 @generic.schedule_region.register(["cuda", "gpu"])
 def schedule_region(outs):
@@ -19,47 +37,6 @@ def schedule_region(outs):
    s: Schedule
        The computation schedule for region.
    """
+    target = tvm.target.current_target(allow_none=False)
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    cpp_target = cpp.TEST_create_target(target.target_name)
-    s = tvm.create_schedule([x.op for x in outs])
+    return cpp.cuda.schedule_region(cpp_target, outs)
-    output = outs[0].op.output(0)
-    #thread = 64 for higher size tensors, give resource_unavailable error for higher values
-    num_thread = 64
-    def _schedule_softmax(softmax_op):
-        softmax = softmax_op.input_tensors[0]
-        max_elem = softmax_op.input_tensors[1]
-        expsum = softmax_op.input_tensors[2]
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-        s[max_elem].bind(max_elem.op.axis[0], block_x)
-        k = expsum.op.reduce_axis[0]
-        ko, ki = s[expsum].split(k, factor=num_thread)
-        ef = s.rfactor(expsum, ki)
-        s[expsum].bind(s[expsum].op.axis[0], block_x)
-        s[expsum].bind(s[expsum].op.reduce_axis[0], thread_x)
-        s[ef].compute_at(s[expsum], s[expsum].op.reduce_axis[0])
-        s[expsum].set_store_predicate(thread_x.var.equal(0))
-        tx, xi = s[softmax_op].split(softmax_op.axis[1], nparts=num_thread)
-        s[softmax_op].bind(softmax_op.axis[0], block_x)
-        s[softmax_op].bind(tx, thread_x)
-        return max_elem.op.input_tensors[0]
-    def _traverse(op):
-        if tag.is_injective(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
-                    _traverse(tensor.op)
-        elif op.tag == 'softmax_output':
-            tensor = _schedule_softmax(op)
-            if tensor.op.input_tensors:
-                _traverse(tensor.op)
-        else:
-            raise RuntimeError("Unsupported operator: %s" % op.tag)
-    _traverse(outs[0].op)
-    k = output.op.axis[0]
-    bx, tx = s[output].split(k, factor=num_thread)
-    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
-    return s
--- a/topi/python/topi/generic/vision.py
+++ b/topi/python/topi/generic/vision.py
 """Generic vision operators"""
 from __future__ import absolute_import as _abs
 import tvm
+from .. import cpp
 def _default_schedule(outs, auto_inline):
    """Default schedule for llvm."""
@@ -47,7 +48,9 @@ def schedule_reorg(outs):
    s: Schedule
      The computation schedule for the op.
    """
-    return _default_schedule(outs, False)
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.generic.default_schedule(cpp_target, outs, False)
 @tvm.target.generic_func
 def schedule_region(outs):
@@ -64,4 +67,6 @@ def schedule_region(outs):
    s: Schedule
      The computation schedule for the op.
    """
-    return _default_schedule(outs, False)
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.generic.default_schedule(cpp_target, outs, False)
--- a/topi/python/topi/rocm/vision.py
+++ b/topi/python/topi/rocm/vision.py
 # pylint: disable=invalid-name, unused-variable
 """Schedule for vision operator"""
 from __future__ import absolute_import as _abs
-import topi
+import tvm
 from .. import generic
+from .. import cpp
 @generic.schedule_region.register(["rocm"])
 def schedule_region(outs):
@@ -19,4 +20,6 @@ def schedule_region(outs):
    s: Schedule
        The computation schedule for region.
    """
-    return topi.cuda.schedule_region(outs)
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.rocm.schedule_region(cpp_target, outs)
--- a/topi/python/topi/vision/reorg.py
+++ b/topi/python/topi/vision/reorg.py
@@ -5,8 +5,7 @@ Reorg operator, used in darknet.
 """
 from __future__ import absolute_import as _abs
 import tvm
-from .. import util
+from .. import cpp
-from .. import transform
 @tvm.target.generic_func
 def reorg(data, stride):
@@ -25,15 +24,4 @@ def reorg(data, stride):
    Output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
-    batch, c_in, h_in, w_in = util.get_const_tuple(data.shape)
+    return cpp.vision.reorg(data, stride)
-    out_c = int(c_in / (stride * stride))
-    out = tvm.compute((batch, c_in, h_in, w_in), lambda b, k, j, i:
-                      data[b * stride * stride,
-                           (k % out_c) * stride * stride,
-                           (j*stride + (k / out_c) / stride) * stride,
-                           (i*stride + (k / out_c) % stride)],
-                      tag="reorg")
-    out_c = int(c_in * stride * stride)
-    out_h = int(h_in / stride)
-    out_w = int(w_in / stride)
-    return transform.reshape(out, (batch, out_c, out_h, out_w))
--- a/topi/python/topi/vision/yolo2/region.py
+++ b/topi/python/topi/vision/yolo2/region.py
@@ -6,10 +6,7 @@ Region operator, used in darknet.
 """
 from __future__ import absolute_import as _abs
 import tvm
-from ... import transform
+from ... import cpp
-from ... import util
-from ... import math
-from ... import nn
 @tvm.target.generic_func
 def region(data, num, classes, coords, background, softmax=True):
@@ -39,25 +36,4 @@ def region(data, num, classes, coords, background, softmax=True):
    out : tvm.Tensor
        4-D with shape [batch, c_in, h_in, w_in]
    """
+    return cpp.yolo2.region(data, num, classes, coords, background, softmax)
-    batch, c_in, h_in, w_in = util.get_const_tuple(data.shape)
-    split_indices = classes+coords+1
-    data_block = transform.reshape(data, (batch, num, split_indices, h_in, w_in))
-    split_res = transform.split(data_block, split_indices, 2)
-    split_res[0] = math.sigmoid(split_res[0])
-    split_res[1] = math.sigmoid(split_res[1])
-    if not background:
-        split_res[coords] = math.sigmoid(split_res[coords])
-    if softmax:
-        offset = coords + int(not background)
-        data_block_1 = []
-        data_block_1.append(transform.concatenate(split_res[0:offset], 2))
-        temp_out = transform.concatenate(split_res[offset:split_indices], 2)
-        temp_out = nn.softmax(temp_out, axis=2)
-        data_block_1.append(temp_out)
-        split_res = data_block_1
-    out = transform.concatenate(split_res, 2)
-    out = transform.reshape(out, data.shape)
-    return out
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -24,6 +24,8 @@
 #include <topi/nn/pooling.h>
 #include <topi/nn/softmax.h>
+#include <topi/vision/reorg.h>
+#include <topi/vision/yolo2/region.h>
 #include <topi/generic/default.h>
 #include <topi/generic/extern.h>
 #include <topi/generic/injective.h>
@@ -34,12 +36,14 @@
 #include <topi/cuda/pooling.h>
 #include <topi/cuda/reduction.h>
 #include <topi/cuda/softmax.h>
+#include <topi/cuda/vision.h>
 #include <topi/x86/bnn.h>
 #include <topi/x86/default.h>
 #include <topi/x86/injective.h>
 #include <topi/rocm/dense.h>
+#include <topi/rocm/vision.h>
 namespace tvm {
 namespace runtime {
@@ -338,6 +342,14 @@ TVM_REGISTER_GLOBAL("topi.nn.log_softmax")
  *rv = nn::log_softmax(args[0]);
  });
+TVM_REGISTER_GLOBAL("topi.vision.reorg")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = vision::reorg(args[0], args[1]);
+  });
+TVM_REGISTER_GLOBAL("topi.vision.yolo2.region")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = vision::yolo2::region(args[0], args[1], args[2], args[3], args[4], args[5]);
+  });
 /* Generic schedules */
 TVM_REGISTER_GLOBAL("topi.generic.default_schedule")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -394,6 +406,10 @@ TVM_REGISTER_GLOBAL("topi.rocm.schedule_dense")
  *rv = topi::rocm::schedule_dense(args[0], args[1]);
  });
+TVM_REGISTER_GLOBAL("topi.rocm.schedule_region")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::rocm::schedule_region(args[0], args[1]);
+  });
 /* CUDA schedules */
 TVM_REGISTER_GLOBAL("topi.cuda.dense_cuda")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -435,6 +451,11 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_softmax")
  *rv = topi::cuda::schedule_softmax(args[0], args[1]);
  });
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_region")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_region(args[0], args[1]);
+  });
 /*! \brief Builder function for instantiating schedules. */
 using FTVMScheduleBuilder = std::function<
  tvm::Schedule(const tvm::Target& target, const tvm::Array<tvm::Tensor>& outs)>;

--- a/topi/tests/python/test_topi_region.py
+++ b/topi/tests/python/test_topi_region.py
@@ -3,6 +3,8 @@ import numpy as np
 import topi
 from topi.util import get_const_tuple
 import tvm
+import topi.testing
 def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_softmax):
    '''Verify region operator by comparing outputs from tvm and numpy implementation'''
    in_height = in_width = in_size
@@ -27,7 +29,10 @@ def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
+            if device == 'llvm':
                s = topi.generic.vision.schedule_region([B])
+            else:
+                s = topi.cuda.vision.schedule_region([B])
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
        func = tvm.build(s, [A, B], device)

--- a/topi/tests/python/test_topi_reorg.py
+++ b/topi/tests/python/test_topi_reorg.py
@@ -3,6 +3,7 @@ import numpy as np
 import topi
 from topi.util import get_const_tuple
 import tvm
+import topi.testing
 def verify_reorg(batch, in_size, in_channel, stride):
    '''Verify reorg operator by comparing outputs from tvm and numpy implementation'''
@@ -29,8 +30,10 @@ def verify_reorg(batch, in_size, in_channel, stride):
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
-            s = topi.generic.schedule_injective([B])
+            if device == 'llvm':
+                s = topi.generic.schedule_reorg([B])
+            else:
+                s = topi.cuda.schedule_reorg([B])
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
        func = tvm.build(s, [A, B], device)

--- a/topi/tests/python_cpp/test_topi_region.py
+++ b/topi/tests/python_cpp/test_topi_region.py
+"""Test code for region"""
+import logging
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_softmax):
+    '''Verify region operator by comparing outputs from tvm and numpy implementation'''
+    in_height = in_width = in_size
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    B = topi.cpp.yolo2.region(A, n, classes, coords, background, l_softmax)
+    a_shape = get_const_tuple(A.shape)
+    dtype = A.dtype
+    def get_ref_data_region():
+        '''Randomly initialize the data variables and get refernce output for the region operation'''
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        b_np = topi.testing.region_python(a_np, n, classes, coords, background, l_softmax)
+        return a_np, b_np
+    a_np, b_np = get_ref_data_region()
+    def check_device(device):
+        '''Check the device is available and if so, build and run the program'''
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], False)
+        else:
+            s = topi.cpp.rocm.schedule_region(target, [B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        func = tvm.build(s, [A, B], device, name="region")
+        func(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
+        check_device(device)
+def test_region():
+    verify_region(1, 19, 425, 5, 80, 4, 0, 1)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    test_region()
--- a/topi/tests/python_cpp/test_topi_reorg.py
+++ b/topi/tests/python_cpp/test_topi_reorg.py
+"""Test code for reorg"""
+import logging
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+def verify_reorg(batch, in_size, in_channel, stride):
+    '''Verify reorg operator by comparing outputs from tvm and numpy implementation'''
+    in_height = in_width = in_size
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    B = topi.cpp.vision.reorg(A, stride)
+    a_shape = get_const_tuple(A.shape)
+    dtype = A.dtype
+    def get_ref_data_reorg():
+        '''Randomly initialize the data variables and get refernce output for the reorg operation'''
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        b_np = topi.testing.reorg_python(a_np, stride)
+        return a_np, b_np
+    a_np, b_np = get_ref_data_reorg()
+    def check_device(device):
+        '''Check the device is available and if so, build and run the program'''
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        target = topi.cpp.TEST_create_target(device)
+        if device == "llvm":
+            s = topi.cpp.generic.default_schedule(target, [B], False)
+        else:
+            s = topi.cpp.cuda.schedule_injective(target, [B])
+        ctx = tvm.context(device, 0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        func = tvm.build(s, [A, B], device, name="reorg")
+        func(a, b)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
+        check_device(device)
+def test_reorg():
+    verify_reorg(1, 38, 64, 2)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    test_reorg()