[CODEGEN] fix vector conversion for opencl (#783)

* support more argument type in depthwise_conv2d * mark all pointer as 'restrict' & fix vector conversion for opencl

[CODEGEN] fix vector conversion for opencl (#783)
* support more argument type in depthwise_conv2d * mark all pointer as 'restrict' & fix vector conversion for opencl
8d263e37 · Lianmin Zheng · Tianqi Chen · 3ff2d958 · 8d263e37 · 8d263e37
Commit 8d263e37 authored Jan 17, 2018 by Lianmin Zheng Committed by Tianqi Chen Jan 16, 2018
8 changed files
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -38,14 +38,17 @@ void CodeGenC::AddFunction(LoweredFunc f) {
    if (i != 0) stream << ", ";
    if (v.type().is_handle()) {
      auto it = alloc_storage_scope_.find(v.get());
-      if (it != alloc_storage_scope_.end()) {
+      if (it != alloc_storage_scope_.end())
        PrintStorageScope(it->second, stream);
-        stream << ' ';
+      stream << ' ';
+      if (handle_data_type_.count(v.get())) {
+        PrintType(handle_data_type_.at(v.get()), stream);
+      } else {
+        stream << "void";
      }
-    }
-    if (handle_data_type_.count(v.get())) {
-      PrintType(handle_data_type_.at(v.get()), stream);
      stream << "*";
      if (f->is_restricted && restrict_keyword_.length() != 0) {
        stream << ' ' << restrict_keyword_;
      }
@@ -402,12 +405,9 @@ inline void PrintBinaryIntrinsitc(const Call* op,
  }
 }
 void CodeGenC::VisitExpr_(const Cast *op, std::ostream& os) {  // NOLINT(*)
-  os << "(";
+  std::stringstream value;
-  this->PrintType(op->type, os);
+  this->PrintExpr(op->value, value);
-  os << ")";
+  os << CastFromTo(value.str(), op->value.type(), op->type);
-  os << '(';
-  this->PrintExpr(op->value, os);
-  os << ')';
 }
 void CodeGenC::VisitExpr_(const Variable *op, std::ostream& os) {  // NOLINT(*)
  os << GetVarID(op);

--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -142,6 +142,8 @@ class CodeGenC :
  // print store of single element.
  virtual void PrintVecElemStore(
      const std::string& vec, Type t, int i, const std::string& value);
+  // Get a cast type from to
+  virtual std::string CastFromTo(std::string value, Type from, Type target);
 protected:
  // Print reference to struct location
@@ -150,8 +152,6 @@ class CodeGenC :
  // print reference to a buffer as type t in index.
  std::string GetBufferRef(
      Type t, const Variable* buffer, Expr index);
-  // Get a cast type from to
-  std::string CastFromTo(std::string value, Type from, Type target);
  /*!
   * \brief If buffer is allocated as type t.
   * \param buf_var The buffer variable.

--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -175,6 +175,22 @@ void CodeGenOpenCL::PrintStorageScope(
  }
 }
+std::string CodeGenOpenCL::CastFromTo(std::string value, Type from, Type target) {
+  if (from == target) return value;
+  std::ostringstream os;
+  if (target.lanes() == 1) {
+    os << "((";
+    this->PrintType(target, os);
+    os << ")" << value << ")";
+  } else {  // convert vector type
+    os << "(";
+    os << "convert_";
+    this->PrintType(target, os);
+    os << "(" << value << "))";
+  }
+  return os.str();
+}
 void CodeGenOpenCL::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
  std::string v = PrintExpr(op->value);
  os << "((";

--- a/src/codegen/codegen_opencl.h
+++ b/src/codegen/codegen_opencl.h
@@ -34,6 +34,8 @@ class CodeGenOpenCL final : public CodeGenC {
  // the address of load/store
  void PrintVecAddr(const Variable* buffer, Type t,
                    Expr base, std::ostream& os);  // NOLINT(*)
+  std::string CastFromTo(std::string value, Type from, Type target); // NOLINT(*)
  // overload visitor
  void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)

--- a/src/pass/split_host_device.cc
+++ b/src/pass/split_host_device.cc
@@ -191,9 +191,6 @@ class HostDeviceSplitter : public IRMutator {
        auto it = handle_data_type_.find(v.get());
        if (it != handle_data_type_.end()) {
          n->handle_data_type.Set(v, it->second);
-        } else {
-          // int32 as a placeholder
-          n->handle_data_type.Set(v, make_const(UInt(32), 0));
        }
      }
    }

--- a/topi/python/topi/nn/depthwise_conv2d.py
+++ b/topi/python/topi/nn/depthwise_conv2d.py
@@ -31,9 +31,14 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype='float32'):
    Output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
+    out_dtype = Input.dtype
    batch, in_channel, in_height, in_width = Input.shape
    filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (filter_height, filter_width))
@@ -82,7 +87,10 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding):
    """
    batch, in_height, in_width, in_channel = Input.shape
    filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (filter_height, filter_width))
@@ -131,7 +139,10 @@ def depthwise_conv2d_backward_input_nhwc(Filter, Out_grad, oshape, ishape, strid
    batch, in_h, in_w, in_c = ishape
    _, out_h, out_w, out_c = oshape
    filter_h, filter_w, _, channel_multiplier = Filter.shape
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
    dilated_out_grad = dilate(Out_grad, [1, stride_h, stride_w, 1], name='dilated_out_grad')
@@ -186,7 +197,10 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid
    batch, out_h, out_w, out_c = oshape
    filter_h, filter_w, _, channel_multiplier = fshape
    in_c = Input.shape[3].value
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (filter_h, filter_w))

--- a/topi/python/topi/testing/depthwise_conv2d_python.py
+++ b/topi/python/topi/testing/depthwise_conv2d_python.py
@@ -27,7 +27,11 @@ def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding):
    """
    batch, in_channel, in_height, in_width = input_np.shape
    _, channel_multiplier, filter_height, filter_width = filter_np.shape
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
    # calculate output shape
    if padding == 'VALID':
        out_channel = in_channel * channel_multiplier
@@ -84,7 +88,11 @@ def depthwise_conv2d_python_nhwc(input_np, filter_np, stride, padding):
    """
    batch, in_height, in_width, in_channel = input_np.shape
    filter_height, filter_width, _, channel_multiplier = filter_np.shape
-    stride_h, stride_w = stride
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
    # calculate output shape
    if padding == 'VALID':
        out_channel = in_channel * channel_multiplier

--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -7,18 +7,17 @@ from tvm.contrib.pickle_memoize import memoize
 from topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_nhwc
-def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding):
+def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding):
    in_width = in_height
    filter_channel = in_channel
    filter_width = filter_height
-    stride_w = stride_h
    # placeholder
    Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input')
    Filter = tvm.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
    Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
    Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
    # declare
-    DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, stride=[stride_h, stride_w], padding=padding)
+    DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, stride=stride, padding=padding)
    ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
    Relu = topi.nn.relu(ScaleShift)
@@ -56,7 +55,7 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
            shift_np = np.random.uniform(size=shift_shape).astype(dtype)
            # correctness with scipy
            depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
-                input_np, filter_np, stride=[stride_h, stride_w], padding=padding)
+                input_np, filter_np, stride=stride, padding=padding)
            scale_shift_scipy = np.zeros(shape=scale_shift_shape)
            for c in range(in_channel * channel_multiplier):
                scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c]