[AlterLayout] NCHWc upsampling, fix depthwise conv (#2806)

* [AlterLayout] NCHW upsampling * [Relay][Pass] Fix Depthwise AlterLayout

[AlterLayout] NCHWc upsampling, fix depthwise conv (#2806)
* [AlterLayout] NCHW upsampling * [Relay][Pass] Fix Depthwise AlterLayout
f81e2873 · Bing Xu · Leyuan Wang · 7ef32cdc · f81e2873 · f81e2873
Commit f81e2873 authored Mar 19, 2019 by Bing Xu Committed by Leyuan Wang Mar 19, 2019
13 changed files
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -345,3 +345,28 @@ def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
 reg.register_pattern("nn.contrib_conv2d_NCHWc",
                     OpPattern.OUT_ELEMWISE_FUSABLE)
+@reg.register_compute("nn.contrib_depthwise_conv2d_NCHWc")
+def compute_contrib_depthwise_conv2d_NCHWc(attrs, inputs, out_dtype, target):
+    """Compute definition of depthwise conv2d NCHWc"""
+    # pylint: disable=assignment-from-no-return
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    data_layout = attrs.get_str("data_layout")
+    out_layout = attrs.get_str("out_layout")
+    out_dtype = attrs.get_str("out_dtype")
+    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
+    out = topi.nn.depthwise_conv2d_NCHWc(inputs[0], inputs[1], strides, padding, dilation,
+                                         data_layout, out_layout, out_dtype)
+    return [out]
+@reg.register_schedule("nn.contrib_depthwise_conv2d_NCHWc")
+def schedule_contrib_depthwise_conv2d_NCHWc(attrs, outs, target):
+    """Schedule definition of contrib_conv2d_NCHWc"""
+    with target:
+        return topi.generic.schedule_depthwise_conv2d_NCHWc(outs)
+reg.register_pattern("nn.contrib_depthwise_conv2d_NCHWc",
+                     OpPattern.OUT_ELEMWISE_FUSABLE)
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -927,6 +927,70 @@ def contrib_conv2d_nchwc(data,
                                      groups, channels, kernel_size, data_layout,
                                      kernel_layout, out_layout, out_dtype)
+def contrib_depthwise_conv2d_nchwc(data,
+                                   kernel,
+                                   strides=(1, 1),
+                                   padding=(0, 0),
+                                   dilation=(1, 1),
+                                   groups=1,
+                                   channels=None,
+                                   kernel_size=None,
+                                   data_layout="NCHW8c",
+                                   kernel_layout="OIHW",
+                                   out_layout="",
+                                   out_dtype=""):
+    r"""Variant of 2D depthwise convolution.
+    This operator takes the weight as the depthwise convolution kernel
+    and depthwise convolves it with data to produce an output, following a specialized
+    NCHWc data layout.
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+    kernel : tvm.relay.Expr
+        The kernel expressions.
+    strides : tuple of int, optional
+        The strides of convoltution.
+    padding : tuple of int, optional
+        The padding of convolution on both sides of inputs before convolution.
+    dilation : tuple of int, optional
+        Specifies the dilation rate to be used for dilated convolution.
+    groups : int, optional
+        Number of groups for grouped convolution.
+    channels : int, optional
+        Number of output channels of this convolution.
+    kernel_size : tuple of int, optional
+        The spatial of the convolution kernel.
+    data_layout : str, optional
+        Layout of the input.
+    kernel_layout : str, optional
+        Layout of the weight.
+    out_layout : str, optional
+        Layout of the output, by default, out_layout is the same as data_layout
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.contrib_depthwise_conv2d_NCHWc(data, kernel, strides, padding, dilation,
+                                                groups, channels, kernel_size, data_layout,
+                                                kernel_layout, out_layout, out_dtype)
 def contrib_conv2d_winograd_weight_transform(weight,
                                             tile_size):

--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -582,5 +582,57 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc")
        Conv2DInferCorrectLayout<Conv2DAttrs>);
+// Positional relay function to create depthwise conv2d NCHWc operator
+// used by frontend FFI.
+Expr MakeDepthwiseConv2DNCHWc(Expr data,
+                              Expr kernel,
+                              Array<IndexExpr> strides,
+                              Array<IndexExpr> padding,
+                              Array<IndexExpr> dilation,
+                              int groups,
+                              IndexExpr channels,
+                              Array<IndexExpr> kernel_size,
+                              std::string data_layout,
+                              std::string kernel_layout,
+                              std::string out_layout,
+                              DataType out_dtype) {
+  auto attrs = make_node<Conv2DAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->channels = channels;
+  attrs->kernel_size = std::move(kernel_size);
+  attrs->data_layout = std::move(data_layout);
+  attrs->kernel_layout = std::move(kernel_layout);
+  attrs->out_layout = std::move(out_layout);
+  attrs->out_dtype = std::move(out_dtype);
+  static const Op& op = Op::Get("nn.contrib_depthwise_conv2d_NCHWc");
+  return CallNode::make(op, {data, kernel}, Attrs(attrs), {});
+}
+TVM_REGISTER_API("relay.op.nn._make.contrib_depthwise_conv2d_NCHWc")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 12>(MakeDepthwiseConv2DNCHWc, args, rv);
+  });
+RELAY_REGISTER_OP("nn.contrib_depthwise_conv2d_NCHWc")
+.describe(R"code(Compute conv2d with NCHWc data layout. Only supports NCHW layout.
+- **data**: Input is 5D packed tensor.
+- **weight**: 6D packed tensor.
+- **out**:  Output is 5D packed tensor
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.DepthwiseConv2D")
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(10)
+.add_type_rel("Conv2D", Conv2DRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+        Conv2DInferCorrectLayout<Conv2DAttrs>);
 }  // namespace relay
 }  // namespace tvm
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -18,6 +18,31 @@ namespace relay {
 TVM_REGISTER_NODE_TYPE(UpSamplingAttrs);
+template <typename T>
+Array<Array<Layout> > UpsamplingInferCorrectLayout(
+    const Attrs& attrs,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<Array<IndexExpr>> &old_in_shapes) {
+  // NOTE: Discard "const" qualifier here.
+  T *params = const_cast<T*>(attrs.as<T>());
+  if (new_in_layouts.defined()) {
+    CHECK_EQ(new_in_layouts.size(), 1);
+    Layout raw_layout(params->layout);
+    Layout input = new_in_layouts[0];
+    if (input.IndexOf(LayoutAxis::Get('W')) == raw_layout.IndexOf(LayoutAxis::Get('W')) &&
+      input.IndexOf(LayoutAxis::Get('H')) == raw_layout.IndexOf(LayoutAxis::Get('H')) &&
+        !input.Contains(LayoutAxis::Get('w')) && !input.Contains(LayoutAxis::Get('h'))) {
+      params->layout = input.name();  // modify self to follow the input layout
+    }
+  }
+  Layout inferred_layout(params->layout);
+  return Array<Array<Layout> >{{inferred_layout}, {inferred_layout}};
+}
 bool UpSamplingRel(const Array<Type>& types,
                   int num_inputs,
                   const Attrs& attrs,
@@ -91,6 +116,8 @@ RELAY_REGISTER_OP("nn.upsampling")
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
 .add_type_rel("UpSampling", UpSamplingRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+  UpsamplingInferCorrectLayout<UpSamplingAttrs>)
 .set_attr<TOpPattern>("TOpPattern", kInjective)
 .set_attr<FTVMCompute>(
  "FTVMCompute", [](const Attrs& attrs,
@@ -101,14 +128,16 @@ RELAY_REGISTER_OP("nn.upsampling")
    CHECK(uattrs != nullptr);
    auto out_tt = out_type.as<TensorTypeNode>();
    CHECK(out_tt) << "expected a tensor type: " << out_type;
-    CHECK(uattrs->layout == "NCHW" || uattrs->layout == "NHWC")
+    const auto layout = uattrs->layout;
+    const auto base_layout = layout.substr(0, 4);
+    CHECK(base_layout == "NCHW" || layout == "NHWC")
      << "unknown layout: " << uattrs->layout;
    Array<HalideIR::Expr> oshape;
-    if (uattrs->layout == "NCHW") {
+    if (base_layout == "NCHW") {
      oshape.push_back(out_tt->shape[2]);
      oshape.push_back(out_tt->shape[3]);
-    } else if (uattrs->layout == "NHWC") {
+    } else if (layout == "NHWC") {
      oshape.push_back(out_tt->shape[1]);
      oshape.push_back(out_tt->shape[2]);
    }

--- a/tests/lint/pylintrc
+++ b/tests/lint/pylintrc
@@ -114,7 +114,7 @@ single-line-if-stmt=no
 no-space-check=trailing-comma,dict-separator
 # Maximum number of lines in a module
-max-module-lines=1000
+max-module-lines=1500
 # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
 # tab).

--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -411,6 +411,51 @@ def test_alter_layout_concatenate():
    assert(alpha_equal(a, b))
+def test_alter_layout_nchw_upsamping_op():
+    """Test upsamping operators """
+    def before():
+        x = relay.var("x", shape=(1, 32, 28, 28))
+        weight = relay.var('weight', shape=(32, 32, 3, 3))
+        y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1))
+        y = relay.nn.upsampling(y, scale=2)
+        y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2))
+        y = relay.Function(free_vars(y), y)
+        return y
+    @register_alter_op_layout("nn.conv2d", level=108)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs['data_layout'] = 'NCHW16c'
+        return relay.nn.conv2d(data, weight, **new_attrs)
+    def expected():
+        x = relay.var("x", shape=(1, 32, 28, 28))
+        weight = relay.var("weight")
+        x = relay.layout_transform(x, "NCHW", "NCHW16c")
+        y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1),
+                            data_layout="NCHW16c")
+        y = relay.nn.upsampling(y, scale=2, layout="NCHW16c")
+        y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2), layout='NCHW16c')
+        y = relay.layout_transform(y, "NCHW16c", "NCHW")
+        y = relay.Function(free_vars(y), y)
+        return y
+    a = before()
+    a = infer_type(a)
+    a = canonicalize_ops(a)
+    a = infer_type(a)
+    a = alter_op_layout(a)
+    a = infer_type(a)
+    b = expected()
+    b = infer_type(b)
+    assert(alpha_equal(a, b))
 if __name__ == "__main__":
    test_alter_op()
    test_alter_return_none()
@@ -420,3 +465,4 @@ if __name__ == "__main__":
    test_alter_layout_broadcast_op()
    test_alter_layout_scalar()
    test_alter_layout_concatenate()
+    test_alter_layout_nchw_upsamping_op()
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -4,5 +4,8 @@ export PYTHONPATH=python:topi/python
 make cython || exit -1
 make cython3 || exit -1
+rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
+rm -rf topi/python/topi/*.pyc topi/python/topi/*/*.pyc topi/python/topi/*/*/*.pyc topi/python/topi/*/*/*/*.pyc 
 python -m nose -v topi/tests/python || exit -1
 python3 -m nose -v topi/tests/python || exit -1
--- a/topi/include/topi/image/resize.h
+++ b/topi/include/topi/image/resize.h
@@ -135,6 +135,45 @@ inline Tensor resize_nearest_neighbor_nchw(const Tensor& input,
 }
 /*!
+* \brief Resize given tensor to given shape using nearest neighbour for NCHWc
+*
+* \param input The input tensor.
+* \param shape Output shape to resize to.
+* \param align_corners To preserve centers of 4 corner pixels
+* \param name Name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor resized to given shape
+*/
+inline Tensor resize_nearest_neighbor_nchwc(const Tensor& input,
+                                            const Array<Expr>& shape,
+                                            bool align_corners = false,
+                                            std::string name = "tensor",
+                                            std::string tag = kInjective) {
+  Array<Expr> out_shape;
+  out_shape.push_back(input->shape[0]);
+  out_shape.push_back(input->shape[1]);
+  out_shape.push_back(shape[0]);
+  out_shape.push_back(shape[1]);
+  out_shape.push_back(input->shape[4]);
+  Expr h_ratio = shape[0] / input->shape[2];
+  Expr w_ratio = shape[1] / input->shape[3];
+  return compute(
+    out_shape, [&](const Array<Var>& indices) {
+    Array<Expr> idx;
+    idx.push_back(indices[0]);
+    idx.push_back(indices[1]);
+    idx.push_back(indices[2] / h_ratio);
+    idx.push_back(indices[3] / w_ratio);
+    idx.push_back(indices[4]);
+     return input(idx);
+    }, name, tag);
+}
+/*!
 * \brief Resize given tensor to given shape using nearest neighbour
 *
 * \param input The input tensor.
@@ -153,11 +192,17 @@ inline Tensor resize_nearest_neighbor(const Tensor& input,
                                      std::string name = "tensor",
                                      std::string tag = kInjective) {
  CHECK_EQ(align_corners, false) << "Align corners not supported for nearest neighbour";
+  auto base_layout = layout.substr(0, 4);
  if (layout == "NHWC") {
    return resize_nearest_neighbor_nhwc(input, shape, align_corners);
-  } else {
+  } else if (layout == "NCHW") {
    return resize_nearest_neighbor_nchw(input, shape, align_corners);
+  } else if (base_layout == "NCHW") {
+    // NCHWc
+    return resize_nearest_neighbor_nchwc(input, shape, align_corners);
+  } else {
+    LOG(FATAL) << "Unknown layout: " << layout;
+    return Tensor();
  }
 }

--- a/topi/python/topi/nn/depthwise_conv2d.py
+++ b/topi/python/topi/nn/depthwise_conv2d.py
@@ -292,7 +292,7 @@ def depthwise_conv2d_NCHWc(Input, Filter, stride, padding, dilation,
        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
    Filter : tvm.Tensor
-        4-D with shape [out_channel_chunk, filter_height, filter_width, out_channel_block]
+        6-D with shape [out_channel_chunk, 1, filter_height, filter_width, 1, out_channel_block]
        In NCHWc depthwise convolution,
        we group kernel's in_channel and channel_multiplier together then do the tiling.
@@ -317,6 +317,6 @@ def depthwise_conv2d_NCHWc(Input, Filter, stride, padding, dilation,
    Returns
    -------
    Output : tvm.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
+        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
    """
    raise ValueError("missing register for topi.nn.depthwise_conv2d_NCHWc")
--- a/topi/python/topi/nn/upsampling.py
+++ b/topi/python/topi/nn/upsampling.py
@@ -30,8 +30,8 @@ def upsampling(data, scale, layout="NCHW", method='NEAREST_NEIGHBOR'):
        4-D with shape [batch, channel, in_height*scale, in_width*scale]
        or [batch, in_height*scale, in_width*scale, channel]
    """
+    base_layout = layout[0:4]
-    if layout == "NCHW":
+    if base_layout == "NCHW":
        out_shape = (simplify(data.shape[2] * scale), simplify(data.shape[3] * scale))
    elif layout == "NHWC":
        out_shape = (simplify(data.shape[1] * scale), simplify(data.shape[2] * scale))

--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-member
 """Conv2D schedule on x86"""
+import logging
 import tvm
 from tvm import autotvm
 from tvm.autotvm.task.topi_integration import deserialize_args
@@ -16,6 +18,8 @@ from ..nn.pad import pad
 from . import conv2d_avx_1x1, conv2d_avx_common
+logger = logging.getLogger('topi')
 def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False):
    """
    Get default schedule config for the workload
@@ -290,7 +294,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
    batch_size, in_channel, height, width = get_const_tuple(data.shape)
    groups = attrs.get_int("groups")
-    out_channel = attrs.get_int("channels")
+    out_channel = attrs.get_int("channels") if F == sym else attrs.get_int("channels").value
    padding = attrs.get_int_tuple("padding")
    strides = attrs.get_int_tuple("strides")
    dilation = attrs.get_int_tuple("dilation")
@@ -330,16 +334,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
    new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
                               dtype=data.dtype)
-    if is_depthwise:
-        # channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block
-        # in which out_channel = merge(channel, channel_multiplier)
-        kernel_sym = copy_inputs[1]
-        kernel_sym = sym.reshape(kernel_sym, shape=(out_channel//oc_bn, oc_bn, kh, kw))
-        kernel_sym = sym.transpose(kernel_sym, axes=(0, 2, 3, 1))
-        copy_inputs[1] = kernel_sym
+    if is_depthwise:
+        new_attrs['kernel_layout'] = 'OIHW1i%do' % oc_bn
        # Store altered operator's config
-        new_kernel = tvm.placeholder((out_channel//oc_bn, kh, kw, oc_bn), dtype=kernel.dtype)
+        new_kernel = tvm.placeholder((out_channel//oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel.dtype)
        new_workload = autotvm.task.args_to_workload(
            [new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name],
             new_attrs['out_layout'], out_dtype], depthwise_conv2d_NCHWc)
@@ -356,9 +355,16 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
             new_attrs['out_layout'], out_dtype], conv2d_NCHWc)
    dispatch_ctx.update(target, new_workload, cfg)
-    if F == sym:
-        return F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+    if is_depthwise:
-    return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
+        if F == sym:
+            logging.warning("Use native layout for depthwise convolution on NNVM.")
+            return None
+        return F.nn.contrib_depthwise_conv2d_nchwc(*copy_inputs, **new_attrs)
+    else:
+        if F == sym:
+            return F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+        return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
 @autotvm.register_topi_compute(conv2d_NCHWc, 'cpu', 'direct')

--- a/topi/python/topi/x86/depthwise_conv2d.py
+++ b/topi/python/topi/x86/depthwise_conv2d.py
@@ -58,7 +58,7 @@ def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation,
                                layout, out_layout, out_dtype=None):
    out_dtype = data.dtype if out_dtype is None else out_dtype
    batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape)
-    out_channel_chunk, filter_height, filter_width, out_channel_block \
+    out_channel_chunk, _, filter_height, filter_width, __, out_channel_block \
        = get_const_tuple(kernel.shape)
    strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
@@ -102,7 +102,7 @@ def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation,
                      oh*HSTR+kh, ow*WSTR+kw,
                      ((oco * out_channel_block + oci) // channel_multiplier) % in_channel_block]
             .astype(out_dtype) *
-             kernel[oco, kh, kw, oci].astype(out_dtype)),
+             kernel[oco, 0, kh, kw, 0, oci].astype(out_dtype)),
            axis=[kh, kw]),
        name='DepthwiseConv2d', tag="depthwise_conv2d_NCHWc")
    return Output

--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -216,7 +216,8 @@ def _transform_kernel(kernel, bn):
    out_channel = channel * channel_multiplier
    kernel = np.reshape(kernel, (out_channel//bn, bn, kh, kw))
    kernel = np.transpose(kernel, (0, 2, 3, 1))
-    return kernel
+    out_channel_chunk, kh, kw, out_channel_block = kernel.shape
+    return kernel.reshape(out_channel_chunk, 1, kh, kw, 1, out_channel_block)
 def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
    in_width = in_height
@@ -246,7 +247,7 @@ def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_m
    # placeholder
    Input = tvm.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='Input')
-    Filter = tvm.placeholder((out_channel//oc_block, filter_height, filter_width, oc_block), name='Filter')
+    Filter = tvm.placeholder((out_channel//oc_block, 1, filter_height, filter_width, 1, oc_block), name='Filter')
    in_layout = "NCHW%dc" % ic_block
    out_layout = "NCHW%dc" % oc_block
    dtype = 'float32'
@@ -297,10 +298,12 @@ def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_m
        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                                                     dtype=DepthwiseConv2d.dtype), ctx)
        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # launch kernel 1 (depthwise_conv2d)
+        print(filter_tvm.shape)
        f1(input_tvm, filter_tvm, depthwise_conv2d_tvm)
        # launch kernel 2 (depthwise_conv2d + relu)
        f2(input_tvm, filter_tvm, relu_tvm)