add conv2d transpose and fix bugs (#1566)

672147c8 · Lianmin Zheng · Tianqi Chen · 6d4cf448 · 672147c8 · 672147c8
Commit 672147c8 authored Aug 09, 2018 by Lianmin Zheng Committed by Tianqi Chen Aug 09, 2018
19 changed files
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -254,6 +254,7 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
  int groups;
  std::string layout;
  std::string kernel_layout;
+  int out_dtype;
  bool use_bias;

  DMLC_DECLARE_PARAMETER(Conv2DTransposeParam) {
@@ -286,6 +287,10 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
      .describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc."
                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
                "dimensions respectively.");
+    DMLC_DECLARE_DTYPE_FIELD(out_dtype)
+        .add_enum("same", -1)
+        .set_default(-1)
+        .describe("Output data type, set to explicit type under mixed precision setting");
    DMLC_DECLARE_FIELD(use_bias).set_default(true)
      .describe("Whether the layer uses a bias vector.");
  }

--- a/nnvm/python/nnvm/testing/dcgan.py
+++ b/nnvm/python/nnvm/testing/dcgan.py
@@ -42,28 +42,31 @@ def deconv2d_bn_relu(data, prefix, **kwargs):

 def get_symbol(oshape, ngf=128, code=None):
    """get symbol of dcgan generator"""
-    assert oshape[-1] == 32, "Only support 32x32 image"
-    assert oshape[-2] == 32, "Only support 32x32 image"
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"

    code = sym.Variable("data") if code is None else code
-    net = sym.dense(code, name="g1", units=4*4*ngf*4, use_bias=False)
+    net = sym.dense(code, name="g1", units=4*4*ngf*8, use_bias=False)
    net = sym.relu(net)
    # 4 x 4
-    net = sym.reshape(net, shape=(-1, ngf * 4, 4, 4))
+    net = sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
    # 8 x 8
    net = deconv2d_bn_relu(
-        net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2")
+        net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
    # 16x16
    net = deconv2d_bn_relu(
-        net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3")
+        net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
    # 32x32
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
    net = deconv2d(
-        net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv")
+        net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
    net = sym.tanh(net)
    return net


-def get_workload(batch_size, oshape=(3, 32, 32), ngf=128, random_len=100, dtype="float32"):
+def get_workload(batch_size, oshape=(3, 64, 64), ngf=128, random_len=100, dtype="float32"):
    """Get benchmark workload for a DCGAN generator

    Parameters

--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -251,11 +251,15 @@ def compute_conv2d_transpose(attrs, inputs, _):
    strides = attrs.get_int_tuple("strides")
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
+    out_dtype = attrs.get_string("out_dtype")
    layout = attrs["layout"]
+    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
+
    assert layout == "NCHW", "only support nchw for now"
    assert dilation == (1, 1), "not support dilate now"
    assert groups == 1, "only support groups == 1 for now"
-    out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding)
+
+    out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding, out_dtype)
    if attrs.get_bool("use_bias"):
        bias = inputs[2]
        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)

--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -556,7 +556,7 @@ v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DTransposeParam>)
 .set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DTransposeParam>)
 .set_attr<FInferShape>("FInferShape", Conv2DTransposeInferShape)
-.set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DTransposeParam>)
 .set_attr<FCorrectLayout>("FCorrectLayout", Conv2DTransposeCorrectLayout)
 .set_num_outputs(1)
 .set_num_inputs(UseBiasNumInputs<Conv2DTransposeParam>)

--- a/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py
@@ -40,24 +40,27 @@ def deconv2d_bn_relu(data, prefix, **kwargs):
    net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu')
    return net

-def get_symbol(oshape=(3, 32, 32), ngf=128, code=None):
+def get_symbol(oshape=(3, 64, 64), ngf=128, code=None):
    """get symbol of dcgan generator"""
-    assert oshape[-1] == 32, "Only support 32x32 image"
-    assert oshape[-2] == 32, "Only support 32x32 image"
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"

    code = mx.sym.Variable("data") if code is None else code
-    net = mx.sym.FullyConnected(code, name="g1", num_hidden=4*4*ngf*4, no_bias=True, flatten=False)
+    net = mx.sym.FullyConnected(code, name="g1", num_hidden=ngf*8*4*4, no_bias=True, flatten=False)
    net = mx.sym.Activation(net, act_type='relu')
    # 4 x 4
-    net = mx.sym.reshape(net, shape=(-1, ngf * 4, 4, 4))
+    net = mx.sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
    # 8 x 8
    net = deconv2d_bn_relu(
-        net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2")
+        net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
    # 16x16
    net = deconv2d_bn_relu(
-        net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3")
+        net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
    # 32x32
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
    net = deconv2d(
-        net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv")
+        net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
    net = mx.sym.Activation(net, act_type='tanh')
    return net
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -345,7 +345,9 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
                    msg = msg.split('\n')[-2].split(": ")[1]
                except Exception:  # pylint: disable=broad-except
                    pass
-                raise InstantiationError(msg)
+                res_pack.append(MeasureResult((InstantiationError(msg),),
+                                              MeasureErrorNo.INSTANTIATION_ERROR,
+                                              tstamp - tic, tstamp))
            else:
                res_pack.append(MeasureResult((RuntimeError(msg),),
                                              MeasureErrorNo.COMPILE_HOST,

--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -21,6 +21,8 @@ import numpy as np

 from tvm import target as _target

+from .space import ConfigSpace
+
 logger = logging.getLogger('autotvm')

 class DispatchContext(object):
@@ -120,7 +122,12 @@ def dispatcher(fworkload):
            raise RuntimeError("DispatchContext is not initialized")
        workload = func(*args, **kwargs)
        cfg = context.query(tgt, workload)
+        if cfg.template_key:
            return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
+        else:
+            assert dispatch_dict, "No func registered for this dispatcher"
+            for v in dispatch_dict.values():
+                return v(cfg, *args, **kwargs)

    fdecorate = decorate(fworkload, dispatch_func)
    fdecorate.register = register
@@ -159,13 +166,18 @@ class ApplyHistoryBest(DispatchContext):
        Otherwise, it is an iterator.
    default: ConfigEntity, optional
        The default config to return when no history records
+    allow_fallback: bool
+        Whether allow to use a fallback configuration if cannot find
+        tuned result.
    """
-    def __init__(self, records, default=None):
+    def __init__(self, records, default=None, allow_fallback=False):
        super(ApplyHistoryBest, self).__init__()

        self.best_by_targetkey = {}
        self.best_by_model = {}
        self._default = default
+        self._allow_fallback = allow_fallback
+        self.fallback = {}

        if records:
            self.load(records)
@@ -244,5 +256,18 @@ class ApplyHistoryBest(DispatchContext):

        if self._default:
            return self._default
+
+        if self._allow_fallback:
+            key = (target, workload)
+            if key in self.fallback:
+                return self.fallback[key]
+            logger.warning(
+                "Cannot find config for target=%s, workload=%s. A fallback configuration "
+                "is used, which may bring great performance regression.", target, workload)
+            cfg = ConfigSpace()
+            self.fallback[key] = cfg
+            return cfg
+
        raise RuntimeError(
-            "Cannot find config for target=%s, workload=%s" % (target, workload))
+            "Cannot find config for target=%s, workload=%s. You need to do tuning "
+            "for this workload to get the config." % (target, workload))
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -53,12 +53,14 @@ class TaskExtractEnv:
        import nnvm

        self.symbol2topi = {
-            nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw]
+            nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw],
+            nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose],
        }

        self.topi_to_task = {
            topi.nn.conv2d: "topi_nn_conv2d",
            topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
+            topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
        }

        self._register_dummy()
@@ -110,6 +112,15 @@ class TaskExtractEnv:
            s = topi.generic.schedule_depthwise_conv2d_nchw([C])
            return s, [A, W, C]

+        @register("topi_nn_conv2d_transpose_nchw")
+        def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.conv2d_transpose_nchw(*args, **kwargs)
+            s = topi.generic.schedule_conv2d_transpose_nchw([C])
+            return s, [A, W, C]
+
    def reset(self):
        """Reset task collections"""
        self.task_collection = []

--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -9,6 +9,7 @@ TVM will download these parameters for you when you create the target for the fi
 import logging
 import os
 import json
+import sys

 from .task import ApplyHistoryBest
 from .. import target as _target
@@ -27,7 +28,7 @@ def _alias(name):
    return table.get(name, name)


-def context(target, extra_files=None):
+def context(target, extra_files=None, allow_fallback=False):
    """Return the dispatch context with pre-tuned parameters.
    The corresponding downloaded *.log files under tophub root path will be loaded.
    Users can also add their own files in argument `extra_files`.
@@ -38,9 +39,12 @@ def context(target, extra_files=None):
        The compilation target
    extra_files: list of str, optional
        Extra log files to load
+    allow_fallback: bool
+        Whether allow to use a fallback configuration if cannot find
+        tuned result.
    """
    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
-    best_context = ApplyHistoryBest([])
+    best_context = ApplyHistoryBest([], allow_fallback=allow_fallback)

    if isinstance(target, str):
        target = _target.create(target)
@@ -99,7 +103,15 @@ def check_package(backend):

    if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")):
        return
+
+    if sys.version_info >= (3,):
+        import urllib.request as urllib2
+    else:
+        import urllib2
+    try:
        download_package(backend)
+    except urllib2.URLError:
+        logging.warning("Failed to download tophub package for %s", backend)


 def list_packages():

--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -118,8 +118,8 @@ def progress_bar(total, prefix=''):
            ctx.cur_flops = flops
            ctx.best_flops = tuner.best_flops

-            sys.stdout.write('%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
-                             '| %.2f s\r' %
+            sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
+                             '| %.2f s' %
                             (prefix, ctx.cur_flops/1e9, ctx.best_flops/1e9, ctx.ct, ctx.total,
                              time.time() - tic))
            sys.stdout.flush()

--- a/topi/python/topi/arm_cpu/__init__.py
+++ b/topi/python/topi/arm_cpu/__init__.py
@@ -2,4 +2,5 @@

 from . import conv2d
 from . import depthwise_conv2d
+from . import conv2d_transpose
 from . import bitserial_conv2d
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -42,7 +42,7 @@ def schedule_conv2d_nchw_arm_cpu(cfg, outs):

    def _callback(op):
        # schedule conv2d
-        if 'spatial_conv_output' in op.tag:
+        if 'spatial_conv2d_output' in op.tag:
            output = op.output(0)
            conv = op.input_tensors[0]

@@ -60,7 +60,7 @@ def schedule_conv2d_nchw_arm_cpu(cfg, outs):

            _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])

-        if 'winograd_conv_output' in op.tag:
+        if 'winograd_conv2d_output' in op.tag:
            output = op.output(0)
            _schedule_winograd(cfg, s, output, outs[0])

@@ -72,7 +72,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
    assert layout == "NCHW", "Only support NCHW"
    out_dtype = out_dtype or data.dtype

-    _, CI, IH, IW = get_const_tuple(data.shape)
+    N, CI, IH, IW = get_const_tuple(data.shape)
    if len(kernel.shape) == 4:
        pre_packed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
@@ -81,13 +81,12 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
        CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
        CO = CO * VC

-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (KH, KW))
+    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)

-    N = 1
-    OH = (IH + pad_top + pad_down - KH) // HSTR + 1
+    OH = (IH + pad_top + pad_bottom - KH) // HSTR + 1
    OW = (IW + pad_left + pad_right - KW) // WSTR + 1
-    data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_down, pad_right])
+    data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right])

    # ==================== define configuration space ====================
    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
@@ -145,7 +144,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n

    output = tvm.compute(oshape, lambda n, co, h, w:
                         conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
-                         name='output_unpack', tag='spatial_conv_output',
+                         name='output_unpack', tag='spatial_conv2d_output',
                         attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding,
                                                                  layout, out_dtype)})
    return output
@@ -195,11 +194,14 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
    if kernel_vec.op.name == 'kernel_vec':
        co, _, _, _, _ = s[kernel_vec].op.axis
        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # kernel packing will be pre-computed during compliation, so we skip
+            # kernel packing will be pre-computed during compilation, so we skip
            # this part to make tuning records correct
            s[kernel_vec].pragma(co, 'debug_skip_region')
        else:
            s[kernel_vec].parallel(co)
+    elif kernel_vec.op.name == 'kernel_vec_conv2d_transpose':  # for conv2d transpose
+        co, _, _, _, _ = s[kernel_vec].op.axis
+        s[kernel_vec].parallel(co)

    return s

@@ -330,7 +332,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_
    # unpack output
    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
                         Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
-                         name='output', tag='winograd_conv_output',
+                         name='output', tag='winograd_conv2d_output',
                         attrs={'workload': _winograd_conv_arg_to_workload(
                             data, kernel, strides, padding, layout, out_dtype, tile_size)})

@@ -462,7 +464,7 @@ def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
    s = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
-        if 'winograd_conv_output' in op.tag:
+        if 'winograd_conv2d_output' in op.tag:
            output = op.output(0)
            _schedule_winograd(cfg, s, output, outs[0])


--- a/topi/python/topi/arm_cpu/conv2d_transpose.py
+++ b/topi/python/topi/arm_cpu/conv2d_transpose.py
+# pylint: disable=invalid-name, unused-variable
+"""Transposed 2D convolution operators (sometimes called Deconvolution)."""
+from __future__ import absolute_import as _abs
+
+import tvm
+from tvm import autotvm
+
+from ..generic import schedule_conv2d_transpose_nchw
+from ..nn import conv2d_transpose_nchw, dilate, pad, get_pad_tuple
+from ..util import get_const_tuple, traverse_inline
+from .conv2d import _schedule_spatial_pack
+
+@autotvm.task.register_topi_compute(conv2d_transpose_nchw, "arm_cpu", "direct")
+def conv2d_transpose_nchw_arm(cfg, Input, Filter, strides, padding, out_dtype):
+    """Transposed 2D convolution nchw forward operator.
+
+    Parameters
+    ----------
+    Input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    Filter : tvm.Tensor
+        4-D with shape [in_channel, num_filter, filter_height, filter_width]
+
+    strides : tuple of two ints
+        The spatial stride along height and width
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    out_dtype: str
+        The output data type. This is used for mixed precision.
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    return _decl_spatial_pack(cfg, Input, Filter, strides, padding, "NCHW", out_dtype, 2)
+
+def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
+    assert layout == "NCHW", "Only support NCHW"
+    out_dtype = out_dtype or data.dtype
+
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    _, CO, KH, KW = get_const_tuple(kernel.shape)
+
+    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
+    bpad_top, bpad_bottom = KH - 1 - pad_top, KH - 1 - pad_bottom
+    bpad_left, bpad_right = KW - 1 - pad_left, KW - 1 - pad_right
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+
+    OH = (IH - 1) * HSTR - pad_top - pad_bottom + KH
+    OW = (IW - 1) * WSTR - pad_left - pad_right + KW
+
+    dilated_input = dilate(data, [1, 1, HSTR, WSTR])
+    data_pad = pad(dilated_input, [0, 0, bpad_top, bpad_left], [0, 0, bpad_bottom, bpad_right])
+
+    # ==================== define configuration space ====================
+    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
+    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
+
+    if num_tile == 2:     # for arm cpu
+        co, vc = cfg.define_split('tile_co', co, num_outputs=2)
+        oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
+        ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
+    elif num_tile == 3:   # for mali gpu
+        co, _, vc = cfg.define_split('tile_co', co, num_outputs=3)
+        oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3)
+        ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3)
+    else:
+        raise RuntimeError("Invalid num_tile")
+
+    cfg.define_reorder("reorder_0",
+                       [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
+                       policy='candidate', candidate=[
+                           [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
+                           [n, co, oh, ow, ci, kh, kw, vc, vh, vw]])
+
+    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
+    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
+    # ====================================================================
+
+    VC = cfg["tile_co"].size[-1]
+    VH = cfg["tile_oh"].size[-1]
+    VW = cfg["tile_ow"].size[-1]
+
+    dvshape = (N, OH // VH, OW // VW, CI, VH + KH-1, VW + KW-1)
+    kvshape = (CO // VC, CI, KH, KW, VC)
+    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
+    oshape = (N, CO, OH, OW)
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
+                           data_pad[n][ci][h*VH + vh][w*VW + vw],
+                           name='data_vec')
+
+    kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
+                             kernel[ci][co*VC+vc][kh][kw],
+                             name='kernel_vec_conv2d_transpose')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    kh = tvm.reduce_axis((0, KH), name='kh')
+    kw = tvm.reduce_axis((0, KW), name='kw')
+
+    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+        tvm.sum(data_vec[n, h, w, ci, vh + kh, vw + kw].astype(out_dtype) *
+                kernel_vec[co, ci, KH - 1 - kh, KW - 1 - kw, vc].astype(out_dtype),
+                axis=[ci, kh, kw]), name='conv')
+
+    output = tvm.compute(oshape, lambda n, co, h, w:
+                         conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
+                         name='output_unpack', tag='spatial_conv2d_transpose_output')
+    return output
+
+
+# register customized schedule for arm cpu.
+@autotvm.task.register_topi_schedule(schedule_conv2d_transpose_nchw, "arm_cpu", "direct")
+def schedule_conv2d_transpose_arm(cfg, outs):
+    """Schedule conv2d transpose for arm cpu"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'spatial_conv2d_transpose_output' in op.tag:
+            output = op.output(0)
+            conv = op.input_tensors[0]
+
+            data_vec = conv.op.input_tensors[0]
+            data_pad = data_vec.op.input_tensors[0]
+            dilated_input = data_pad.op.input_tensors[0]
+            s[data_pad].compute_inline()
+            s[dilated_input].compute_inline()
+
+            kernel_vec = conv.op.input_tensors[1]
+            if kernel_vec.op.name == 'kernel_vec':
+                kernel = kernel_vec.op.input_tensors[0]
+            else:
+                kernel = kernel_vec
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -15,7 +15,16 @@ autotvm.task.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct',
 # register customized schedule for arm cpu.
 @autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
 def schedule_depthwise_conv2d_nchw_(cfg, outs):
-    """Schedule depthwise conv2d"""
+    """Schedule depthwise conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The configuration of this tempalte
+    outs: Array of Tensor
+        The computation graph description of depthwise convolution2d
+        in the format of an array of tensors.
+    """
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])

@@ -79,10 +88,8 @@ def schedule_depthwise_conv2d_nchw_(cfg, outs):

        return s

-    scheduled_ops = []
-
    def _callback(op):
-        if op.tag == 'depthwise_conv2d_nchw' and op not in scheduled_ops:
+        if op.tag == 'depthwise_conv2d_nchw':
            output = op.output(0)
            kernel = op.input_tensors[1]
            data = op.input_tensors[0]
@@ -92,7 +99,5 @@ def schedule_depthwise_conv2d_nchw_(cfg, outs):
                data = data_pad.op.input_tensors[0]
            _schedule(cfg, s, data, data_pad, kernel, output)

-        scheduled_ops.append(op)
-
    traverse_inline(s, outs[0].op, _callback)
    return s
--- a/topi/python/topi/nn/conv2d_transpose.py
+++ b/topi/python/topi/nn/conv2d_transpose.py
@@ -10,7 +10,7 @@ from ..util import simplify


 @tvm.target.generic_func
-def conv2d_transpose_nchw(Input, Filter, strides, padding):
+def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype):
    """Transposed 2D convolution nchw forward operator.

    Parameters
@@ -27,6 +27,9 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding):
    padding : int or str
        Padding size, or ['VALID', 'SAME']

+    out_dtype : str
+        The output data type. This is used for mixed precision.
+
    Returns
    -------
    Output : tvm.Tensor
@@ -58,7 +61,8 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding):
    Output = tvm.compute(
        (batch, out_c, out_h, out_w),
        lambda b, c, h, w: tvm.sum(
-            PaddedInput[b, dc, h+dh, w+dw] * Filter[dc, c, filter_h-1-dh, filter_w-1-dw],
+            PaddedInput[b, dc, h+dh, w+dw].astype(out_dtype) *
+            Filter[dc, c, filter_h-1-dh, filter_w-1-dw].astype(out_dtype),
            axis=[dc, dh, dw]), tag="conv2d_transpose_nchw")

    return Output
--- a/topi/tests/python/test_topi_conv2d.py
+++ b/topi/tests/python/test_topi_conv2d.py
@@ -40,7 +40,7 @@ def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, paddin
    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

 def test_conv2d():
-    with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b')):
+    with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b'), allow_fallback=True):
        verify_conv2d(1, 56, 64, 64, 3, 1, 1)

 if __name__ == "__main__":

--- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
@@ -12,7 +12,7 @@ def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel,

    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
    W = tvm.placeholder((in_channel, num_filter, kernel, kernel), name='W')
-    B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding)
+    B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding, A.dtype)
    C = topi.nn.relu(B)

    a_shape = get_const_tuple(A.shape)

--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -62,7 +62,7 @@ import tvm.contrib.graph_runtime as runtime

 def get_network(name, batch_size):
    """Get the symbol definition and random weight of a network"""
-    shape = {"data": (batch_size, 3, 224, 224)}
+    input_shape = (batch_size, 3, 224, 224)
    output_shape = (batch_size, 1000)

    if name =='resnet-18':
@@ -90,7 +90,7 @@ def get_network(name, batch_size):
    else:
        raise ValueError("Unsupported network: " + name)

-    return net, params, shape, output_shape
+    return net, params, input_shape, output_shape

 #################################################################
 # Start RPC Tracker
@@ -226,8 +226,8 @@ tuning_option = {
 def tune_tasks(tasks,
               measure_option,
               tuner='xgb',
-               n_trial=500,
-               early_stopping=200,
+               n_trial=1000,
+               early_stopping=None,
               log_filename='tuning.log',
               use_transfer_learning=True,
               try_winograd=True):
@@ -283,10 +283,10 @@ def tune_tasks(tasks,
 def tune_and_evaluate():
    # extract workloads from nnvm graph
    print("Extract tasks...")
-    net, params, shape, out_shape = get_network(network, batch_size=1)
-    tasks = autotvm.task.extract_from_graph(net, shape=shape, dtype=dtype,
-                                            symbols=(nnvm.sym.conv2d,),
-                                            target=target)
+    net, params, input_shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))

    # run tuning tasks
    print("Tuning...")
@@ -298,7 +298,7 @@ def tune_and_evaluate():
        with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
            graph, lib, params = nnvm.compiler.build(
                net, target=target,
-                shape=shape, params=params, dtype=dtype)
+                shape={'data': input_shape}, params=params, dtype=dtype)

        # export library
        tmp = tempdir()
@@ -319,7 +319,7 @@ def tune_and_evaluate():
        # upload parameters to device
        ctx = remote.context(str(target), 0)
        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-        data_tvm = tvm.nd.array((np.random.uniform(size=shape['data'])).astype(dtype))
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
        module = runtime.create(graph, rlib, ctx)
        module.set_input('data', data_tvm)
        module.set_input(**rparams)
@@ -341,35 +341,33 @@ def tune_and_evaluate():
 # -------------
 # The tuning needs to train xgboost models and use them for prediction.
 # So a high performance CPU is recommended.
-# It takes about 1.5 hour on a 32T AMD Ryzen CPU.
+# It takes about 2 hours on a 32T AMD Ryzen CPU.
 # One sample output is
 #
 # .. code-block:: bash
 #
 #    Extract tasks...
 #    Tuning...
-#    [Task  1/16]  Current/Best:   13.15/  20.49 GFLOPS | Progress: (297/1000) | 348.51 s Done.
-#    [Task  2/16]  Current/Best:   16.66/  22.64 GFLOPS | Progress: (475/1000) | 415.42 s Done.
-#    [Task  3/16]  Current/Best:   10.33/  14.19 GFLOPS | Progress: (306/1000) | 239.61 s Done.
-#    [Task  4/16]  Current/Best:   13.29/  20.88 GFLOPS | Progress: (242/1000) | 227.48 s Done.
-#    [Task  5/16]  Current/Best:   13.28/  15.61 GFLOPS | Progress: (237/1000) | 191.56 s Done.
-#    [Task  6/16]  Current/Best:   20.16/  23.86 GFLOPS | Progress: (315/1000) | 304.31 s Done.
-#    [Task  7/16]  Current/Best:    9.22/  22.00 GFLOPS | Progress: (458/1000) | 433.26 s Done.
-#    [Task  8/16]  Current/Best:   14.12/  17.80 GFLOPS | Progress: (270/1000) | 240.73 s Done.
-#    [Task  9/16]  Current/Best:   14.59/  24.02 GFLOPS | Progress: (209/1000) | 213.61 s Done.
-#    [Task 10/16]  Current/Best:    9.86/  21.74 GFLOPS | Progress: (367/1000) | 359.93 s Done.
-#    [Task 11/16]  Current/Best:    5.01/  18.86 GFLOPS | Progress: (202/1000) | 191.18 s Done.
-#    [Task 12/16]  Current/Best:    8.61/  25.23 GFLOPS | Progress: (220/1000) | 220.74 s Done.
-#    [Task 13/16]  Current/Best:   10.87/  25.79 GFLOPS | Progress: (465/1000) | 902.14 s Done.
-#    [Task 14/16]  Current/Best:   15.33/  29.38 GFLOPS | Progress: (239/1000) | 481.33 s Done.
-#    [Task 15/16]  Current/Best:   12.09/  38.60 GFLOPS | Progress: (476/1000) | 928.35 s Done.
-#    [Task 16/16]  Current/Best:   16.77/  47.08 GFLOPS | Progress: (255/1000) | 439.91 s Done.
+#    [Task  1/16]  Current/Best:   18.85/  19.67 GFLOPS | Progress: (353/1000) | 387.05 s Done.
+#    [Task  2/16]  Current/Best:   16.10/  23.50 GFLOPS | Progress: (444/1000) | 379.99 s Done.
+#    [Task  3/16]  Current/Best:    5.49/  13.96 GFLOPS | Progress: (610/1000) | 485.87 s Done.
+#    [Task  4/16]  Current/Best:   10.07/  20.48 GFLOPS | Progress: (430/1000) | 391.66 s Done.
+#    [Task  5/16]  Current/Best:   11.50/  15.50 GFLOPS | Progress: (374/1000) | 356.03 s Done.
+#    [Task  6/16]  Current/Best:   10.76/  23.77 GFLOPS | Progress: (526/1000) | 526.42 s Done.
+#    [Task  7/16]  Current/Best:   12.71/  22.03 GFLOPS | Progress: (341/1000) | 322.96 s Done.
+#    [Task  8/16]  Current/Best:    8.60/  17.91 GFLOPS | Progress: (272/1000) | 236.08 s Done.
+#    [Task  9/16]  Current/Best:   15.37/  23.62 GFLOPS | Progress: (275/1000) | 275.18 s Done.
+#    [Task 10/16]  Current/Best:    6.62/  23.01 GFLOPS | Progress: (330/1000) | 315.02 s Done.
+#    [Task 11/16]  Current/Best:    1.85/  21.39 GFLOPS | Progress: (281/1000) | 239.19 s Done.
+#    [Task 12/16]  Current/Best:   15.41/  24.02 GFLOPS | Progress: (258/1000) | 270.82 s Done.
+#    [Task 13/16]  Current/Best:   17.96/  25.79 GFLOPS | Progress: (380/1000) | 738.29 s Done.
+#    [Task 14/16]  Current/Best:   14.81/  31.17 GFLOPS | Progress: (413/1000) | 799.21 s Done.
+#    [Task 15/16]  Current/Best:   24.39/  40.97 GFLOPS | Progress: (355/1000) | 700.25 s Done.
+#    [Task 16/16]  Current/Best:    9.42/  49.90 GFLOPS | Progress: (348/1000) | 603.84 s Done.
 #    Compile...
 #    Upload...
 #    Evaluate inference time cost...
-#    Mean inference time (std dev): 156.51 ms (0.89 ms)
-#
-
+#    Mean inference time (std dev): 157.29 ms (1.74 ms)

 ######################################################################
 #

--- a/tutorials/nnvm_quick_start.py
+++ b/tutorials/nnvm_quick_start.py
@@ -109,7 +109,7 @@ print(out.asnumpy().flatten()[0:10])
 # Save and Load Compiled Module
 # -----------------------------
 # We can also save the graph, lib and parameters into files and load them
-# back in development environment.
+# back in deploy environment.

 ####################################################