Commit 672147c8 by Lianmin Zheng Committed by Tianqi Chen

add conv2d transpose and fix bugs (#1566)

parent 6d4cf448
......@@ -254,6 +254,7 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
int groups;
std::string layout;
std::string kernel_layout;
int out_dtype;
bool use_bias;
DMLC_DECLARE_PARAMETER(Conv2DTransposeParam) {
......@@ -286,6 +287,10 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
.describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc."
"'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
"dimensions respectively.");
DMLC_DECLARE_DTYPE_FIELD(out_dtype)
.add_enum("same", -1)
.set_default(-1)
.describe("Output data type, set to explicit type under mixed precision setting");
DMLC_DECLARE_FIELD(use_bias).set_default(true)
.describe("Whether the layer uses a bias vector.");
}
......
......@@ -42,28 +42,31 @@ def deconv2d_bn_relu(data, prefix, **kwargs):
def get_symbol(oshape, ngf=128, code=None):
"""get symbol of dcgan generator"""
assert oshape[-1] == 32, "Only support 32x32 image"
assert oshape[-2] == 32, "Only support 32x32 image"
assert oshape[-1] == 64, "Only support 64x64 image"
assert oshape[-2] == 64, "Only support 64x64 image"
code = sym.Variable("data") if code is None else code
net = sym.dense(code, name="g1", units=4*4*ngf*4, use_bias=False)
net = sym.dense(code, name="g1", units=4*4*ngf*8, use_bias=False)
net = sym.relu(net)
# 4 x 4
net = sym.reshape(net, shape=(-1, ngf * 4, 4, 4))
net = sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
# 8 x 8
net = deconv2d_bn_relu(
net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2")
net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
# 16x16
net = deconv2d_bn_relu(
net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3")
net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
# 32x32
net = deconv2d_bn_relu(
net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
# 64x64
net = deconv2d(
net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv")
net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
net = sym.tanh(net)
return net
def get_workload(batch_size, oshape=(3, 32, 32), ngf=128, random_len=100, dtype="float32"):
def get_workload(batch_size, oshape=(3, 64, 64), ngf=128, random_len=100, dtype="float32"):
"""Get benchmark workload for a DCGAN generator
Parameters
......
......@@ -251,11 +251,15 @@ def compute_conv2d_transpose(attrs, inputs, _):
strides = attrs.get_int_tuple("strides")
dilation = attrs.get_int_tuple("dilation")
groups = attrs.get_int("groups")
out_dtype = attrs.get_string("out_dtype")
layout = attrs["layout"]
out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
assert layout == "NCHW", "only support nchw for now"
assert dilation == (1, 1), "not support dilate now"
assert groups == 1, "only support groups == 1 for now"
out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding)
out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding, out_dtype)
if attrs.get_bool("use_bias"):
bias = inputs[2]
bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
......
......@@ -556,7 +556,7 @@ v (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DTransposeParam>)
.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DTransposeParam>)
.set_attr<FInferShape>("FInferShape", Conv2DTransposeInferShape)
.set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DTransposeParam>)
.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DTransposeCorrectLayout)
.set_num_outputs(1)
.set_num_inputs(UseBiasNumInputs<Conv2DTransposeParam>)
......
......@@ -40,24 +40,27 @@ def deconv2d_bn_relu(data, prefix, **kwargs):
net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu')
return net
def get_symbol(oshape=(3, 32, 32), ngf=128, code=None):
def get_symbol(oshape=(3, 64, 64), ngf=128, code=None):
"""get symbol of dcgan generator"""
assert oshape[-1] == 32, "Only support 32x32 image"
assert oshape[-2] == 32, "Only support 32x32 image"
assert oshape[-1] == 64, "Only support 64x64 image"
assert oshape[-2] == 64, "Only support 64x64 image"
code = mx.sym.Variable("data") if code is None else code
net = mx.sym.FullyConnected(code, name="g1", num_hidden=4*4*ngf*4, no_bias=True, flatten=False)
net = mx.sym.FullyConnected(code, name="g1", num_hidden=ngf*8*4*4, no_bias=True, flatten=False)
net = mx.sym.Activation(net, act_type='relu')
# 4 x 4
net = mx.sym.reshape(net, shape=(-1, ngf * 4, 4, 4))
net = mx.sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
# 8 x 8
net = deconv2d_bn_relu(
net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2")
net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
# 16x16
net = deconv2d_bn_relu(
net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3")
net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
# 32x32
net = deconv2d_bn_relu(
net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
# 64x64
net = deconv2d(
net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv")
net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
net = mx.sym.Activation(net, act_type='tanh')
return net
......@@ -345,7 +345,9 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
msg = msg.split('\n')[-2].split(": ")[1]
except Exception: # pylint: disable=broad-except
pass
raise InstantiationError(msg)
res_pack.append(MeasureResult((InstantiationError(msg),),
MeasureErrorNo.INSTANTIATION_ERROR,
tstamp - tic, tstamp))
else:
res_pack.append(MeasureResult((RuntimeError(msg),),
MeasureErrorNo.COMPILE_HOST,
......
......@@ -21,6 +21,8 @@ import numpy as np
from tvm import target as _target
from .space import ConfigSpace
logger = logging.getLogger('autotvm')
class DispatchContext(object):
......@@ -120,7 +122,12 @@ def dispatcher(fworkload):
raise RuntimeError("DispatchContext is not initialized")
workload = func(*args, **kwargs)
cfg = context.query(tgt, workload)
if cfg.template_key:
return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
else:
assert dispatch_dict, "No func registered for this dispatcher"
for v in dispatch_dict.values():
return v(cfg, *args, **kwargs)
fdecorate = decorate(fworkload, dispatch_func)
fdecorate.register = register
......@@ -159,13 +166,18 @@ class ApplyHistoryBest(DispatchContext):
Otherwise, it is an iterator.
default: ConfigEntity, optional
The default config to return when no history records
allow_fallback: bool
Whether allow to use a fallback configuration if cannot find
tuned result.
"""
def __init__(self, records, default=None):
def __init__(self, records, default=None, allow_fallback=False):
super(ApplyHistoryBest, self).__init__()
self.best_by_targetkey = {}
self.best_by_model = {}
self._default = default
self._allow_fallback = allow_fallback
self.fallback = {}
if records:
self.load(records)
......@@ -244,5 +256,18 @@ class ApplyHistoryBest(DispatchContext):
if self._default:
return self._default
if self._allow_fallback:
key = (target, workload)
if key in self.fallback:
return self.fallback[key]
logger.warning(
"Cannot find config for target=%s, workload=%s. A fallback configuration "
"is used, which may bring great performance regression.", target, workload)
cfg = ConfigSpace()
self.fallback[key] = cfg
return cfg
raise RuntimeError(
"Cannot find config for target=%s, workload=%s" % (target, workload))
"Cannot find config for target=%s, workload=%s. You need to do tuning "
"for this workload to get the config." % (target, workload))
......@@ -53,12 +53,14 @@ class TaskExtractEnv:
import nnvm
self.symbol2topi = {
nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw]
nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw],
nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose],
}
self.topi_to_task = {
topi.nn.conv2d: "topi_nn_conv2d",
topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
}
self._register_dummy()
......@@ -110,6 +112,15 @@ class TaskExtractEnv:
s = topi.generic.schedule_depthwise_conv2d_nchw([C])
return s, [A, W, C]
@register("topi_nn_conv2d_transpose_nchw")
def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
assert not kwargs, "Do not support kwargs in template function call"
args = deserialize_args(args)
A, W = args[:2]
C = topi.nn.conv2d_transpose_nchw(*args, **kwargs)
s = topi.generic.schedule_conv2d_transpose_nchw([C])
return s, [A, W, C]
def reset(self):
"""Reset task collections"""
self.task_collection = []
......
......@@ -9,6 +9,7 @@ TVM will download these parameters for you when you create the target for the fi
import logging
import os
import json
import sys
from .task import ApplyHistoryBest
from .. import target as _target
......@@ -27,7 +28,7 @@ def _alias(name):
return table.get(name, name)
def context(target, extra_files=None):
def context(target, extra_files=None, allow_fallback=False):
"""Return the dispatch context with pre-tuned parameters.
The corresponding downloaded *.log files under tophub root path will be loaded.
Users can also add their own files in argument `extra_files`.
......@@ -38,9 +39,12 @@ def context(target, extra_files=None):
The compilation target
extra_files: list of str, optional
Extra log files to load
allow_fallback: bool
Whether allow to use a fallback configuration if cannot find
tuned result.
"""
rootpath = AUTOTVM_TOPHUB_ROOT_PATH
best_context = ApplyHistoryBest([])
best_context = ApplyHistoryBest([], allow_fallback=allow_fallback)
if isinstance(target, str):
target = _target.create(target)
......@@ -99,7 +103,15 @@ def check_package(backend):
if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")):
return
if sys.version_info >= (3,):
import urllib.request as urllib2
else:
import urllib2
try:
download_package(backend)
except urllib2.URLError:
logging.warning("Failed to download tophub package for %s", backend)
def list_packages():
......
......@@ -118,8 +118,8 @@ def progress_bar(total, prefix=''):
ctx.cur_flops = flops
ctx.best_flops = tuner.best_flops
sys.stdout.write('%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
'| %.2f s\r' %
sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
'| %.2f s' %
(prefix, ctx.cur_flops/1e9, ctx.best_flops/1e9, ctx.ct, ctx.total,
time.time() - tic))
sys.stdout.flush()
......
......@@ -2,4 +2,5 @@
from . import conv2d
from . import depthwise_conv2d
from . import conv2d_transpose
from . import bitserial_conv2d
......@@ -42,7 +42,7 @@ def schedule_conv2d_nchw_arm_cpu(cfg, outs):
def _callback(op):
# schedule conv2d
if 'spatial_conv_output' in op.tag:
if 'spatial_conv2d_output' in op.tag:
output = op.output(0)
conv = op.input_tensors[0]
......@@ -60,7 +60,7 @@ def schedule_conv2d_nchw_arm_cpu(cfg, outs):
_schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
if 'winograd_conv_output' in op.tag:
if 'winograd_conv2d_output' in op.tag:
output = op.output(0)
_schedule_winograd(cfg, s, output, outs[0])
......@@ -72,7 +72,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
assert layout == "NCHW", "Only support NCHW"
out_dtype = out_dtype or data.dtype
_, CI, IH, IW = get_const_tuple(data.shape)
N, CI, IH, IW = get_const_tuple(data.shape)
if len(kernel.shape) == 4:
pre_packed = False
CO, _, KH, KW = get_const_tuple(kernel.shape)
......@@ -81,13 +81,12 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
CO = CO * VC
pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (KH, KW))
pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
N = 1
OH = (IH + pad_top + pad_down - KH) // HSTR + 1
OH = (IH + pad_top + pad_bottom - KH) // HSTR + 1
OW = (IW + pad_left + pad_right - KW) // WSTR + 1
data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_down, pad_right])
data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right])
# ==================== define configuration space ====================
n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
......@@ -145,7 +144,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
output = tvm.compute(oshape, lambda n, co, h, w:
conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
name='output_unpack', tag='spatial_conv_output',
name='output_unpack', tag='spatial_conv2d_output',
attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding,
layout, out_dtype)})
return output
......@@ -195,11 +194,14 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
if kernel_vec.op.name == 'kernel_vec':
co, _, _, _, _ = s[kernel_vec].op.axis
if autotvm.GLOBAL_SCOPE.in_tuning:
# kernel packing will be pre-computed during compliation, so we skip
# kernel packing will be pre-computed during compilation, so we skip
# this part to make tuning records correct
s[kernel_vec].pragma(co, 'debug_skip_region')
else:
s[kernel_vec].parallel(co)
elif kernel_vec.op.name == 'kernel_vec_conv2d_transpose': # for conv2d transpose
co, _, _, _, _ = s[kernel_vec].op.axis
s[kernel_vec].parallel(co)
return s
......@@ -330,7 +332,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_
# unpack output
output = tvm.compute((N, K, H, W), lambda n, k, h, w:
Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
name='output', tag='winograd_conv_output',
name='output', tag='winograd_conv2d_output',
attrs={'workload': _winograd_conv_arg_to_workload(
data, kernel, strides, padding, layout, out_dtype, tile_size)})
......@@ -462,7 +464,7 @@ def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
s = tvm.create_schedule([x.op for x in outs])
def _callback(op):
if 'winograd_conv_output' in op.tag:
if 'winograd_conv2d_output' in op.tag:
output = op.output(0)
_schedule_winograd(cfg, s, output, outs[0])
......
# pylint: disable=invalid-name, unused-variable
"""Transposed 2D convolution operators (sometimes called Deconvolution)."""
from __future__ import absolute_import as _abs
import tvm
from tvm import autotvm
from ..generic import schedule_conv2d_transpose_nchw
from ..nn import conv2d_transpose_nchw, dilate, pad, get_pad_tuple
from ..util import get_const_tuple, traverse_inline
from .conv2d import _schedule_spatial_pack
@autotvm.task.register_topi_compute(conv2d_transpose_nchw, "arm_cpu", "direct")
def conv2d_transpose_nchw_arm(cfg, Input, Filter, strides, padding, out_dtype):
"""Transposed 2D convolution nchw forward operator.
Parameters
----------
Input : tvm.Tensor
4-D with shape [batch, in_channel, in_height, in_width]
Filter : tvm.Tensor
4-D with shape [in_channel, num_filter, filter_height, filter_width]
strides : tuple of two ints
The spatial stride along height and width
padding : int or str
Padding size, or ['VALID', 'SAME']
out_dtype: str
The output data type. This is used for mixed precision.
Returns
-------
Output : tvm.Tensor
4-D with shape [batch, out_channel, out_height, out_width]
"""
return _decl_spatial_pack(cfg, Input, Filter, strides, padding, "NCHW", out_dtype, 2)
def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
assert layout == "NCHW", "Only support NCHW"
out_dtype = out_dtype or data.dtype
N, CI, IH, IW = get_const_tuple(data.shape)
_, CO, KH, KW = get_const_tuple(kernel.shape)
pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
bpad_top, bpad_bottom = KH - 1 - pad_top, KH - 1 - pad_bottom
bpad_left, bpad_right = KW - 1 - pad_left, KW - 1 - pad_right
HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
OH = (IH - 1) * HSTR - pad_top - pad_bottom + KH
OW = (IW - 1) * WSTR - pad_left - pad_right + KW
dilated_input = dilate(data, [1, 1, HSTR, WSTR])
data_pad = pad(dilated_input, [0, 0, bpad_top, bpad_left], [0, 0, bpad_bottom, bpad_right])
# ==================== define configuration space ====================
n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
if num_tile == 2: # for arm cpu
co, vc = cfg.define_split('tile_co', co, num_outputs=2)
oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
elif num_tile == 3: # for mali gpu
co, _, vc = cfg.define_split('tile_co', co, num_outputs=3)
oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3)
ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3)
else:
raise RuntimeError("Invalid num_tile")
cfg.define_reorder("reorder_0",
[n, co, oh, ow, ci, kh, kw, vh, vw, vc],
policy='candidate', candidate=[
[n, co, oh, ow, ci, kh, kw, vh, vw, vc],
[n, co, oh, ow, ci, kh, kw, vc, vh, vw]])
cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
# ====================================================================
VC = cfg["tile_co"].size[-1]
VH = cfg["tile_oh"].size[-1]
VW = cfg["tile_ow"].size[-1]
dvshape = (N, OH // VH, OW // VW, CI, VH + KH-1, VW + KW-1)
kvshape = (CO // VC, CI, KH, KW, VC)
ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
oshape = (N, CO, OH, OW)
data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
data_pad[n][ci][h*VH + vh][w*VW + vw],
name='data_vec')
kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
kernel[ci][co*VC+vc][kh][kw],
name='kernel_vec_conv2d_transpose')
ci = tvm.reduce_axis((0, CI), name='ci')
kh = tvm.reduce_axis((0, KH), name='kh')
kw = tvm.reduce_axis((0, KW), name='kw')
conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
tvm.sum(data_vec[n, h, w, ci, vh + kh, vw + kw].astype(out_dtype) *
kernel_vec[co, ci, KH - 1 - kh, KW - 1 - kw, vc].astype(out_dtype),
axis=[ci, kh, kw]), name='conv')
output = tvm.compute(oshape, lambda n, co, h, w:
conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
name='output_unpack', tag='spatial_conv2d_transpose_output')
return output
# register customized schedule for arm cpu.
@autotvm.task.register_topi_schedule(schedule_conv2d_transpose_nchw, "arm_cpu", "direct")
def schedule_conv2d_transpose_arm(cfg, outs):
"""Schedule conv2d transpose for arm cpu"""
s = tvm.create_schedule([x.op for x in outs])
def _callback(op):
if 'spatial_conv2d_transpose_output' in op.tag:
output = op.output(0)
conv = op.input_tensors[0]
data_vec = conv.op.input_tensors[0]
data_pad = data_vec.op.input_tensors[0]
dilated_input = data_pad.op.input_tensors[0]
s[data_pad].compute_inline()
s[dilated_input].compute_inline()
kernel_vec = conv.op.input_tensors[1]
if kernel_vec.op.name == 'kernel_vec':
kernel = kernel_vec.op.input_tensors[0]
else:
kernel = kernel_vec
if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
s[kernel].compute_inline()
_schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
traverse_inline(s, outs[0].op, _callback)
return s
......@@ -15,7 +15,16 @@ autotvm.task.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct',
# register customized schedule for arm cpu.
@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
def schedule_depthwise_conv2d_nchw_(cfg, outs):
"""Schedule depthwise conv2d"""
"""Schedule depthwise conv2d
Parameters
----------
cfg: ConfigEntity
The configuration of this tempalte
outs: Array of Tensor
The computation graph description of depthwise convolution2d
in the format of an array of tensors.
"""
outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
s = tvm.create_schedule([x.op for x in outs])
......@@ -79,10 +88,8 @@ def schedule_depthwise_conv2d_nchw_(cfg, outs):
return s
scheduled_ops = []
def _callback(op):
if op.tag == 'depthwise_conv2d_nchw' and op not in scheduled_ops:
if op.tag == 'depthwise_conv2d_nchw':
output = op.output(0)
kernel = op.input_tensors[1]
data = op.input_tensors[0]
......@@ -92,7 +99,5 @@ def schedule_depthwise_conv2d_nchw_(cfg, outs):
data = data_pad.op.input_tensors[0]
_schedule(cfg, s, data, data_pad, kernel, output)
scheduled_ops.append(op)
traverse_inline(s, outs[0].op, _callback)
return s
......@@ -10,7 +10,7 @@ from ..util import simplify
@tvm.target.generic_func
def conv2d_transpose_nchw(Input, Filter, strides, padding):
def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype):
"""Transposed 2D convolution nchw forward operator.
Parameters
......@@ -27,6 +27,9 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding):
padding : int or str
Padding size, or ['VALID', 'SAME']
out_dtype : str
The output data type. This is used for mixed precision.
Returns
-------
Output : tvm.Tensor
......@@ -58,7 +61,8 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding):
Output = tvm.compute(
(batch, out_c, out_h, out_w),
lambda b, c, h, w: tvm.sum(
PaddedInput[b, dc, h+dh, w+dw] * Filter[dc, c, filter_h-1-dh, filter_w-1-dw],
PaddedInput[b, dc, h+dh, w+dw].astype(out_dtype) *
Filter[dc, c, filter_h-1-dh, filter_w-1-dw].astype(out_dtype),
axis=[dc, dh, dw]), tag="conv2d_transpose_nchw")
return Output
......@@ -40,7 +40,7 @@ def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, paddin
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def test_conv2d():
with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b')):
with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b'), allow_fallback=True):
verify_conv2d(1, 56, 64, 64, 3, 1, 1)
if __name__ == "__main__":
......
......@@ -12,7 +12,7 @@ def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel,
A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
W = tvm.placeholder((in_channel, num_filter, kernel, kernel), name='W')
B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding)
B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding, A.dtype)
C = topi.nn.relu(B)
a_shape = get_const_tuple(A.shape)
......
......@@ -62,7 +62,7 @@ import tvm.contrib.graph_runtime as runtime
def get_network(name, batch_size):
"""Get the symbol definition and random weight of a network"""
shape = {"data": (batch_size, 3, 224, 224)}
input_shape = (batch_size, 3, 224, 224)
output_shape = (batch_size, 1000)
if name =='resnet-18':
......@@ -90,7 +90,7 @@ def get_network(name, batch_size):
else:
raise ValueError("Unsupported network: " + name)
return net, params, shape, output_shape
return net, params, input_shape, output_shape
#################################################################
# Start RPC Tracker
......@@ -226,8 +226,8 @@ tuning_option = {
def tune_tasks(tasks,
measure_option,
tuner='xgb',
n_trial=500,
early_stopping=200,
n_trial=1000,
early_stopping=None,
log_filename='tuning.log',
use_transfer_learning=True,
try_winograd=True):
......@@ -283,10 +283,10 @@ def tune_tasks(tasks,
def tune_and_evaluate():
# extract workloads from nnvm graph
print("Extract tasks...")
net, params, shape, out_shape = get_network(network, batch_size=1)
tasks = autotvm.task.extract_from_graph(net, shape=shape, dtype=dtype,
symbols=(nnvm.sym.conv2d,),
target=target)
net, params, input_shape, out_shape = get_network(network, batch_size=1)
tasks = autotvm.task.extract_from_graph(net, target=target,
shape={'data': input_shape}, dtype=dtype,
symbols=(nnvm.sym.conv2d,))
# run tuning tasks
print("Tuning...")
......@@ -298,7 +298,7 @@ def tune_and_evaluate():
with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
graph, lib, params = nnvm.compiler.build(
net, target=target,
shape=shape, params=params, dtype=dtype)
shape={'data': input_shape}, params=params, dtype=dtype)
# export library
tmp = tempdir()
......@@ -319,7 +319,7 @@ def tune_and_evaluate():
# upload parameters to device
ctx = remote.context(str(target), 0)
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
data_tvm = tvm.nd.array((np.random.uniform(size=shape['data'])).astype(dtype))
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
module = runtime.create(graph, rlib, ctx)
module.set_input('data', data_tvm)
module.set_input(**rparams)
......@@ -341,35 +341,33 @@ def tune_and_evaluate():
# -------------
# The tuning needs to train xgboost models and use them for prediction.
# So a high performance CPU is recommended.
# It takes about 1.5 hour on a 32T AMD Ryzen CPU.
# It takes about 2 hours on a 32T AMD Ryzen CPU.
# One sample output is
#
# .. code-block:: bash
#
# Extract tasks...
# Tuning...
# [Task 1/16] Current/Best: 13.15/ 20.49 GFLOPS | Progress: (297/1000) | 348.51 s Done.
# [Task 2/16] Current/Best: 16.66/ 22.64 GFLOPS | Progress: (475/1000) | 415.42 s Done.
# [Task 3/16] Current/Best: 10.33/ 14.19 GFLOPS | Progress: (306/1000) | 239.61 s Done.
# [Task 4/16] Current/Best: 13.29/ 20.88 GFLOPS | Progress: (242/1000) | 227.48 s Done.
# [Task 5/16] Current/Best: 13.28/ 15.61 GFLOPS | Progress: (237/1000) | 191.56 s Done.
# [Task 6/16] Current/Best: 20.16/ 23.86 GFLOPS | Progress: (315/1000) | 304.31 s Done.
# [Task 7/16] Current/Best: 9.22/ 22.00 GFLOPS | Progress: (458/1000) | 433.26 s Done.
# [Task 8/16] Current/Best: 14.12/ 17.80 GFLOPS | Progress: (270/1000) | 240.73 s Done.
# [Task 9/16] Current/Best: 14.59/ 24.02 GFLOPS | Progress: (209/1000) | 213.61 s Done.
# [Task 10/16] Current/Best: 9.86/ 21.74 GFLOPS | Progress: (367/1000) | 359.93 s Done.
# [Task 11/16] Current/Best: 5.01/ 18.86 GFLOPS | Progress: (202/1000) | 191.18 s Done.
# [Task 12/16] Current/Best: 8.61/ 25.23 GFLOPS | Progress: (220/1000) | 220.74 s Done.
# [Task 13/16] Current/Best: 10.87/ 25.79 GFLOPS | Progress: (465/1000) | 902.14 s Done.
# [Task 14/16] Current/Best: 15.33/ 29.38 GFLOPS | Progress: (239/1000) | 481.33 s Done.
# [Task 15/16] Current/Best: 12.09/ 38.60 GFLOPS | Progress: (476/1000) | 928.35 s Done.
# [Task 16/16] Current/Best: 16.77/ 47.08 GFLOPS | Progress: (255/1000) | 439.91 s Done.
# [Task 1/16] Current/Best: 18.85/ 19.67 GFLOPS | Progress: (353/1000) | 387.05 s Done.
# [Task 2/16] Current/Best: 16.10/ 23.50 GFLOPS | Progress: (444/1000) | 379.99 s Done.
# [Task 3/16] Current/Best: 5.49/ 13.96 GFLOPS | Progress: (610/1000) | 485.87 s Done.
# [Task 4/16] Current/Best: 10.07/ 20.48 GFLOPS | Progress: (430/1000) | 391.66 s Done.
# [Task 5/16] Current/Best: 11.50/ 15.50 GFLOPS | Progress: (374/1000) | 356.03 s Done.
# [Task 6/16] Current/Best: 10.76/ 23.77 GFLOPS | Progress: (526/1000) | 526.42 s Done.
# [Task 7/16] Current/Best: 12.71/ 22.03 GFLOPS | Progress: (341/1000) | 322.96 s Done.
# [Task 8/16] Current/Best: 8.60/ 17.91 GFLOPS | Progress: (272/1000) | 236.08 s Done.
# [Task 9/16] Current/Best: 15.37/ 23.62 GFLOPS | Progress: (275/1000) | 275.18 s Done.
# [Task 10/16] Current/Best: 6.62/ 23.01 GFLOPS | Progress: (330/1000) | 315.02 s Done.
# [Task 11/16] Current/Best: 1.85/ 21.39 GFLOPS | Progress: (281/1000) | 239.19 s Done.
# [Task 12/16] Current/Best: 15.41/ 24.02 GFLOPS | Progress: (258/1000) | 270.82 s Done.
# [Task 13/16] Current/Best: 17.96/ 25.79 GFLOPS | Progress: (380/1000) | 738.29 s Done.
# [Task 14/16] Current/Best: 14.81/ 31.17 GFLOPS | Progress: (413/1000) | 799.21 s Done.
# [Task 15/16] Current/Best: 24.39/ 40.97 GFLOPS | Progress: (355/1000) | 700.25 s Done.
# [Task 16/16] Current/Best: 9.42/ 49.90 GFLOPS | Progress: (348/1000) | 603.84 s Done.
# Compile...
# Upload...
# Evaluate inference time cost...
# Mean inference time (std dev): 156.51 ms (0.89 ms)
#
# Mean inference time (std dev): 157.29 ms (1.74 ms)
######################################################################
#
......
......@@ -109,7 +109,7 @@ print(out.asnumpy().flatten()[0:10])
# Save and Load Compiled Module
# -----------------------------
# We can also save the graph, lib and parameters into files and load them
# back in development environment.
# back in deploy environment.
####################################################
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment