Commit 672147c8 by Lianmin Zheng Committed by Tianqi Chen

add conv2d transpose and fix bugs (#1566)

parent 6d4cf448
...@@ -254,6 +254,7 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> { ...@@ -254,6 +254,7 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
int groups; int groups;
std::string layout; std::string layout;
std::string kernel_layout; std::string kernel_layout;
int out_dtype;
bool use_bias; bool use_bias;
DMLC_DECLARE_PARAMETER(Conv2DTransposeParam) { DMLC_DECLARE_PARAMETER(Conv2DTransposeParam) {
...@@ -286,6 +287,10 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> { ...@@ -286,6 +287,10 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
.describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc." .describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc."
"'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width" "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
"dimensions respectively."); "dimensions respectively.");
DMLC_DECLARE_DTYPE_FIELD(out_dtype)
.add_enum("same", -1)
.set_default(-1)
.describe("Output data type, set to explicit type under mixed precision setting");
DMLC_DECLARE_FIELD(use_bias).set_default(true) DMLC_DECLARE_FIELD(use_bias).set_default(true)
.describe("Whether the layer uses a bias vector."); .describe("Whether the layer uses a bias vector.");
} }
......
...@@ -42,28 +42,31 @@ def deconv2d_bn_relu(data, prefix, **kwargs): ...@@ -42,28 +42,31 @@ def deconv2d_bn_relu(data, prefix, **kwargs):
def get_symbol(oshape, ngf=128, code=None): def get_symbol(oshape, ngf=128, code=None):
"""get symbol of dcgan generator""" """get symbol of dcgan generator"""
assert oshape[-1] == 32, "Only support 32x32 image" assert oshape[-1] == 64, "Only support 64x64 image"
assert oshape[-2] == 32, "Only support 32x32 image" assert oshape[-2] == 64, "Only support 64x64 image"
code = sym.Variable("data") if code is None else code code = sym.Variable("data") if code is None else code
net = sym.dense(code, name="g1", units=4*4*ngf*4, use_bias=False) net = sym.dense(code, name="g1", units=4*4*ngf*8, use_bias=False)
net = sym.relu(net) net = sym.relu(net)
# 4 x 4 # 4 x 4
net = sym.reshape(net, shape=(-1, ngf * 4, 4, 4)) net = sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
# 8 x 8 # 8 x 8
net = deconv2d_bn_relu( net = deconv2d_bn_relu(
net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2") net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
# 16x16 # 16x16
net = deconv2d_bn_relu( net = deconv2d_bn_relu(
net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3") net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
# 32x32 # 32x32
net = deconv2d_bn_relu(
net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
# 64x64
net = deconv2d( net = deconv2d(
net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv") net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
net = sym.tanh(net) net = sym.tanh(net)
return net return net
def get_workload(batch_size, oshape=(3, 32, 32), ngf=128, random_len=100, dtype="float32"): def get_workload(batch_size, oshape=(3, 64, 64), ngf=128, random_len=100, dtype="float32"):
"""Get benchmark workload for a DCGAN generator """Get benchmark workload for a DCGAN generator
Parameters Parameters
......
...@@ -251,11 +251,15 @@ def compute_conv2d_transpose(attrs, inputs, _): ...@@ -251,11 +251,15 @@ def compute_conv2d_transpose(attrs, inputs, _):
strides = attrs.get_int_tuple("strides") strides = attrs.get_int_tuple("strides")
dilation = attrs.get_int_tuple("dilation") dilation = attrs.get_int_tuple("dilation")
groups = attrs.get_int("groups") groups = attrs.get_int("groups")
out_dtype = attrs.get_string("out_dtype")
layout = attrs["layout"] layout = attrs["layout"]
out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
assert layout == "NCHW", "only support nchw for now" assert layout == "NCHW", "only support nchw for now"
assert dilation == (1, 1), "not support dilate now" assert dilation == (1, 1), "not support dilate now"
assert groups == 1, "only support groups == 1 for now" assert groups == 1, "only support groups == 1 for now"
out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding)
out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding, out_dtype)
if attrs.get_bool("use_bias"): if attrs.get_bool("use_bias"):
bias = inputs[2] bias = inputs[2]
bias = topi.expand_dims(bias, axis=1, num_newaxis=2) bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
......
...@@ -556,7 +556,7 @@ v (batch_size, channels, out_height, out_width) if `layout` is `NCHW` ...@@ -556,7 +556,7 @@ v (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DTransposeParam>) .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DTransposeParam>)
.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DTransposeParam>) .set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DTransposeParam>)
.set_attr<FInferShape>("FInferShape", Conv2DTransposeInferShape) .set_attr<FInferShape>("FInferShape", Conv2DTransposeInferShape)
.set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>) .set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DTransposeParam>)
.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DTransposeCorrectLayout) .set_attr<FCorrectLayout>("FCorrectLayout", Conv2DTransposeCorrectLayout)
.set_num_outputs(1) .set_num_outputs(1)
.set_num_inputs(UseBiasNumInputs<Conv2DTransposeParam>) .set_num_inputs(UseBiasNumInputs<Conv2DTransposeParam>)
......
...@@ -40,24 +40,27 @@ def deconv2d_bn_relu(data, prefix, **kwargs): ...@@ -40,24 +40,27 @@ def deconv2d_bn_relu(data, prefix, **kwargs):
net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu') net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu')
return net return net
def get_symbol(oshape=(3, 32, 32), ngf=128, code=None): def get_symbol(oshape=(3, 64, 64), ngf=128, code=None):
"""get symbol of dcgan generator""" """get symbol of dcgan generator"""
assert oshape[-1] == 32, "Only support 32x32 image" assert oshape[-1] == 64, "Only support 64x64 image"
assert oshape[-2] == 32, "Only support 32x32 image" assert oshape[-2] == 64, "Only support 64x64 image"
code = mx.sym.Variable("data") if code is None else code code = mx.sym.Variable("data") if code is None else code
net = mx.sym.FullyConnected(code, name="g1", num_hidden=4*4*ngf*4, no_bias=True, flatten=False) net = mx.sym.FullyConnected(code, name="g1", num_hidden=ngf*8*4*4, no_bias=True, flatten=False)
net = mx.sym.Activation(net, act_type='relu') net = mx.sym.Activation(net, act_type='relu')
# 4 x 4 # 4 x 4
net = mx.sym.reshape(net, shape=(-1, ngf * 4, 4, 4)) net = mx.sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
# 8 x 8 # 8 x 8
net = deconv2d_bn_relu( net = deconv2d_bn_relu(
net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2") net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
# 16x16 # 16x16
net = deconv2d_bn_relu( net = deconv2d_bn_relu(
net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3") net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
# 32x32 # 32x32
net = deconv2d_bn_relu(
net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
# 64x64
net = deconv2d( net = deconv2d(
net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv") net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
net = mx.sym.Activation(net, act_type='tanh') net = mx.sym.Activation(net, act_type='tanh')
return net return net
...@@ -345,7 +345,9 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat, ...@@ -345,7 +345,9 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
msg = msg.split('\n')[-2].split(": ")[1] msg = msg.split('\n')[-2].split(": ")[1]
except Exception: # pylint: disable=broad-except except Exception: # pylint: disable=broad-except
pass pass
raise InstantiationError(msg) res_pack.append(MeasureResult((InstantiationError(msg),),
MeasureErrorNo.INSTANTIATION_ERROR,
tstamp - tic, tstamp))
else: else:
res_pack.append(MeasureResult((RuntimeError(msg),), res_pack.append(MeasureResult((RuntimeError(msg),),
MeasureErrorNo.COMPILE_HOST, MeasureErrorNo.COMPILE_HOST,
......
...@@ -21,6 +21,8 @@ import numpy as np ...@@ -21,6 +21,8 @@ import numpy as np
from tvm import target as _target from tvm import target as _target
from .space import ConfigSpace
logger = logging.getLogger('autotvm') logger = logging.getLogger('autotvm')
class DispatchContext(object): class DispatchContext(object):
...@@ -120,7 +122,12 @@ def dispatcher(fworkload): ...@@ -120,7 +122,12 @@ def dispatcher(fworkload):
raise RuntimeError("DispatchContext is not initialized") raise RuntimeError("DispatchContext is not initialized")
workload = func(*args, **kwargs) workload = func(*args, **kwargs)
cfg = context.query(tgt, workload) cfg = context.query(tgt, workload)
return dispatch_dict[cfg.template_key](cfg, *args, **kwargs) if cfg.template_key:
return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
else:
assert dispatch_dict, "No func registered for this dispatcher"
for v in dispatch_dict.values():
return v(cfg, *args, **kwargs)
fdecorate = decorate(fworkload, dispatch_func) fdecorate = decorate(fworkload, dispatch_func)
fdecorate.register = register fdecorate.register = register
...@@ -159,13 +166,18 @@ class ApplyHistoryBest(DispatchContext): ...@@ -159,13 +166,18 @@ class ApplyHistoryBest(DispatchContext):
Otherwise, it is an iterator. Otherwise, it is an iterator.
default: ConfigEntity, optional default: ConfigEntity, optional
The default config to return when no history records The default config to return when no history records
allow_fallback: bool
Whether allow to use a fallback configuration if cannot find
tuned result.
""" """
def __init__(self, records, default=None): def __init__(self, records, default=None, allow_fallback=False):
super(ApplyHistoryBest, self).__init__() super(ApplyHistoryBest, self).__init__()
self.best_by_targetkey = {} self.best_by_targetkey = {}
self.best_by_model = {} self.best_by_model = {}
self._default = default self._default = default
self._allow_fallback = allow_fallback
self.fallback = {}
if records: if records:
self.load(records) self.load(records)
...@@ -244,5 +256,18 @@ class ApplyHistoryBest(DispatchContext): ...@@ -244,5 +256,18 @@ class ApplyHistoryBest(DispatchContext):
if self._default: if self._default:
return self._default return self._default
if self._allow_fallback:
key = (target, workload)
if key in self.fallback:
return self.fallback[key]
logger.warning(
"Cannot find config for target=%s, workload=%s. A fallback configuration "
"is used, which may bring great performance regression.", target, workload)
cfg = ConfigSpace()
self.fallback[key] = cfg
return cfg
raise RuntimeError( raise RuntimeError(
"Cannot find config for target=%s, workload=%s" % (target, workload)) "Cannot find config for target=%s, workload=%s. You need to do tuning "
"for this workload to get the config." % (target, workload))
...@@ -53,12 +53,14 @@ class TaskExtractEnv: ...@@ -53,12 +53,14 @@ class TaskExtractEnv:
import nnvm import nnvm
self.symbol2topi = { self.symbol2topi = {
nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw] nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw],
nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose],
} }
self.topi_to_task = { self.topi_to_task = {
topi.nn.conv2d: "topi_nn_conv2d", topi.nn.conv2d: "topi_nn_conv2d",
topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw", topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
} }
self._register_dummy() self._register_dummy()
...@@ -110,6 +112,15 @@ class TaskExtractEnv: ...@@ -110,6 +112,15 @@ class TaskExtractEnv:
s = topi.generic.schedule_depthwise_conv2d_nchw([C]) s = topi.generic.schedule_depthwise_conv2d_nchw([C])
return s, [A, W, C] return s, [A, W, C]
@register("topi_nn_conv2d_transpose_nchw")
def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
assert not kwargs, "Do not support kwargs in template function call"
args = deserialize_args(args)
A, W = args[:2]
C = topi.nn.conv2d_transpose_nchw(*args, **kwargs)
s = topi.generic.schedule_conv2d_transpose_nchw([C])
return s, [A, W, C]
def reset(self): def reset(self):
"""Reset task collections""" """Reset task collections"""
self.task_collection = [] self.task_collection = []
......
...@@ -9,6 +9,7 @@ TVM will download these parameters for you when you create the target for the fi ...@@ -9,6 +9,7 @@ TVM will download these parameters for you when you create the target for the fi
import logging import logging
import os import os
import json import json
import sys
from .task import ApplyHistoryBest from .task import ApplyHistoryBest
from .. import target as _target from .. import target as _target
...@@ -27,7 +28,7 @@ def _alias(name): ...@@ -27,7 +28,7 @@ def _alias(name):
return table.get(name, name) return table.get(name, name)
def context(target, extra_files=None): def context(target, extra_files=None, allow_fallback=False):
"""Return the dispatch context with pre-tuned parameters. """Return the dispatch context with pre-tuned parameters.
The corresponding downloaded *.log files under tophub root path will be loaded. The corresponding downloaded *.log files under tophub root path will be loaded.
Users can also add their own files in argument `extra_files`. Users can also add their own files in argument `extra_files`.
...@@ -38,9 +39,12 @@ def context(target, extra_files=None): ...@@ -38,9 +39,12 @@ def context(target, extra_files=None):
The compilation target The compilation target
extra_files: list of str, optional extra_files: list of str, optional
Extra log files to load Extra log files to load
allow_fallback: bool
Whether allow to use a fallback configuration if cannot find
tuned result.
""" """
rootpath = AUTOTVM_TOPHUB_ROOT_PATH rootpath = AUTOTVM_TOPHUB_ROOT_PATH
best_context = ApplyHistoryBest([]) best_context = ApplyHistoryBest([], allow_fallback=allow_fallback)
if isinstance(target, str): if isinstance(target, str):
target = _target.create(target) target = _target.create(target)
...@@ -99,7 +103,15 @@ def check_package(backend): ...@@ -99,7 +103,15 @@ def check_package(backend):
if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")): if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")):
return return
download_package(backend)
if sys.version_info >= (3,):
import urllib.request as urllib2
else:
import urllib2
try:
download_package(backend)
except urllib2.URLError:
logging.warning("Failed to download tophub package for %s", backend)
def list_packages(): def list_packages():
......
...@@ -118,8 +118,8 @@ def progress_bar(total, prefix=''): ...@@ -118,8 +118,8 @@ def progress_bar(total, prefix=''):
ctx.cur_flops = flops ctx.cur_flops = flops
ctx.best_flops = tuner.best_flops ctx.best_flops = tuner.best_flops
sys.stdout.write('%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) ' sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
'| %.2f s\r' % '| %.2f s' %
(prefix, ctx.cur_flops/1e9, ctx.best_flops/1e9, ctx.ct, ctx.total, (prefix, ctx.cur_flops/1e9, ctx.best_flops/1e9, ctx.ct, ctx.total,
time.time() - tic)) time.time() - tic))
sys.stdout.flush() sys.stdout.flush()
......
...@@ -2,4 +2,5 @@ ...@@ -2,4 +2,5 @@
from . import conv2d from . import conv2d
from . import depthwise_conv2d from . import depthwise_conv2d
from . import conv2d_transpose
from . import bitserial_conv2d from . import bitserial_conv2d
...@@ -42,7 +42,7 @@ def schedule_conv2d_nchw_arm_cpu(cfg, outs): ...@@ -42,7 +42,7 @@ def schedule_conv2d_nchw_arm_cpu(cfg, outs):
def _callback(op): def _callback(op):
# schedule conv2d # schedule conv2d
if 'spatial_conv_output' in op.tag: if 'spatial_conv2d_output' in op.tag:
output = op.output(0) output = op.output(0)
conv = op.input_tensors[0] conv = op.input_tensors[0]
...@@ -60,7 +60,7 @@ def schedule_conv2d_nchw_arm_cpu(cfg, outs): ...@@ -60,7 +60,7 @@ def schedule_conv2d_nchw_arm_cpu(cfg, outs):
_schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0]) _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
if 'winograd_conv_output' in op.tag: if 'winograd_conv2d_output' in op.tag:
output = op.output(0) output = op.output(0)
_schedule_winograd(cfg, s, output, outs[0]) _schedule_winograd(cfg, s, output, outs[0])
...@@ -72,7 +72,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n ...@@ -72,7 +72,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
assert layout == "NCHW", "Only support NCHW" assert layout == "NCHW", "Only support NCHW"
out_dtype = out_dtype or data.dtype out_dtype = out_dtype or data.dtype
_, CI, IH, IW = get_const_tuple(data.shape) N, CI, IH, IW = get_const_tuple(data.shape)
if len(kernel.shape) == 4: if len(kernel.shape) == 4:
pre_packed = False pre_packed = False
CO, _, KH, KW = get_const_tuple(kernel.shape) CO, _, KH, KW = get_const_tuple(kernel.shape)
...@@ -81,13 +81,12 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n ...@@ -81,13 +81,12 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
CO, _, KH, KW, VC = get_const_tuple(kernel.shape) CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
CO = CO * VC CO = CO * VC
pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (KH, KW)) pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
N = 1 OH = (IH + pad_top + pad_bottom - KH) // HSTR + 1
OH = (IH + pad_top + pad_down - KH) // HSTR + 1
OW = (IW + pad_left + pad_right - KW) // WSTR + 1 OW = (IW + pad_left + pad_right - KW) // WSTR + 1
data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_down, pad_right]) data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right])
# ==================== define configuration space ==================== # ==================== define configuration space ====================
n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW) n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
...@@ -145,7 +144,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n ...@@ -145,7 +144,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
output = tvm.compute(oshape, lambda n, co, h, w: output = tvm.compute(oshape, lambda n, co, h, w:
conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC], conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
name='output_unpack', tag='spatial_conv_output', name='output_unpack', tag='spatial_conv2d_output',
attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding, attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding,
layout, out_dtype)}) layout, out_dtype)})
return output return output
...@@ -195,11 +194,14 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, ...@@ -195,11 +194,14 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
if kernel_vec.op.name == 'kernel_vec': if kernel_vec.op.name == 'kernel_vec':
co, _, _, _, _ = s[kernel_vec].op.axis co, _, _, _, _ = s[kernel_vec].op.axis
if autotvm.GLOBAL_SCOPE.in_tuning: if autotvm.GLOBAL_SCOPE.in_tuning:
# kernel packing will be pre-computed during compliation, so we skip # kernel packing will be pre-computed during compilation, so we skip
# this part to make tuning records correct # this part to make tuning records correct
s[kernel_vec].pragma(co, 'debug_skip_region') s[kernel_vec].pragma(co, 'debug_skip_region')
else: else:
s[kernel_vec].parallel(co) s[kernel_vec].parallel(co)
elif kernel_vec.op.name == 'kernel_vec_conv2d_transpose': # for conv2d transpose
co, _, _, _, _ = s[kernel_vec].op.axis
s[kernel_vec].parallel(co)
return s return s
...@@ -330,7 +332,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_ ...@@ -330,7 +332,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_
# unpack output # unpack output
output = tvm.compute((N, K, H, W), lambda n, k, h, w: output = tvm.compute((N, K, H, W), lambda n, k, h, w:
Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m], Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
name='output', tag='winograd_conv_output', name='output', tag='winograd_conv2d_output',
attrs={'workload': _winograd_conv_arg_to_workload( attrs={'workload': _winograd_conv_arg_to_workload(
data, kernel, strides, padding, layout, out_dtype, tile_size)}) data, kernel, strides, padding, layout, out_dtype, tile_size)})
...@@ -462,7 +464,7 @@ def schedule_conv2d_winograd_without_weight_transform_(cfg, outs): ...@@ -462,7 +464,7 @@ def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
s = tvm.create_schedule([x.op for x in outs]) s = tvm.create_schedule([x.op for x in outs])
def _callback(op): def _callback(op):
if 'winograd_conv_output' in op.tag: if 'winograd_conv2d_output' in op.tag:
output = op.output(0) output = op.output(0)
_schedule_winograd(cfg, s, output, outs[0]) _schedule_winograd(cfg, s, output, outs[0])
......
# pylint: disable=invalid-name, unused-variable
"""Transposed 2D convolution operators (sometimes called Deconvolution)."""
from __future__ import absolute_import as _abs
import tvm
from tvm import autotvm
from ..generic import schedule_conv2d_transpose_nchw
from ..nn import conv2d_transpose_nchw, dilate, pad, get_pad_tuple
from ..util import get_const_tuple, traverse_inline
from .conv2d import _schedule_spatial_pack
@autotvm.task.register_topi_compute(conv2d_transpose_nchw, "arm_cpu", "direct")
def conv2d_transpose_nchw_arm(cfg, Input, Filter, strides, padding, out_dtype):
"""Transposed 2D convolution nchw forward operator.
Parameters
----------
Input : tvm.Tensor
4-D with shape [batch, in_channel, in_height, in_width]
Filter : tvm.Tensor
4-D with shape [in_channel, num_filter, filter_height, filter_width]
strides : tuple of two ints
The spatial stride along height and width
padding : int or str
Padding size, or ['VALID', 'SAME']
out_dtype: str
The output data type. This is used for mixed precision.
Returns
-------
Output : tvm.Tensor
4-D with shape [batch, out_channel, out_height, out_width]
"""
return _decl_spatial_pack(cfg, Input, Filter, strides, padding, "NCHW", out_dtype, 2)
def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
assert layout == "NCHW", "Only support NCHW"
out_dtype = out_dtype or data.dtype
N, CI, IH, IW = get_const_tuple(data.shape)
_, CO, KH, KW = get_const_tuple(kernel.shape)
pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
bpad_top, bpad_bottom = KH - 1 - pad_top, KH - 1 - pad_bottom
bpad_left, bpad_right = KW - 1 - pad_left, KW - 1 - pad_right
HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
OH = (IH - 1) * HSTR - pad_top - pad_bottom + KH
OW = (IW - 1) * WSTR - pad_left - pad_right + KW
dilated_input = dilate(data, [1, 1, HSTR, WSTR])
data_pad = pad(dilated_input, [0, 0, bpad_top, bpad_left], [0, 0, bpad_bottom, bpad_right])
# ==================== define configuration space ====================
n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
if num_tile == 2: # for arm cpu
co, vc = cfg.define_split('tile_co', co, num_outputs=2)
oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
elif num_tile == 3: # for mali gpu
co, _, vc = cfg.define_split('tile_co', co, num_outputs=3)
oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3)
ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3)
else:
raise RuntimeError("Invalid num_tile")
cfg.define_reorder("reorder_0",
[n, co, oh, ow, ci, kh, kw, vh, vw, vc],
policy='candidate', candidate=[
[n, co, oh, ow, ci, kh, kw, vh, vw, vc],
[n, co, oh, ow, ci, kh, kw, vc, vh, vw]])
cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
# ====================================================================
VC = cfg["tile_co"].size[-1]
VH = cfg["tile_oh"].size[-1]
VW = cfg["tile_ow"].size[-1]
dvshape = (N, OH // VH, OW // VW, CI, VH + KH-1, VW + KW-1)
kvshape = (CO // VC, CI, KH, KW, VC)
ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
oshape = (N, CO, OH, OW)
data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
data_pad[n][ci][h*VH + vh][w*VW + vw],
name='data_vec')
kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
kernel[ci][co*VC+vc][kh][kw],
name='kernel_vec_conv2d_transpose')
ci = tvm.reduce_axis((0, CI), name='ci')
kh = tvm.reduce_axis((0, KH), name='kh')
kw = tvm.reduce_axis((0, KW), name='kw')
conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
tvm.sum(data_vec[n, h, w, ci, vh + kh, vw + kw].astype(out_dtype) *
kernel_vec[co, ci, KH - 1 - kh, KW - 1 - kw, vc].astype(out_dtype),
axis=[ci, kh, kw]), name='conv')
output = tvm.compute(oshape, lambda n, co, h, w:
conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
name='output_unpack', tag='spatial_conv2d_transpose_output')
return output
# register customized schedule for arm cpu.
@autotvm.task.register_topi_schedule(schedule_conv2d_transpose_nchw, "arm_cpu", "direct")
def schedule_conv2d_transpose_arm(cfg, outs):
"""Schedule conv2d transpose for arm cpu"""
s = tvm.create_schedule([x.op for x in outs])
def _callback(op):
if 'spatial_conv2d_transpose_output' in op.tag:
output = op.output(0)
conv = op.input_tensors[0]
data_vec = conv.op.input_tensors[0]
data_pad = data_vec.op.input_tensors[0]
dilated_input = data_pad.op.input_tensors[0]
s[data_pad].compute_inline()
s[dilated_input].compute_inline()
kernel_vec = conv.op.input_tensors[1]
if kernel_vec.op.name == 'kernel_vec':
kernel = kernel_vec.op.input_tensors[0]
else:
kernel = kernel_vec
if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
s[kernel].compute_inline()
_schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
traverse_inline(s, outs[0].op, _callback)
return s
...@@ -15,7 +15,16 @@ autotvm.task.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct', ...@@ -15,7 +15,16 @@ autotvm.task.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct',
# register customized schedule for arm cpu. # register customized schedule for arm cpu.
@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct') @autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
def schedule_depthwise_conv2d_nchw_(cfg, outs): def schedule_depthwise_conv2d_nchw_(cfg, outs):
"""Schedule depthwise conv2d""" """Schedule depthwise conv2d
Parameters
----------
cfg: ConfigEntity
The configuration of this tempalte
outs: Array of Tensor
The computation graph description of depthwise convolution2d
in the format of an array of tensors.
"""
outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
s = tvm.create_schedule([x.op for x in outs]) s = tvm.create_schedule([x.op for x in outs])
...@@ -79,10 +88,8 @@ def schedule_depthwise_conv2d_nchw_(cfg, outs): ...@@ -79,10 +88,8 @@ def schedule_depthwise_conv2d_nchw_(cfg, outs):
return s return s
scheduled_ops = []
def _callback(op): def _callback(op):
if op.tag == 'depthwise_conv2d_nchw' and op not in scheduled_ops: if op.tag == 'depthwise_conv2d_nchw':
output = op.output(0) output = op.output(0)
kernel = op.input_tensors[1] kernel = op.input_tensors[1]
data = op.input_tensors[0] data = op.input_tensors[0]
...@@ -92,7 +99,5 @@ def schedule_depthwise_conv2d_nchw_(cfg, outs): ...@@ -92,7 +99,5 @@ def schedule_depthwise_conv2d_nchw_(cfg, outs):
data = data_pad.op.input_tensors[0] data = data_pad.op.input_tensors[0]
_schedule(cfg, s, data, data_pad, kernel, output) _schedule(cfg, s, data, data_pad, kernel, output)
scheduled_ops.append(op)
traverse_inline(s, outs[0].op, _callback) traverse_inline(s, outs[0].op, _callback)
return s return s
...@@ -10,7 +10,7 @@ from ..util import simplify ...@@ -10,7 +10,7 @@ from ..util import simplify
@tvm.target.generic_func @tvm.target.generic_func
def conv2d_transpose_nchw(Input, Filter, strides, padding): def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype):
"""Transposed 2D convolution nchw forward operator. """Transposed 2D convolution nchw forward operator.
Parameters Parameters
...@@ -27,6 +27,9 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding): ...@@ -27,6 +27,9 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding):
padding : int or str padding : int or str
Padding size, or ['VALID', 'SAME'] Padding size, or ['VALID', 'SAME']
out_dtype : str
The output data type. This is used for mixed precision.
Returns Returns
------- -------
Output : tvm.Tensor Output : tvm.Tensor
...@@ -58,7 +61,8 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding): ...@@ -58,7 +61,8 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding):
Output = tvm.compute( Output = tvm.compute(
(batch, out_c, out_h, out_w), (batch, out_c, out_h, out_w),
lambda b, c, h, w: tvm.sum( lambda b, c, h, w: tvm.sum(
PaddedInput[b, dc, h+dh, w+dw] * Filter[dc, c, filter_h-1-dh, filter_w-1-dw], PaddedInput[b, dc, h+dh, w+dw].astype(out_dtype) *
Filter[dc, c, filter_h-1-dh, filter_w-1-dw].astype(out_dtype),
axis=[dc, dh, dw]), tag="conv2d_transpose_nchw") axis=[dc, dh, dw]), tag="conv2d_transpose_nchw")
return Output return Output
...@@ -40,7 +40,7 @@ def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, paddin ...@@ -40,7 +40,7 @@ def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, paddin
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def test_conv2d(): def test_conv2d():
with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b')): with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b'), allow_fallback=True):
verify_conv2d(1, 56, 64, 64, 3, 1, 1) verify_conv2d(1, 56, 64, 64, 3, 1, 1)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -12,7 +12,7 @@ def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel, ...@@ -12,7 +12,7 @@ def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel,
A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
W = tvm.placeholder((in_channel, num_filter, kernel, kernel), name='W') W = tvm.placeholder((in_channel, num_filter, kernel, kernel), name='W')
B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding) B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding, A.dtype)
C = topi.nn.relu(B) C = topi.nn.relu(B)
a_shape = get_const_tuple(A.shape) a_shape = get_const_tuple(A.shape)
......
...@@ -62,7 +62,7 @@ import tvm.contrib.graph_runtime as runtime ...@@ -62,7 +62,7 @@ import tvm.contrib.graph_runtime as runtime
def get_network(name, batch_size): def get_network(name, batch_size):
"""Get the symbol definition and random weight of a network""" """Get the symbol definition and random weight of a network"""
shape = {"data": (batch_size, 3, 224, 224)} input_shape = (batch_size, 3, 224, 224)
output_shape = (batch_size, 1000) output_shape = (batch_size, 1000)
if name =='resnet-18': if name =='resnet-18':
...@@ -90,7 +90,7 @@ def get_network(name, batch_size): ...@@ -90,7 +90,7 @@ def get_network(name, batch_size):
else: else:
raise ValueError("Unsupported network: " + name) raise ValueError("Unsupported network: " + name)
return net, params, shape, output_shape return net, params, input_shape, output_shape
################################################################# #################################################################
# Start RPC Tracker # Start RPC Tracker
...@@ -226,8 +226,8 @@ tuning_option = { ...@@ -226,8 +226,8 @@ tuning_option = {
def tune_tasks(tasks, def tune_tasks(tasks,
measure_option, measure_option,
tuner='xgb', tuner='xgb',
n_trial=500, n_trial=1000,
early_stopping=200, early_stopping=None,
log_filename='tuning.log', log_filename='tuning.log',
use_transfer_learning=True, use_transfer_learning=True,
try_winograd=True): try_winograd=True):
...@@ -283,10 +283,10 @@ def tune_tasks(tasks, ...@@ -283,10 +283,10 @@ def tune_tasks(tasks,
def tune_and_evaluate(): def tune_and_evaluate():
# extract workloads from nnvm graph # extract workloads from nnvm graph
print("Extract tasks...") print("Extract tasks...")
net, params, shape, out_shape = get_network(network, batch_size=1) net, params, input_shape, out_shape = get_network(network, batch_size=1)
tasks = autotvm.task.extract_from_graph(net, shape=shape, dtype=dtype, tasks = autotvm.task.extract_from_graph(net, target=target,
symbols=(nnvm.sym.conv2d,), shape={'data': input_shape}, dtype=dtype,
target=target) symbols=(nnvm.sym.conv2d,))
# run tuning tasks # run tuning tasks
print("Tuning...") print("Tuning...")
...@@ -298,7 +298,7 @@ def tune_and_evaluate(): ...@@ -298,7 +298,7 @@ def tune_and_evaluate():
with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']): with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
graph, lib, params = nnvm.compiler.build( graph, lib, params = nnvm.compiler.build(
net, target=target, net, target=target,
shape=shape, params=params, dtype=dtype) shape={'data': input_shape}, params=params, dtype=dtype)
# export library # export library
tmp = tempdir() tmp = tempdir()
...@@ -319,7 +319,7 @@ def tune_and_evaluate(): ...@@ -319,7 +319,7 @@ def tune_and_evaluate():
# upload parameters to device # upload parameters to device
ctx = remote.context(str(target), 0) ctx = remote.context(str(target), 0)
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
data_tvm = tvm.nd.array((np.random.uniform(size=shape['data'])).astype(dtype)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
module = runtime.create(graph, rlib, ctx) module = runtime.create(graph, rlib, ctx)
module.set_input('data', data_tvm) module.set_input('data', data_tvm)
module.set_input(**rparams) module.set_input(**rparams)
...@@ -341,35 +341,33 @@ def tune_and_evaluate(): ...@@ -341,35 +341,33 @@ def tune_and_evaluate():
# ------------- # -------------
# The tuning needs to train xgboost models and use them for prediction. # The tuning needs to train xgboost models and use them for prediction.
# So a high performance CPU is recommended. # So a high performance CPU is recommended.
# It takes about 1.5 hour on a 32T AMD Ryzen CPU. # It takes about 2 hours on a 32T AMD Ryzen CPU.
# One sample output is # One sample output is
# #
# .. code-block:: bash # .. code-block:: bash
# #
# Extract tasks... # Extract tasks...
# Tuning... # Tuning...
# [Task 1/16] Current/Best: 13.15/ 20.49 GFLOPS | Progress: (297/1000) | 348.51 s Done. # [Task 1/16] Current/Best: 18.85/ 19.67 GFLOPS | Progress: (353/1000) | 387.05 s Done.
# [Task 2/16] Current/Best: 16.66/ 22.64 GFLOPS | Progress: (475/1000) | 415.42 s Done. # [Task 2/16] Current/Best: 16.10/ 23.50 GFLOPS | Progress: (444/1000) | 379.99 s Done.
# [Task 3/16] Current/Best: 10.33/ 14.19 GFLOPS | Progress: (306/1000) | 239.61 s Done. # [Task 3/16] Current/Best: 5.49/ 13.96 GFLOPS | Progress: (610/1000) | 485.87 s Done.
# [Task 4/16] Current/Best: 13.29/ 20.88 GFLOPS | Progress: (242/1000) | 227.48 s Done. # [Task 4/16] Current/Best: 10.07/ 20.48 GFLOPS | Progress: (430/1000) | 391.66 s Done.
# [Task 5/16] Current/Best: 13.28/ 15.61 GFLOPS | Progress: (237/1000) | 191.56 s Done. # [Task 5/16] Current/Best: 11.50/ 15.50 GFLOPS | Progress: (374/1000) | 356.03 s Done.
# [Task 6/16] Current/Best: 20.16/ 23.86 GFLOPS | Progress: (315/1000) | 304.31 s Done. # [Task 6/16] Current/Best: 10.76/ 23.77 GFLOPS | Progress: (526/1000) | 526.42 s Done.
# [Task 7/16] Current/Best: 9.22/ 22.00 GFLOPS | Progress: (458/1000) | 433.26 s Done. # [Task 7/16] Current/Best: 12.71/ 22.03 GFLOPS | Progress: (341/1000) | 322.96 s Done.
# [Task 8/16] Current/Best: 14.12/ 17.80 GFLOPS | Progress: (270/1000) | 240.73 s Done. # [Task 8/16] Current/Best: 8.60/ 17.91 GFLOPS | Progress: (272/1000) | 236.08 s Done.
# [Task 9/16] Current/Best: 14.59/ 24.02 GFLOPS | Progress: (209/1000) | 213.61 s Done. # [Task 9/16] Current/Best: 15.37/ 23.62 GFLOPS | Progress: (275/1000) | 275.18 s Done.
# [Task 10/16] Current/Best: 9.86/ 21.74 GFLOPS | Progress: (367/1000) | 359.93 s Done. # [Task 10/16] Current/Best: 6.62/ 23.01 GFLOPS | Progress: (330/1000) | 315.02 s Done.
# [Task 11/16] Current/Best: 5.01/ 18.86 GFLOPS | Progress: (202/1000) | 191.18 s Done. # [Task 11/16] Current/Best: 1.85/ 21.39 GFLOPS | Progress: (281/1000) | 239.19 s Done.
# [Task 12/16] Current/Best: 8.61/ 25.23 GFLOPS | Progress: (220/1000) | 220.74 s Done. # [Task 12/16] Current/Best: 15.41/ 24.02 GFLOPS | Progress: (258/1000) | 270.82 s Done.
# [Task 13/16] Current/Best: 10.87/ 25.79 GFLOPS | Progress: (465/1000) | 902.14 s Done. # [Task 13/16] Current/Best: 17.96/ 25.79 GFLOPS | Progress: (380/1000) | 738.29 s Done.
# [Task 14/16] Current/Best: 15.33/ 29.38 GFLOPS | Progress: (239/1000) | 481.33 s Done. # [Task 14/16] Current/Best: 14.81/ 31.17 GFLOPS | Progress: (413/1000) | 799.21 s Done.
# [Task 15/16] Current/Best: 12.09/ 38.60 GFLOPS | Progress: (476/1000) | 928.35 s Done. # [Task 15/16] Current/Best: 24.39/ 40.97 GFLOPS | Progress: (355/1000) | 700.25 s Done.
# [Task 16/16] Current/Best: 16.77/ 47.08 GFLOPS | Progress: (255/1000) | 439.91 s Done. # [Task 16/16] Current/Best: 9.42/ 49.90 GFLOPS | Progress: (348/1000) | 603.84 s Done.
# Compile... # Compile...
# Upload... # Upload...
# Evaluate inference time cost... # Evaluate inference time cost...
# Mean inference time (std dev): 156.51 ms (0.89 ms) # Mean inference time (std dev): 157.29 ms (1.74 ms)
#
###################################################################### ######################################################################
# #
......
...@@ -109,7 +109,7 @@ print(out.asnumpy().flatten()[0:10]) ...@@ -109,7 +109,7 @@ print(out.asnumpy().flatten()[0:10])
# Save and Load Compiled Module # Save and Load Compiled Module
# ----------------------------- # -----------------------------
# We can also save the graph, lib and parameters into files and load them # We can also save the graph, lib and parameters into files and load them
# back in development environment. # back in deploy environment.
#################################################### ####################################################
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment