Commit a02916b5 by hlu1 Committed by Tianqi Chen

winograd_nnpack (#2721)

parent 7f942474
......@@ -155,6 +155,24 @@ struct Conv2DWinogradAttrs : public tvm::AttrsNode<Conv2DWinogradAttrs> {
/*! \brief Attributes used in winograd weight transformation operators */
struct Conv2DWinogradNNPACKWeightTransformAttrs
: public tvm::AttrsNode<Conv2DWinogradNNPACKWeightTransformAttrs> {
int convolution_algorithm;
DataType out_dtype;
"relay.attrs.Conv2DWinogradNNPACKWeightTransformAttrs") {
"The convolution algorithm for Winograd NNPACK. "
"E.g. tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8 for WT_8x8, "
"tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16 for WT_8x8_FP16");
.describe("Output data type, set to explicit type under mixed precision setting");
/*! \brief Attributes used in softmax operators */
struct SoftmaxAttrs : public tvm::AttrsNode<SoftmaxAttrs> {
......@@ -183,6 +183,26 @@ struct WinogradWeightTransformParam : public dmlc::Parameter<WinogradWeightTrans
static const constexpr int kWeight = 0;
struct WinogradNNPACKWeightTransformParam
: public dmlc::Parameter<WinogradNNPACKWeightTransformParam> {
int convolution_algorithm;
int out_dtype;
DMLC_DECLARE_PARAMETER(WinogradNNPACKWeightTransformParam) {
"The convolution algorithm for Winograd NNPACK. "
"E.g. tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8 for WT_8x8, "
"tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16 for WT_8x8_FP16");
.add_enum("same", -1)
.describe("Output data type, set to explicit type under mixed precision setting");
static const constexpr int kWeight = 0;
struct WinogradConv2DParam : public dmlc::Parameter<WinogradConv2DParam> {
int channels;
TShape kernel_size;
......@@ -161,6 +161,10 @@ def alter_conv2d_layout(attrs, inputs, tinfos):
sym.contrib_conv2d_winograd_weight_transform = \
sym.contrib_conv2d_winograd_nnpack_without_weight_transform = \
sym.contrib_conv2d_winograd_nnpack_weight_transform = \
sym.nn = sym
# map relay argument names to nnvm argument names
......@@ -274,6 +278,49 @@ reg.register_pattern("_contrib_conv2d_winograd_without_weight_transform",
def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, _):
convolution_algorithm = attrs.get_int('convolution_algorithm')
out_dype = attrs.get_str('out_dtype')
return topi.nn.conv2d_winograd_nnpack_weight_transform(
inputs[0], convolution_algorithm, out_dype)
def schedule_contrib_conv2d_winograd_nnpack_weight_transform(attrs, outs, target):
return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs)
reg.register_pattern("_contrib_conv2d_winograd_nnpack_weight_transform", OpPattern.OPAQUE)
def compute_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, inputs, _):
padding = attrs.get_int_tuple("padding")
strides = attrs.get_int_tuple("strides")
dilation = attrs.get_int_tuple("dilation")
groups = attrs.get_int("groups")
layout = attrs.get_str("layout")
out_dtype = attrs.get_str("out_dtype")
out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
assert dilation == (1, 1), "Do not support dilate now"
assert groups == 1, "Do not supoort arbitrary group number"
# pylint: disable=assignment-from-no-return
out = topi.nn.conv2d_winograd_nnpack_without_weight_transform(
inputs[0], inputs[1], inputs[2] if attrs.get_bool("use_bias") else None,
strides, padding, dilation, layout, out_dtype)
return out
def schedule_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, outs, target):
return topi.generic.schedule_conv2d_winograd_nnpack_without_weight_transform(outs)
# conv2d_transpose
def compute_conv2d_transpose(attrs, inputs, _):
......@@ -130,13 +130,14 @@ inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
return true;
template<class Param>
inline bool WinogradConv2DInferShape(const nnvm::NodeAttrs& attrs,
std::vector<TShape>* in_shape,
std::vector<TShape>* out_shape) {
static const Layout kNCHW("NCHW");
static const Layout kOIHW("OIHW");
const WinogradConv2DParam& param = nnvm::get<WinogradConv2DParam>(attrs.parsed);
const Param& param = nnvm::get<Param>(attrs.parsed);
const Layout in_layout(param.layout);
const Layout kernel_layout(param.kernel_layout);
......@@ -403,7 +404,7 @@ NNVM_REGISTER_OP(_contrib_conv2d_winograd_without_weight_transform)
.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradConv2DParam>)
.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<WinogradConv2DParam>)
.set_attr<FInferShape>("FInferShape", WinogradConv2DInferShape)
.set_attr<FInferShape>("FInferShape", WinogradConv2DInferShape<WinogradConv2DParam>)
.set_attr<FInferType>("FInferType", Conv2DInferType<WinogradConv2DParam>)
.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<WinogradConv2DParam>)
......@@ -412,6 +413,82 @@ NNVM_REGISTER_OP(_contrib_conv2d_winograd_without_weight_transform)
inline bool Conv2DWinogradNNPACKWTInferType(const nnvm::NodeAttrs& attrs,
std::vector<int>* in_type,
std::vector<int>* out_type) {
const WinogradNNPACKWeightTransformParam& param =
CHECK_EQ(in_type->size(), 1U) << "Input:[weight]";
CHECK_EQ(out_type->size(), 1U);
if (param.out_dtype != -1) {
NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_type, 0, param.out_dtype);
} else {
ElemwiseType<1, 1>(attrs, in_type, out_type);
return true;
.describe(R"code(Weight transformation of winograd fast convolution algorithm.
Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
weight transformation in advance.
- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
.add_argument("weight", "4D Tensor", "Weight tensor.")
.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradNNPACKWeightTransformParam>)
.set_attr<FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
std::vector<TShape> *in_shape,
std::vector<TShape> *out_shape) {
const TShape &wshape = (*in_shape)[0];
CHECK_EQ(wshape.ndim(), 4) << "Weight should be a 4 dimensional tensor";
TShape oshape({wshape[0], wshape[1], 8, 8});
NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
return true;
.set_attr<FCorrectLayout>("FCorrectLayout", [](const NodeAttrs& attrs,
std::vector<Layout> *ilayouts,
const std::vector<Layout> *last_ilayouts,
std::vector<Layout> *olayouts) {
Layout layout("OIHW");
NNVM_ASSIGN_LAYOUT(*ilayouts, 0, layout);
NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout);
return true;
.set_attr<FInferType>("FInferType", Conv2DWinogradNNPACKWTInferType)
.describe(R"code(Compute conv2d with winograd nnpack.
- **data**: Input is 4D array of shape (batch_size, in_channels, height, width)
- **weight**: Any shape
We do not check shape for this input tensor.
- **bias**: (channels,)
- **out**: Output is 4D array of shape (batch_size, channels, out_height, out_width)
.add_argument("data", "4D Tensor", "Input data.")
.add_argument("weight", "4D Tensor", "Transformed weight tensor.")
.add_argument("bias", "1D Tensor", "Bias parameter.")
.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
.set_attr<FInferShape>("FInferShape", WinogradConv2DInferShape<Conv2DParam>)
.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
.describe(R"code(2D convolution grad.
......@@ -149,11 +149,12 @@ def convolution_inference_without_weight_transform(
ins[2] if bias is not None else 0,
outs[0], padding[0], padding[1], padding[2], padding[3],
stride[0], stride[1], nthreads, algorithm), name="C")
stride[0], stride[1], nthreads, algorithm), name="C", dtype='float32')
def convolution_inference_weight_transform(
kernel, nthreads=1,
"""Create an extern op to do inference convolution of 3D tensor data and
4D tensor kernel and 1D tensor bias with nnpack.
......@@ -171,13 +172,14 @@ def convolution_inference_weight_transform(
assert algorithm in (ConvolutionAlgorithm.WT_8x8, ConvolutionAlgorithm.WT_8x8_FP16)
output_channels, input_channels, _, _ = kernel.shape
transform_tile_size = 8
if not isinstance(dtype, str):
dtype = dtype.dtype
return _api.extern(
(output_channels, input_channels, transform_tile_size, transform_tile_size),
lambda ins, outs: _intrin.call_packed(
ins[0], outs[0], nthreads, algorithm), name="transform_kernel")
ins[0], outs[0], nthreads, algorithm), name="transform_kernel", dtype=dtype)
......@@ -326,6 +326,58 @@ def schedule_contrib_conv2d_winograd_weight_transform(attrs, outs, target):
# winograd nnpack related operators
def compute_contrib_conv2d_winograd_nnpack_without_weight_transform(
attrs, inputs, out_dtype, target):
"""Compute definition of conv2d_winograd_nnpack_without_weight_transform"""
# pylint: disable=assignment-from-no-return
padding = attrs.get_int_tuple("padding")
strides = attrs.get_int_tuple("strides")
dilation = attrs.get_int_tuple("dilation")
groups = attrs.get_int("groups")
data_layout = attrs.get_str("data_layout")
out_dtype = attrs.get_str("out_dtype")
out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
assert dilation == (1, 1), "Do not support dilate now"
assert groups == 1, "Do not supoort arbitrary group number"
# No bias
out = topi.nn.conv2d_winograd_nnpack_without_weight_transform(
inputs[0], inputs[1], None, strides, padding, dilation, data_layout,
return [out]
def schedule_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, outs, target):
"""Schedule definition of conv2d_winograd_nnpack_without_weight_transform"""
with target:
return topi.generic.schedule_conv2d_winograd_nnpack_without_weight_transform(outs)
def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, out_dtype, target):
"""Compute definition of contrib_conv2d_winograd_nnpack_weight_transform"""
convolution_algorithm = attrs.get_int('convolution_algorithm')
out = topi.nn.conv2d_winograd_nnpack_weight_transform(
inputs[0], convolution_algorithm, out_dtype)
return [out]
def schedule_contrib_conv2d_winograd_nnpack_weight_transform(attrs, outs, target):
"""Schedule definition of contrib_conv2d_winograd_nnpack_weight_transform"""
with target:
return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs)
def compute_contrib_conv2d_NCHWc(attrs, inputs, out_dtype, target):
"""Compute definition of conv2d NCHWc"""
#pylint: disable=invalid-name, too-many-lines
"""Neural network operations."""
from __future__ import absolute_import as _abs
from ...expr import TupleWrapper
......@@ -862,6 +863,72 @@ def contrib_conv2d_winograd_without_weight_transform(data,
kernel_layout, out_layout, out_dtype)
def contrib_conv2d_winograd_nnpack_without_weight_transform(data,
strides=(1, 1),
padding=(0, 0),
dilation=(1, 1),
r"""2D convolution with the NNPACK implementation of winograd algorithm.
The basic parameters are the same as the ones in vanilla conv2d.
It assumes the weight is pre-transformed by nn.contrib_conv2d_winograd_nnpack_weight_transform
data : tvm.relay.Expr
The input data to the operator.
weight : tvm.relay.Expr
The weight expressions.
strides : tuple of int, optional
The strides of convoltution.
padding : tuple of int, optional
The padding of convolution on both sides of inputs before convolution.
dilation : tuple of int, optional
Specifies the dilation rate to be used for dilated convolution.
groups : int, optional
Number of groups for grouped convolution.
channels : int, optional
Number of output channels of this convolution.
kernel_size : tuple of int, optional
The spatial of the convolution kernel.
data_layout : str, optional
Layout of the input.
kernel_layout : str, optional
Layout of the weight.
out_layout : str, optional
Layout of the output, by default, out_layout is the same as data_layout
out_dtype : str, optional
Specifies the output data type for mixed precision conv2d.
result : tvm.relay.Expr
The computed result.
return _make.contrib_conv2d_winograd_nnpack_without_weight_transform(
data, weight, strides, padding, dilation,
groups, channels, kernel_size, data_layout,
kernel_layout, out_layout, out_dtype)
def contrib_conv2d_nchwc(data,
strides=(1, 1),
......@@ -1013,3 +1080,28 @@ def contrib_conv2d_winograd_weight_transform(weight,
The computed result.
return _make.contrib_conv2d_winograd_weight_transform(weight, tile_size)
def contrib_conv2d_winograd_nnpack_weight_transform(weight,
r"""Weight Transformation part for 2D convolution with winograd algorithm.
We separate this as a single op to enable pre-compute for inference.
Use this together with nn.contrib_conv2d_winograd_without_weight_transform
weight : tvm.relay.Expr
The weight expressions.
convolution_algorithm : int
The Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
result : tvm.relay.Expr
The computed result.
return _make.contrib_conv2d_winograd_nnpack_weight_transform(
weight, convolution_algorithm, out_dtype)
......@@ -19,5 +19,10 @@ class Conv2DWinogradWeightTransformAttrs(Attrs):
class Conv2DWinogradNNPACKWeightTransformAttrs(Attrs):
"""Attribute of nn.contrib_conv2d_winograd_nnpack_weight_transform"""
class GlobalPool2DAttrs(Attrs):
"""Attribute of nn.global_pool"""
......@@ -189,20 +189,20 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
CHECK(workspace_buffer != nullptr);
for (auto n = 0; n < input->shape[0]; ++n) {
nnp_status status = nnp_convolution_inference(
algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels,
input_size, input_padding, kernel_size, stride_size,
static_cast<float *>(input->data) + n * input->shape[1] *
input->shape[2] *
static_cast<float *>(transformed_kernel->data),
bias ? static_cast<float *>(bias->data) : zero_bias->data(),
static_cast<float *>(output->data) + n * output->shape[1] *
output->shape[2] *
workspace_buffer, &workspace_size,
nnp_activation_identity, nullptr, entry->threadpool, nullptr);
CHECK_EQ(status, nnp_status_success);
nnp_status status = nnp_convolution_inference(
algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels,
input_size, input_padding, kernel_size, stride_size,
static_cast<float *>(input->data) + n * input->shape[1] *
input->shape[2] *
static_cast<float *>(transformed_kernel->data),
bias ? static_cast<float *>(bias->data) : zero_bias->data(),
static_cast<float *>(output->data) + n * output->shape[1] *
output->shape[2] *
workspace_buffer, &workspace_size,
nnp_activation_identity, nullptr, entry->threadpool, nullptr);
CHECK_EQ(status, nnp_status_success);
cpu_api->FreeWorkspace(ctx, workspace_buffer);
......@@ -344,6 +344,7 @@ v (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
// relay.nn.contrib_conv2d_winograd_without_weight_transform
template<class Param>
bool Conv2DWinogradRel(const Array<Type>& types,
int num_inputs,
const Attrs& attrs,
......@@ -354,7 +355,7 @@ bool Conv2DWinogradRel(const Array<Type>& types,
static const Layout kNCHW("NCHW");
static const Layout kOIHW("OIHW");
const Conv2DWinogradAttrs* param =<Conv2DWinogradAttrs>();
const Param* param =<Param>();
CHECK(param != nullptr);
const Layout in_layout(param->data_layout);
const Layout kernel_layout(param->kernel_layout);
......@@ -467,7 +468,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_without_weight_transform")
.add_argument("data", "Tensor", "The input tensor.")
.add_argument("weight", "Tensor", "The weight tensor.")
.add_type_rel("Conv2DWinograd", Conv2DWinogradRel)
.add_type_rel("Conv2DWinograd", Conv2DWinogradRel<Conv2DWinogradAttrs>)
......@@ -511,8 +512,8 @@ Expr MakeConv2DWinogradWeightTransform(Expr weight,
.set_body([](const TVMArgs& args, TVMRetValue* rv) {
runtime::detail::unpack_call<Expr, 2>(MakeConv2DWinogradWeightTransform, args, rv);
runtime::detail::unpack_call<Expr, 2>(MakeConv2DWinogradWeightTransform, args, rv);
......@@ -530,6 +531,124 @@ weight transformation in advance.
.add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel);
// Positional relay function to create conv2d winograd nnpack operator
// used by frontend FFI.
Expr MakeConv2DWinogradNNPACK(Expr data,
Expr weight,
Array<IndexExpr> strides,
Array<IndexExpr> padding,
Array<IndexExpr> dilation,
int groups,
IndexExpr channels,
Array<IndexExpr> kernel_size,
std::string data_layout,
std::string kernel_layout,
std::string out_layout,
DataType out_dtype) {
auto attrs = make_node<Conv2DAttrs>();
attrs->strides = std::move(strides);
attrs->padding = std::move(padding);
attrs->dilation = std::move(dilation);
attrs->groups = groups;
attrs->channels = channels;
attrs->kernel_size = std::move(kernel_size);
attrs->data_layout = std::move(data_layout);
attrs->kernel_layout = std::move(kernel_layout);
attrs->out_layout = std::move(out_layout);
attrs->out_dtype = std::move(out_dtype);
static const Op& op = Op::Get("nn.contrib_conv2d_winograd_nnpack_without_weight_transform");
return CallNode::make(op, {data, weight}, Attrs(attrs), {});
.set_body([](const TVMArgs& args, TVMRetValue* rv) {
runtime::detail::unpack_call<Expr, 12>(MakeConv2DWinogradNNPACK, args, rv);
.describe(R"code(Compute conv2d with winograd nnpack. Only supports NCHW layout.
This operator assumes the weight tensor is already pre-transformed by
- **data**: Input is 4D array of shape (batch_size, in_channels, height, width)
- **weight**: Any shape
We do not check the shape for this input tensor. Since different backend
has different layout strategy.
- **out**: Output is 4D array of shape (batch_size, channels, out_height, out_width)
.add_argument("data", "Tensor", "The input tensor.")
.add_argument("weight", "Tensor", "The weight tensor.")
.add_type_rel("Conv2DWinogradNNPACKRel", Conv2DWinogradRel<Conv2DAttrs>)
.set_attr<FInferCorrectLayout>("FInferCorrectLayout", Conv2DInferCorrectLayout<Conv2DAttrs>);
// relay.nn.contrib_conv2d_winograd_nnpack_weight_transform
bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types,
int num_inputs,
const Attrs& attrs,
const TypeReporter& reporter) {
CHECK_EQ(types.size(), 2);
const auto* data = types[0].as<TensorTypeNode>();
if (data == nullptr) {
return false;
const Conv2DWinogradNNPACKWeightTransformAttrs* param =<Conv2DWinogradNNPACKWeightTransformAttrs>();
CHECK(param != nullptr);
CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
std::vector<IndexExpr> oshape{
DataType out_dtype = param->out_dtype;
if (out_dtype.bits() == 0) {
out_dtype = data->dtype;
reporter->Assign(types[1], TensorTypeNode::make(Array<IndexExpr>(oshape), out_dtype));
return true;
Expr MakeConv2DWinogradNNPACKWeightTransform(Expr weight,
int convolution_algorithm,
DataType out_dtype) {
auto attrs = make_node<Conv2DWinogradNNPACKWeightTransformAttrs>();
attrs->convolution_algorithm = convolution_algorithm;
attrs->out_dtype = std::move(out_dtype);
static const Op& op = Op::Get("nn.contrib_conv2d_winograd_nnpack_weight_transform");
return CallNode::make(op, {weight}, Attrs(attrs), {});
.set_body([](const TVMArgs& args, TVMRetValue* rv) {
runtime::detail::unpack_call<Expr, 3>(MakeConv2DWinogradNNPACKWeightTransform, args, rv);
.describe(R"code(Weight transformation of winograd fast convolution algorithm with NNPACK.
Separate this into another symbol in order to enable Precompute Pass to compute the
weight transformation in advance.
- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
.add_argument("weight", "Tensor", "The weight tensor.")
.add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel);
// Positional relay function to create conv2d NCHWc operator
// used by frontend FFI.
Expr MakeConv2DNCHWc(Expr data,
import numpy as np
import tvm
from tvm import autotvm
from import FallbackConfigEntity
from tvm.contrib import nnpack
from tvm.contrib.pickle_memoize import memoize
import topi
import topi.testing
from topi.util import get_const_tuple
from nose import SkipTest
def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False,
devices=['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']):
print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
in_height = in_width = in_size
A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
bias = tvm.placeholder((num_filter, 1, 1), name='bias')
a_shape = get_const_tuple(A.shape)
w_shape = get_const_tuple(W.shape)
bias_shape = get_const_tuple(bias.shape)
dtype = A.dtype
def get_ref_data():
a_np = np.random.uniform(size=a_shape).astype(dtype)
w_np = np.random.uniform(size=w_shape).astype(dtype)
b_np = np.random.uniform(size=bias_shape).astype(dtype)
dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
if add_bias:
b_np = np.random.uniform(size=bias_shape).astype(dtype)
c_np += b_np
if add_relu:
c_np = np.maximum(c_np, 0)
return a_np, w_np, b_np, c_np
a_np, w_np, b_np, c_np = get_ref_data()
def check_device(device):
ctx = tvm.context(device, 0)
if not ctx.exist:
raise SkipTest("Skip because %s is not enabled" % device)
print("Running on target: %s" % device)
C = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NCHW', out_dtype=dtype)
if add_bias:
C = topi.add(C, bias)
if add_relu:
C = topi.nn.relu(C)
s = topi.generic.schedule_conv2d_nchw([C])
a = tvm.nd.array(a_np, ctx)
w = tvm.nd.array(w_np, ctx)
b = tvm.nd.array(b_np, ctx)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
if add_bias:
func =, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
func(a, w, b, c)
func =, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
func(a, w, c)
tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
for device in devices:
class WinogradFallback(autotvm.FallbackContext):
def _query_inside(self, target, workload):
key = (target, workload)
if key in self.memory:
return self.memory[key]
cfg = FallbackConfigEntity()
cfg.template_key = 'winograd_nnpack_fp32'
self.memory[key] = cfg
return cfg
def test_conv2d_nchw():
if not tvm.get_global_func("tvm.contrib.nnpack.convolution_inference_without_weight_transform", True):
raise SkipTest("skip because extern function is not available")
if not nnpack.is_available():
raise SkipTest("skip because nnpack is not available")
devices = ['llvm -device=arm_cpu']
autotvm.DispatchContext.current.silent = True
with WinogradFallback():
# resnet 18 workloads
verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, devices=devices)
verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1, devices=devices)
verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1, devices=devices)
verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1, devices=devices)
# unet workloads
verify_conv2d_nchw(1, 3, 192, 12, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 4, 192, 12, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 12, 96, 24, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 24, 48, 48, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 48, 24, 96, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 96, 12, 180, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 180, 6, 220, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 220, 6, 180, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 180, 12, 96, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 96, 24, 48, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 48, 48, 24, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 24, 96, 12, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 12, 192, 1, 3, 1, 1, add_bias=True, devices=devices)
# relu, bias
verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True, devices=devices)
verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True, devices=devices)
verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True, add_bias=True, devices=devices)
# werid workloads
verify_conv2d_nchw(1, 3, 3, 3, 3, 1, 1, devices=devices)
verify_conv2d_nchw(1, 13, 71, 59, 3, 1, 1, devices=devices)
if __name__ == "__main__":
import nose
......@@ -122,6 +122,39 @@ def schedule_conv2d_winograd_without_weight_transform(outs):
def schedule_conv2d_winograd_nnpack_weight_transform(outs):
"""Schedule for weight transformation of winograd
outs: Array of Tensor
The computation graph description of this operator
in the format of an array of tensors.
sch: Schedule
The computation schedule for the op.
# Typically this is computed in nnvm PreCompute pass
s = tvm.create_schedule([x.op for x in outs])
return s
def schedule_conv2d_winograd_nnpack_without_weight_transform(outs):
"""Schedule for winograd without weight transformation
outs: Array of Tensor
The computation graph description of this operator
in the format of an array of tensors.
sch: Schedule
The computation schedule for the op.
return _default_schedule(outs, False)
def schedule_conv2d_transpose_nchw(outs):
"""Schedule for conv2d_transpose_nchw
......@@ -410,6 +410,48 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding, di
raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_dtype):
"""Weight transformation for winograd
kernel: Tensor
The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now.
convolution_algorithm: int
The convolution algorithm for Winograd NNPACK.
output : tvm.Tensor
4-D with shape [alpha, alpha, CO, CI]
from tvm.contrib import nnpack
return nnpack.convolution_inference_weight_transform(
kernel, algorithm=convolution_algorithm, dtype=out_dtype)
def conv2d_winograd_nnpack_without_weight_transform(
input, filter, bias, strides, padding, dilation, layout, out_dtype):
"""Compute convolution in winograd algorithm. The filter is supposed to be transformed
in advance.
input : tvm.Tensor
4-D with shape [batch, in_height, in_width, in_channel]
filter : tvm.Tensor
4-D with shape [num_filter, in_channel, 8, 8]
bias : tvm.Tensor
1-D with shape [num_filter]
strides : int or a list/tuple of two ints
Stride size, or [stride_height, stride_width]
padding : int or str
Padding size, or ['VALID', 'SAME']
output : tvm.Tensor
4-D with shape [batch, out_height, out_width, out_channel]
raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None):
"""Group convolution operator in NCHW layout.
......@@ -10,7 +10,8 @@ from tvm.contrib.pickle_memoize import memoize
from topi.util import get_const_tuple
def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False,
devices=['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']):
print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
in_height = in_width = in_size
......@@ -67,7 +68,7 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
for device in ['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']:
for device in devices:
