Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
tic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
wenyuanbo
tic
Commits
5712ea6b
Commit
5712ea6b
authored
Nov 12, 2018
by
Yizhi Liu
Committed by
Tianqi Chen
Nov 12, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TOPI] depthwise-conv2d in NCHW[x]c layout for x86 (#2045)
parent
b88065a8
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
509 additions
and
58 deletions
+509
-58
nnvm/python/nnvm/top/nn.py
+10
-1
nnvm/src/top/nn/convolution.cc
+2
-0
topi/python/topi/generic/nn.py
+17
-0
topi/python/topi/nn/depthwise_conv2d.py
+64
-1
topi/python/topi/x86/__init__.py
+1
-0
topi/python/topi/x86/conv2d.py
+57
-41
topi/python/topi/x86/conv2d_avx_1x1.py
+3
-1
topi/python/topi/x86/conv2d_avx_common.py
+3
-1
topi/python/topi/x86/depthwise_conv2d.py
+203
-0
topi/python/topi/x86/util.py
+12
-0
topi/tests/python/test_topi_conv2d_NCHWc.py
+12
-12
topi/tests/python/test_topi_depthwise_conv2d.py
+116
-0
tutorials/autotvm/tune_nnvm_x86.py
+9
-1
No files found.
nnvm/python/nnvm/top/nn.py
View file @
5712ea6b
...
@@ -4,7 +4,7 @@ from __future__ import absolute_import
...
@@ -4,7 +4,7 @@ from __future__ import absolute_import
import
tvm
import
tvm
import
topi
import
topi
from
topi.util
import
get_const_int
from
topi.util
import
get_const_int
,
get_const_tuple
from
.tensor
import
_fschedule_broadcast
,
_fschedule_injective
from
.tensor
import
_fschedule_broadcast
,
_fschedule_injective
from
.
import
registry
as
reg
from
.
import
registry
as
reg
from
.registry
import
OpPattern
from
.registry
import
OpPattern
...
@@ -164,16 +164,22 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
...
@@ -164,16 +164,22 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
padding
=
attrs
.
get_int_tuple
(
"padding"
)
padding
=
attrs
.
get_int_tuple
(
"padding"
)
strides
=
attrs
.
get_int_tuple
(
"strides"
)
strides
=
attrs
.
get_int_tuple
(
"strides"
)
dilation
=
attrs
.
get_int_tuple
(
"dilation"
)
dilation
=
attrs
.
get_int_tuple
(
"dilation"
)
out_channel
=
attrs
.
get_int
(
"channels"
)
groups
=
attrs
.
get_int
(
"groups"
)
groups
=
attrs
.
get_int
(
"groups"
)
layout
=
attrs
.
get_string
(
"layout"
)
layout
=
attrs
.
get_string
(
"layout"
)
out_layout
=
attrs
.
get_string
(
"out_layout"
)
out_layout
=
attrs
.
get_string
(
"out_layout"
)
out_dtype
=
attrs
.
get_string
(
"out_dtype"
)
out_dtype
=
attrs
.
get_string
(
"out_dtype"
)
out_dtype
=
inputs
[
0
]
.
dtype
if
out_dtype
==
"same"
else
out_dtype
out_dtype
=
inputs
[
0
]
.
dtype
if
out_dtype
==
"same"
else
out_dtype
_
,
in_channel_chunk
,
_
,
_
,
in_channel_block
=
get_const_tuple
(
inputs
[
0
]
.
shape
)
in_channel
=
in_channel_chunk
*
in_channel_block
assert
dilation
==
(
1
,
1
),
"not support dilate now"
assert
dilation
==
(
1
,
1
),
"not support dilate now"
if
groups
==
1
:
if
groups
==
1
:
# pylint: disable=assignment-from-no-return
# pylint: disable=assignment-from-no-return
out
=
topi
.
nn
.
conv2d_NCHWc
(
inputs
[
0
],
inputs
[
1
],
strides
,
padding
,
dilation
,
out
=
topi
.
nn
.
conv2d_NCHWc
(
inputs
[
0
],
inputs
[
1
],
strides
,
padding
,
dilation
,
layout
,
out_layout
,
out_dtype
)
layout
,
out_layout
,
out_dtype
)
elif
groups
==
in_channel
and
groups
==
out_channel
:
out
=
topi
.
nn
.
depthwise_conv2d_NCHWc
(
inputs
[
0
],
inputs
[
1
],
strides
,
padding
,
dilation
,
layout
,
out_layout
,
out_dtype
)
# pylint: enable=assignment-from-no-return
# pylint: enable=assignment-from-no-return
else
:
else
:
raise
ValueError
(
"not support arbitrary group number > 1 for now"
)
raise
ValueError
(
"not support arbitrary group number > 1 for now"
)
...
@@ -187,9 +193,12 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
...
@@ -187,9 +193,12 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
def
schedule_contrib_conv2d_NCHWc
(
attrs
,
outs
,
target
):
def
schedule_contrib_conv2d_NCHWc
(
attrs
,
outs
,
target
):
"""Schedule definition of conv2d NCHWc"""
"""Schedule definition of conv2d NCHWc"""
groups
=
attrs
.
get_int
(
"groups"
)
groups
=
attrs
.
get_int
(
"groups"
)
out_channel
=
attrs
.
get_int
(
"channels"
)
with
tvm
.
target
.
create
(
target
):
with
tvm
.
target
.
create
(
target
):
if
groups
==
1
:
if
groups
==
1
:
return
topi
.
generic
.
schedule_conv2d_NCHWc
(
outs
)
return
topi
.
generic
.
schedule_conv2d_NCHWc
(
outs
)
elif
groups
==
out_channel
:
return
topi
.
generic
.
schedule_depthwise_conv2d_NCHWc
(
outs
)
else
:
else
:
raise
ValueError
(
"not support group number > 1 for now"
)
raise
ValueError
(
"not support group number > 1 for now"
)
...
...
nnvm/src/top/nn/convolution.cc
View file @
5712ea6b
...
@@ -82,7 +82,9 @@ inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
...
@@ -82,7 +82,9 @@ inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
wshape
[
kernel_layout
.
indexof
(
'O'
)]
*=
param
.
groups
;
wshape
[
kernel_layout
.
indexof
(
'O'
)]
*=
param
.
groups
;
if
(
in_shape
->
at
(
Conv2DParam
::
kWeight
).
ndim
()
==
0
)
{
NNVM_ASSIGN_INPUT_SHAPE
(
attrs
,
*
in_shape
,
Conv2DParam
::
kWeight
,
wshape
);
NNVM_ASSIGN_INPUT_SHAPE
(
attrs
,
*
in_shape
,
Conv2DParam
::
kWeight
,
wshape
);
}
if
(
param
.
use_bias
)
{
if
(
param
.
use_bias
)
{
static
const
Layout
default_bias_layout
(
"C"
);
static
const
Layout
default_bias_layout
(
"C"
);
TShape
bias_shape
({
param
.
channels
});
TShape
bias_shape
({
param
.
channels
});
...
...
topi/python/topi/generic/nn.py
View file @
5712ea6b
...
@@ -175,6 +175,23 @@ def schedule_depthwise_conv2d_nhwc(outs):
...
@@ -175,6 +175,23 @@ def schedule_depthwise_conv2d_nhwc(outs):
@tvm.target.generic_func
@tvm.target.generic_func
def
schedule_depthwise_conv2d_NCHWc
(
outs
):
"""Schedule for depthwise_conv2d_NCHWc
Parameters
----------
outs: Array of Tensor
The computation graph description of depthwise_conv2d_nhwc
in the format of an array of tensors.
Returns
-------
sch: Schedule
The computation schedule for the op.
"""
return
_default_schedule
(
outs
,
False
)
@tvm.target.generic_func
def
schedule_group_conv2d_nchw
(
outs
):
def
schedule_group_conv2d_nchw
(
outs
):
"""Schedule for conv2d_nchw
"""Schedule for conv2d_nchw
...
...
topi/python/topi/nn/depthwise_conv2d.py
View file @
5712ea6b
# pylint: disable=invalid-name, unused-variable, too-many-locals
# pylint: disable=invalid-name, unused-variable, too-many-locals
, unused-argument
"""Depthwise convolution operators"""
"""Depthwise convolution operators"""
from
__future__
import
absolute_import
as
_abs
from
__future__
import
absolute_import
as
_abs
from
collections
import
namedtuple
import
tvm
import
tvm
from
.dilate
import
dilate
from
.dilate
import
dilate
...
@@ -8,6 +9,27 @@ from .pad import pad
...
@@ -8,6 +9,27 @@ from .pad import pad
from
.util
import
get_pad_tuple
from
.util
import
get_pad_tuple
from
..util
import
simplify
from
..util
import
simplify
# workload description of depthwise-conv2d
Workload
=
namedtuple
(
'Workload'
,
[
'in_dtype'
,
'out_dtype'
,
'height'
,
'width'
,
'in_filter'
,
'out_filter'
,
'hkernel'
,
'wkernel'
,
'hpad'
,
'wpad'
,
'hstride'
,
'wstride'
])
def
_get_workload
(
data
,
kernel
,
stride
,
padding
,
out_dtype
):
""" Get the workload structure. """
_
,
in_channel
,
height
,
width
=
[
x
.
value
for
x
in
data
.
shape
]
channel
,
channel_multiplier
,
kh
,
kw
=
[
x
.
value
for
x
in
kernel
.
shape
]
out_channel
=
channel
*
channel_multiplier
HPAD
,
WPAD
,
_
,
_
=
get_pad_tuple
(
padding
,
kernel
)
if
isinstance
(
stride
,
(
tuple
,
list
)):
HSTR
,
WSTR
=
stride
else
:
HSTR
,
WSTR
=
stride
,
stride
assert
(
data
.
dtype
==
kernel
.
dtype
)
or
(
data
.
dtype
==
'uint8'
and
kernel
.
dtype
==
'int8'
),
\
"Do not support inputs with different data types now. '
\
'{} vs. {}"
.
format
(
data
.
dtype
,
kernel
.
dtype
)
return
Workload
(
data
.
dtype
,
out_dtype
,
height
,
width
,
in_channel
,
out_channel
,
kh
,
kw
,
HPAD
,
WPAD
,
HSTR
,
WSTR
)
@tvm.target.generic_func
@tvm.target.generic_func
def
depthwise_conv2d_nchw
(
Input
,
Filter
,
stride
,
padding
,
dilation
,
out_dtype
=
None
):
def
depthwise_conv2d_nchw
(
Input
,
Filter
,
stride
,
padding
,
dilation
,
out_dtype
=
None
):
...
@@ -258,3 +280,44 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid
...
@@ -258,3 +280,44 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid
tag
=
'depthwise_conv2d_backward_weight_nhwc'
)
tag
=
'depthwise_conv2d_backward_weight_nhwc'
)
return
Weight_grad
return
Weight_grad
@tvm.target.generic_func
def
depthwise_conv2d_NCHWc
(
Input
,
Filter
,
stride
,
padding
,
dilation
,
layout
,
out_layout
,
out_dtype
=
None
):
"""Depthwise convolution NCHW[x]c forward operator.
Parameters
----------
Input : tvm.Tensor
5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
Filter : tvm.Tensor
4-D with shape [out_channel_chunk, filter_height, filter_width, out_channel_block]
In NCHWc depthwise convolution,
we group kernel's in_channel and channel_multiplier together then do the tiling.
stride : tuple of two ints
The spatial stride along height and width
padding : int or str
Padding size, or ['VALID', 'SAME']
dilation: int or a list/tuple of two ints
dilation size, or [dilation_height, dilation_width]
layout : str
Input data layout
out_layout : str
Output data layout
out_dtype: str, optional
Output data type
Returns
-------
Output : tvm.Tensor
4-D with shape [batch, out_channel, out_height, out_width]
"""
raise
ValueError
(
"missing register for topi.nn.depthwise_conv2d_NCHWc"
)
topi/python/topi/x86/__init__.py
View file @
5712ea6b
...
@@ -9,3 +9,4 @@ from .nn import *
...
@@ -9,3 +9,4 @@ from .nn import *
from
.injective
import
*
from
.injective
import
*
from
.pooling
import
schedule_pool
,
schedule_global_pool
from
.pooling
import
schedule_pool
,
schedule_global_pool
from
.bitserial_conv2d
import
schedule_bitserial_conv2d
from
.bitserial_conv2d
import
schedule_bitserial_conv2d
from
.depthwise_conv2d
import
schedule_depthwise_conv2d_NCHWc
topi/python/topi/x86/conv2d.py
View file @
5712ea6b
...
@@ -7,36 +7,30 @@ from tvm.autotvm.task import get_config
...
@@ -7,36 +7,30 @@ from tvm.autotvm.task import get_config
from
..
import
generic
,
tag
from
..
import
generic
,
tag
from
..
import
nn
from
..
import
nn
from
..util
import
get_const_tuple
from
..util
import
get_const_tuple
from
..nn.conv2d
import
conv2d
,
conv2d_NCHWc
,
conv2d_alter_layout
,
_get_workload
from
..nn.conv2d
import
conv2d
,
conv2d_NCHWc
,
\
conv2d_alter_layout
,
_get_workload
as
_get_conv2d_workload
from
..nn.dilate
import
dilate
from
..nn.dilate
import
dilate
from
..nn.depthwise_conv2d
import
_get_workload
as
_get_depthwise_conv2d_workload
from
..nn.depthwise_conv2d
import
depthwise_conv2d_NCHWc
,
depthwise_conv2d_nchw
from
..nn.pad
import
pad
from
..nn.pad
import
pad
from
.
import
conv2d_avx_1x1
,
conv2d_avx_common
from
.
import
conv2d_avx_1x1
,
conv2d_avx_common
def
_get_fp32_len
():
def
_get_default_config
(
cfg
,
data
,
kernel
,
strides
,
padding
,
out_dtype
,
is_depthwise
=
False
):
fp32_vec_len
=
8
target
=
tvm
.
target
.
current_target
()
if
target
is
not
None
:
for
opt
in
target
.
options
:
if
opt
==
'-mcpu=skylake-avx512'
:
fp32_vec_len
=
16
return
fp32_vec_len
def
_get_default_config
(
cfg
,
workload
):
"""
"""
Get default schedule config for the workload
Get default schedule config for the workload
Parameters
----------
workload : topi.nn.conv2d.Workload
Convolution workload
"""
"""
fp32_vec_len
=
_get_fp32_len
()
if
is_depthwise
:
is_kernel_1x1
=
workload
.
hkernel
==
1
and
workload
.
wkernel
==
1
wkl
=
_get_depthwise_conv2d_workload
(
data
,
kernel
,
strides
,
padding
,
out_dtype
)
from
depthwise_conv2d
import
_fallback_schedule
_fallback_schedule
(
cfg
,
wkl
)
else
:
wkl
=
_get_conv2d_workload
(
data
,
kernel
,
strides
,
padding
,
out_dtype
)
is_kernel_1x1
=
wkl
.
hkernel
==
1
and
wkl
.
wkernel
==
1
if
is_kernel_1x1
:
if
is_kernel_1x1
:
conv2d_avx_1x1
.
_fallback_schedule
(
cfg
,
workload
,
fp32_vec_len
)
conv2d_avx_1x1
.
_fallback_schedule
(
cfg
,
wkl
)
else
:
else
:
conv2d_avx_common
.
_fallback_schedule
(
cfg
,
workload
,
fp32_vec_len
)
conv2d_avx_common
.
_fallback_schedule
(
cfg
,
wkl
)
def
_create_tuning_space
(
cfg
,
data
,
kernel
,
strides
,
padding
,
dilation
,
layout
):
def
_create_tuning_space
(
cfg
,
data
,
kernel
,
strides
,
padding
,
dilation
,
layout
):
...
@@ -74,10 +68,9 @@ def _declaration_conv(cfg, data, kernel, strides, padding, dilation, layout, out
...
@@ -74,10 +68,9 @@ def _declaration_conv(cfg, data, kernel, strides, padding, dilation, layout, out
if
layout
==
'NCHW'
:
if
layout
==
'NCHW'
:
_create_tuning_space
(
cfg
,
data
,
kernel
,
strides
,
padding
,
dilation
,
layout
)
_create_tuning_space
(
cfg
,
data
,
kernel
,
strides
,
padding
,
dilation
,
layout
)
if
cfg
.
is_fallback
:
if
cfg
.
is_fallback
:
wkl
=
_get_workload
(
data
,
kernel
,
strides
,
padding
,
out_dtype
)
_get_default_config
(
cfg
,
data
,
kernel
,
strides
,
padding
,
out_dtype
)
_get_default_config
(
cfg
,
wkl
)
return
_declaration_conv_impl
(
cfg
,
data
,
kernel
,
strides
,
return
_declaration_conv_impl
(
cfg
,
data
,
kernel
,
strides
,
padding
,
dilation
,
layout
,
padding
,
dilation
,
layout
,
out_dtype
)
out_dtype
)
elif
layout
==
'HWCN'
:
elif
layout
==
'HWCN'
:
return
nn
.
conv2d_hwcn
(
data
,
kernel
,
strides
,
padding
,
dilation
,
out_dtype
)
return
nn
.
conv2d_hwcn
(
data
,
kernel
,
strides
,
padding
,
dilation
,
out_dtype
)
elif
layout
==
'NHWC'
:
elif
layout
==
'NHWC'
:
...
@@ -295,44 +288,69 @@ def _alter_conv2d_layout(attrs, inputs, tinfo):
...
@@ -295,44 +288,69 @@ def _alter_conv2d_layout(attrs, inputs, tinfo):
copy_inputs
=
[
s
for
s
in
inputs
]
copy_inputs
=
[
s
for
s
in
inputs
]
new_attrs
=
{
k
:
attrs
[
k
]
for
k
in
attrs
.
keys
()}
new_attrs
=
{
k
:
attrs
[
k
]
for
k
in
attrs
.
keys
()}
data
,
kernel
=
tinfo
[
0
],
tinfo
[
1
]
data
,
kernel
=
tinfo
[
0
],
tinfo
[
1
]
# only optimize for NCHW, groups=1 conv
if
attrs
[
'layout'
]
!=
'NCHW'
or
attrs
.
get_int
(
"groups"
)
!=
1
:
return
None
batch_size
,
in_channel
,
height
,
width
=
get_const_tuple
(
data
.
shape
)
batch_size
,
in_channel
,
height
,
width
=
get_const_tuple
(
data
.
shape
)
out_channel
,
_
,
kh
,
kw
=
get_const_tuple
(
kernel
.
shape
)
groups
=
attrs
.
get_int
(
"groups"
)
out_channel
=
attrs
.
get_int
(
"channels"
)
padding
=
attrs
.
get_int_tuple
(
"padding"
)
padding
=
attrs
.
get_int_tuple
(
"padding"
)
strides
=
attrs
.
get_int_tuple
(
"strides"
)
strides
=
attrs
.
get_int_tuple
(
"strides"
)
layout
=
attrs
[
'layout'
]
layout
=
attrs
[
'layout'
]
kh
,
kw
=
attrs
.
get_int_tuple
(
"kernel_size"
)
dtype
=
data
.
dtype
dtype
=
data
.
dtype
out_dtype
=
dtype
if
attrs
[
"out_dtype"
]
==
"same"
else
attrs
[
"out_dtype"
]
out_dtype
=
dtype
if
attrs
[
"out_dtype"
]
==
"same"
else
attrs
[
"out_dtype"
]
is_depthwise
=
groups
==
in_channel
and
groups
==
out_channel
# only optimize for NCHW
if
layout
!=
'NCHW'
:
return
None
if
groups
!=
1
and
not
is_depthwise
:
return
None
workload
=
autotvm
.
task
.
args_to_workload
(
[
data
,
kernel
,
strides
,
padding
,
layout
,
out_dtype
],
conv2d
)
dispatch_ctx
=
autotvm
.
task
.
DispatchContext
.
current
dispatch_ctx
=
autotvm
.
task
.
DispatchContext
.
current
target
=
tvm
.
target
.
current_target
()
target
=
tvm
.
target
.
current_target
()
# query schedule and fallback if necessary
workload
=
autotvm
.
task
.
args_to_workload
(
[
data
,
kernel
,
strides
,
padding
,
out_dtype
],
depthwise_conv2d_nchw
)
\
if
is_depthwise
else
\
autotvm
.
task
.
args_to_workload
(
[
data
,
kernel
,
strides
,
padding
,
layout
,
out_dtype
],
conv2d
)
cfg
=
dispatch_ctx
.
query
(
target
,
workload
)
cfg
=
dispatch_ctx
.
query
(
target
,
workload
)
if
cfg
.
is_fallback
:
if
cfg
.
is_fallback
:
wkl
=
_get_workload
(
data
,
kernel
,
strides
,
padding
,
out_dtype
)
_get_default_config
(
cfg
,
data
,
kernel
,
strides
,
padding
,
out_dtype
,
is_depthwise
)
_get_default_config
(
cfg
,
wkl
)
ic_bn
,
oc_bn
=
cfg
[
"tile_ic"
]
.
size
[
-
1
],
cfg
[
"tile_oc"
]
.
size
[
-
1
]
ic_bn
,
oc_bn
=
cfg
[
"tile_ic"
]
.
size
[
-
1
],
cfg
[
"tile_oc"
]
.
size
[
-
1
]
new_attrs
[
'layout'
]
=
'NCHW
%
dc'
%
ic_bn
new_attrs
[
'layout'
]
=
'NCHW
%
dc'
%
ic_bn
new_attrs
[
'out_layout'
]
=
'NCHW
%
dc'
%
oc_bn
new_attrs
[
'out_layout'
]
=
'NCHW
%
dc'
%
oc_bn
# (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
new_attrs
[
'kernel_layout'
]
=
'OIHW
%
di
%
do'
%
(
ic_bn
,
oc_bn
)
# Store the same config for the altered operator (workload)
new_data
=
tvm
.
placeholder
((
batch_size
,
in_channel
//
ic_bn
,
height
,
width
,
ic_bn
),
new_data
=
tvm
.
placeholder
((
batch_size
,
in_channel
//
ic_bn
,
height
,
width
,
ic_bn
),
dtype
=
data
.
dtype
)
dtype
=
data
.
dtype
)
if
is_depthwise
:
# channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block
# in which out_channel = merge(channel, channel_multiplier)
kernel_sym
=
copy_inputs
[
1
]
kernel_sym
=
sym
.
reshape
(
kernel_sym
,
shape
=
(
out_channel
//
oc_bn
,
oc_bn
,
kh
,
kw
))
kernel_sym
=
sym
.
transpose
(
kernel_sym
,
axes
=
(
0
,
2
,
3
,
1
))
copy_inputs
[
1
]
=
kernel_sym
# Store altered operator's config
new_kernel
=
tvm
.
placeholder
((
out_channel
//
oc_bn
,
kh
,
kw
,
oc_bn
),
dtype
=
kernel
.
dtype
)
new_workload
=
autotvm
.
task
.
args_to_workload
(
[
new_data
,
new_kernel
,
strides
,
padding
,
new_attrs
[
'layout'
],
new_attrs
[
'out_layout'
],
out_dtype
],
depthwise_conv2d_NCHWc
)
else
:
out_channel
,
_
,
kh
,
kw
=
get_const_tuple
(
kernel
.
shape
)
# (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
new_attrs
[
'kernel_layout'
]
=
'OIHW
%
di
%
do'
%
(
ic_bn
,
oc_bn
)
# Store altered operator's config
new_kernel
=
tvm
.
placeholder
((
out_channel
//
oc_bn
,
in_channel
//
ic_bn
,
kh
,
kw
,
ic_bn
,
oc_bn
),
new_kernel
=
tvm
.
placeholder
((
out_channel
//
oc_bn
,
in_channel
//
ic_bn
,
kh
,
kw
,
ic_bn
,
oc_bn
),
dtype
=
kernel
.
dtype
)
dtype
=
kernel
.
dtype
)
new_workload
=
autotvm
.
task
.
args_to_workload
(
new_workload
=
autotvm
.
task
.
args_to_workload
(
[
new_data
,
new_kernel
,
strides
,
padding
,
new_attrs
[
'layout'
],
[
new_data
,
new_kernel
,
strides
,
padding
,
new_attrs
[
'layout'
],
new_attrs
[
'out_layout'
],
out_dtype
],
conv2d_NCHWc
)
new_attrs
[
'out_layout'
],
out_dtype
],
conv2d_NCHWc
)
dispatch_ctx
.
update
(
target
,
new_workload
,
cfg
)
dispatch_ctx
.
update
(
target
,
new_workload
,
cfg
)
return
sym
.
contrib
.
conv2d_NCHWc
(
*
copy_inputs
,
**
new_attrs
)
return
sym
.
contrib
.
conv2d_NCHWc
(
*
copy_inputs
,
**
new_attrs
)
...
@@ -354,13 +372,11 @@ def _declaration_conv_NCHWc(cfg, data, kernel, strides,
...
@@ -354,13 +372,11 @@ def _declaration_conv_NCHWc(cfg, data, kernel, strides,
oc_chunk
,
_
,
kernel_height
,
kernel_width
,
_
,
oc_bn
=
get_const_tuple
(
kernel
.
shape
)
oc_chunk
,
_
,
kernel_height
,
kernel_width
,
_
,
oc_bn
=
get_const_tuple
(
kernel
.
shape
)
num_filter
=
oc_chunk
*
oc_bn
num_filter
=
oc_chunk
*
oc_bn
# get workload and related schedule config
if
cfg
.
is_fallback
:
wkl
=
_get_workload
(
tvm
.
placeholder
((
n
,
in_channel
,
ih
,
iw
),
dtype
=
data
.
dtype
),
_get_default_config
(
cfg
,
tvm
.
placeholder
((
n
,
in_channel
,
ih
,
iw
),
dtype
=
data
.
dtype
),
tvm
.
placeholder
((
num_filter
,
in_channel
,
kernel_height
,
kernel_width
),
tvm
.
placeholder
((
num_filter
,
in_channel
,
kernel_height
,
kernel_width
),
dtype
=
kernel
.
dtype
),
dtype
=
kernel
.
dtype
),
strides
,
padding
,
out_dtype
)
strides
,
padding
,
out_dtype
)
if
cfg
.
is_fallback
:
_get_default_config
(
cfg
,
wkl
)
# output shape
# output shape
out_height
=
(
ih
+
2
*
HPAD
-
kernel_height
)
//
HSTR
+
1
out_height
=
(
ih
+
2
*
HPAD
-
kernel_height
)
//
HSTR
+
1
...
@@ -386,7 +402,7 @@ def _declaration_conv_NCHWc(cfg, data, kernel, strides,
...
@@ -386,7 +402,7 @@ def _declaration_conv_NCHWc(cfg, data, kernel, strides,
n_elems
=
4
n_elems
=
4
assert
ic_bn
%
n_elems
==
0
assert
ic_bn
%
n_elems
==
0
ic_outer
=
tvm
.
reduce_axis
((
0
,
wkl
.
in_filter
//
ic_bn
),
name
=
'ic_outer'
)
ic_outer
=
tvm
.
reduce_axis
((
0
,
in_channel
//
ic_bn
),
name
=
'ic_outer'
)
ic_f_inner
=
tvm
.
reduce_axis
((
0
,
ic_bn
//
n_elems
),
name
=
'ic_f_inner'
)
ic_f_inner
=
tvm
.
reduce_axis
((
0
,
ic_bn
//
n_elems
),
name
=
'ic_f_inner'
)
ic_s_inner
=
tvm
.
reduce_axis
((
0
,
n_elems
),
name
=
'ic_s_inner'
)
ic_s_inner
=
tvm
.
reduce_axis
((
0
,
n_elems
),
name
=
'ic_s_inner'
)
return
tvm
.
compute
(
oshape
,
lambda
n
,
oc_chunk
,
oh
,
ow
,
oc_block
:
return
tvm
.
compute
(
oshape
,
lambda
n
,
oc_chunk
,
oh
,
ow
,
oc_block
:
...
...
topi/python/topi/x86/conv2d_avx_1x1.py
View file @
5712ea6b
...
@@ -8,8 +8,10 @@ from ..nn.util import infer_pad
...
@@ -8,8 +8,10 @@ from ..nn.util import infer_pad
from
..util
import
get_const_tuple
from
..util
import
get_const_tuple
from
.tensor_intrin
import
dot_16x1x16_int8_int8_int32
from
.tensor_intrin
import
dot_16x1x16_int8_int8_int32
from
.check_targets
import
check_skylake
from
.check_targets
import
check_skylake
from
.util
import
get_fp32_len
def
_fallback_schedule
(
cfg
,
wkl
,
simd_width
):
def
_fallback_schedule
(
cfg
,
wkl
):
simd_width
=
get_fp32_len
()
HPAD
,
WPAD
=
wkl
.
hpad
,
wkl
.
wpad
HPAD
,
WPAD
=
wkl
.
hpad
,
wkl
.
wpad
HSTR
,
WSTR
=
wkl
.
hstride
,
wkl
.
wstride
HSTR
,
WSTR
=
wkl
.
hstride
,
wkl
.
wstride
out_height
=
(
wkl
.
height
+
2
*
HPAD
-
wkl
.
hkernel
)
//
HSTR
+
1
out_height
=
(
wkl
.
height
+
2
*
HPAD
-
wkl
.
hkernel
)
//
HSTR
+
1
...
...
topi/python/topi/x86/conv2d_avx_common.py
View file @
5712ea6b
...
@@ -8,8 +8,10 @@ from ..nn.util import infer_pad
...
@@ -8,8 +8,10 @@ from ..nn.util import infer_pad
from
..util
import
get_const_tuple
from
..util
import
get_const_tuple
from
.tensor_intrin
import
dot_16x1x16_int8_int8_int32
from
.tensor_intrin
import
dot_16x1x16_int8_int8_int32
from
.check_targets
import
check_skylake
from
.check_targets
import
check_skylake
from
.util
import
get_fp32_len
def
_fallback_schedule
(
cfg
,
wkl
,
simd_width
):
def
_fallback_schedule
(
cfg
,
wkl
):
simd_width
=
get_fp32_len
()
HPAD
,
WPAD
=
wkl
.
hpad
,
wkl
.
wpad
HPAD
,
WPAD
=
wkl
.
hpad
,
wkl
.
wpad
HSTR
,
WSTR
=
wkl
.
hstride
,
wkl
.
wstride
HSTR
,
WSTR
=
wkl
.
hstride
,
wkl
.
wstride
out_width
=
(
wkl
.
width
+
2
*
WPAD
-
wkl
.
wkernel
)
//
WSTR
+
1
out_width
=
(
wkl
.
width
+
2
*
WPAD
-
wkl
.
wkernel
)
//
WSTR
+
1
...
...
topi/python/topi/x86/depthwise_conv2d.py
0 → 100644
View file @
5712ea6b
# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
"""Depthwise Conv2D schedule on x86"""
import
tvm
from
tvm
import
autotvm
from
tvm.autotvm.task
import
get_config
from
tvm.autotvm.task.space
import
SplitEntity
from
tvm.autotvm.task.nnvm_integration
import
deserialize_args
from
..
import
generic
,
tag
from
..nn.pad
import
pad
from
..util
import
get_const_tuple
from
..nn.util
import
get_pad_tuple
from
..nn.depthwise_conv2d
import
depthwise_conv2d_NCHWc
,
_get_workload
from
.util
import
get_fp32_len
def
_fallback_schedule
(
cfg
,
wkl
):
"""
Get default schedule for the workload
Parameters
----------
cfg : tvm.autotvm.task.space.FallbackConfigEntity
Fallback config to be updated
wkl : topi.nn.depthwise_conv2d.Workload
Convolution workload
"""
simd_width
=
get_fp32_len
()
HPAD
,
WPAD
=
wkl
.
hpad
,
wkl
.
wpad
HSTR
,
WSTR
=
wkl
.
hstride
,
wkl
.
wstride
out_height
=
(
wkl
.
height
+
2
*
HPAD
-
wkl
.
hkernel
)
//
HSTR
+
1
out_width
=
(
wkl
.
width
+
2
*
WPAD
-
wkl
.
wkernel
)
//
WSTR
+
1
oc_bn
=
1
for
bn
in
range
(
simd_width
,
0
,
-
1
):
if
wkl
.
out_filter
%
bn
==
0
:
oc_bn
=
bn
break
ic_bn
=
1
for
bn
in
range
(
oc_bn
,
0
,
-
1
):
if
wkl
.
in_filter
%
bn
==
0
:
ic_bn
=
bn
break
reg_n
=
1
for
n
in
range
(
31
,
0
,
-
1
):
if
out_width
%
n
==
0
:
reg_n
=
n
break
cfg
[
"tile_ic"
]
=
SplitEntity
([
wkl
.
in_filter
//
ic_bn
,
ic_bn
])
cfg
[
"tile_oc"
]
=
SplitEntity
([
wkl
.
out_filter
//
oc_bn
,
oc_bn
])
cfg
[
"tile_ow"
]
=
SplitEntity
([
out_width
//
reg_n
,
reg_n
])
@autotvm.register_topi_compute
(
depthwise_conv2d_NCHWc
,
'cpu'
,
'direct'
)
def
_depthwise_conv2d_NCHWc_cpu
(
cfg
,
data
,
kernel
,
strides
,
padding
,
dilation
,
layout
,
out_layout
,
out_dtype
=
None
):
out_dtype
=
data
.
dtype
if
out_dtype
is
None
else
out_dtype
batch
,
in_channel_chunk
,
in_height
,
in_width
,
in_channel_block
=
get_const_tuple
(
data
.
shape
)
out_channel_chunk
,
filter_height
,
filter_width
,
out_channel_block
\
=
get_const_tuple
(
kernel
.
shape
)
strides
=
strides
if
isinstance
(
strides
,
(
tuple
,
list
))
else
(
strides
,
strides
)
HSTR
,
WSTR
=
strides
pad_top
,
pad_left
,
pad_down
,
pad_right
=
get_pad_tuple
(
padding
,
(
filter_height
,
filter_width
))
dh
,
dw
=
dilation
if
isinstance
(
dilation
,
(
tuple
,
list
))
else
(
dilation
,
dilation
)
assert
(
dh
,
dw
)
==
(
1
,
1
),
"Does not support dilation"
in_channel
=
in_channel_chunk
*
in_channel_block
out_channel
=
out_channel_chunk
*
out_channel_block
channel_multiplier
=
out_channel
//
in_channel
out_height
=
(
in_height
-
filter_height
+
pad_top
+
pad_down
)
//
HSTR
+
1
out_width
=
(
in_width
-
filter_width
+
pad_left
+
pad_right
)
//
WSTR
+
1
# get workload and related schedule config
wkl
=
_get_workload
(
tvm
.
placeholder
((
batch
,
in_channel
,
in_height
,
in_width
),
dtype
=
data
.
dtype
),
tvm
.
placeholder
((
out_channel
,
in_channel
,
filter_height
,
filter_width
),
dtype
=
kernel
.
dtype
),
strides
,
padding
,
out_dtype
)
if
cfg
.
is_fallback
:
_fallback_schedule
(
cfg
,
wkl
)
# padding stage
DOPAD
=
(
pad_top
!=
0
or
pad_left
!=
0
or
pad_down
!=
0
or
pad_right
!=
0
)
if
DOPAD
:
pad_before
=
[
0
,
0
,
pad_top
,
pad_left
,
0
]
pad_after
=
[
0
,
0
,
pad_down
,
pad_right
,
0
]
data_pad
=
pad
(
data
,
pad_before
,
pad_after
,
name
=
"PaddedInput"
)
else
:
data_pad
=
data
# depthconv stage
kh
=
tvm
.
reduce_axis
((
0
,
filter_height
),
name
=
'kh'
)
kw
=
tvm
.
reduce_axis
((
0
,
filter_width
),
name
=
'kw'
)
Output
=
tvm
.
compute
(
(
batch
,
out_channel_chunk
,
out_height
,
out_width
,
out_channel_block
),
lambda
b
,
oco
,
oh
,
ow
,
oci
:
tvm
.
sum
(
(
data_pad
[
b
,
(
oco
*
out_channel_block
+
oci
)
//
channel_multiplier
//
in_channel_block
,
oh
*
HSTR
+
kh
,
ow
*
WSTR
+
kw
,
((
oco
*
out_channel_block
+
oci
)
//
channel_multiplier
)
%
in_channel_block
]
.
astype
(
out_dtype
)
*
kernel
[
oco
,
kh
,
kw
,
oci
]
.
astype
(
out_dtype
)),
axis
=
[
kh
,
kw
]),
name
=
'DepthwiseConv2d'
,
tag
=
"depthwise_conv2d_NCHWc"
)
return
Output
@autotvm.register_topi_schedule
(
generic
.
schedule_depthwise_conv2d_NCHWc
,
'cpu'
,
[
'direct'
])
def
schedule_depthwise_conv2d_NCHWc
(
cfg
,
outs
):
"""CPU schedule for depthwise conv2d in NCHW[x]c layout"""
s
=
tvm
.
create_schedule
([
x
.
op
for
x
in
outs
])
scheduled_ops
=
[]
def
traverse
(
op
):
"""Traverse operators from computation graph"""
# inline all one-to-one-mapping operators except the last stage (output)
if
tag
.
is_broadcast
(
op
.
tag
):
if
op
not
in
s
.
outputs
:
s
[
op
]
.
compute_inline
()
for
tensor
in
op
.
input_tensors
:
if
tensor
.
op
.
input_tensors
and
tensor
.
op
not
in
scheduled_ops
:
traverse
(
tensor
.
op
)
if
'depthwise_conv2d_NCHWc'
in
op
.
tag
:
conv_out
=
op
.
output
(
0
)
data
=
conv_out
.
op
.
input_tensors
[
0
]
kernel
=
conv_out
.
op
.
input_tensors
[
1
]
_schedule_depthwise_conv2d_NCHWc_impl
(
s
,
cfg
,
data
,
kernel
,
conv_out
,
outs
[
0
])
scheduled_ops
.
append
(
op
)
traverse
(
outs
[
0
]
.
op
)
return
s
def
_schedule_depthwise_conv2d_NCHWc_impl
(
s
,
cfg
,
data
,
kernel
,
conv_out
,
output
):
tile_ow
=
cfg
[
"tile_ow"
]
.
size
[
-
1
]
# schedule data
A
=
data
if
isinstance
(
s
[
A
]
.
op
,
tvm
.
tensor
.
ComputeOp
):
batch
,
ic_chunk
,
ih
,
iw
,
ic_block
=
s
[
A
]
.
op
.
axis
p
=
s
[
A
]
.
fuse
(
ic_chunk
,
ih
)
s
[
A
]
.
parallel
(
p
)
C
,
O
=
conv_out
,
output
CC
=
s
.
cache_write
(
C
,
'global'
)
_
,
ic_chunk
,
oh
,
ow
,
ic_block
=
s
[
C
]
.
op
.
axis
ow_chunk
,
ow_block
=
s
[
C
]
.
split
(
ow
,
factor
=
tile_ow
)
s
[
C
]
.
reorder
(
ic_chunk
,
oh
,
ow_chunk
,
ow_block
,
ic_block
)
parallel_axis
=
s
[
C
]
.
fuse
(
ic_chunk
,
oh
)
s
[
C
]
.
parallel
(
parallel_axis
)
s
[
CC
]
.
compute_at
(
s
[
C
],
ow_chunk
)
_
,
ic_chunk
,
oh
,
ow
,
ic_block
=
s
[
CC
]
.
op
.
axis
kh
,
kw
=
s
[
CC
]
.
op
.
reduce_axis
ow_chunk
,
ow_block
=
s
[
CC
]
.
split
(
ow
,
factor
=
tile_ow
)
s
[
CC
]
.
reorder
(
ic_chunk
,
oh
,
kh
,
kw
,
ow_block
,
ic_block
)
s
[
CC
]
.
vectorize
(
ic_block
)
s
[
CC
]
.
unroll
(
ow_block
)
if
C
!=
O
:
batch
,
oc_chunk
,
oh
,
ow
,
oc_block
=
s
[
O
]
.
op
.
axis
ow_chunk
,
ow_block
=
s
[
O
]
.
split
(
ow
,
factor
=
tile_ow
)
s
[
O
]
.
reorder
(
oc_chunk
,
oh
,
ow_chunk
,
ow_block
,
oc_block
)
parallel_axis
=
s
[
O
]
.
fuse
(
oc_chunk
,
oh
)
s
[
C
]
.
compute_at
(
s
[
O
],
parallel_axis
)
s
[
O
]
.
vectorize
(
oc_block
)
s
[
O
]
.
parallel
(
parallel_axis
)
return
s
@autotvm.task.register
(
"topi_x86_depthwise_conv2d_NCHWc_from_nchw"
)
def
_topi_nn_depthwise_conv2d_NCHWc
(
*
args
,
**
kwargs
):
assert
not
kwargs
,
"Do not support kwargs in template function call"
data
,
kernel
,
strides
,
padding
,
dilation
,
dtype
=
deserialize_args
(
args
)
batch
,
in_channel
,
height
,
width
=
get_const_tuple
(
data
.
shape
)
filter_channel
,
channel_multiplier
,
kh
,
kw
=
get_const_tuple
(
kernel
.
shape
)
ph
,
pw
=
padding
if
isinstance
(
padding
,
(
tuple
,
list
))
else
(
padding
,
padding
)
sh
,
sw
=
strides
if
isinstance
(
strides
,
(
tuple
,
list
))
else
(
strides
,
strides
)
out_height
=
(
height
-
kh
+
2
*
ph
)
//
sh
+
1
out_width
=
(
width
-
kw
+
2
*
pw
)
//
sw
+
1
out_channel
=
filter_channel
*
channel_multiplier
# get config here
cfg
=
get_config
()
cfg
.
define_split
(
"tile_ic"
,
in_channel
,
num_outputs
=
2
)
cfg
.
define_split
(
"tile_oc"
,
out_channel
,
num_outputs
=
2
)
cfg
.
define_split
(
"tile_ow"
,
out_width
,
num_outputs
=
2
,
filter
=
lambda
y
:
y
.
size
[
-
1
]
<=
64
)
# change shape with the value in config
ic_bn
,
oc_bn
=
cfg
[
"tile_ic"
]
.
size
[
-
1
],
cfg
[
"tile_oc"
]
.
size
[
-
1
]
new_data_shape
=
(
batch
,
in_channel
//
ic_bn
,
height
,
width
,
ic_bn
)
new_kernel_shape
=
(
out_channel
//
oc_bn
,
kh
,
kw
,
oc_bn
)
new_data
=
tvm
.
placeholder
(
new_data_shape
,
data
.
dtype
)
new_kernel
=
tvm
.
placeholder
(
new_kernel_shape
,
kernel
.
dtype
)
data_layout
=
"NCHW
%
dc"
%
ic_bn
out_layout
=
"NCHW
%
dc"
%
oc_bn
C
=
_depthwise_conv2d_NCHWc_cpu
(
cfg
,
new_data
,
new_kernel
,
strides
,
padding
,
dilation
,
data_layout
,
out_layout
,
dtype
)
s
=
schedule_depthwise_conv2d_NCHWc
(
cfg
,
[
C
])
return
s
,
[
new_data
,
new_kernel
,
C
]
topi/python/topi/x86/util.py
0 → 100644
View file @
5712ea6b
"""Common x86 related utilities"""
from
__future__
import
absolute_import
as
_abs
import
tvm
def
get_fp32_len
():
fp32_vec_len
=
8
target
=
tvm
.
target
.
current_target
()
if
target
is
not
None
:
for
opt
in
target
.
options
:
if
opt
==
'-mcpu=skylake-avx512'
:
fp32_vec_len
=
16
return
fp32_vec_len
topi/tests/python/test_topi_conv2d_NCHWc.py
View file @
5712ea6b
...
@@ -13,27 +13,22 @@ from common import get_all_backend
...
@@ -13,27 +13,22 @@ from common import get_all_backend
def
_transform_data
(
data
,
bn
):
def
_transform_data
(
data
,
bn
):
# NCHW -> NCHW[x]c
# NCHW -> NCHW[x]c
batch_size
,
channel
,
height
,
width
=
data
.
shape
batch_size
,
channel
,
height
,
width
=
data
.
shape
data
=
np
.
transpose
(
data
,
(
0
,
2
,
3
,
1
))
data
=
np
.
reshape
(
data
,
(
batch_size
,
channel
//
bn
,
bn
,
height
,
width
))
data
=
np
.
reshape
(
data
,
(
batch_size
,
height
,
width
,
channel
//
bn
,
bn
))
data
=
np
.
transpose
(
data
,
(
0
,
1
,
3
,
4
,
2
))
data
=
np
.
transpose
(
data
,
(
0
,
3
,
1
,
2
,
4
))
return
data
return
data
def
_transform_kernel
(
kernel
,
ic_bn
,
oc_bn
):
def
_transform_kernel
(
kernel
,
ic_bn
,
oc_bn
):
# OIHW -> OIHW[x]i[x]o
# OIHW -> OIHW[x]i[x]o
out_channel
,
in_channel
,
kh
,
kw
=
kernel
.
shape
out_channel
,
in_channel
,
kh
,
kw
=
kernel
.
shape
kernel
=
np
.
transpose
(
kernel
,
(
1
,
2
,
3
,
0
))
kernel
=
np
.
reshape
(
kernel
,
(
out_channel
//
oc_bn
,
oc_bn
,
in_channel
//
ic_bn
,
ic_bn
,
kh
,
kw
))
kernel
=
np
.
reshape
(
kernel
,
(
in_channel
,
kh
,
kw
,
out_channel
//
oc_bn
,
oc_bn
))
kernel
=
np
.
transpose
(
kernel
,
(
0
,
2
,
4
,
5
,
3
,
1
))
kernel
=
np
.
transpose
(
kernel
,
(
1
,
2
,
3
,
4
,
0
))
kernel
=
np
.
reshape
(
kernel
,
(
kh
,
kw
,
out_channel
//
oc_bn
,
oc_bn
,
in_channel
//
ic_bn
,
ic_bn
))
kernel
=
np
.
transpose
(
kernel
,
(
2
,
4
,
0
,
1
,
5
,
3
))
return
kernel
return
kernel
def
_transform_bias
(
bias
,
bn
):
def
_transform_bias
(
bias
,
bn
):
# [num_filter, 1, 1] -> [num_filter//bn, 1, 1, bn]
# [num_filter, 1, 1] -> [num_filter//bn, 1, 1, bn]
num_filter
,
h
,
w
=
bias
.
shape
num_filter
,
h
,
w
=
bias
.
shape
bias
=
np
.
transpose
(
bias
,
(
1
,
2
,
0
))
bias
=
np
.
reshape
(
bias
,
(
num_filter
//
bn
,
bn
,
h
,
w
))
bias
=
np
.
reshape
(
bias
,
(
h
,
w
,
num_filter
//
bn
,
bn
))
bias
=
np
.
transpose
(
bias
,
(
0
,
2
,
3
,
1
))
bias
=
np
.
transpose
(
bias
,
(
2
,
0
,
1
,
3
))
return
bias
return
bias
def
verify_conv2d_NCHWc
(
batch
,
in_channel
,
in_size
,
num_filter
,
kernel
,
stride
,
def
verify_conv2d_NCHWc
(
batch
,
in_channel
,
in_size
,
num_filter
,
kernel
,
stride
,
...
@@ -86,6 +81,7 @@ def verify_conv2d_NCHWc(batch, in_channel, in_size, num_filter, kernel, stride,
...
@@ -86,6 +81,7 @@ def verify_conv2d_NCHWc(batch, in_channel, in_size, num_filter, kernel, stride,
print
(
"Running on target:
%
s"
%
device
)
print
(
"Running on target:
%
s"
%
device
)
with
tvm
.
target
.
create
(
device
):
with
tvm
.
target
.
create
(
device
):
C
=
topi
.
nn
.
conv2d_NCHWc
(
A
,
W
,
(
stride
,
stride
),
(
padding
,
padding
),
C
=
topi
.
nn
.
conv2d_NCHWc
(
A
,
W
,
(
stride
,
stride
),
(
padding
,
padding
),
(
dilation
,
dilation
),
layout
=
'NCHW
%
dc'
%
ic_block
,
layout
=
'NCHW
%
dc'
%
ic_block
,
out_layout
=
"NCHW
%
dc"
%
oc_block
,
out_layout
=
"NCHW
%
dc"
%
oc_block
,
out_dtype
=
dtype
)
out_dtype
=
dtype
)
...
@@ -117,7 +113,7 @@ def verify_conv2d_NCHWc(batch, in_channel, in_size, num_filter, kernel, stride,
...
@@ -117,7 +113,7 @@ def verify_conv2d_NCHWc(batch, in_channel, in_size, num_filter, kernel, stride,
check_device
(
device
)
check_device
(
device
)
if
__name__
==
"__main__"
:
def
test_conv2d_NCHWc
()
:
# ResNet18 workloads
# ResNet18 workloads
verify_conv2d_NCHWc
(
1
,
3
,
224
,
64
,
7
,
2
,
3
)
verify_conv2d_NCHWc
(
1
,
3
,
224
,
64
,
7
,
2
,
3
)
verify_conv2d_NCHWc
(
1
,
64
,
56
,
64
,
3
,
1
,
1
)
verify_conv2d_NCHWc
(
1
,
64
,
56
,
64
,
3
,
1
,
1
)
...
@@ -204,3 +200,6 @@ if __name__ == "__main__":
...
@@ -204,3 +200,6 @@ if __name__ == "__main__":
verify_conv2d_NCHWc
(
1
,
2048
,
10
,
126
,
3
,
1
,
1
)
verify_conv2d_NCHWc
(
1
,
2048
,
10
,
126
,
3
,
1
,
1
)
verify_conv2d_NCHWc
(
1
,
512
,
5
,
126
,
3
,
1
,
1
)
verify_conv2d_NCHWc
(
1
,
512
,
5
,
126
,
3
,
1
,
1
)
verify_conv2d_NCHWc
(
1
,
256
,
3
,
126
,
3
,
1
,
1
)
verify_conv2d_NCHWc
(
1
,
256
,
3
,
126
,
3
,
1
,
1
)
if
__name__
==
"__main__"
:
test_conv2d_NCHWc
()
\ No newline at end of file
topi/tests/python/test_topi_depthwise_conv2d.py
View file @
5712ea6b
...
@@ -203,6 +203,115 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
...
@@ -203,6 +203,115 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
with
autotvm
.
tophub
.
context
(
device
):
# load tophub pre-tuned parameters
with
autotvm
.
tophub
.
context
(
device
):
# load tophub pre-tuned parameters
check_device
(
device
)
check_device
(
device
)
def
_transform_data
(
data
,
bn
):
# NCHW -> NCHW[x]c
batch_size
,
channel
,
height
,
width
=
data
.
shape
data
=
np
.
reshape
(
data
,
(
batch_size
,
channel
//
bn
,
bn
,
height
,
width
))
data
=
np
.
transpose
(
data
,
(
0
,
1
,
3
,
4
,
2
))
return
data
def
_transform_kernel
(
kernel
,
bn
):
# channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block
channel
,
channel_multiplier
,
kh
,
kw
=
kernel
.
shape
out_channel
=
channel
*
channel_multiplier
kernel
=
np
.
reshape
(
kernel
,
(
out_channel
//
bn
,
bn
,
kh
,
kw
))
kernel
=
np
.
transpose
(
kernel
,
(
0
,
2
,
3
,
1
))
return
kernel
def
depthwise_conv2d_with_workload_NCHWc
(
batch
,
in_channel
,
in_height
,
channel_multiplier
,
filter_height
,
stride
,
padding
,
dilation
=
1
):
in_width
=
in_height
filter_channel
=
in_channel
filter_width
=
filter_height
stride_h
=
stride_w
=
stride
assert
dilation
==
1
,
"depthwise_conv2d_NCHWc currently does not support dilation."
pad_h
,
pad_w
,
_
,
_
=
get_pad_tuple
(
padding
,
(
filter_height
,
filter_width
))
padding_args
=
(
pad_h
,
pad_w
)
out_channel
=
filter_channel
*
channel_multiplier
# for testing functionality,
# we choose arbitrary block size that can divide the channel,
# regardless of the performance.
oc_block
=
1
for
bn
in
range
(
16
,
0
,
-
1
):
if
out_channel
%
bn
==
0
:
oc_block
=
bn
break
ic_block
=
1
for
bn
in
range
(
oc_block
,
0
,
-
1
):
if
in_channel
%
bn
==
0
:
ic_block
=
bn
break
# placeholder
Input
=
tvm
.
placeholder
((
batch
,
in_channel
//
ic_block
,
in_height
,
in_width
,
ic_block
),
name
=
'Input'
)
Filter
=
tvm
.
placeholder
((
out_channel
//
oc_block
,
filter_height
,
filter_width
,
oc_block
),
name
=
'Filter'
)
in_layout
=
"NCHW
%
dc"
%
ic_block
out_layout
=
"NCHW
%
dc"
%
oc_block
dtype
=
'float32'
def
check_device
(
device
):
ctx
=
tvm
.
context
(
device
,
0
)
if
not
ctx
.
exist
:
print
(
"Skip because
%
s is not enabled"
%
device
)
return
print
(
"Running on target:
%
s"
%
device
)
with
tvm
.
target
.
create
(
device
):
# declare
DepthwiseConv2d
=
topi
.
nn
.
depthwise_conv2d_NCHWc
(
Input
,
Filter
,
(
stride_h
,
stride_w
),
padding_args
,
(
dilation
,
dilation
),
in_layout
,
out_layout
,
dtype
)
# TODO: add scale_shift implement for NCHWc and add test here
Relu
=
topi
.
nn
.
relu
(
DepthwiseConv2d
)
# schedule
s1
=
topi
.
generic
.
schedule_depthwise_conv2d_nchw
(
DepthwiseConv2d
)
s2
=
topi
.
generic
.
schedule_depthwise_conv2d_nchw
(
Relu
)
# build the kernels
f1
=
tvm
.
build
(
s1
,
[
Input
,
Filter
,
DepthwiseConv2d
],
device
)
f2
=
tvm
.
build
(
s2
,
[
Input
,
Filter
,
Relu
],
device
)
# Prepare pod type for test data closure
input_shape
=
(
batch
,
in_channel
,
in_height
,
in_width
)
filter_shape
=
(
filter_channel
,
channel_multiplier
,
filter_height
,
filter_width
)
# Use memoize, pickle the test data for next time use.
@memoize
(
"topi.tests.test_topi_depthwise_conv2d.NCHWc"
)
def
get_ref_data
():
input_np
=
np
.
random
.
uniform
(
size
=
input_shape
)
.
astype
(
dtype
)
filter_np
=
np
.
random
.
uniform
(
size
=
filter_shape
)
.
astype
(
dtype
)
# correctness with scipy
depthwise_conv2d_scipy
=
topi
.
testing
.
depthwise_conv2d_python_nchw
(
input_np
,
filter_np
,
stride
,
padding
)
relu_scipy
=
np
.
maximum
(
depthwise_conv2d_scipy
,
0
)
return
(
_transform_data
(
input_np
,
ic_block
),
_transform_kernel
(
filter_np
,
oc_block
),
_transform_data
(
depthwise_conv2d_scipy
,
oc_block
),
_transform_data
(
relu_scipy
,
oc_block
))
# Get the test data
(
input_np
,
filter_np
,
depthwise_conv2d_scipy
,
relu_scipy
)
=
get_ref_data
()
input_tvm
=
tvm
.
nd
.
array
(
input_np
,
ctx
)
filter_tvm
=
tvm
.
nd
.
array
(
filter_np
,
ctx
)
depthwise_conv2d_tvm
=
tvm
.
nd
.
array
(
np
.
zeros
(
shape
=
get_const_tuple
(
DepthwiseConv2d
.
shape
),
dtype
=
DepthwiseConv2d
.
dtype
),
ctx
)
relu_tvm
=
tvm
.
nd
.
array
(
np
.
zeros
(
shape
=
get_const_tuple
(
Relu
.
shape
),
dtype
=
Relu
.
dtype
),
ctx
)
# launch kernel 1 (depthwise_conv2d)
f1
(
input_tvm
,
filter_tvm
,
depthwise_conv2d_tvm
)
# launch kernel 2 (depthwise_conv2d + relu)
f2
(
input_tvm
,
filter_tvm
,
relu_tvm
)
tvm
.
testing
.
assert_allclose
(
depthwise_conv2d_tvm
.
asnumpy
(),
depthwise_conv2d_scipy
,
rtol
=
1e-5
)
tvm
.
testing
.
assert_allclose
(
relu_tvm
.
asnumpy
(),
relu_scipy
,
rtol
=
1e-5
)
# test llvm only for now since depthwise_conv2d_NCHWc implement is missing in other backend.
for
device
in
[
"llvm"
]:
with
autotvm
.
tophub
.
context
(
device
):
# load tophub pre-tuned parameters
check_device
(
device
)
def
test_depthwise_conv2d
():
def
test_depthwise_conv2d
():
# mobilenet workloads
# mobilenet workloads
...
@@ -233,5 +342,12 @@ def test_depthwise_conv2d():
...
@@ -233,5 +342,12 @@ def test_depthwise_conv2d():
# disabled because it uses too large shared memory on cuda
# disabled because it uses too large shared memory on cuda
# depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
# depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
# NCHW[x]c
depthwise_conv2d_with_workload_NCHWc
(
1
,
728
,
32
,
1
,
3
,
1
,
"SAME"
)
depthwise_conv2d_with_workload_NCHWc
(
4
,
256
,
64
,
2
,
5
,
2
,
"SAME"
)
depthwise_conv2d_with_workload_NCHWc
(
1
,
728
,
32
,
1
,
3
,
1
,
"VALID"
)
depthwise_conv2d_with_workload_NCHWc
(
4
,
256
,
64
,
2
,
5
,
2
,
"VALID"
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
test_depthwise_conv2d
()
test_depthwise_conv2d
()
tutorials/autotvm/tune_nnvm_x86.py
View file @
5712ea6b
...
@@ -117,7 +117,15 @@ def tune_kernels(tasks,
...
@@ -117,7 +117,15 @@ def tune_kernels(tasks,
prefix
=
"[Task
%2
d/
%2
d] "
%
(
i
+
1
,
len
(
tasks
))
prefix
=
"[Task
%2
d/
%2
d] "
%
(
i
+
1
,
len
(
tasks
))
# converting conv2d tasks to conv2d_NCHWc tasks
# converting conv2d tasks to conv2d_NCHWc tasks
task
=
autotvm
.
task
.
create
(
"topi_x86_conv2d_NCHWc"
,
args
=
tsk
.
args
,
op_name
=
tsk
.
workload
[
0
]
if
op_name
==
'conv2d'
:
func_create
=
'topi_x86_conv2d_NCHWc'
elif
op_name
==
'depthwise_conv2d_nchw'
:
func_create
=
'topi_x86_depthwise_conv2d_NCHWc_from_nchw'
else
:
raise
ValueError
(
"Tuning {} is not supported on x86"
.
format
(
op_name
))
task
=
autotvm
.
task
.
create
(
func_create
,
args
=
tsk
.
args
,
target
=
target
,
template_key
=
'direct'
)
target
=
target
,
template_key
=
'direct'
)
task
.
workload
=
tsk
.
workload
task
.
workload
=
tsk
.
workload
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment