Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
tic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
wenyuanbo
tic
Commits
06f91dd2
Commit
06f91dd2
authored
Oct 02, 2018
by
Wuwei Lin
Committed by
Tianqi Chen
Oct 01, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TOPI] Add conv2d int8 template (#1735)
parent
46363d0a
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
602 additions
and
8 deletions
+602
-8
nnvm/python/nnvm/top/nn.py
+12
-3
nnvm/src/top/nn/convolution.cc
+0
-1
topi/python/topi/cuda/conv2d.py
+13
-4
topi/python/topi/cuda/conv2d_direct.py
+5
-0
topi/python/topi/cuda/conv2d_int8.py
+336
-0
topi/python/topi/cuda/conv2d_winograd.py
+7
-0
topi/python/topi/generic/nn.py
+18
-0
topi/python/topi/nn/conv2d.py
+34
-0
topi/tests/python/test_topi_conv2d_int8.py
+177
-0
No files found.
nnvm/python/nnvm/top/nn.py
View file @
06f91dd2
...
...
@@ -90,10 +90,12 @@ def compute_conv2d(attrs, inputs, _):
kernel_layout
=
attrs
[
"kernel_layout"
]
out_dtype
=
attrs
[
"out_dtype"
]
out_dtype
=
inputs
[
0
]
.
dtype
if
out_dtype
==
"same"
else
out_dtype
assert
layout
==
"NCHW"
or
layout
==
"NHWC"
assert
layout
in
[
"NCHW"
,
"NHWC"
,
"NCHW4c"
]
(
dilation_h
,
dilation_w
)
=
dilation
if
dilation_h
<
1
or
dilation_w
<
1
:
raise
ValueError
(
"dilation should be positive value"
)
elif
layout
==
"NCHW4c"
and
(
dilation_h
>
1
or
dilation_w
>
1
):
raise
ValueError
(
"not support dilate now"
)
elif
dilation
==
(
1
,
1
):
kernel
=
inputs
[
1
]
elif
layout
==
"NCHW"
:
...
...
@@ -101,7 +103,12 @@ def compute_conv2d(attrs, inputs, _):
else
:
#layout == NHWC
kernel
=
topi
.
nn
.
dilate
(
inputs
[
1
],
[
1
,
dilation_h
,
dilation_w
,
1
])
if
groups
==
1
:
if
groups
==
1
and
layout
==
'NCHW4c'
and
inputs
[
0
]
.
dtype
==
'int8'
:
# pylint: disable=assignment-from-no-return
out
=
topi
.
nn
.
conv2d_NCHWc_int8_prepacked
(
inputs
[
0
],
kernel
,
strides
,
padding
,
layout
,
out_dtype
=
out_dtype
)
# pylint: enable=assignment-from-no-return
elif
groups
==
1
:
out
=
topi
.
nn
.
conv2d
(
inputs
[
0
],
kernel
,
strides
,
padding
,
layout
,
out_dtype
=
out_dtype
)
elif
layout
==
"NCHW"
and
\
...
...
@@ -120,7 +127,7 @@ def compute_conv2d(attrs, inputs, _):
if
attrs
.
get_bool
(
"use_bias"
):
bias
=
inputs
[
2
]
expand_axis
=
1
if
layout
==
"NCHW"
else
0
expand_axis
=
1
if
layout
in
[
"NCHW"
,
"NCHW4c"
]
else
0
bias
=
topi
.
expand_dims
(
bias
,
axis
=
expand_axis
,
num_newaxis
=
2
)
out
=
topi
.
add
(
out
,
bias
)
return
out
...
...
@@ -136,6 +143,8 @@ def schedule_conv2d(attrs, outs, target):
with
tvm
.
target
.
create
(
target
):
if
groups
==
1
and
layout
==
"NCHW"
:
return
topi
.
generic
.
schedule_conv2d_nchw
(
outs
)
elif
groups
==
1
and
layout
==
"NCHW4c"
:
return
topi
.
generic
.
schedule_conv2d_NCHWc_int8_prepacked
(
outs
)
elif
groups
==
1
and
layout
==
"NHWC"
:
return
topi
.
generic
.
schedule_conv2d_nhwc
(
outs
)
elif
groups
==
channels
and
layout
==
"NCHW"
:
...
...
nnvm/src/top/nn/convolution.cc
View file @
06f91dd2
...
...
@@ -344,7 +344,6 @@ NNVM_REGISTER_OP(_contrib_conv2d_NCHWc)
.
set_num_inputs
(
UseBiasNumInputs
<
Conv2DParam
>
)
.
set_support_level
(
2
);
NNVM_REGISTER_OP
(
_contrib_conv2d_winograd_weight_transform
)
.
describe
(
R"code(Weight transformation of winograd fast convolution algorithm.
Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
...
...
topi/python/topi/cuda/conv2d.py
View file @
06f91dd2
...
...
@@ -9,9 +9,10 @@ from ..util import get_const_int, get_const_tuple, traverse_inline
from
.conv2d_direct
import
schedule_direct_cuda
from
.conv2d_winograd
import
winograd_cuda
,
schedule_winograd_cuda
from
.conv2d_int8
import
conv2d_NCHWc_int8
,
schedule_conv2d_NCHWc_int8
@autotvm.register_topi_compute
(
nn
.
conv2d
,
[
'cuda'
,
'gpu'
],
[
'direct'
,
'winograd'
])
@autotvm.register_topi_compute
(
nn
.
conv2d
,
[
'cuda'
,
'gpu'
],
[
'direct'
,
'winograd'
,
'int8'
])
def
conv2d_cuda
(
cfg
,
data
,
kernel
,
strides
,
padding
,
layout
=
'NCHW'
,
out_dtype
=
'float32'
):
"""Conv2D operator for cuda backend.
...
...
@@ -21,10 +22,13 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
The config for this template
data : tvm.Tensor
4-D with shape [batch, in_channel, in_height, in_width]
4-D with shape [batch, in_channel, in_height, in_width] or
5-D with shape [batch, ic_chunk, in_height, in_width, ic_block]
kernel : tvm.Tensor
4-D with shape [num_filter, in_channel, filter_height, filter_width]
4-D with shape [num_filter, in_channel, filter_height, filter_width] or
6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
filter_width, num_filter_block, in_channel_block]
strides : int or a list/tuple of two ints
stride size, or [stride_height, stride_width]
...
...
@@ -98,6 +102,9 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
if
cfg
.
template_key
==
'winograd'
:
return
winograd_cuda
(
cfg
,
data
,
kernel
,
strides
,
padding
,
layout
,
out_dtype
,
pre_computed
=
False
)
if
cfg
.
template_key
==
'int8'
:
return
conv2d_NCHWc_int8
(
cfg
,
data
,
kernel
,
strides
,
padding
,
layout
,
out_dtype
,
pre_computed
=
False
)
if
layout
==
'NCHW'
:
return
nn
.
conv2d_nchw
(
data
,
kernel
,
strides
,
padding
,
out_dtype
)
...
...
@@ -108,7 +115,7 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
@autotvm.register_topi_schedule
(
generic
.
schedule_conv2d_nchw
,
[
"cuda"
,
"gpu"
],
[
"direct"
,
'winograd'
])
[
"direct"
,
'winograd'
,
"int8"
])
def
schedule_conv2d_nchw_cuda
(
cfg
,
outs
):
"""TOPI schedule callback of conv2d for cuda gpu
...
...
@@ -138,6 +145,8 @@ def schedule_conv2d_nchw_cuda(cfg, outs):
schedule_direct_cuda
(
cfg
,
s
,
op
.
output
(
0
))
if
op
.
tag
==
'conv2d_nchw_winograd'
:
schedule_winograd_cuda
(
cfg
,
s
,
op
.
output
(
0
),
pre_computed
=
False
)
if
op
.
tag
==
"conv2d_NCHWc_int8"
:
schedule_conv2d_NCHWc_int8
(
cfg
,
s
,
op
.
output
(
0
),
pre_computed
=
False
)
traverse_inline
(
s
,
outs
[
0
]
.
op
,
_callback
)
return
s
topi/python/topi/cuda/conv2d_direct.py
View file @
06f91dd2
...
...
@@ -2,6 +2,7 @@
"""The templates for cuda conv2d operators"""
import
tvm
from
tvm
import
autotvm
from
..util
import
get_const_tuple
def
schedule_direct_cuda
(
cfg
,
s
,
conv
):
"""schedule optimized for batch size = 1"""
...
...
@@ -94,3 +95,7 @@ def schedule_direct_cuda(cfg, s, conv):
# unroll
s
[
output
]
.
pragma
(
kernel_scope
,
'auto_unroll_max_step'
,
cfg
[
'auto_unroll_max_step'
]
.
val
)
s
[
output
]
.
pragma
(
kernel_scope
,
'unroll_explicit'
,
cfg
[
'unroll_explicit'
]
.
val
)
N
,
CO
,
OH
,
OW
=
get_const_tuple
(
output
.
shape
)
_
,
KH
,
KW
,
CI
=
get_const_tuple
(
kernel
.
shape
)
cfg
.
add_flop
(
2
*
N
*
OH
*
OW
*
CO
*
CI
*
KH
*
KW
)
topi/python/topi/cuda/conv2d_int8.py
0 → 100644
View file @
06f91dd2
# pylint: disable=invalid-name
"""Int8 conv2d in NCHWc layout"""
import
tvm
from
tvm
import
autotvm
from
.injective
import
_schedule_injective
from
..generic
import
schedule_conv2d_NCHWc_int8_prepacked
from
.tensor_intrin
import
dp4a
from
..nn.conv2d
import
conv2d_NCHWc_int8_prepacked
from
..nn.pad
import
pad
from
..nn.util
import
get_pad_tuple
from
..util
import
get_const_tuple
,
get_const_int
,
traverse_inline
def
_conv2d_NCHWc_int8_arg_to_workload
(
data
,
kernel
,
stride
,
padding
,
out_dtype
):
"""convert argument to workload"""
shape
=
get_const_tuple
(
data
.
shape
)
if
len
(
shape
)
==
5
:
N
,
ic_chunk
,
H
,
W
,
ic_block
=
shape
raw_data
=
tvm
.
placeholder
(
(
N
,
ic_chunk
*
ic_block
,
H
,
W
),
dtype
=
data
.
dtype
)
else
:
raw_data
=
data
shape
=
get_const_tuple
(
kernel
.
shape
)
if
len
(
shape
)
==
6
:
oc_chunk
,
ic_chunk
,
KH
,
KW
,
oc_block
,
ic_block
=
shape
raw_kernel
=
tvm
.
placeholder
(
(
oc_chunk
*
oc_block
,
ic_chunk
*
ic_block
,
KH
,
KW
),
dtype
=
kernel
.
dtype
)
else
:
raw_kernel
=
kernel
return
(
'conv2d'
,
)
+
autotvm
.
task
.
task
.
args_to_workload
(
[
raw_data
,
raw_kernel
,
stride
,
padding
,
"NCHW"
,
out_dtype
])
def
conv2d_NCHWc_int8
(
cfg
,
data
,
kernel
,
stride
,
padding
,
layout
,
out_dtype
,
pre_computed
):
"""Convolution operator in NCHW[x]c layout for int8.
Parameters
----------
cfg: ConfigEntity
The config for this template
data : tvm.Tensor
4-D with shape [batch, in_channel, in_height, in_width] or
5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
kernel : tvm.Tensor
4-D with shape [num_filter, in_channel, filter_height, filter_width] or
6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
filter_width, num_filter_block, in_channel_block]
stride : int or a list/tuple of two ints
stride size, or [stride_height, stride_width]
padding: int or a list/tuple of two ints
padding size, or [pad_height, pad_width]
layout : str
layout of data
out_dtype : str
The output type. This is used for mixed precision.
pre_computed : str
Whether packed data and kernel are pre-computed
Returns
-------
output : tvm.Tensor
5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
"""
assert
layout
in
[
"NCHW"
,
"NCHW4c"
]
ic_block_factor
=
4
oc_block_factor
=
4
if
not
pre_computed
:
batch
,
channels
,
height
,
width
=
get_const_tuple
(
data
.
shape
)
assert
channels
%
ic_block_factor
==
0
,
\
"Number of input channels should be multiple of {}"
.
format
(
ic_block_factor
)
packed_data
=
tvm
.
compute
((
batch
,
channels
//
ic_block_factor
,
height
,
width
,
ic_block_factor
),
lambda
n
,
c
,
h
,
w
,
vc
:
data
[
n
,
c
*
ic_block_factor
+
vc
,
h
,
w
],
name
=
"packed_data"
)
out_channels
,
in_channels
,
kernel_h
,
kernel_w
=
get_const_tuple
(
kernel
.
shape
)
assert
out_channels
%
4
==
0
,
\
"Number of output channels should be multiple of {}"
.
format
(
oc_block_factor
)
packed_kernel
=
tvm
.
compute
(
(
out_channels
//
oc_block_factor
,
in_channels
//
ic_block_factor
,
kernel_h
,
kernel_w
,
oc_block_factor
,
ic_block_factor
),
lambda
oc_chunk
,
ic_chunk
,
kh
,
kw
,
oc_block
,
ic_block
:
kernel
[
oc_chunk
*
oc_block_factor
+
oc_block
,
ic_chunk
*
ic_block_factor
+
ic_block
,
kh
,
kw
],
name
=
"packed_kernel"
)
else
:
packed_data
=
data
packed_kernel
=
kernel
batch
,
ic_chunk
,
in_height
,
in_width
,
ic_block
=
get_const_tuple
(
packed_data
.
shape
)
oc_chunk
,
ic_chunk
,
kernel_h
,
kernel_w
,
oc_block
,
ic_block
=
get_const_tuple
(
packed_kernel
.
shape
)
if
isinstance
(
stride
,
int
):
stride_h
,
stride_w
=
stride
else
:
stride_h
,
stride_w
=
stride
pad_top
,
pad_left
,
pad_down
,
pad_right
=
get_pad_tuple
(
padding
,
(
kernel_h
,
kernel_w
))
# compute graph
pad_before
=
[
0
,
0
,
pad_top
,
pad_left
,
0
]
pad_after
=
[
0
,
0
,
pad_down
,
pad_right
,
0
]
pad_data
=
pad
(
packed_data
,
pad_before
,
pad_after
,
name
=
"pad_data"
)
# compute the output shape
out_height
=
(
in_height
-
kernel_h
+
pad_top
+
pad_down
)
//
stride_h
+
1
out_width
=
(
in_width
-
kernel_w
+
pad_left
+
pad_right
)
//
stride_w
+
1
oshape
=
(
batch
,
oc_chunk
,
out_height
,
out_width
,
oc_block
)
icc
=
tvm
.
reduce_axis
((
0
,
ic_chunk
),
name
=
'ic_chunk'
)
icb
=
tvm
.
reduce_axis
((
0
,
ic_block
),
name
=
'ic_block'
)
kh
=
tvm
.
reduce_axis
((
0
,
kernel_h
),
name
=
'kh'
)
kw
=
tvm
.
reduce_axis
((
0
,
kernel_w
),
name
=
'kw'
)
conv
=
tvm
.
compute
(
oshape
,
lambda
n
,
oc_chunk
,
oh
,
ow
,
oc_block
:
tvm
.
sum
(
pad_data
[
n
,
icc
,
oh
*
stride_h
+
kh
,
ow
*
stride_w
+
kw
,
icb
]
.
astype
(
'int32'
)
*
packed_kernel
[
oc_chunk
,
icc
,
kh
,
kw
,
oc_block
,
icb
]
.
astype
(
'int32'
),
axis
=
[
icc
,
kh
,
kw
,
icb
]))
output
=
tvm
.
compute
(
oshape
,
lambda
n
,
oc_chunk
,
oh
,
ow
,
oc_block
:
conv
[
n
,
oc_chunk
,
oh
,
ow
,
oc_block
]
.
astype
(
out_dtype
),
tag
=
"conv2d_NCHWc_int8"
,
attrs
=
{
"workload"
:
_conv2d_NCHWc_int8_arg_to_workload
(
data
,
kernel
,
stride
,
padding
,
out_dtype
)})
# num flop
num_flop
=
batch
*
oc_chunk
*
oc_block
*
out_height
*
out_width
*
\
ic_chunk
*
ic_block
*
kernel_h
*
kernel_w
*
2
cfg
.
add_flop
(
num_flop
)
return
output
_dp4a
=
dp4a
(
'shared'
,
'shared'
,
'local'
)
def
schedule_conv2d_NCHWc_int8
(
cfg
,
s
,
output
,
pre_computed
):
"""Schedule conv2d int8 NCHWc template"""
workload
=
output
.
op
.
attrs
[
"workload"
]
stride
=
workload
[
3
]
conv
=
output
.
op
.
input_tensors
[
0
]
packed_data
,
packed_kernel
=
conv
.
op
.
input_tensors
if
isinstance
(
packed_data
.
op
,
tvm
.
tensor
.
ComputeOp
)
and
"pad"
in
packed_data
.
op
.
tag
:
pad_data
=
packed_data
packed_data
=
pad_data
.
op
.
input_tensors
[
0
]
else
:
pad_data
=
packed_data
if
not
pre_computed
:
kernel
,
=
packed_kernel
.
op
.
input_tensors
if
autotvm
.
GLOBAL_SCOPE
.
in_tuning
:
# skip this part during tuning to make recrods accurate
# this part will be pre-computed during NNVM's pre-compute optimization pass
s
[
packed_data
]
.
pragma
(
s
[
packed_data
]
.
op
.
axis
[
0
],
"debug_skip_region"
)
s
[
packed_kernel
]
.
pragma
(
s
[
packed_kernel
]
.
op
.
axis
[
0
],
"debug_skip_region"
)
else
:
_schedule_injective
(
packed_data
.
op
,
s
)
_schedule_injective
(
packed_kernel
.
op
,
s
)
else
:
kernel
=
packed_data
if
isinstance
(
kernel
.
op
,
tvm
.
tensor
.
ComputeOp
)
and
"dilate"
in
kernel
.
op
.
tag
:
s
[
kernel
]
.
compute_inline
()
if
pad_data
!=
packed_data
:
s
[
pad_data
]
.
compute_inline
()
batch
=
get_const_int
(
packed_data
.
shape
[
0
])
if
isinstance
(
stride
,
int
):
stride_h
=
stride_w
=
stride
else
:
stride_h
,
stride_w
=
stride
# create cache stage
AA
=
s
.
cache_read
(
pad_data
,
'shared'
,
[
conv
])
WW
=
s
.
cache_read
(
packed_kernel
,
'shared'
,
[
conv
])
s
[
conv
]
.
set_scope
(
'local'
)
# handle bias
if
output
.
op
not
in
s
.
outputs
:
s
[
output
]
.
compute_inline
()
output
=
s
.
outputs
[
0
]
.
output
(
0
)
# tile and bind spatial axes
n
,
f
,
y
,
x
,
c
=
s
[
output
]
.
op
.
axis
cfg
.
define_split
(
"tile_f"
,
cfg
.
axis
(
f
),
num_outputs
=
4
)
cfg
.
define_split
(
"tile_y"
,
cfg
.
axis
(
y
),
num_outputs
=
4
)
cfg
.
define_split
(
"tile_x"
,
cfg
.
axis
(
x
),
num_outputs
=
4
)
bf
,
vf
,
tf
,
fi
=
cfg
[
"tile_f"
]
.
apply
(
s
,
output
,
f
)
by
,
vy
,
ty
,
yi
=
cfg
[
"tile_y"
]
.
apply
(
s
,
output
,
y
)
bx
,
vx
,
tx
,
xi
=
cfg
[
"tile_x"
]
.
apply
(
s
,
output
,
x
)
# this is the scope to attach global config inside this kernel
kernel_scope
,
n
=
s
[
output
]
.
split
(
n
,
nparts
=
1
)
max_block_z
=
128
if
batch
>
max_block_z
:
_
,
n
=
s
[
output
]
.
split
(
n
,
factor
=
max_block_z
)
s
[
output
]
.
reorder
(
n
,
bf
,
by
,
bx
,
vf
,
vy
,
vx
,
tf
,
ty
,
tx
,
fi
,
yi
,
xi
)
fused_byx
=
s
[
output
]
.
fuse
(
by
,
bx
)
s
[
output
]
.
bind
(
n
,
tvm
.
thread_axis
(
"blockIdx.z"
))
s
[
output
]
.
bind
(
bf
,
tvm
.
thread_axis
(
"blockIdx.y"
))
s
[
output
]
.
bind
(
fused_byx
,
tvm
.
thread_axis
(
"blockIdx.x"
))
s
[
output
]
.
bind
(
vf
,
tvm
.
thread_axis
(
"vthread"
))
s
[
output
]
.
bind
(
vy
,
tvm
.
thread_axis
(
"vthread"
))
s
[
output
]
.
bind
(
vx
,
tvm
.
thread_axis
(
"vthread"
))
s
[
output
]
.
bind
(
tf
,
tvm
.
thread_axis
(
"threadIdx.z"
))
s
[
output
]
.
bind
(
ty
,
tvm
.
thread_axis
(
"threadIdx.y"
))
s
[
output
]
.
bind
(
tx
,
tvm
.
thread_axis
(
"threadIdx.x"
))
s
[
conv
]
.
compute_at
(
s
[
output
],
tx
)
# tile and bind reduction axes
n
,
f
,
y
,
x
,
c
=
s
[
conv
]
.
op
.
axis
rc
,
ry
,
rx
,
rc_block
=
s
[
conv
]
.
op
.
reduce_axis
cfg
.
define_split
(
"tile_rc"
,
cfg
.
axis
(
rc
),
num_outputs
=
2
)
cfg
.
define_split
(
"tile_ry"
,
cfg
.
axis
(
ry
),
num_outputs
=
2
)
cfg
.
define_split
(
"tile_rx"
,
cfg
.
axis
(
rx
),
num_outputs
=
2
)
rco
,
rci
=
cfg
[
'tile_rc'
]
.
apply
(
s
,
conv
,
rc
)
ryo
,
ryi
=
cfg
[
'tile_ry'
]
.
apply
(
s
,
conv
,
ry
)
rxo
,
rxi
=
cfg
[
'tile_rx'
]
.
apply
(
s
,
conv
,
rx
)
s
[
conv
]
.
reorder
(
rco
,
ryo
,
rxo
,
rci
,
ryi
,
rxi
,
n
,
f
,
y
,
x
,
c
,
rc_block
)
_
,
rc_block
=
s
[
conv
]
.
split
(
rc_block
,
factor
=
4
)
s
[
conv
]
.
tensorize
(
rc_block
,
_dp4a
)
s
[
AA
]
.
compute_at
(
s
[
conv
],
rxo
)
s
[
WW
]
.
compute_at
(
s
[
conv
],
rxo
)
# cooperative fetching
for
load
in
[
AA
,
WW
]:
if
load
==
AA
:
n
,
f
,
y
,
x
,
c
=
s
[
load
]
.
op
.
axis
if
pad_data
==
packed_data
and
stride_h
==
1
and
stride_w
==
1
:
s
[
load
]
.
vectorize
(
c
)
fused
=
s
[
load
]
.
fuse
(
n
,
f
,
y
,
x
)
else
:
c
,
_
=
s
[
load
]
.
split
(
c
,
factor
=
4
)
fused
=
s
[
load
]
.
fuse
(
n
,
f
,
y
,
x
,
c
)
else
:
n
,
f
,
y
,
x
,
oc_chunk
,
c
=
s
[
load
]
.
op
.
axis
fused
=
s
[
load
]
.
fuse
(
n
,
f
,
y
,
x
,
oc_chunk
)
s
[
load
]
.
vectorize
(
c
)
fused
,
tx
=
s
[
load
]
.
split
(
fused
,
factor
=
cfg
[
"tile_x"
]
.
size
[
2
])
fused
,
ty
=
s
[
load
]
.
split
(
fused
,
factor
=
cfg
[
"tile_y"
]
.
size
[
2
])
fused
,
tz
=
s
[
load
]
.
split
(
fused
,
factor
=
cfg
[
"tile_f"
]
.
size
[
2
])
s
[
load
]
.
bind
(
tz
,
tvm
.
thread_axis
(
"threadIdx.z"
))
s
[
load
]
.
bind
(
ty
,
tvm
.
thread_axis
(
"threadIdx.y"
))
s
[
load
]
.
bind
(
tx
,
tvm
.
thread_axis
(
"threadIdx.x"
))
# double buffer
cfg
.
define_knob
(
'AA_double_buffer'
,
[
0
,
1
])
cfg
.
define_knob
(
'WW_double_buffer'
,
[
0
,
1
])
if
cfg
[
'AA_double_buffer'
]
.
val
:
s
[
AA
]
.
double_buffer
()
if
cfg
[
'WW_double_buffer'
]
.
val
:
s
[
WW
]
.
double_buffer
()
# unroll
cfg
.
define_knob
(
"auto_unroll_max_step"
,
[
0
,
512
,
1500
])
s
[
output
]
.
pragma
(
kernel_scope
,
'auto_unroll_max_step'
,
cfg
[
'auto_unroll_max_step'
]
.
val
)
s
[
output
]
.
pragma
(
kernel_scope
,
'unroll_explicit'
,
False
)
return
s
@conv2d_NCHWc_int8_prepacked.register
([
"cuda"
])
@autotvm.task.dispatcher
def
conv2d_NCHWc_int8_prepacked_dispatcher
(
data
,
kernel
,
stride
,
padding
,
layout
,
out_dtype
):
assert
layout
==
'NCHW4c'
return
_conv2d_NCHWc_int8_arg_to_workload
(
data
,
kernel
,
stride
,
padding
,
out_dtype
)
@conv2d_NCHWc_int8_prepacked_dispatcher.register
(
"int8"
)
def
_decl_conv2d_NCHWc_int8_prepacked
(
cfg
,
data
,
kernel
,
stride
,
padding
,
layout
,
out_dtype
):
return
conv2d_NCHWc_int8
(
cfg
,
data
,
kernel
,
stride
,
padding
,
layout
,
out_dtype
,
pre_computed
=
True
)
@autotvm.register_topi_schedule
(
schedule_conv2d_NCHWc_int8_prepacked
,
[
"cuda"
],
[
"int8"
])
def
schedule_conv2d_NCHWc_int8_prepacked_cuda
(
cfg
,
outs
):
"""TOPI schedule callback of conv2d for cuda
Parameters
----------
cfg: ConfigEntity
The config for this template
outs: Array of Tensor
The computation graph description of conv2d
in the format of an array of tensors.
Returns
-------
s: Schedule
The computation schedule for conv2d.
"""
s
=
tvm
.
create_schedule
([
x
.
op
for
x
in
outs
])
def
_callback
(
op
):
if
'conv2d_NCHWc_int8'
in
op
.
tag
:
schedule_conv2d_NCHWc_int8
(
cfg
,
s
,
op
.
output
(
0
),
pre_computed
=
True
)
traverse_inline
(
s
,
outs
[
0
]
.
op
,
_callback
)
return
s
topi/python/topi/cuda/conv2d_winograd.py
View file @
06f91dd2
...
...
@@ -375,6 +375,13 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
if
cfg
.
template_key
==
'direct'
:
return
None
if
cfg
.
template_key
==
'int8'
:
assert
'cuda'
in
tvm
.
target
.
current_target
()
.
keys
new_attrs
[
'layout'
]
=
'NCHW4c'
new_attrs
[
'out_layout'
]
=
'NCHW4c'
new_attrs
[
'kernel_layout'
]
=
'OIHW4o4i'
return
sym
.
conv2d
(
*
copy_inputs
,
**
new_attrs
)
# pre-compute weight transformation in winograd
tile_size
=
_infer_tile_size
(
tinfos
[
0
],
tinfos
[
1
])
...
...
topi/python/topi/generic/nn.py
View file @
06f91dd2
...
...
@@ -140,6 +140,24 @@ def schedule_conv2d_winograd_without_weight_transform(outs):
@tvm.target.generic_func
def
schedule_conv2d_NCHWc_int8_prepacked
(
outs
):
"""Schedule for conv2d NCHWc int8 with prepacked data and kernel
Parameters
----------
outs: Array of Tensor
The computation graph description of this operator
in the format of an array of tensors.
Returns
-------
sch: Schedule
The computation schedule for the op.
"""
return
_default_schedule
(
outs
,
False
)
@tvm.target.generic_func
def
schedule_conv2d_transpose_nchw
(
outs
):
"""Schedule for conv2d_transpose_nchw
...
...
topi/python/topi/nn/conv2d.py
View file @
06f91dd2
...
...
@@ -423,3 +423,37 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
4-D with shape [batch, out_height, out_width, out_channel]
"""
raise
ValueError
(
"missing register for topi.nn.conv2d_winograd_without_weight_transform"
)
@tvm.target.generic_func
def
conv2d_NCHWc_int8_prepacked
(
data
,
kernel
,
stride
,
padding
,
layout
,
out_dtype
):
"""Convolution operator in NCHW[x]c layout for int8. Data and kernel should be packed in
advance.
Parameters
----------
data : tvm.Tensor
5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
kernel : tvm.Tensor
6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
filter_width, num_filter_block, in_channel_block]
stride : int or a list/tuple of two ints
stride size, or [stride_height, stride_width]
padding: int or a list/tuple of two ints
padding size, or [pad_height, pad_width]
layout : str
layout of data
out_dtype: str
The output type. This is used for mixed precision.
Returns
-------
output : tvm.Tensor
5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
"""
raise
ValueError
(
"missing register for topi.nn.conv2d_NCHWc_int8_prepacked"
)
topi/tests/python/test_topi_conv2d_int8.py
0 → 100644
View file @
06f91dd2
"""Example code to do convolution."""
import
numpy
as
np
import
tvm
from
tvm
import
autotvm
from
tvm.autotvm.task.space
import
FallbackConfigEntity
import
topi
import
topi.testing
from
tvm.contrib.pickle_memoize
import
memoize
from
topi.util
import
get_const_tuple
from
common
import
get_all_backend
oc_block_factor
=
4
def
verify_conv2d_NCHWc_int8
(
batch
,
in_channel
,
in_size
,
num_filter
,
kernel
,
stride
,
padding
,
dilation
=
1
,
add_bias
=
False
,
add_relu
=
False
):
print
(
"Workload: (
%
d,
%
d,
%
d,
%
d,
%
d,
%
d,
%
d)"
%
(
batch
,
in_channel
,
in_size
,
num_filter
,
kernel
,
stride
,
padding
))
in_height
=
in_width
=
in_size
A
=
tvm
.
placeholder
((
batch
,
in_channel
,
in_height
,
in_width
),
name
=
'A'
,
dtype
=
'int8'
)
W
=
tvm
.
placeholder
((
num_filter
,
in_channel
,
kernel
,
kernel
),
name
=
'W'
,
dtype
=
'int8'
)
bias
=
tvm
.
placeholder
((
num_filter
//
oc_block_factor
,
1
,
1
,
oc_block_factor
),
name
=
'bias'
,
dtype
=
'int8'
)
a_shape
=
get_const_tuple
(
A
.
shape
)
w_shape
=
get_const_tuple
(
W
.
shape
)
bias_shape
=
get_const_tuple
(
bias
.
shape
)
dtype
=
A
.
dtype
@memoize
(
"topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw"
)
def
get_ref_data
():
a_np
=
np
.
random
.
randint
(
low
=-
128
,
high
=
127
,
size
=
a_shape
)
.
astype
(
dtype
)
w_np
=
np
.
random
.
randint
(
low
=-
128
,
high
=
128
,
size
=
w_shape
)
.
astype
(
dtype
)
b_np
=
np
.
random
.
uniform
(
size
=
bias_shape
)
.
astype
(
dtype
)
dw_np
=
topi
.
testing
.
dilate_python
(
w_np
,
(
1
,
1
,
dilation
,
dilation
))
c_np
=
topi
.
testing
.
conv2d_nchw_python
(
a_np
,
dw_np
,
stride
,
padding
)
.
astype
(
dtype
)
# convert to NCHWc
_
,
_
,
out_height
,
out_width
=
c_np
.
shape
c_np
=
c_np
.
reshape
((
batch
,
num_filter
//
oc_block_factor
,
oc_block_factor
,
\
out_height
,
out_width
))
.
transpose
(
0
,
1
,
3
,
4
,
2
)
if
add_bias
:
b_np
=
np
.
random
.
uniform
(
size
=
bias_shape
)
.
astype
(
dtype
)
c_np
+=
b_np
if
add_relu
:
c_np
=
np
.
maximum
(
c_np
,
0
)
return
a_np
,
w_np
,
b_np
,
c_np
a_np
,
w_np
,
b_np
,
c_np
=
get_ref_data
()
def
check_device
(
device
):
ctx
=
tvm
.
context
(
device
,
0
)
if
not
ctx
.
exist
:
print
(
"Skip because
%
s is not enabled"
%
device
)
return
if
device
==
"cuda"
and
not
tvm
.
contrib
.
nvcc
.
have_int8
(
ctx
.
compute_version
):
print
(
"Skip because int8 intrinsics are not available"
)
return
print
(
"Running on target:
%
s"
%
device
)
with
tvm
.
target
.
create
(
device
):
dW
=
topi
.
nn
.
dilate
(
W
,
(
1
,
1
,
dilation
,
dilation
))
C
=
topi
.
nn
.
conv2d
(
A
,
dW
,
(
stride
,
stride
),
(
padding
,
padding
),
layout
=
'NCHW'
,
out_dtype
=
dtype
)
if
add_bias
:
C
=
topi
.
add
(
C
,
bias
)
if
add_relu
:
C
=
topi
.
nn
.
relu
(
C
)
s
=
topi
.
generic
.
schedule_conv2d_nchw
([
C
])
a
=
tvm
.
nd
.
array
(
a_np
,
ctx
)
w
=
tvm
.
nd
.
array
(
w_np
,
ctx
)
b
=
tvm
.
nd
.
array
(
b_np
,
ctx
)
c
=
tvm
.
nd
.
array
(
np
.
zeros
(
get_const_tuple
(
C
.
shape
),
dtype
=
C
.
dtype
),
ctx
)
if
add_bias
:
tvm
.
build
(
s
,
[
A
,
W
,
bias
,
C
],
device
,
name
=
"relu_
%
d_
%
d_
%
d_
%
d_
%
d_
%
d_
%
d_
%
d"
%
(
batch
,
in_channel
,
in_size
,
num_filter
,
kernel
,
stride
,
padding
,
dilation
))
func
=
tvm
.
build
(
s
,
[
A
,
W
,
bias
,
C
],
device
,
name
=
"relu_
%
d_
%
d_
%
d_
%
d_
%
d_
%
d_
%
d_
%
d"
%
(
batch
,
in_channel
,
in_size
,
num_filter
,
kernel
,
stride
,
padding
,
dilation
))
func
(
a
,
w
,
b
,
c
)
else
:
func
=
tvm
.
build
(
s
,
[
A
,
W
,
C
],
device
,
name
=
"relu_
%
d_
%
d_
%
d_
%
d_
%
d_
%
d_
%
d_
%
d"
%
(
batch
,
in_channel
,
in_size
,
num_filter
,
kernel
,
stride
,
padding
,
dilation
))
func
(
a
,
w
,
c
)
np
.
testing
.
assert_allclose
(
c
.
asnumpy
(),
c_np
,
rtol
=
1e-5
)
for
device
in
[
"cuda"
]:
check_device
(
device
)
class
NCHWcInt8Fallback
(
autotvm
.
FallbackContext
):
def
_query_inside
(
self
,
target
,
workload
):
key
=
(
target
,
workload
)
if
key
in
self
.
memory
:
return
self
.
memory
[
key
]
cfg
=
FallbackConfigEntity
()
cfg
.
template_key
=
'int8'
self
.
memory
[
key
]
=
cfg
return
cfg
def
test_conv2d_nchw
():
with
NCHWcInt8Fallback
():
# ResNet18 workloads where channels in / out are multiple of oc_block_factor
verify_conv2d_NCHWc_int8
(
1
,
64
,
56
,
64
,
3
,
1
,
1
)
verify_conv2d_NCHWc_int8
(
1
,
64
,
56
,
64
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
64
,
56
,
128
,
3
,
2
,
1
)
verify_conv2d_NCHWc_int8
(
1
,
64
,
56
,
128
,
1
,
2
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
128
,
28
,
128
,
3
,
1
,
1
)
verify_conv2d_NCHWc_int8
(
1
,
128
,
28
,
256
,
3
,
2
,
1
)
verify_conv2d_NCHWc_int8
(
1
,
128
,
28
,
256
,
1
,
2
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
256
,
14
,
256
,
3
,
1
,
1
)
verify_conv2d_NCHWc_int8
(
1
,
256
,
14
,
512
,
3
,
2
,
1
)
verify_conv2d_NCHWc_int8
(
1
,
256
,
14
,
512
,
1
,
2
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
512
,
7
,
512
,
3
,
1
,
1
)
# bias, relu
verify_conv2d_NCHWc_int8
(
1
,
64
,
56
,
64
,
3
,
1
,
1
,
add_relu
=
True
)
verify_conv2d_NCHWc_int8
(
1
,
64
,
56
,
64
,
3
,
1
,
1
,
add_bias
=
True
)
verify_conv2d_NCHWc_int8
(
1
,
64
,
56
,
64
,
3
,
1
,
1
,
add_bias
=
True
,
add_relu
=
True
)
# batch size
verify_conv2d_NCHWc_int8
(
4
,
64
,
56
,
64
,
3
,
1
,
1
)
verify_conv2d_NCHWc_int8
(
9
,
64
,
56
,
64
,
3
,
1
,
1
)
# weird workloads
verify_conv2d_NCHWc_int8
(
4
,
4
,
4
,
4
,
4
,
4
,
4
)
# inception v3 workloads where channels in / out are multiple of oc_block_factor
verify_conv2d_NCHWc_int8
(
1
,
32
,
149
,
32
,
3
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
32
,
147
,
64
,
3
,
1
,
1
)
verify_conv2d_NCHWc_int8
(
1
,
64
,
73
,
80
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
80
,
73
,
192
,
3
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
192
,
35
,
64
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
192
,
35
,
48
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
48
,
35
,
64
,
5
,
1
,
2
)
verify_conv2d_NCHWc_int8
(
1
,
64
,
35
,
96
,
3
,
1
,
1
)
verify_conv2d_NCHWc_int8
(
1
,
96
,
35
,
96
,
3
,
1
,
1
)
verify_conv2d_NCHWc_int8
(
1
,
192
,
35
,
32
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
256
,
35
,
64
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
256
,
35
,
48
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
288
,
35
,
64
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
288
,
35
,
48
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
288
,
35
,
384
,
3
,
2
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
96
,
35
,
96
,
3
,
2
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
768
,
17
,
192
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
768
,
17
,
128
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
128
,
17
,
128
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
128
,
17
,
192
,
7
,
1
,
3
)
verify_conv2d_NCHWc_int8
(
1
,
128
,
17
,
128
,
7
,
1
,
3
)
verify_conv2d_NCHWc_int8
(
1
,
128
,
17
,
192
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
768
,
17
,
160
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
160
,
17
,
160
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
160
,
17
,
192
,
7
,
1
,
3
)
verify_conv2d_NCHWc_int8
(
1
,
160
,
17
,
160
,
7
,
1
,
3
)
verify_conv2d_NCHWc_int8
(
1
,
160
,
17
,
192
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
192
,
17
,
192
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
192
,
17
,
192
,
7
,
1
,
3
)
verify_conv2d_NCHWc_int8
(
1
,
192
,
17
,
320
,
3
,
2
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
192
,
17
,
192
,
3
,
2
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
1280
,
8
,
320
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
1280
,
8
,
384
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
384
,
8
,
384
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
384
,
8
,
384
,
3
,
1
,
1
)
verify_conv2d_NCHWc_int8
(
1
,
1280
,
8
,
448
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
448
,
8
,
384
,
3
,
1
,
1
)
verify_conv2d_NCHWc_int8
(
1
,
1280
,
8
,
192
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
2048
,
8
,
320
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
2048
,
8
,
384
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
2048
,
8
,
448
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
2048
,
8
,
192
,
1
,
1
,
0
)
verify_conv2d_NCHWc_int8
(
1
,
1024
,
19
,
84
,
3
,
1
,
1
)
if
__name__
==
"__main__"
:
test_conv2d_nchw
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment