Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
tic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
wenyuanbo
tic
Commits
f98035b0
Unverified
Commit
f98035b0
authored
Sep 28, 2019
by
Tianqi Chen
Committed by
GitHub
Sep 28, 2019
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[ARITH] cleanup the indexmod/div on python side (#4028)
parent
bbf82e0e
Hide whitespace changes
Inline
Side-by-side
Showing
24 changed files
with
188 additions
and
116 deletions
+188
-116
python/tvm/autotvm/task/task.py
+3
-1
python/tvm/expr.py
+10
-10
src/pass/rewrite_unsafe_select.cc
+4
-2
tests/python/relay/test_op_level3.py
+8
-6
tests/python/relay/test_op_level5.py
+2
-1
tests/python/unittest/test_autotvm_flop_calculator.py
+3
-3
tests/python/unittest/test_codegen_cuda.py
+3
-2
tests/python/unittest/test_ir_builder.py
+3
-2
tests/python/unittest/test_lang_buffer.py
+14
-14
tests/python/unittest/test_pass_rewrite_unsafe_select.py
+1
-1
tests/python/unittest/test_schedule_tensorize.py
+5
-4
topi/python/topi/arm_cpu/bitserial_conv2d.py
+11
-3
topi/python/topi/arm_cpu/conv2d.py
+23
-12
topi/python/topi/cuda/nms.py
+10
-5
topi/python/topi/cuda/rcnn/proposal.py
+9
-4
topi/python/topi/cuda/sort.py
+13
-8
topi/python/topi/cuda/ssd/multibox.py
+12
-6
topi/python/topi/nn/bitserial_conv2d.py
+10
-8
topi/python/topi/nn/sparse.py
+4
-1
topi/python/topi/util.py
+6
-3
topi/python/topi/vision/ssd/multibox.py
+2
-2
topi/python/topi/x86/conv2d_avx_1x1.py
+12
-3
tutorials/optimize/opt_gemm.py
+1
-1
vta/python/vta/ir_pass.py
+19
-14
No files found.
python/tvm/autotvm/task/task.py
View file @
f98035b0
...
@@ -350,7 +350,9 @@ def compute_flop(sch):
...
@@ -350,7 +350,9 @@ def compute_flop(sch):
return
_count_flop
(
exp
.
value
)
return
_count_flop
(
exp
.
value
)
if
isinstance
(
exp
,
expr
.
Var
):
if
isinstance
(
exp
,
expr
.
Var
):
return
0
return
0
if
isinstance
(
exp
,
(
expr
.
Add
,
expr
.
Sub
,
expr
.
Mul
,
expr
.
Div
,
expr
.
Mod
,
if
isinstance
(
exp
,
(
expr
.
Add
,
expr
.
Sub
,
expr
.
Mul
,
expr
.
Div
,
expr
.
Mod
,
expr
.
FloorDiv
,
expr
.
FloorMod
,
expr
.
Max
,
expr
.
Min
,
expr
.
Max
,
expr
.
Min
,
expr
.
EQ
,
expr
.
NE
,
expr
.
LT
,
expr
.
LE
,
expr
.
GT
,
expr
.
GE
,
expr
.
EQ
,
expr
.
NE
,
expr
.
LT
,
expr
.
LE
,
expr
.
GT
,
expr
.
GE
,
expr
.
And
,
expr
.
Or
,
expr
.
Not
)):
expr
.
And
,
expr
.
Or
,
expr
.
Not
)):
...
...
python/tvm/expr.py
View file @
f98035b0
...
@@ -72,23 +72,23 @@ class ExprOp(object):
...
@@ -72,23 +72,23 @@ class ExprOp(object):
return
_generic
.
multiply
(
other
,
self
)
return
_generic
.
multiply
(
other
,
self
)
def
__div__
(
self
,
other
):
def
__div__
(
self
,
other
):
#
if _dtype_is_int(self) and _dtype_is_int(other):
if
_dtype_is_int
(
self
)
and
_dtype_is_int
(
other
):
#
raise div_ambiguity_error()
raise
div_ambiguity_error
()
return
_generic
.
divide
(
self
,
other
)
return
_generic
.
divide
(
self
,
other
)
def
__rdiv__
(
self
,
other
):
def
__rdiv__
(
self
,
other
):
#
if _dtype_is_int(self) and _dtype_is_int(other):
if
_dtype_is_int
(
self
)
and
_dtype_is_int
(
other
):
#
raise div_ambiguity_error()
raise
div_ambiguity_error
()
return
_generic
.
divide
(
other
,
self
)
return
_generic
.
divide
(
other
,
self
)
def
__truediv__
(
self
,
other
):
def
__truediv__
(
self
,
other
):
#
if _dtype_is_int(self) and _dtype_is_int(other):
if
_dtype_is_int
(
self
)
and
_dtype_is_int
(
other
):
#
raise div_ambiguity_error()
raise
div_ambiguity_error
()
return
_generic
.
divide
(
self
,
other
)
return
_generic
.
divide
(
self
,
other
)
def
__rtruediv__
(
self
,
other
):
def
__rtruediv__
(
self
,
other
):
#
if _dtype_is_int(self) and _dtype_is_int(other):
if
_dtype_is_int
(
self
)
and
_dtype_is_int
(
other
):
#
raise div_ambiguity_error()
raise
div_ambiguity_error
()
return
_generic
.
divide
(
other
,
self
)
return
_generic
.
divide
(
other
,
self
)
def
__floordiv__
(
self
,
other
):
def
__floordiv__
(
self
,
other
):
...
@@ -100,8 +100,8 @@ class ExprOp(object):
...
@@ -100,8 +100,8 @@ class ExprOp(object):
return
_generic
.
divide
(
other
,
self
)
return
_generic
.
divide
(
other
,
self
)
def
__mod__
(
self
,
other
):
def
__mod__
(
self
,
other
):
#
raise div_ambiguity_error()
raise
div_ambiguity_error
()
return
_make
.
_OpMod
(
self
,
other
)
#
return _make._OpMod(self, other)
def
__neg__
(
self
):
def
__neg__
(
self
):
neg_one
=
_api_internal
.
_const
(
-
1
,
self
.
dtype
)
neg_one
=
_api_internal
.
_const
(
-
1
,
self
.
dtype
)
...
...
src/pass/rewrite_unsafe_select.cc
View file @
f98035b0
...
@@ -6,9 +6,9 @@
...
@@ -6,9 +6,9 @@
* to you under the Apache License, Version 2.0 (the
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* with the License. You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing,
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
...
@@ -64,6 +64,8 @@ class UnsafeExprDetector : public ExprFunctor<bool(const Expr& n)> {
...
@@ -64,6 +64,8 @@ class UnsafeExprDetector : public ExprFunctor<bool(const Expr& n)> {
bool
VisitExpr_
(
const
Mul
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
Mul
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
Div
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
Div
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
Mod
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
Mod
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
FloorDiv
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
FloorMod
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
Min
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
Min
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
Max
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
Max
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
EQ
*
op
)
final
{
return
BinaryOp
(
op
);
}
bool
VisitExpr_
(
const
EQ
*
op
)
final
{
return
BinaryOp
(
op
);
}
...
...
tests/python/relay/test_op_level3.py
View file @
f98035b0
...
@@ -373,6 +373,8 @@ def test_split_infer_type():
...
@@ -373,6 +373,8 @@ def test_split_infer_type():
yy
=
run_infer_type
(
y
.
astuple
())
yy
=
run_infer_type
(
y
.
astuple
())
assert
yy
.
checked_type
==
ret_type
assert
yy
.
checked_type
==
ret_type
idxd
=
tvm
.
indexdiv
d1
,
d2
,
d3
,
d4
=
tvm
.
var
(
"d1"
),
tvm
.
var
(
"d2"
),
tvm
.
var
(
"d3"
),
tvm
.
var
(
"d4"
)
d1
,
d2
,
d3
,
d4
=
tvm
.
var
(
"d1"
),
tvm
.
var
(
"d2"
),
tvm
.
var
(
"d3"
),
tvm
.
var
(
"d4"
)
axis
=
tvm
.
var
(
"axis"
)
axis
=
tvm
.
var
(
"axis"
)
verify_split
((
5
,
5
,
2
,
2
),
5
,
verify_split
((
5
,
5
,
2
,
2
),
5
,
...
@@ -393,15 +395,15 @@ def test_split_infer_type():
...
@@ -393,15 +395,15 @@ def test_split_infer_type():
axis
=
0
)
axis
=
0
)
verify_split
((
d1
,
d2
,
d3
,
d4
),
4
,
verify_split
((
d1
,
d2
,
d3
,
d4
),
4
,
relay
.
ty
.
TupleType
(
tvm
.
convert
([
relay
.
ty
.
TupleType
(
tvm
.
convert
([
relay
.
ty
.
TensorType
((
d1
,
d2
,
d3
/
4
,
d4
),
"float32"
),
relay
.
ty
.
TensorType
((
d1
,
d2
,
idxd
(
d3
,
4
)
,
d4
),
"float32"
),
relay
.
ty
.
TensorType
((
d1
,
d2
,
d3
/
4
,
d4
),
"float32"
),
relay
.
ty
.
TensorType
((
d1
,
d2
,
idxd
(
d3
,
4
)
,
d4
),
"float32"
),
relay
.
ty
.
TensorType
((
d1
,
d2
,
d3
/
4
,
d4
),
"float32"
),
relay
.
ty
.
TensorType
((
d1
,
d2
,
idxd
(
d3
,
4
)
,
d4
),
"float32"
),
relay
.
ty
.
TensorType
((
d1
,
d2
,
d3
/
4
,
d4
),
"float32"
)])),
relay
.
ty
.
TensorType
((
d1
,
d2
,
idxd
(
d3
,
4
)
,
d4
),
"float32"
)])),
axis
=
2
)
axis
=
2
)
verify_split
((
d1
,
d2
,
d3
,
d4
),
2
,
verify_split
((
d1
,
d2
,
d3
,
d4
),
2
,
relay
.
ty
.
TupleType
(
tvm
.
convert
([
relay
.
ty
.
TupleType
(
tvm
.
convert
([
relay
.
ty
.
TensorType
((
d1
/
2
,
d2
,
d3
,
d4
),
"float32"
),
relay
.
ty
.
TensorType
((
idxd
(
d1
,
2
)
,
d2
,
d3
,
d4
),
"float32"
),
relay
.
ty
.
TensorType
((
d1
/
2
,
d2
,
d3
,
d4
),
"float32"
)])),
relay
.
ty
.
TensorType
((
idxd
(
d1
,
2
)
,
d2
,
d3
,
d4
),
"float32"
)])),
axis
=
0
)
axis
=
0
)
verify_split
((
d1
,
d2
,
d3
,
d4
),
(
2
,
4
,
7
),
verify_split
((
d1
,
d2
,
d3
,
d4
),
(
2
,
4
,
7
),
relay
.
ty
.
TupleType
(
tvm
.
convert
([
relay
.
ty
.
TupleType
(
tvm
.
convert
([
...
...
tests/python/relay/test_op_level5.py
View file @
f98035b0
...
@@ -487,8 +487,9 @@ def test_yolo_reorg_infer_shape():
...
@@ -487,8 +487,9 @@ def test_yolo_reorg_infer_shape():
assert
zz
.
checked_type
==
relay
.
ty
.
TensorType
(
out_shape
,
"float32"
)
assert
zz
.
checked_type
==
relay
.
ty
.
TensorType
(
out_shape
,
"float32"
)
n
,
c
,
h
,
w
=
tvm
.
var
(
"n"
),
tvm
.
var
(
"c"
),
tvm
.
var
(
"h"
),
tvm
.
var
(
"w"
)
n
,
c
,
h
,
w
=
tvm
.
var
(
"n"
),
tvm
.
var
(
"c"
),
tvm
.
var
(
"h"
),
tvm
.
var
(
"w"
)
idxd
=
tvm
.
indexdiv
verify_yolo_reorg
((
n
,
c
,
20
,
20
),
10
,
(
n
,
c
*
10
*
10
,
2
,
2
))
verify_yolo_reorg
((
n
,
c
,
20
,
20
),
10
,
(
n
,
c
*
10
*
10
,
2
,
2
))
verify_yolo_reorg
((
n
,
c
,
h
,
w
),
2
,
(
n
,
c
*
2
*
2
,
h
/
2
,
w
/
2
))
verify_yolo_reorg
((
n
,
c
,
h
,
w
),
2
,
(
n
,
c
*
2
*
2
,
idxd
(
h
,
2
),
idxd
(
w
,
2
)
))
def
test_yolo_reorg
():
def
test_yolo_reorg
():
def
verify_yolo_reorg
(
shape
,
stride
):
def
verify_yolo_reorg
(
shape
,
stride
):
...
...
tests/python/unittest/test_autotvm_flop_calculator.py
View file @
f98035b0
...
@@ -60,14 +60,14 @@ def test_pack_gemm():
...
@@ -60,14 +60,14 @@ def test_pack_gemm():
k
=
tvm
.
reduce_axis
((
0
,
L
))
k
=
tvm
.
reduce_axis
((
0
,
L
))
bn
=
4
bn
=
4
fld
=
tvm
.
floor
div
idxd
=
tvm
.
index
div
flm
=
tvm
.
floor
mod
idxm
=
tvm
.
index
mod
A_pack
=
tvm
.
compute
((
N
//
bn
,
L
,
bn
),
lambda
i
,
j
,
k
:
A
[
i
*
bn
+
k
][
j
])
A_pack
=
tvm
.
compute
((
N
//
bn
,
L
,
bn
),
lambda
i
,
j
,
k
:
A
[
i
*
bn
+
k
][
j
])
B_pack
=
tvm
.
compute
((
M
//
bn
,
L
,
bn
),
lambda
i
,
j
,
k
:
B
[
i
*
bn
+
k
][
j
])
B_pack
=
tvm
.
compute
((
M
//
bn
,
L
,
bn
),
lambda
i
,
j
,
k
:
B
[
i
*
bn
+
k
][
j
])
C_pack
=
tvm
.
compute
((
N
//
bn
,
M
//
bn
,
bn
,
bn
),
lambda
i
,
j
,
ii
,
jj
:
C_pack
=
tvm
.
compute
((
N
//
bn
,
M
//
bn
,
bn
,
bn
),
lambda
i
,
j
,
ii
,
jj
:
tvm
.
sum
(
A_pack
[
i
,
k
,
ii
]
.
astype
(
acc_dtype
)
*
B_pack
[
j
,
k
,
jj
]
.
astype
(
acc_dtype
),
axis
=
[
k
]))
tvm
.
sum
(
A_pack
[
i
,
k
,
ii
]
.
astype
(
acc_dtype
)
*
B_pack
[
j
,
k
,
jj
]
.
astype
(
acc_dtype
),
axis
=
[
k
]))
C
=
tvm
.
compute
((
N
,
M
),
lambda
i
,
j
:
C_pack
[
fld
(
i
,
bn
)][
fld
(
j
,
bn
)][
flm
(
i
,
bn
)][
fl
m
(
j
,
bn
)])
C
=
tvm
.
compute
((
N
,
M
),
lambda
i
,
j
:
C_pack
[
idxd
(
i
,
bn
)][
idxd
(
j
,
bn
)][
idxm
(
i
,
bn
)][
idx
m
(
j
,
bn
)])
s
=
tvm
.
create_schedule
([
C
.
op
])
s
=
tvm
.
create_schedule
([
C
.
op
])
assert
compute_flop
(
s
)
==
2
*
N
*
L
*
M
assert
compute_flop
(
s
)
==
2
*
N
*
L
*
M
...
...
tests/python/unittest/test_codegen_cuda.py
View file @
f98035b0
...
@@ -37,7 +37,7 @@ def test_cuda_vectorize_add():
...
@@ -37,7 +37,7 @@ def test_cuda_vectorize_add():
print
(
"skip because gpu does not support int8"
)
print
(
"skip because gpu does not support int8"
)
return
return
A
=
tvm
.
placeholder
((
n
,),
name
=
'A'
,
dtype
=
"
%
sx
%
d"
%
(
dtype
,
lanes
))
A
=
tvm
.
placeholder
((
n
,),
name
=
'A'
,
dtype
=
"
%
sx
%
d"
%
(
dtype
,
lanes
))
B
=
tvm
.
compute
((
n
,),
lambda
i
:
A
[
i
]
+
tvm
.
const
(
1
,
A
.
dtype
),
name
=
'B'
)
B
=
tvm
.
compute
((
n
,),
lambda
i
:
A
[
i
]
+
tvm
.
const
(
1
,
A
.
dtype
),
name
=
'B'
)
s
=
tvm
.
create_schedule
(
B
.
op
)
s
=
tvm
.
create_schedule
(
B
.
op
)
xo
,
xi
=
s
[
B
]
.
split
(
B
.
op
.
axis
[
0
],
factor
=
num_thread
)
xo
,
xi
=
s
[
B
]
.
split
(
B
.
op
.
axis
[
0
],
factor
=
num_thread
)
s
[
B
]
.
bind
(
xo
,
bx
)
s
[
B
]
.
bind
(
xo
,
bx
)
...
@@ -165,9 +165,10 @@ def test_cuda_shuffle():
...
@@ -165,9 +165,10 @@ def test_cuda_shuffle():
print
(
"skip because cuda is not enabled.."
)
print
(
"skip because cuda is not enabled.."
)
return
return
idxm
=
tvm
.
indexmod
a
=
tvm
.
placeholder
((
64
,
),
'int32'
)
a
=
tvm
.
placeholder
((
64
,
),
'int32'
)
b
=
tvm
.
placeholder
((
64
,
),
'int32'
)
b
=
tvm
.
placeholder
((
64
,
),
'int32'
)
c
=
tvm
.
compute
((
64
,
),
lambda
x
:
a
[
x
]
+
b
[
x
-
(
x
%
4
)
+
(
3
-
x
%
4
)])
c
=
tvm
.
compute
((
64
,
),
lambda
x
:
a
[
x
]
+
b
[
x
-
idxm
(
x
,
4
)
+
(
3
-
idxm
(
x
,
4
)
)])
sch
=
tvm
.
create_schedule
(
c
.
op
)
sch
=
tvm
.
create_schedule
(
c
.
op
)
x
=
c
.
op
.
axis
[
0
]
x
=
c
.
op
.
axis
[
0
]
xo
,
xi
=
sch
[
c
]
.
split
(
x
,
4
)
xo
,
xi
=
sch
[
c
]
.
split
(
x
,
4
)
...
...
tests/python/unittest/test_ir_builder.py
View file @
f98035b0
...
@@ -109,14 +109,15 @@ def test_gpu():
...
@@ -109,14 +109,15 @@ def test_gpu():
dtype
=
"float32"
dtype
=
"float32"
A
=
tvm
.
placeholder
((
n
,),
name
=
'A'
)
A
=
tvm
.
placeholder
((
n
,),
name
=
'A'
)
B
=
tvm
.
placeholder
((
n
,),
name
=
'B'
)
B
=
tvm
.
placeholder
((
n
,),
name
=
'B'
)
fld
=
tvm
.
floordiv
idxd
=
tvm
.
indexdiv
def
test_device_ir
(
A
,
B
,
C
):
def
test_device_ir
(
A
,
B
,
C
):
n
=
A
.
shape
[
0
]
n
=
A
.
shape
[
0
]
max_threads
=
32
max_threads
=
32
ib
=
tvm
.
ir_builder
.
create
()
ib
=
tvm
.
ir_builder
.
create
()
bx
=
tvm
.
thread_axis
(
"blockIdx.x"
)
bx
=
tvm
.
thread_axis
(
"blockIdx.x"
)
tx
=
tvm
.
thread_axis
(
"threadIdx.x"
)
tx
=
tvm
.
thread_axis
(
"threadIdx.x"
)
ib
.
scope_attr
(
bx
,
"thread_extent"
,
fl
d
(
n
+
max_threads
-
1
,
max_threads
))
ib
.
scope_attr
(
bx
,
"thread_extent"
,
idx
d
(
n
+
max_threads
-
1
,
max_threads
))
ib
.
scope_attr
(
tx
,
"thread_extent"
,
max_threads
)
ib
.
scope_attr
(
tx
,
"thread_extent"
,
max_threads
)
idx
=
bx
.
var
*
max_threads
+
tx
.
var
idx
=
bx
.
var
*
max_threads
+
tx
.
var
Aptr
=
ib
.
buffer_ptr
(
A
)
Aptr
=
ib
.
buffer_ptr
(
A
)
...
...
tests/python/unittest/test_lang_buffer.py
View file @
f98035b0
...
@@ -94,31 +94,31 @@ def test_buffer_index_merge_mult_mod():
...
@@ -94,31 +94,31 @@ def test_buffer_index_merge_mult_mod():
def
assert_simplified_equal
(
index_simplified
,
index_direct
):
def
assert_simplified_equal
(
index_simplified
,
index_direct
):
assert
tvm
.
ir_pass
.
Equal
(
index_simplified
,
index_direct
),
\
assert
tvm
.
ir_pass
.
Equal
(
index_simplified
,
index_direct
),
\
"index_simplified=
%
s, index_direct=
%
s"
%
(
index_simplified
,
index_direct
)
"index_simplified=
%
s, index_direct=
%
s"
%
(
index_simplified
,
index_direct
)
idxd
iv
=
tvm
.
indexdiv
idxd
=
tvm
.
indexdiv
idxm
od
=
tvm
.
indexmod
idxm
=
tvm
.
indexmod
# Test Case1
# Test Case1
index_simplified
=
A_stride
.
vload
(
index_simplified
=
A_stride
.
vload
(
(
idxd
iv
(
idxmod
(
k0
,
k1
),
s
),
idxmod
(
idxmod
(
k0
,
k1
),
s
)
+
idxdiv
(
k0
,
k1
)
*
k1
))
(
idxd
(
idxm
(
k0
,
k1
),
s
),
idxm
(
idxm
(
k0
,
k1
),
s
)
+
idxd
(
k0
,
k1
)
*
k1
))
index_direct
=
A_stride
.
vload
((
0
,
k0
))
index_direct
=
A_stride
.
vload
((
0
,
k0
))
assert_simplified_equal
(
index_simplified
,
index_direct
)
assert_simplified_equal
(
index_simplified
,
index_direct
)
# Test Case2
# Test Case2
index_simplified
=
A
.
vload
((
idxd
iv
(
idxmod
(
k0
,
idxdiv
(
k1
,
s
)),
n
),
index_simplified
=
A
.
vload
((
idxd
(
idxm
(
k0
,
idxd
(
k1
,
s
)),
n
),
idxm
od
(
idxmod
(
k0
,
idxdiv
(
k1
,
s
)),
n
)
+
idxmod
(
k0
,
k1
)))
idxm
(
idxm
(
k0
,
idxd
(
k1
,
s
)),
n
)
+
idxm
(
k0
,
k1
)))
index_direct
=
A
.
vload
((
0
,
idxm
od
(
k0
,
k1
)
+
idxmod
(
k0
,
idxdiv
(
k1
,
s
))))
index_direct
=
A
.
vload
((
0
,
idxm
(
k0
,
k1
)
+
idxm
(
k0
,
idxd
(
k1
,
s
))))
assert_simplified_equal
(
index_simplified
,
index_direct
)
assert_simplified_equal
(
index_simplified
,
index_direct
)
# Test Case3
# Test Case3
index_simplified
=
A
.
vload
((
idxd
iv
((
idxdiv
(
k0
,
idxdiv
(
k1
,
s
))
*
idxdiv
(
k1
,
s
)),
n
)
+
index_simplified
=
A
.
vload
((
idxd
((
idxd
(
k0
,
idxd
(
k1
,
s
))
*
idxd
(
k1
,
s
)),
n
)
+
idxd
iv
(
idxmod
(
k0
,
idxdiv
(
k1
,
s
)),
n
),
idxd
(
idxm
(
k0
,
idxd
(
k1
,
s
)),
n
),
idxm
od
((
idxdiv
(
k0
,
idxdiv
(
k1
,
s
))
*
idxdiv
(
k1
,
s
)),
n
)
+
idxm
((
idxd
(
k0
,
idxd
(
k1
,
s
))
*
idxd
(
k1
,
s
)),
n
)
+
idxm
od
(
idxmod
(
k0
,
idxdiv
(
k1
,
s
)),
n
)))
idxm
(
idxm
(
k0
,
idxd
(
k1
,
s
)),
n
)))
index_direct
=
A
.
vload
((
0
,
k0
))
index_direct
=
A
.
vload
((
0
,
k0
))
assert_simplified_equal
(
index_simplified
,
index_direct
)
assert_simplified_equal
(
index_simplified
,
index_direct
)
# Test Case4 (not able to simplify)
# Test Case4 (not able to simplify)
index_simplified
=
A
.
vload
((
idxd
iv
(
idxmod
(
k0
,
idxdiv
(
k1
,
s
)),
n
),
index_simplified
=
A
.
vload
((
idxd
(
idxm
(
k0
,
idxd
(
k1
,
s
)),
n
),
idxm
od
(
idxmod
(
k0
,
idxdiv
(
k1
,
n
)),
n
)
+
idxmod
(
k0
,
k1
)))
idxm
(
idxm
(
k0
,
idxd
(
k1
,
n
)),
n
)
+
idxm
(
k0
,
k1
)))
index_direct
=
A
.
vload
((
0
,
idxd
iv
(
idxmod
(
k0
,
idxdiv
(
k1
,
s
)),
n
)
*
n
+
index_direct
=
A
.
vload
((
0
,
idxd
(
idxm
(
k0
,
idxd
(
k1
,
s
)),
n
)
*
n
+
(
idxm
od
(
idxmod
(
k0
,
idxdiv
(
k1
,
n
)),
n
)
+
idxmod
(
k0
,
k1
))))
(
idxm
(
idxm
(
k0
,
idxd
(
k1
,
n
)),
n
)
+
idxm
(
k0
,
k1
))))
assert_simplified_equal
(
index_simplified
,
index_direct
)
assert_simplified_equal
(
index_simplified
,
index_direct
)
...
...
tests/python/unittest/test_pass_rewrite_unsafe_select.py
View file @
f98035b0
...
@@ -28,7 +28,7 @@ def test_rewrite_Select():
...
@@ -28,7 +28,7 @@ def test_rewrite_Select():
tvm
.
expr
.
Select
(
i
>
1
,
A
[
i
-
1
],
1.0
)
>
0.0
,
A
[
i
],
0.1
)
tvm
.
expr
.
Select
(
i
>
1
,
A
[
i
-
1
],
1.0
)
>
0.0
,
A
[
i
],
0.1
)
zz
=
tvm
.
ir_pass
.
RewriteUnsafeSelect
(
tvm
.
make
.
Evaluate
(
z
))
.
value
zz
=
tvm
.
ir_pass
.
RewriteUnsafeSelect
(
tvm
.
make
.
Evaluate
(
z
))
.
value
a
=
tvm
.
expr
.
Select
(
i
>
10
,
y
,
z
)
a
=
tvm
.
expr
.
Select
(
tvm
.
floordiv
(
i
,
4
)
>
10
,
y
,
z
)
aa
=
tvm
.
ir_pass
.
RewriteUnsafeSelect
(
tvm
.
make
.
Evaluate
(
a
))
.
value
aa
=
tvm
.
ir_pass
.
RewriteUnsafeSelect
(
tvm
.
make
.
Evaluate
(
a
))
.
value
assert
yy
.
name
==
"tvm_if_then_else"
assert
yy
.
name
==
"tvm_if_then_else"
assert
zz
.
name
==
"tvm_if_then_else"
assert
zz
.
name
==
"tvm_if_then_else"
...
...
tests/python/unittest/test_schedule_tensorize.py
View file @
f98035b0
...
@@ -221,14 +221,15 @@ def test_tensorize_matmul():
...
@@ -221,14 +221,15 @@ def test_tensorize_matmul():
# This tests whether algorithm and intrinsics expressions are simplified
# This tests whether algorithm and intrinsics expressions are simplified
# as much as possible first and then checked for equality. See Issue #696
# as much as possible first and then checked for equality. See Issue #696
def
test_tensorize_op
():
def
test_tensorize_op
():
tdiv
=
tvm
.
truncdiv
idxd
=
tvm
.
indexdiv
tmod
=
tvm
.
truncmod
idxm
=
tvm
.
indexmod
def
op_intrin
():
def
op_intrin
():
bh
=
9
bh
=
9
bw
=
9
bw
=
9
x
=
tvm
.
placeholder
((
5
,
5
),
name
=
'A'
)
x
=
tvm
.
placeholder
((
5
,
5
),
name
=
'A'
)
y
=
tvm
.
compute
((
bh
,
bw
),
y
=
tvm
.
compute
((
bh
,
bw
),
lambda
i
,
j
:
x
[
tdiv
(
j
,
3
)
+
tmod
(
i
,
3
),
tmod
(
j
,
3
)
+
tdiv
(
i
,
3
)])
lambda
i
,
j
:
x
[
idxd
(
j
,
3
)
+
idxm
(
i
,
3
),
idxm
(
j
,
3
)
+
idxd
(
i
,
3
)])
def
intrin_func
(
ins
,
outs
):
def
intrin_func
(
ins
,
outs
):
xx
,
=
ins
xx
,
=
ins
...
@@ -239,7 +240,7 @@ def test_tensorize_op():
...
@@ -239,7 +240,7 @@ def test_tensorize_op():
return
tvm
.
decl_tensor_intrin
(
y
.
op
,
intrin_func
)
return
tvm
.
decl_tensor_intrin
(
y
.
op
,
intrin_func
)
A
=
tvm
.
placeholder
((
5
,
5
),
name
=
'A'
)
A
=
tvm
.
placeholder
((
5
,
5
),
name
=
'A'
)
B
=
tvm
.
compute
((
9
,
9
),
lambda
i
,
j
:
A
[
tdiv
(
j
,
3
)
+
tmod
(
i
,
3
),
tmod
(
j
,
3
)
+
tdiv
(
i
,
3
)])
B
=
tvm
.
compute
((
9
,
9
),
lambda
i
,
j
:
A
[
idxd
(
j
,
3
)
+
idxm
(
i
,
3
),
idxm
(
j
,
3
)
+
idxd
(
i
,
3
)])
bt
=
op_intrin
()
bt
=
op_intrin
()
s
=
tvm
.
create_schedule
(
B
.
op
)
s
=
tvm
.
create_schedule
(
B
.
op
)
...
...
topi/python/topi/arm_cpu/bitserial_conv2d.py
View file @
f98035b0
...
@@ -70,6 +70,9 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weigh
...
@@ -70,6 +70,9 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weigh
OW
=
(
PAD_W
-
KW
)
//
WSTR
+
1
OW
=
(
PAD_W
-
KW
)
//
WSTR
+
1
oshape
=
(
1
,
OH
,
OW
,
CO
)
oshape
=
(
1
,
OH
,
OW
,
CO
)
idxd
=
tvm
.
indexdiv
idxm
=
tvm
.
indexmod
# Pad input channels of weights and data when it is not a multiple of 8
# Pad input channels of weights and data when it is not a multiple of 8
if
CI_packed
%
8
!=
0
:
if
CI_packed
%
8
!=
0
:
CI_PAD
=
CI_packed
%
8
CI_PAD
=
CI_packed
%
8
...
@@ -106,7 +109,8 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weigh
...
@@ -106,7 +109,8 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weigh
data_q
=
bitpack
(
data
,
activation_bits
,
pack_axis
=
3
,
bit_axis
=
3
,
pack_type
=
'uint8'
)
data_q
=
bitpack
(
data
,
activation_bits
,
pack_axis
=
3
,
bit_axis
=
3
,
pack_type
=
'uint8'
)
kernel_vec
=
_kernel_vec_spatial_pack_nhwc
(
kernel
,
weight_bits
,
VC
,
len
(
kernel
.
shape
)
==
4
)
kernel_vec
=
_kernel_vec_spatial_pack_nhwc
(
kernel
,
weight_bits
,
VC
,
len
(
kernel
.
shape
)
==
4
)
if
kernel_vec
.
shape
[
-
1
]
%
8
!=
0
and
CI_PAD
!=
0
:
idxm
=
tvm
.
indexmod
if
idxm
(
kernel_vec
.
shape
[
-
1
],
8
)
!=
0
and
CI_PAD
!=
0
:
kernel_vec
=
pad
(
kernel_vec
,
[
0
,
0
,
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
,
0
,
CI_PAD
])
kernel_vec
=
pad
(
kernel_vec
,
[
0
,
0
,
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
,
0
,
CI_PAD
])
N
,
H
,
W
,
IB
,
CI
=
data_q
.
shape
N
,
H
,
W
,
IB
,
CI
=
data_q
.
shape
...
@@ -147,8 +151,12 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weigh
...
@@ -147,8 +151,12 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weigh
else
:
else
:
conv_vec
=
tvm
.
compute
(
ovshape
,
_bipolar_conv
,
name
=
'conv_vec'
,
tag
=
'bipolar'
)
conv_vec
=
tvm
.
compute
(
ovshape
,
_bipolar_conv
,
name
=
'conv_vec'
,
tag
=
'bipolar'
)
conv
=
tvm
.
compute
(
oshape
,
lambda
n
,
h
,
w
,
co
:
conv_vec
[
n
][
h
//
VH
][
w
//
VW
][
co
//
VC
][
h
%
VH
][
w
%
VW
][
co
%
VC
]
.
astype
(
out_dtype
),
conv
=
tvm
.
compute
(
oshape
,
lambda
n
,
h
,
w
,
co
:
conv_vec
[
n
,
idxd
(
h
,
VH
),
idxd
(
w
,
VW
),
idxd
(
co
,
VC
),
idxm
(
h
,
VH
),
idxm
(
w
,
VW
),
idxm
(
co
,
VC
)]
.
astype
(
out_dtype
),
name
=
'conv'
,
tag
=
'spatial_bitserial_conv_nhwc'
)
name
=
'conv'
,
tag
=
'spatial_bitserial_conv_nhwc'
)
return
conv
return
conv
...
...
topi/python/topi/arm_cpu/conv2d.py
View file @
f98035b0
...
@@ -171,6 +171,9 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
...
@@ -171,6 +171,9 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
assert
KH
==
3
and
KW
==
3
and
HSTR
==
1
and
WSTR
==
1
assert
KH
==
3
and
KW
==
3
and
HSTR
==
1
and
WSTR
==
1
data_pad
=
pad
(
data
,
(
0
,
0
,
HPAD
,
WPAD
),
name
=
"data_pad"
)
data_pad
=
pad
(
data
,
(
0
,
0
,
HPAD
,
WPAD
),
name
=
"data_pad"
)
idxd
=
tvm
.
indexdiv
idxm
=
tvm
.
indexmod
r
=
KW
r
=
KW
m
=
tile_size
m
=
tile_size
alpha
=
m
+
r
-
1
alpha
=
m
+
r
-
1
...
@@ -190,10 +193,11 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
...
@@ -190,10 +193,11 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
VK
=
cfg
[
'tile_k'
]
.
size
[
-
1
]
VK
=
cfg
[
'tile_k'
]
.
size
[
-
1
]
# pack input tile
# pack input tile
input_tile
=
tvm
.
compute
((
C
,
P
//
VP
,
alpha
,
alpha
,
VP
),
input_tile
=
tvm
.
compute
((
C
,
idxd
(
P
,
VP
)
,
alpha
,
alpha
,
VP
),
lambda
c
,
b
,
eps
,
nu
,
bb
:
lambda
c
,
b
,
eps
,
nu
,
bb
:
data_pad
[(
b
*
VP
+
bb
)
//
(
nH
*
nW
)][
c
][(
b
*
VP
+
bb
)
//
nW
%
nH
*
m
+
eps
]
data_pad
[
idxd
(
b
*
VP
+
bb
,
nH
*
nW
),
c
,
[(
b
*
VP
+
bb
)
%
nW
*
m
+
nu
],
idxm
(
idxd
(
b
*
VP
+
bb
,
nW
),
nH
)
*
m
+
eps
,
idxm
(
b
*
VP
+
bb
,
nW
)
*
m
+
nu
],
name
=
'd'
)
name
=
'd'
)
# transform kernel
# transform kernel
...
@@ -202,22 +206,22 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
...
@@ -202,22 +206,22 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
else
:
else
:
r_kh
=
tvm
.
reduce_axis
((
0
,
KH
),
'r_kh'
)
r_kh
=
tvm
.
reduce_axis
((
0
,
KH
),
'r_kh'
)
r_kw
=
tvm
.
reduce_axis
((
0
,
KW
),
'r_kw'
)
r_kw
=
tvm
.
reduce_axis
((
0
,
KW
),
'r_kw'
)
U
=
tvm
.
compute
((
alpha
,
alpha
,
K
//
VK
,
C
,
VK
),
lambda
eps
,
nu
,
k
,
c
,
kk
:
U
=
tvm
.
compute
((
alpha
,
alpha
,
idxd
(
K
,
VK
)
,
C
,
VK
),
lambda
eps
,
nu
,
k
,
c
,
kk
:
tvm
.
sum
(
kernel
[
k
*
VK
+
kk
][
c
][
r_kh
][
r_kw
]
.
astype
(
out_dtype
)
*
tvm
.
sum
(
kernel
[
k
*
VK
+
kk
][
c
][
r_kh
][
r_kw
]
.
astype
(
out_dtype
)
*
G
[
eps
][
r_kh
]
*
G
[
nu
][
r_kw
],
axis
=
[
r_kh
,
r_kw
]),
name
=
'U'
)
G
[
eps
][
r_kh
]
*
G
[
nu
][
r_kw
],
axis
=
[
r_kh
,
r_kw
]),
name
=
'U'
)
# transform image
# transform image
r_eps
=
tvm
.
reduce_axis
((
0
,
alpha
),
'r_eps'
)
r_eps
=
tvm
.
reduce_axis
((
0
,
alpha
),
'r_eps'
)
r_nu
=
tvm
.
reduce_axis
((
0
,
alpha
),
'r_nu'
)
r_nu
=
tvm
.
reduce_axis
((
0
,
alpha
),
'r_nu'
)
V
=
tvm
.
compute
((
alpha
,
alpha
,
P
//
VP
,
C
,
VP
),
lambda
eps
,
nu
,
b
,
c
,
bb
:
V
=
tvm
.
compute
((
alpha
,
alpha
,
idxd
(
P
,
VP
)
,
C
,
VP
),
lambda
eps
,
nu
,
b
,
c
,
bb
:
tvm
.
sum
(
input_tile
[
c
][
b
][
r_eps
][
r_nu
][
bb
]
.
astype
(
out_dtype
)
*
tvm
.
sum
(
input_tile
[
c
][
b
][
r_eps
][
r_nu
][
bb
]
.
astype
(
out_dtype
)
*
B
[
r_eps
][
eps
]
*
B
[
r_nu
][
nu
],
axis
=
[
r_eps
,
r_nu
]),
name
=
'V'
)
B
[
r_eps
][
eps
]
*
B
[
r_nu
][
nu
],
axis
=
[
r_eps
,
r_nu
]),
name
=
'V'
)
# batch gemm
# batch gemm
c
=
tvm
.
reduce_axis
((
0
,
C
),
name
=
'c'
)
c
=
tvm
.
reduce_axis
((
0
,
C
),
name
=
'c'
)
M
=
tvm
.
compute
((
alpha
,
alpha
,
K
,
P
),
lambda
eps
,
nu
,
k
,
b
:
M
=
tvm
.
compute
((
alpha
,
alpha
,
K
,
P
),
lambda
eps
,
nu
,
k
,
b
:
tvm
.
sum
(
U
[
eps
][
nu
][
k
//
VK
][
c
][
k
%
VK
]
*
tvm
.
sum
(
U
[
eps
][
nu
][
idxd
(
k
,
VK
)][
c
][
idxm
(
k
,
VK
)
]
*
V
[
eps
][
nu
][
b
//
VP
][
c
][
b
%
VP
],
axis
=
c
),
name
=
'M'
)
V
[
eps
][
nu
][
idxd
(
b
,
VP
)][
c
][
idxm
(
b
,
VP
)
],
axis
=
c
),
name
=
'M'
)
# inverse transform
# inverse transform
r_eps
=
tvm
.
reduce_axis
((
0
,
alpha
),
'r_eps'
)
r_eps
=
tvm
.
reduce_axis
((
0
,
alpha
),
'r_eps'
)
...
@@ -228,7 +232,8 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
...
@@ -228,7 +232,8 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
# unpack output
# unpack output
output
=
tvm
.
compute
((
N
,
K
,
H
,
W
),
lambda
n
,
k
,
h
,
w
:
output
=
tvm
.
compute
((
N
,
K
,
H
,
W
),
lambda
n
,
k
,
h
,
w
:
Y
[
k
][
n
*
nH
*
nW
+
(
h
//
m
)
*
nW
+
w
//
m
][
h
%
m
][
w
%
m
],
Y
[
k
][
n
*
nH
*
nW
+
idxd
(
h
,
m
)
*
nW
+
idxd
(
w
,
m
),
idxm
(
h
,
m
),
idxm
(
w
,
m
)],
name
=
'output'
,
tag
=
'winograd_conv2d_output'
)
name
=
'output'
,
tag
=
'winograd_conv2d_output'
)
# we have to manually assign effective GFLOP for winograd
# we have to manually assign effective GFLOP for winograd
...
@@ -517,6 +522,8 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
...
@@ -517,6 +522,8 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
N
,
CI
,
H
,
W
=
get_const_tuple
(
data
.
shape
)
N
,
CI
,
H
,
W
=
get_const_tuple
(
data
.
shape
)
CO
,
_
,
KH
,
KW
=
get_const_tuple
(
kernel
.
shape
)
CO
,
_
,
KH
,
KW
=
get_const_tuple
(
kernel
.
shape
)
idxd
=
tvm
.
indexdiv
if
groups
==
1
:
if
groups
==
1
:
# query config of this workload
# query config of this workload
workload
=
autotvm
.
task
.
args_to_workload
(
workload
=
autotvm
.
task
.
args_to_workload
(
...
@@ -535,7 +542,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
...
@@ -535,7 +542,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
# Store the same config for the altered operator (workload)
# Store the same config for the altered operator (workload)
new_data
=
data
new_data
=
data
new_kernel
=
tvm
.
placeholder
((
CO
//
VC
,
CI
,
KH
,
KW
,
VC
),
dtype
=
kernel
.
dtype
)
new_kernel
=
tvm
.
placeholder
((
idxd
(
CO
,
VC
)
,
CI
,
KH
,
KW
,
VC
),
dtype
=
kernel
.
dtype
)
new_workload
=
autotvm
.
task
.
args_to_workload
(
new_workload
=
autotvm
.
task
.
args_to_workload
(
[
new_data
,
new_kernel
,
strides
,
padding
,
dilation
,
'NCHW'
,
out_dtype
],
conv2d
)
[
new_data
,
new_kernel
,
strides
,
padding
,
dilation
,
'NCHW'
,
out_dtype
],
conv2d
)
dispatch_ctx
.
update
(
target
,
new_workload
,
cfg
)
dispatch_ctx
.
update
(
target
,
new_workload
,
cfg
)
...
@@ -553,7 +560,9 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
...
@@ -553,7 +560,9 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
weight
=
F
.
nn
.
contrib_conv2d_winograd_weight_transform
(
copy_inputs
[
1
],
weight
=
F
.
nn
.
contrib_conv2d_winograd_weight_transform
(
copy_inputs
[
1
],
tile_size
=
tile_size
)
tile_size
=
tile_size
)
weight
=
F
.
reshape
(
weight
,
weight
=
F
.
reshape
(
weight
,
newshape
=
(
KH
+
tile_size
-
1
,
KW
+
tile_size
-
1
,
CO
//
VC
,
VC
,
CI
))
newshape
=
(
KH
+
tile_size
-
1
,
KW
+
tile_size
-
1
,
idxd
(
CO
,
VC
),
VC
,
CI
))
weight
=
F
.
transpose
(
weight
,
axes
=
[
0
,
1
,
2
,
4
,
3
])
weight
=
F
.
transpose
(
weight
,
axes
=
[
0
,
1
,
2
,
4
,
3
])
copy_inputs
[
1
]
=
weight
copy_inputs
[
1
]
=
weight
...
@@ -561,7 +570,9 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
...
@@ -561,7 +570,9 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
# Store the same config for the altered operator (workload)
# Store the same config for the altered operator (workload)
new_data
=
data
new_data
=
data
new_weight
=
tvm
.
placeholder
((
KH
+
tile_size
-
1
,
KH
+
tile_size
-
1
,
CO
//
VC
,
CI
,
VC
),
new_weight
=
tvm
.
placeholder
((
KH
+
tile_size
-
1
,
KH
+
tile_size
-
1
,
idxd
(
CO
,
VC
),
CI
,
VC
),
kernel
.
dtype
)
kernel
.
dtype
)
new_workload
=
autotvm
.
task
.
args_to_workload
(
new_workload
=
autotvm
.
task
.
args_to_workload
(
[
new_data
,
new_weight
,
strides
,
padding
,
dilation
,
[
new_data
,
new_weight
,
strides
,
padding
,
dilation
,
...
@@ -612,7 +623,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
...
@@ -612,7 +623,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
# Store the same config for the altered operator (workload)
# Store the same config for the altered operator (workload)
new_data
=
data
new_data
=
data
CO
,
M
,
KH
,
KW
=
get_const_tuple
(
kernel
.
shape
)
CO
,
M
,
KH
,
KW
=
get_const_tuple
(
kernel
.
shape
)
new_kernel
=
tvm
.
placeholder
((
CO
//
VC
,
M
,
KH
,
KW
,
VC
),
dtype
=
kernel
.
dtype
)
new_kernel
=
tvm
.
placeholder
((
idxd
(
CO
,
VC
)
,
M
,
KH
,
KW
,
VC
),
dtype
=
kernel
.
dtype
)
new_workload
=
autotvm
.
task
.
args_to_workload
(
new_workload
=
autotvm
.
task
.
args_to_workload
(
[
new_data
,
new_kernel
,
strides
,
padding
,
dilation
,
out_dtype
],
[
new_data
,
new_kernel
,
strides
,
padding
,
dilation
,
out_dtype
],
depthwise_conv2d_nchw
)
depthwise_conv2d_nchw
)
...
...
topi/python/topi/cuda/nms.py
View file @
f98035b0
...
@@ -243,14 +243,16 @@ def get_valid_counts_downsweep(data, idx_in, partial, idx):
...
@@ -243,14 +243,16 @@ def get_valid_counts_downsweep(data, idx_in, partial, idx):
ib
.
scope_attr
(
bx
,
"thread_extent"
,
nthread_bx
)
ib
.
scope_attr
(
bx
,
"thread_extent"
,
nthread_bx
)
tid
=
bx
*
max_threads
+
tx
tid
=
bx
*
max_threads
+
tx
new_range
=
num_anchors
//
elem_per_thread
+
1
new_range
=
num_anchors
//
elem_per_thread
+
1
idxd
=
tvm
.
indexdiv
idxm
=
tvm
.
indexmod
# Scan: Downsweep:
# Scan: Downsweep:
with
ib
.
if_scope
(
tid
<
batch_size
*
num_anchors
):
with
ib
.
if_scope
(
tid
<
batch_size
*
num_anchors
):
i
=
tid
//
num_anchors
# number of batches
i
=
idxd
(
tid
,
num_anchors
)
# number of batches
j
=
tid
%
num_anchors
# number of anchors
j
=
idxm
(
tid
,
num_anchors
)
# number of anchors
with
ib
.
if_scope
(
j
<
elem_per_thread
):
with
ib
.
if_scope
(
j
<
elem_per_thread
):
idx
[
tid
]
=
idx_in
[
tid
]
idx
[
tid
]
=
idx_in
[
tid
]
with
ib
.
else_scope
():
with
ib
.
else_scope
():
idx
[
tid
]
=
idx_in
[
tid
]
+
partial
[
i
*
new_range
+
j
//
elem_per_thread
-
1
]
idx
[
tid
]
=
idx_in
[
tid
]
+
partial
[
i
*
new_range
+
idxd
(
j
,
elem_per_thread
)
-
1
]
return
ib
.
get
()
return
ib
.
get
()
...
@@ -303,9 +305,12 @@ def get_valid_counts_ir(data, flag, idx, valid_count, out):
...
@@ -303,9 +305,12 @@ def get_valid_counts_ir(data, flag, idx, valid_count, out):
ib
.
scope_attr
(
bx
,
"thread_extent"
,
nthread_bx
)
ib
.
scope_attr
(
bx
,
"thread_extent"
,
nthread_bx
)
tid
=
bx
*
max_threads
+
tx
tid
=
bx
*
max_threads
+
tx
idxd
=
tvm
.
indexdiv
idxm
=
tvm
.
indexmod
with
ib
.
if_scope
(
tid
<
batch_size
*
num_anchors
):
with
ib
.
if_scope
(
tid
<
batch_size
*
num_anchors
):
i
=
tid
//
num_anchors
i
=
idxd
(
tid
,
num_anchors
)
j
=
tid
%
num_anchors
j
=
idxm
(
tid
,
num_anchors
)
base_idx
=
i
*
num_anchors
*
elem_length
base_idx
=
i
*
num_anchors
*
elem_length
with
ib
.
if_scope
(
flag
[
tid
]
>
0
):
with
ib
.
if_scope
(
flag
[
tid
]
>
0
):
with
ib
.
for_range
(
0
,
elem_length
)
as
k
:
with
ib
.
for_range
(
0
,
elem_length
)
as
k
:
...
...
topi/python/topi/cuda/rcnn/proposal.py
View file @
f98035b0
...
@@ -79,10 +79,13 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r
...
@@ -79,10 +79,13 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r
p_im_info
=
ib
.
buffer_ptr
(
im_info_buf
)
p_im_info
=
ib
.
buffer_ptr
(
im_info_buf
)
p_out
=
ib
.
buffer_ptr
(
out_buf
)
p_out
=
ib
.
buffer_ptr
(
out_buf
)
idxm
=
tvm
.
indexmod
idxd
=
tvm
.
indexdiv
with
ib
.
if_scope
(
tid
<
batch
*
height
*
width
):
with
ib
.
if_scope
(
tid
<
batch
*
height
*
width
):
w
=
tid
%
width
w
=
idxm
(
tid
,
width
)
h
=
(
tid
//
width
)
%
height
h
=
idxm
(
idxd
(
tid
,
width
),
height
)
b
=
tid
//
width
//
height
b
=
idxd
(
idxd
(
tid
,
width
),
height
)
for
k
in
range
(
num_anchors
):
for
k
in
range
(
num_anchors
):
out_index
=
tid
*
num_anchors
+
k
out_index
=
tid
*
num_anchors
+
k
...
@@ -163,6 +166,8 @@ def argsort_ir(data_buf, out_index_buf):
...
@@ -163,6 +166,8 @@ def argsort_ir(data_buf, out_index_buf):
temp_data
=
ib
.
allocate
(
"float32"
,
(
1
,),
name
=
"temp_data"
,
scope
=
"local"
)
temp_data
=
ib
.
allocate
(
"float32"
,
(
1
,),
name
=
"temp_data"
,
scope
=
"local"
)
temp_index
=
ib
.
allocate
(
"int32"
,
(
1
,),
name
=
"temp_index"
,
scope
=
"local"
)
temp_index
=
ib
.
allocate
(
"int32"
,
(
1
,),
name
=
"temp_index"
,
scope
=
"local"
)
idxm
=
tvm
.
indexmod
with
ib
.
for_range
(
0
,
batch
,
for_type
=
"unroll"
)
as
b
:
with
ib
.
for_range
(
0
,
batch
,
for_type
=
"unroll"
)
as
b
:
start
=
b
*
num_bbox
start
=
b
*
num_bbox
for
i
in
range
(
2
):
for
i
in
range
(
2
):
...
@@ -170,7 +175,7 @@ def argsort_ir(data_buf, out_index_buf):
...
@@ -170,7 +175,7 @@ def argsort_ir(data_buf, out_index_buf):
with
ib
.
if_scope
(
bbox_id
<
num_bbox
):
with
ib
.
if_scope
(
bbox_id
<
num_bbox
):
index_out
[
start
+
bbox_id
]
=
bbox_id
index_out
[
start
+
bbox_id
]
=
bbox_id
with
ib
.
for_range
(
0
,
num_bbox
)
as
k
:
with
ib
.
for_range
(
0
,
num_bbox
)
as
k
:
offset
=
start
+
2
*
tid
+
(
k
%
2
)
offset
=
start
+
2
*
tid
+
idxm
(
k
,
2
)
with
ib
.
if_scope
(
with
ib
.
if_scope
(
tvm
.
all
(
offset
+
1
<
num_bbox
,
p_data
[
offset
]
<
p_data
[
offset
+
1
])):
tvm
.
all
(
offset
+
1
<
num_bbox
,
p_data
[
offset
]
<
p_data
[
offset
+
1
])):
temp_data
[
0
]
=
p_data
[
offset
]
temp_data
[
0
]
=
p_data
[
offset
]
...
...
topi/python/topi/cuda/sort.py
View file @
f98035b0
...
@@ -115,6 +115,8 @@ def sort_ir(data, values_out, axis, is_ascend, indices_out=None):
...
@@ -115,6 +115,8 @@ def sort_ir(data, values_out, axis, is_ascend, indices_out=None):
ib
.
emit
(
tvm
.
make
.
Call
(
None
,
'tvm_storage_sync'
,
ib
.
emit
(
tvm
.
make
.
Call
(
None
,
'tvm_storage_sync'
,
tvm
.
convert
([
'shared'
]),
tvm
.
convert
([
'shared'
]),
tvm
.
expr
.
Call
.
Intrinsic
,
None
,
0
))
tvm
.
expr
.
Call
.
Intrinsic
,
None
,
0
))
idxd
=
tvm
.
indexdiv
idxm
=
tvm
.
indexmod
with
ib
.
for_range
(
0
,
axis_mul_before
)
as
i
:
with
ib
.
for_range
(
0
,
axis_mul_before
)
as
i
:
with
ib
.
for_range
(
0
,
axis_mul_after
)
as
j
:
with
ib
.
for_range
(
0
,
axis_mul_after
)
as
j
:
...
@@ -122,13 +124,13 @@ def sort_ir(data, values_out, axis, is_ascend, indices_out=None):
...
@@ -122,13 +124,13 @@ def sort_ir(data, values_out, axis, is_ascend, indices_out=None):
base_idx
=
i
*
shape
[
axis
]
*
axis_mul_after
+
j
base_idx
=
i
*
shape
[
axis
]
*
axis_mul_after
+
j
# OddEvenTransposeSort
# OddEvenTransposeSort
with
ib
.
for_range
(
0
,
current_sort_num
)
as
k
:
with
ib
.
for_range
(
0
,
current_sort_num
)
as
k
:
with
ib
.
if_scope
(
tid
<
(
current_sort_num
+
1
)
//
2
):
with
ib
.
if_scope
(
tid
<
idxd
(
current_sort_num
+
1
,
2
)
):
offset
=
base_idx
+
(
2
*
tid
+
(
k
%
2
))
*
axis_mul_after
offset
=
base_idx
+
(
2
*
tid
+
idxm
(
k
,
2
))
*
axis_mul_after
if
is_ascend
:
if
is_ascend
:
cond
=
tvm
.
all
(
2
*
tid
+
(
k
%
2
)
+
1
<
current_sort_num
,
cond
=
tvm
.
all
(
2
*
tid
+
idxm
(
k
,
2
)
+
1
<
current_sort_num
,
values_out
[
offset
]
>
values_out
[
offset
+
axis_mul_after
])
values_out
[
offset
]
>
values_out
[
offset
+
axis_mul_after
])
else
:
else
:
cond
=
tvm
.
all
(
2
*
tid
+
(
k
%
2
)
+
1
<
current_sort_num
,
cond
=
tvm
.
all
(
2
*
tid
+
idxm
(
k
,
2
)
+
1
<
current_sort_num
,
values_out
[
offset
]
<
values_out
[
offset
+
axis_mul_after
])
values_out
[
offset
]
<
values_out
[
offset
+
axis_mul_after
])
with
ib
.
if_scope
(
cond
):
with
ib
.
if_scope
(
cond
):
temp_data
[
0
]
=
values_out
[
offset
]
temp_data
[
0
]
=
values_out
[
offset
]
...
@@ -199,6 +201,9 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend):
...
@@ -199,6 +201,9 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend):
temp_index
=
ib
.
allocate
(
"int32"
,
(
1
,),
name
=
"temp_index"
,
scope
=
"local"
)
temp_index
=
ib
.
allocate
(
"int32"
,
(
1
,),
name
=
"temp_index"
,
scope
=
"local"
)
is_ascend
=
tvm
.
make
.
node
(
"IntImm"
,
dtype
=
"int32"
,
value
=
is_ascend
)
is_ascend
=
tvm
.
make
.
node
(
"IntImm"
,
dtype
=
"int32"
,
value
=
is_ascend
)
idxd
=
tvm
.
indexdiv
idxm
=
tvm
.
indexmod
with
ib
.
for_range
(
0
,
axis_mul_before
)
as
i
:
with
ib
.
for_range
(
0
,
axis_mul_before
)
as
i
:
with
ib
.
for_range
(
0
,
axis_mul_after
)
as
j
:
with
ib
.
for_range
(
0
,
axis_mul_after
)
as
j
:
current_sort_num
=
valid_count
[
i
*
axis_mul_after
+
j
]
current_sort_num
=
valid_count
[
i
*
axis_mul_after
+
j
]
...
@@ -207,10 +212,10 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend):
...
@@ -207,10 +212,10 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend):
output
[
base_idx
+
tid
*
axis_mul_after
]
=
tid
output
[
base_idx
+
tid
*
axis_mul_after
]
=
tid
# OddEvenTransposeSort
# OddEvenTransposeSort
with
ib
.
for_range
(
0
,
current_sort_num
)
as
k
:
with
ib
.
for_range
(
0
,
current_sort_num
)
as
k
:
with
ib
.
if_scope
(
tid
<
(
current_sort_num
+
1
)
//
2
):
with
ib
.
if_scope
(
tid
<
idxd
(
current_sort_num
+
1
,
2
)
):
offset
=
base_idx
+
(
2
*
tid
+
(
k
%
2
))
*
axis_mul_after
offset
=
base_idx
+
(
2
*
tid
+
idxm
(
k
,
2
))
*
axis_mul_after
with
ib
.
if_scope
(
tvm
.
all
(
is_ascend
==
1
,
\
with
ib
.
if_scope
(
tvm
.
all
(
is_ascend
==
1
,
\
2
*
tid
+
(
k
%
2
)
+
1
<
current_sort_num
,
\
2
*
tid
+
idxm
(
k
,
2
)
+
1
<
current_sort_num
,
\
data
[
offset
]
>
data
[
offset
+
axis_mul_after
])):
data
[
offset
]
>
data
[
offset
+
axis_mul_after
])):
temp_data
[
0
]
=
data
[
offset
]
temp_data
[
0
]
=
data
[
offset
]
data
[
offset
]
=
data
[
offset
+
axis_mul_after
]
data
[
offset
]
=
data
[
offset
+
axis_mul_after
]
...
@@ -219,7 +224,7 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend):
...
@@ -219,7 +224,7 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend):
output
[
offset
]
=
output
[
offset
+
axis_mul_after
]
output
[
offset
]
=
output
[
offset
+
axis_mul_after
]
output
[
offset
+
axis_mul_after
]
=
temp_index
[
0
]
output
[
offset
+
axis_mul_after
]
=
temp_index
[
0
]
with
ib
.
if_scope
(
tvm
.
all
(
is_ascend
==
0
,
\
with
ib
.
if_scope
(
tvm
.
all
(
is_ascend
==
0
,
\
2
*
tid
+
(
k
%
2
)
+
1
<
current_sort_num
,
\
2
*
tid
+
idxm
(
k
,
2
)
+
1
<
current_sort_num
,
\
data
[
offset
]
<
data
[
offset
+
axis_mul_after
])):
data
[
offset
]
<
data
[
offset
+
axis_mul_after
])):
temp_data
[
0
]
=
data
[
offset
]
temp_data
[
0
]
=
data
[
offset
]
data
[
offset
]
=
data
[
offset
+
axis_mul_after
]
data
[
offset
]
=
data
[
offset
+
axis_mul_after
]
...
...
topi/python/topi/cuda/ssd/multibox.py
View file @
f98035b0
...
@@ -95,8 +95,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
...
@@ -95,8 +95,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
for
k
in
range
(
num_sizes
+
num_ratios
-
1
):
for
k
in
range
(
num_sizes
+
num_ratios
-
1
):
w
=
if_then_else
(
k
<
num_sizes
,
w
=
if_then_else
(
k
<
num_sizes
,
size_ratio_concat
[
k
]
*
in_height
/
in_width
/
2.0
,
float
(
size_ratio_concat
[
k
])
*
in_height
/
in_width
/
2.0
,
size_ratio_concat
[
0
]
*
in_height
/
in_width
*
float
(
size_ratio_concat
[
0
])
*
in_height
/
in_width
*
math
.
sqrt
(
size_ratio_concat
[
k
+
1
])
/
2.0
)
math
.
sqrt
(
size_ratio_concat
[
k
+
1
])
/
2.0
)
h
=
if_then_else
(
h
=
if_then_else
(
k
<
num_sizes
,
size_ratio_concat
[
k
]
/
2.0
,
k
<
num_sizes
,
size_ratio_concat
[
k
]
/
2.0
,
...
@@ -204,10 +204,12 @@ def transform_loc_pre(cls_prob, valid_count, temp_valid_count, temp_cls_id, temp
...
@@ -204,10 +204,12 @@ def transform_loc_pre(cls_prob, valid_count, temp_valid_count, temp_cls_id, temp
ib
.
scope_attr
(
tx
,
"thread_extent"
,
nthread_tx
)
ib
.
scope_attr
(
tx
,
"thread_extent"
,
nthread_tx
)
ib
.
scope_attr
(
bx
,
"thread_extent"
,
nthread_bx
)
ib
.
scope_attr
(
bx
,
"thread_extent"
,
nthread_bx
)
tid
=
bx
*
max_threads
+
tx
tid
=
bx
*
max_threads
+
tx
idxd
=
tvm
.
indexdiv
idxm
=
tvm
.
indexmod
with
ib
.
if_scope
(
tid
<
batch_size
*
num_anchors
):
with
ib
.
if_scope
(
tid
<
batch_size
*
num_anchors
):
i
=
tid
/
num_anchors
i
=
idxd
(
tid
,
num_anchors
)
j
=
tid
%
num_anchors
j
=
idxm
(
tid
,
num_anchors
)
valid_count
[
i
]
=
0
valid_count
[
i
]
=
0
score
[
tid
]
=
-
1.0
score
[
tid
]
=
-
1.0
cls_id
[
tid
]
=
0
cls_id
[
tid
]
=
0
...
@@ -314,9 +316,13 @@ def transform_loc_ir(loc_pred, anchor, temp_valid_count, temp_cls_id, temp_score
...
@@ -314,9 +316,13 @@ def transform_loc_ir(loc_pred, anchor, temp_valid_count, temp_cls_id, temp_score
ib
.
scope_attr
(
bx
,
"thread_extent"
,
nthread_bx
)
ib
.
scope_attr
(
bx
,
"thread_extent"
,
nthread_bx
)
tid
=
bx
*
max_threads
+
tx
tid
=
bx
*
max_threads
+
tx
idxd
=
tvm
.
indexdiv
idxm
=
tvm
.
indexmod
with
ib
.
if_scope
(
tid
<
batch_size
*
num_anchors
):
with
ib
.
if_scope
(
tid
<
batch_size
*
num_anchors
):
i
=
tid
//
num_anchors
i
=
idxd
(
tid
,
num_anchors
)
j
=
tid
%
num_anchors
j
=
idxm
(
tid
,
num_anchors
)
with
ib
.
if_scope
(
cls_id
[
tid
]
>
0
):
with
ib
.
if_scope
(
cls_id
[
tid
]
>
0
):
with
ib
.
if_scope
(
tid
==
0
):
with
ib
.
if_scope
(
tid
==
0
):
out_base_idx
=
i
*
num_anchors
*
6
out_base_idx
=
i
*
num_anchors
*
6
...
...
topi/python/topi/nn/bitserial_conv2d.py
View file @
f98035b0
...
@@ -313,13 +313,14 @@ def spatial_pack_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bits,
...
@@ -313,13 +313,14 @@ def spatial_pack_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bits,
axis
=
[
ci
,
dh
,
dw
,
b1
,
b2
])
axis
=
[
ci
,
dh
,
dw
,
b1
,
b2
])
conv
=
tvm
.
compute
(
ovshape
,
_conv
,
name
=
'conv_out'
)
conv
=
tvm
.
compute
(
ovshape
,
_conv
,
name
=
'conv_out'
)
idxd
iv
=
tvm
.
indexdiv
idxd
=
tvm
.
indexdiv
idxm
od
=
tvm
.
indexmod
idxm
=
tvm
.
indexmod
return
tvm
.
compute
(
return
tvm
.
compute
(
oshape
,
lambda
n
,
co
,
h
,
w
:
oshape
,
lambda
n
,
co
,
h
,
w
:
conv
[
n
][
idxdiv
(
co
,
VC
)][
idxdiv
(
h
,
VH
)][
idxdiv
(
conv
[
n
,
w
,
VW
)][
idxmod
(
h
,
VH
)][
idxmod
(
w
,
VW
)][
idxmod
(
co
,
VC
)],
idxd
(
co
,
VC
),
idxd
(
h
,
VH
),
idxd
(
w
,
VW
),
idxm
(
h
,
VH
),
idxm
(
w
,
VW
),
idxm
(
co
,
VC
)],
name
=
'conv_vec'
,
tag
=
'spatial_bitserial_conv_nchw'
)
name
=
'conv_vec'
,
tag
=
'spatial_bitserial_conv_nchw'
)
@autotvm.register_topi_compute
(
bitserial_conv2d_nhwc
,
'cpu'
,
'direct'
)
@autotvm.register_topi_compute
(
bitserial_conv2d_nhwc
,
'cpu'
,
'direct'
)
...
@@ -419,12 +420,13 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, in_bits, weight_bits,
...
@@ -419,12 +420,13 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, in_bits, weight_bits,
conv
=
tvm
.
compute
(
ovshape
,
_conv
,
name
=
'conv'
)
conv
=
tvm
.
compute
(
ovshape
,
_conv
,
name
=
'conv'
)
idxd
iv
=
tvm
.
indexdiv
idxd
=
tvm
.
indexdiv
idxm
od
=
tvm
.
indexmod
idxm
=
tvm
.
indexmod
return
tvm
.
compute
(
return
tvm
.
compute
(
oshape
,
lambda
n
,
h
,
w
,
co
:
oshape
,
lambda
n
,
h
,
w
,
co
:
conv
[
n
][
idxdiv
(
h
,
VH
)][
idxdiv
(
w
,
VW
)][
idxdiv
(
conv
[
n
,
co
,
VC
)][
idxmod
(
h
,
VH
)][
idxmod
(
w
,
VW
)][
idxmod
(
co
,
VC
)],
idxd
(
h
,
VH
),
idxd
(
w
,
VW
),
idxd
(
co
,
VC
),
idxm
(
h
,
VH
),
idxm
(
w
,
VW
),
idxm
(
co
,
VC
)],
name
=
'output_unpack'
,
tag
=
'spatial_bitserial_conv_nhwc'
)
name
=
'output_unpack'
,
tag
=
'spatial_bitserial_conv_nhwc'
)
@tvm.target.generic_func
@tvm.target.generic_func
...
...
topi/python/topi/nn/sparse.py
View file @
f98035b0
...
@@ -94,12 +94,15 @@ def _sparse_dense_bsrmm(data, weight_data, weight_indices, weight_indptr):
...
@@ -94,12 +94,15 @@ def _sparse_dense_bsrmm(data, weight_data, weight_indices, weight_indptr):
x_val
=
data
[
i
,
bs_c
*
block_j
+
c
]
x_val
=
data
[
i
,
bs_c
*
block_j
+
c
]
return
tvm
.
sum
(
block_ij_val
*
x_val
,
axis
=
[
elem_idx
,
c
])
return
tvm
.
sum
(
block_ij_val
*
x_val
,
axis
=
[
elem_idx
,
c
])
idxd
=
tvm
.
indexdiv
idxm
=
tvm
.
indexmod
bsrmm_block
=
tvm
.
compute
(
bsrmm_block
=
tvm
.
compute
(
(
m
,
num_blocks
,
bs_r
),
_compute_block
,
(
m
,
num_blocks
,
bs_r
),
_compute_block
,
tag
=
"sparse_dense_bsrmm_block"
)
tag
=
"sparse_dense_bsrmm_block"
)
return
tvm
.
compute
(
return
tvm
.
compute
(
(
m
,
num_blocks
*
bs_r
),
(
m
,
num_blocks
*
bs_r
),
lambda
m
,
n
:
bsrmm_block
[
m
,
n
//
bs_r
,
n
%
bs_r
],
lambda
m
,
n
:
bsrmm_block
[
m
,
idxd
(
n
,
bs_r
),
idxm
(
n
,
bs_r
)
],
tag
=
"sparse_dense_bsrmm"
)
tag
=
"sparse_dense_bsrmm"
)
@tvm.target.generic_func
@tvm.target.generic_func
...
...
topi/python/topi/util.py
View file @
f98035b0
...
@@ -232,10 +232,12 @@ def unravel_index(idx, shape):
...
@@ -232,10 +232,12 @@ def unravel_index(idx, shape):
indices : tuple of int or tvm.expr.IntImm
indices : tuple of int or tvm.expr.IntImm
Corresponding coordinate of the 1D index
Corresponding coordinate of the 1D index
"""
"""
idxd
=
tvm
.
indexdiv
idxm
=
tvm
.
indexmod
indices
=
[]
indices
=
[]
for
i
in
range
(
len
(
shape
)
-
1
,
-
1
,
-
1
):
for
i
in
range
(
len
(
shape
)
-
1
,
-
1
,
-
1
):
indices
.
append
(
idx
%
shape
[
i
]
)
indices
.
append
(
idx
m
(
idx
,
shape
[
i
])
)
idx
=
idx
//
shape
[
i
]
idx
=
idx
d
(
idx
,
shape
[
i
])
indices
=
indices
[::
-
1
]
indices
=
indices
[::
-
1
]
return
indices
return
indices
...
@@ -257,12 +259,13 @@ def const_matrix(matrix, name="const_matrix"):
...
@@ -257,12 +259,13 @@ def const_matrix(matrix, name="const_matrix"):
"""
"""
row
,
col
=
matrix
.
shape
row
,
col
=
matrix
.
shape
dtype
=
str
(
matrix
.
dtype
)
dtype
=
str
(
matrix
.
dtype
)
idxm
=
tvm
.
indexmod
def
select_array
(
i
,
j
):
def
select_array
(
i
,
j
):
now
=
tvm
.
const
(
0.0
,
dtype
)
now
=
tvm
.
const
(
0.0
,
dtype
)
for
ii
in
range
(
row
):
for
ii
in
range
(
row
):
for
jj
in
range
(
col
):
for
jj
in
range
(
col
):
now
=
tvm
.
expr
.
Select
(
tvm
.
all
(
i
%
row
==
ii
,
j
%
col
==
jj
),
now
=
tvm
.
expr
.
Select
(
tvm
.
all
(
i
dxm
(
i
,
row
)
==
ii
,
idxm
(
j
,
col
)
==
jj
),
tvm
.
const
(
matrix
[
ii
][
jj
],
dtype
),
tvm
.
const
(
matrix
[
ii
][
jj
],
dtype
),
now
)
now
)
return
now
return
now
...
...
topi/python/topi/vision/ssd/multibox.py
View file @
f98035b0
...
@@ -73,10 +73,10 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
...
@@ -73,10 +73,10 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
center_w
=
(
j
+
offset_w
)
*
steps_w
center_w
=
(
j
+
offset_w
)
*
steps_w
for
k
in
const_range
(
num_sizes
+
num_ratios
-
1
):
for
k
in
const_range
(
num_sizes
+
num_ratios
-
1
):
if
k
<
num_sizes
:
if
k
<
num_sizes
:
w
=
sizes
[
k
]
*
in_height
/
in_width
/
2.0
w
=
float32
(
sizes
[
k
]
*
in_height
)
/
in_width
/
2.0
h
=
sizes
[
k
]
/
2.0
h
=
sizes
[
k
]
/
2.0
else
:
else
:
w
=
sizes
[
0
]
*
in_height
/
in_width
\
w
=
float32
(
sizes
[
0
]
*
in_height
)
/
in_width
\
*
sqrt
(
ratios
[
k
-
num_sizes
+
1
]
*
1.0
)
/
2.0
*
sqrt
(
ratios
[
k
-
num_sizes
+
1
]
*
1.0
)
/
2.0
h
=
sizes
[
0
]
/
sqrt
(
ratios
[
k
-
num_sizes
+
1
]
*
1.0
)
/
2.0
h
=
sizes
[
0
]
/
sqrt
(
ratios
[
k
-
num_sizes
+
1
]
*
1.0
)
/
2.0
count
=
i
*
in_width
*
(
num_sizes
+
num_ratios
-
1
)
\
count
=
i
*
in_width
*
(
num_sizes
+
num_ratios
-
1
)
\
...
...
topi/python/topi/x86/conv2d_avx_1x1.py
View file @
f98035b0
...
@@ -309,8 +309,15 @@ def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, o
...
@@ -309,8 +309,15 @@ def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, o
# packing the Filter to let memory access be consecutive for AVX512 intrinsic
# packing the Filter to let memory access be consecutive for AVX512 intrinsic
# Done in pre-compute stage
# Done in pre-compute stage
packw_shape
=
(
kernel_h
,
kernel_w
,
num_filter
/
16
,
16
*
(
channel
/
4
),
4
)
idxd
=
tvm
.
indexdiv
PackW
=
tvm
.
compute
(
packw_shape
,
lambda
a
,
b
,
c
,
d
,
e
:
Filter
[
a
][
b
][
c
*
16
+
d
%
16
][
d
/
16
*
4
+
e
],
idxm
=
tvm
.
indexmod
packw_shape
=
(
kernel_h
,
kernel_w
,
idxd
(
num_filter
,
16
),
16
*
idxd
(
channel
,
4
),
4
)
PackW
=
tvm
.
compute
(
packw_shape
,
lambda
a
,
b
,
c
,
d
,
e
:
Filter
[
a
,
b
,
c
*
16
+
idxm
(
d
,
16
),
idxd
(
d
,
16
)
*
4
+
e
],
name
=
"packed_filter"
)
name
=
"packed_filter"
)
rc
=
tvm
.
reduce_axis
((
0
,
in_channel
),
name
=
'rc'
)
rc
=
tvm
.
reduce_axis
((
0
,
in_channel
),
name
=
'rc'
)
...
@@ -321,7 +328,9 @@ def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, o
...
@@ -321,7 +328,9 @@ def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, o
lambda
nn
,
yy
,
xx
,
ff
:
tvm
.
sum
(
lambda
nn
,
yy
,
xx
,
ff
:
tvm
.
sum
(
PaddedInput
[
nn
,
yy
*
stride_h
+
ry
*
dilation_h
,
PaddedInput
[
nn
,
yy
*
stride_h
+
ry
*
dilation_h
,
xx
*
stride_w
+
rx
*
dilation_w
,
rc
]
.
astype
(
out_dtype
)
*
xx
*
stride_w
+
rx
*
dilation_w
,
rc
]
.
astype
(
out_dtype
)
*
PackW
[
ry
,
rx
,
ff
/
16
,
(
rc
/
4
)
*
16
+
ff
%
16
,
rc
%
4
]
.
astype
(
out_dtype
),
axis
=
[
ry
,
rx
,
rc
]),
PackW
[
ry
,
rx
,
idxd
(
ff
,
16
),
idxd
(
rc
,
4
)
*
16
+
idxm
(
ff
,
16
),
idxm
(
rc
,
4
)]
.
astype
(
out_dtype
),
axis
=
[
ry
,
rx
,
rc
]),
name
=
"Conv2d_1x1_Output_int8"
,
tag
=
"conv2d_nhwc_pack_int8"
)
name
=
"Conv2d_1x1_Output_int8"
,
tag
=
"conv2d_nhwc_pack_int8"
)
return
Output
return
Output
...
...
tutorials/optimize/opt_gemm.py
View file @
f98035b0
...
@@ -247,7 +247,7 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
...
@@ -247,7 +247,7 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
# We have to re-write the algorithm slightly.
# We have to re-write the algorithm slightly.
packedB
=
tvm
.
compute
((
N
/
bn
,
K
,
bn
),
lambda
x
,
y
,
z
:
B
[
y
,
x
*
bn
+
z
],
name
=
'packedB'
)
packedB
=
tvm
.
compute
((
N
/
bn
,
K
,
bn
),
lambda
x
,
y
,
z
:
B
[
y
,
x
*
bn
+
z
],
name
=
'packedB'
)
C
=
tvm
.
compute
((
M
,
N
),
C
=
tvm
.
compute
((
M
,
N
),
lambda
x
,
y
:
tvm
.
sum
(
A
[
x
,
k
]
*
packedB
[
y
/
bn
,
k
,
y
%
bn
],
axis
=
k
),
lambda
x
,
y
:
tvm
.
sum
(
A
[
x
,
k
]
*
packedB
[
y
/
/
bn
,
k
,
tvm
.
indexmod
(
y
,
bn
)
],
axis
=
k
),
name
=
'C'
)
name
=
'C'
)
s
=
tvm
.
create_schedule
(
C
.
op
)
s
=
tvm
.
create_schedule
(
C
.
op
)
...
...
vta/python/vta/ir_pass.py
View file @
f98035b0
...
@@ -335,6 +335,9 @@ def inject_dma_intrin(stmt_in):
...
@@ -335,6 +335,9 @@ def inject_dma_intrin(stmt_in):
Transformed statement
Transformed statement
"""
"""
env
=
get_env
()
env
=
get_env
()
idxd
=
tvm
.
indexdiv
idxm
=
tvm
.
indexmod
def
_check_compact
(
buf
):
def
_check_compact
(
buf
):
ndim
=
len
(
buf
.
shape
)
ndim
=
len
(
buf
.
shape
)
size
=
tvm
.
const
(
1
,
buf
.
shape
[
0
]
.
dtype
)
size
=
tvm
.
const
(
1
,
buf
.
shape
[
0
]
.
dtype
)
...
@@ -369,7 +372,7 @@ def inject_dma_intrin(stmt_in):
...
@@ -369,7 +372,7 @@ def inject_dma_intrin(stmt_in):
x_size
=
1
x_size
=
1
x_stride
=
buf
.
strides
[
ndim
-
base
]
x_stride
=
buf
.
strides
[
ndim
-
base
]
next_base
=
base
next_base
=
base
if
not
util
.
equal_const_int
(
x_stride
%
elem_block
,
0
):
if
not
util
.
equal_const_int
(
idxm
(
x_stride
,
elem_block
)
,
0
):
raise
RuntimeError
(
raise
RuntimeError
(
"scope
%
s need to have block=
%
d, shape=
%
s, strides=
%
s"
%
(
"scope
%
s need to have block=
%
d, shape=
%
s, strides=
%
s"
%
(
scope
,
elem_block
,
buf
.
shape
,
buf
.
strides
))
scope
,
elem_block
,
buf
.
shape
,
buf
.
strides
))
...
@@ -394,7 +397,7 @@ def inject_dma_intrin(stmt_in):
...
@@ -394,7 +397,7 @@ def inject_dma_intrin(stmt_in):
raise
RuntimeError
(
"Expect buffer type to be
%
s instead of
%
s"
%
raise
RuntimeError
(
"Expect buffer type to be
%
s instead of
%
s"
%
(
dtype
,
buf
.
dtype
))
(
dtype
,
buf
.
dtype
))
shape
,
strides
=
buf
.
shape
,
buf
.
strides
shape
,
strides
=
buf
.
shape
,
buf
.
strides
if
not
util
.
equal_const_int
(
buf
.
elem_offset
%
elem_block
,
0
):
if
not
util
.
equal_const_int
(
idxm
(
buf
.
elem_offset
,
elem_block
)
,
0
):
raise
RuntimeError
(
"scope
%
s need to have block=
%
d"
%
(
scope
,
elem_block
))
raise
RuntimeError
(
"scope
%
s need to have block=
%
d"
%
(
scope
,
elem_block
))
if
allow_fold
:
if
allow_fold
:
shape
,
strides
=
_fold_buffer_dim
(
buf
,
scope
,
elem_block
)
shape
,
strides
=
_fold_buffer_dim
(
buf
,
scope
,
elem_block
)
...
@@ -421,7 +424,7 @@ def inject_dma_intrin(stmt_in):
...
@@ -421,7 +424,7 @@ def inject_dma_intrin(stmt_in):
x_size
=
1
x_size
=
1
x_stride
=
1
x_stride
=
1
y_size
=
1
y_size
=
1
return
x_size
,
y_size
,
x_stride
,
buf
.
elem_offset
/
elem_block
return
x_size
,
y_size
,
x_stride
,
idxd
(
buf
.
elem_offset
,
elem_block
)
if
not
util
.
equal_const_int
(
strides
[
-
2
]
-
elem_block
,
0
):
if
not
util
.
equal_const_int
(
strides
[
-
2
]
-
elem_block
,
0
):
raise_error
()
raise_error
()
...
@@ -429,15 +432,15 @@ def inject_dma_intrin(stmt_in):
...
@@ -429,15 +432,15 @@ def inject_dma_intrin(stmt_in):
x_size
=
shape
[
-
2
]
x_size
=
shape
[
-
2
]
x_stride
=
shape
[
-
2
]
x_stride
=
shape
[
-
2
]
y_size
=
1
y_size
=
1
return
x_size
,
y_size
,
x_stride
,
buf
.
elem_offset
/
elem_block
return
x_size
,
y_size
,
x_stride
,
idxd
(
buf
.
elem_offset
,
elem_block
)
if
not
util
.
equal_const_int
(
strides
[
-
3
]
%
elem_block
,
0
):
if
not
util
.
equal_const_int
(
idxm
(
strides
[
-
3
],
elem_block
)
,
0
):
raise_error
()
raise_error
()
if
ndim
==
3
:
if
ndim
==
3
:
x_size
=
shape
[
-
2
]
x_size
=
shape
[
-
2
]
x_stride
=
strides
[
-
3
]
/
elem_block
x_stride
=
idxd
(
strides
[
-
3
],
elem_block
)
y_size
=
shape
[
-
3
]
y_size
=
shape
[
-
3
]
return
x_size
,
y_size
,
x_stride
,
buf
.
elem_offset
/
elem_block
return
x_size
,
y_size
,
x_stride
,
idxd
(
buf
.
elem_offset
,
elem_block
)
else
:
else
:
if
not
util
.
equal_const_int
(
strides
[
-
1
],
1
):
if
not
util
.
equal_const_int
(
strides
[
-
1
],
1
):
...
@@ -451,7 +454,7 @@ def inject_dma_intrin(stmt_in):
...
@@ -451,7 +454,7 @@ def inject_dma_intrin(stmt_in):
x_size
=
1
x_size
=
1
x_stride
=
1
x_stride
=
1
y_size
=
1
y_size
=
1
return
x_size
,
y_size
,
x_stride
,
buf
.
elem_offset
/
elem_block
return
x_size
,
y_size
,
x_stride
,
idxd
(
buf
.
elem_offset
,
elem_block
)
if
not
util
.
equal_const_int
(
strides
[
-
3
],
elem_block
):
if
not
util
.
equal_const_int
(
strides
[
-
3
],
elem_block
):
raise_error
()
raise_error
()
...
@@ -459,15 +462,15 @@ def inject_dma_intrin(stmt_in):
...
@@ -459,15 +462,15 @@ def inject_dma_intrin(stmt_in):
x_size
=
shape
[
-
3
]
x_size
=
shape
[
-
3
]
x_stride
=
shape
[
-
3
]
x_stride
=
shape
[
-
3
]
y_size
=
1
y_size
=
1
return
x_size
,
y_size
,
x_stride
,
buf
.
elem_offset
/
elem_block
return
x_size
,
y_size
,
x_stride
,
idxd
(
buf
.
elem_offset
,
elem_block
)
if
not
util
.
equal_const_int
(
strides
[
-
4
]
%
elem_block
,
0
):
if
not
util
.
equal_const_int
(
idxm
(
strides
[
-
4
],
elem_block
)
,
0
):
raise_error
()
raise_error
()
if
ndim
==
4
:
if
ndim
==
4
:
x_size
=
shape
[
-
3
]
x_size
=
shape
[
-
3
]
x_stride
=
strides
[
-
4
]
/
elem_block
x_stride
=
idxd
(
strides
[
-
4
],
elem_block
)
y_size
=
shape
[
-
4
]
y_size
=
shape
[
-
4
]
return
x_size
,
y_size
,
x_stride
,
buf
.
elem_offset
/
elem_block
return
x_size
,
y_size
,
x_stride
,
idxd
(
buf
.
elem_offset
,
elem_block
)
raise_error
()
raise_error
()
...
@@ -765,6 +768,8 @@ def inject_alu_intrin(stmt_in):
...
@@ -765,6 +768,8 @@ def inject_alu_intrin(stmt_in):
Transformed statement
Transformed statement
"""
"""
env
=
get_env
()
env
=
get_env
()
idxm
=
tvm
.
indexmod
def
_do_fold
(
stmt
):
def
_do_fold
(
stmt
):
def
_equal
(
x
,
y
):
def
_equal
(
x
,
y
):
return
tvm
.
ir_pass
.
Equal
(
tvm
.
ir_pass
.
Simplify
(
x
-
y
),
0
)
return
tvm
.
ir_pass
.
Equal
(
tvm
.
ir_pass
.
Simplify
(
x
-
y
),
0
)
...
@@ -910,10 +915,10 @@ def inject_alu_intrin(stmt_in):
...
@@ -910,10 +915,10 @@ def inject_alu_intrin(stmt_in):
assert
len
(
extents
)
!=
0
assert
len
(
extents
)
!=
0
assert
tvm
.
ir_pass
.
Equal
(
assert
tvm
.
ir_pass
.
Equal
(
tvm
.
ir_pass
.
Simplify
(
tvm
.
ir_pass
.
Simplify
(
src_coeff
[
-
1
]
%
(
env
.
BATCH
*
env
.
BLOCK_OUT
)),
0
)
idxm
(
src_coeff
[
-
1
],
env
.
BATCH
*
env
.
BLOCK_OUT
)),
0
)
assert
tvm
.
ir_pass
.
Equal
(
assert
tvm
.
ir_pass
.
Equal
(
tvm
.
ir_pass
.
Simplify
(
tvm
.
ir_pass
.
Simplify
(
dst_coeff
[
-
1
]
%
(
env
.
BATCH
*
env
.
BLOCK_OUT
)),
0
)
idxm
(
dst_coeff
[
-
1
],
env
.
BATCH
*
env
.
BLOCK_OUT
)),
0
)
assert
tvm
.
ir_pass
.
Equal
(
src_coeff
[
-
2
],
1
)
assert
tvm
.
ir_pass
.
Equal
(
src_coeff
[
-
2
],
1
)
assert
tvm
.
ir_pass
.
Equal
(
dst_coeff
[
-
2
],
1
)
assert
tvm
.
ir_pass
.
Equal
(
dst_coeff
[
-
2
],
1
)
if
env
.
BATCH
>
1
:
if
env
.
BATCH
>
1
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment