Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
tic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
wenyuanbo
tic
Commits
a279dd0e
Commit
a279dd0e
authored
Aug 01, 2019
by
Jian Weng
Committed by
Tianqi Chen
Aug 01, 2019
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add shuffle support to TVM (#3633)
parent
9ae01e0b
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
141 additions
and
5 deletions
+141
-5
include/tvm/ir_functor_ext.h
+1
-0
include/tvm/ir_visitor.h
+1
-0
src/codegen/build_common.h
+1
-0
src/codegen/codegen_c.cc
+4
-0
src/codegen/codegen_c.h
+1
-0
src/codegen/codegen_cuda.cc
+20
-3
src/codegen/codegen_cuda.h
+1
-0
src/codegen/llvm/codegen_llvm.cc
+27
-2
src/codegen/llvm/codegen_llvm.h
+1
-0
src/pass/ir_visitor.cc
+8
-0
tests/python/unittest/test_codegen_cuda.py
+44
-0
tests/python/unittest/test_codegen_llvm.py
+32
-0
No files found.
include/tvm/ir_functor_ext.h
View file @
a279dd0e
...
...
@@ -199,6 +199,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
IR_EXPR_FUNCTOR_DISPATCH
(
Not
);
IR_EXPR_FUNCTOR_DISPATCH
(
Select
);
IR_EXPR_FUNCTOR_DISPATCH
(
Ramp
);
IR_EXPR_FUNCTOR_DISPATCH
(
Shuffle
);
IR_EXPR_FUNCTOR_DISPATCH
(
Broadcast
);
IR_EXPR_FUNCTOR_DISPATCH
(
IntImm
);
IR_EXPR_FUNCTOR_DISPATCH
(
UIntImm
);
...
...
include/tvm/ir_visitor.h
View file @
a279dd0e
...
...
@@ -131,6 +131,7 @@ class TVM_DLL IRVisitor {
virtual
void
Visit_
(
const
Not
*
op
);
virtual
void
Visit_
(
const
Select
*
op
);
virtual
void
Visit_
(
const
Ramp
*
op
);
virtual
void
Visit_
(
const
Shuffle
*
op
);
virtual
void
Visit_
(
const
Broadcast
*
op
);
virtual
void
Visit_
(
const
AssertStmt
*
op
);
virtual
void
Visit_
(
const
ProducerConsumer
*
op
);
...
...
src/codegen/build_common.h
View file @
a279dd0e
...
...
@@ -26,6 +26,7 @@
#define TVM_CODEGEN_BUILD_COMMON_H_
#include <tvm/codegen.h>
#include <tvm/ir.h>
#include <unordered_map>
#include <string>
#include "../runtime/meta_data.h"
...
...
src/codegen/codegen_c.cc
View file @
a279dd0e
...
...
@@ -728,6 +728,10 @@ void CodeGenC::VisitExpr_(const Ramp* op, std::ostream& os) { // NOLINT(*)
os
<<
"))"
;
}
void
CodeGenC
::
VisitExpr_
(
const
Shuffle
*
op
,
std
::
ostream
&
os
)
{
LOG
(
FATAL
)
<<
"Shuffle: not supported "
;
}
void
CodeGenC
::
VisitExpr_
(
const
Broadcast
*
op
,
std
::
ostream
&
os
)
{
// NOLINT(*)
LOG
(
FATAL
)
<<
"Broadcast: not supported "
;
}
...
...
src/codegen/codegen_c.h
View file @
a279dd0e
...
...
@@ -126,6 +126,7 @@ class CodeGenC :
void
VisitExpr_
(
const
Not
*
op
,
std
::
ostream
&
os
)
override
;
// NOLINT(*)
void
VisitExpr_
(
const
Select
*
op
,
std
::
ostream
&
os
)
override
;
// NOLINT(*)
void
VisitExpr_
(
const
Ramp
*
op
,
std
::
ostream
&
os
)
override
;
// NOLINT(*)
void
VisitExpr_
(
const
Shuffle
*
op
,
std
::
ostream
&
os
)
override
;
// NOLINT(*)
void
VisitExpr_
(
const
Broadcast
*
op
,
std
::
ostream
&
os
)
override
;
// NOLINT(*)
void
VisitExpr_
(
const
IntImm
*
op
,
std
::
ostream
&
os
)
override
;
// NOLINT(*)
void
VisitExpr_
(
const
UIntImm
*
op
,
std
::
ostream
&
os
)
override
;
// NOLINT(*)
...
...
src/codegen/codegen_cuda.cc
View file @
a279dd0e
...
...
@@ -205,7 +205,7 @@ void CodeGenCUDA::PrintVecBinaryOp(
void
CodeGenCUDA
::
PrintVecElemLoad
(
const
std
::
string
&
vec
,
Type
t
,
int
i
,
std
::
ostream
&
os
)
{
// NOLINT(*)
const
char
access
[]
=
{
'x'
,
'y'
,
'z'
,
'w'
};
static
const
char
access
[]
=
{
'x'
,
'y'
,
'z'
,
'w'
};
CHECK
(
i
>=
0
&&
i
<
4
);
os
<<
vec
<<
"."
<<
access
[
i
];
}
...
...
@@ -213,7 +213,7 @@ void CodeGenCUDA::PrintVecElemLoad(
void
CodeGenCUDA
::
PrintVecElemStore
(
const
std
::
string
&
vec
,
Type
t
,
int
i
,
const
std
::
string
&
value
)
{
this
->
PrintIndent
();
const
char
access
[]
=
{
'x'
,
'y'
,
'z'
,
'w'
};
static
const
char
access
[]
=
{
'x'
,
'y'
,
'z'
,
'w'
};
CHECK
(
i
>=
0
&&
i
<
4
);
stream
<<
vec
<<
"."
<<
access
[
i
]
<<
" = "
<<
value
<<
";
\n
"
;
}
...
...
@@ -308,7 +308,7 @@ void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) { // NOLIN
std
::
string
v
=
PrintExpr
(
op
->
value
);
os
<<
"make_"
;
PrintType
(
op
->
type
,
os
);
os
<<
"("
;
os
<<
'('
;
for
(
int
i
=
0
;
i
<
op
->
lanes
;
++
i
)
{
if
(
i
!=
0
)
os
<<
", "
;
os
<<
v
;
...
...
@@ -316,6 +316,23 @@ void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) { // NOLIN
os
<<
')'
;
}
void
CodeGenCUDA
::
VisitExpr_
(
const
Shuffle
*
op
,
std
::
ostream
&
os
)
{
std
::
vector
<
std
::
string
>
to_shuffle
(
op
->
vectors
.
size
());
for
(
int
i
=
0
,
e
=
op
->
vectors
.
size
();
i
<
e
;
++
i
)
{
CHECK
(
op
->
vectors
[
i
].
type
().
lanes
()
==
1
)
<<
"Only scalars can be shuffled in CUDA!"
;
to_shuffle
[
i
]
=
PrintExpr
(
op
->
vectors
[
i
]);
}
os
<<
"make_"
;
PrintType
(
op
->
type
,
os
);
os
<<
'('
;
for
(
int
i
=
0
,
e
=
op
->
indices
.
size
();
i
<
e
;
++
i
)
{
const
int64_t
*
val
=
as_const_int
(
op
->
indices
[
i
]);
CHECK
(
val
&&
*
val
>=
0
&&
(
int
)
*
val
<
(
int
)
to_shuffle
.
size
());
if
(
i
!=
0
)
os
<<
", "
;
os
<<
to_shuffle
[
*
val
];
}
os
<<
')'
;
}
inline
void
PrintConst
(
const
FloatImm
*
op
,
std
::
ostream
&
os
,
CodeGenCUDA
*
p
)
{
// NOLINT(*)
switch
(
op
->
type
.
bits
())
{
...
...
src/codegen/codegen_cuda.h
View file @
a279dd0e
...
...
@@ -57,6 +57,7 @@ class CodeGenCUDA final : public CodeGenC {
void
BindThreadIndex
(
const
IterVar
&
iv
)
final
;
// NOLINT(*)
// overload visitor
void
VisitExpr_
(
const
Ramp
*
op
,
std
::
ostream
&
os
)
final
;
// NOLINT(*)
void
VisitExpr_
(
const
Shuffle
*
op
,
std
::
ostream
&
os
)
final
;
// NOLINT(*)
void
VisitExpr_
(
const
Broadcast
*
op
,
std
::
ostream
&
os
)
final
;
// NOLINT(*)
void
VisitExpr_
(
const
FloatImm
*
op
,
std
::
ostream
&
os
)
final
;
void
VisitStmt_
(
const
Evaluate
*
op
)
final
;
...
...
src/codegen/llvm/codegen_llvm.cc
View file @
a279dd0e
...
...
@@ -30,6 +30,7 @@
#include "codegen_llvm.h"
#include "codegen_cpu.h"
#include "../build_common.h"
#include "../../pass/ir_util.h"
#include "../../arithmetic/compute_expr.h"
...
...
@@ -446,6 +447,7 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) {
llvm
::
Value
*
CodeGenLLVM
::
CreateVecSlice
(
llvm
::
Value
*
vec
,
int
begin
,
int
extent
)
{
int
num_elems
=
static_cast
<
int
>
(
vec
->
getType
()
->
getVectorNumElements
());
if
(
extent
==
num_elems
&&
begin
==
0
)
return
vec
;
CHECK
(
begin
>=
0
&&
extent
<=
num_elems
)
<<
"Slicing out of bound!
\n
"
;
std
::
vector
<
llvm
::
Constant
*>
indices
;
indices
.
reserve
(
extent
);
for
(
int
i
=
0
;
i
<
extent
;
++
i
)
{
...
...
@@ -481,6 +483,7 @@ llvm::Value* CodeGenLLVM::CreateVecPad(llvm::Value* vec, int target_lanes) {
llvm
::
Value
*
CodeGenLLVM
::
CreateVecConcat
(
std
::
vector
<
llvm
::
Value
*>
vecs
)
{
// concat vector, tree shape reduction
int
total_lanes
=
0
;
for
(
llvm
::
Value
*
v
:
vecs
)
{
total_lanes
+=
static_cast
<
int
>
(
v
->
getType
()
->
getVectorNumElements
());
...
...
@@ -652,12 +655,14 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
CHECK_GE
(
op
->
args
.
size
(),
2U
);
llvm
::
Intrinsic
::
ID
id
=
static_cast
<
llvm
::
Intrinsic
::
ID
>
(
op
->
args
[
0
].
as
<
UIntImm
>
()
->
value
);
uint64_t
num_signature
=
op
->
args
[
1
].
as
<
UIntImm
>
()
->
value
;
const
uint64_t
*
num_signature
=
as_const_uint
(
op
->
args
[
1
]);
CHECK
(
num_signature
)
<<
"The second argument should be a uint represents number of arguments, "
<<
"but "
<<
op
->
args
[
1
]
<<
" got!
\n
"
;
std
::
vector
<
llvm
::
Value
*>
arg_value
;
std
::
vector
<
llvm
::
Type
*>
sig_type
;
for
(
size_t
i
=
2
;
i
<
op
->
args
.
size
();
++
i
)
{
arg_value
.
push_back
(
MakeValue
(
op
->
args
[
i
]));
if
(
i
-
2
<
num_signature
)
{
if
(
i
-
2
<
*
num_signature
)
{
sig_type
.
push_back
(
arg_value
.
back
()
->
getType
());
}
}
...
...
@@ -1002,6 +1007,26 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Ramp* op) {
return
vec
;
}
llvm
::
Value
*
CodeGenLLVM
::
VisitExpr_
(
const
Shuffle
*
op
)
{
std
::
vector
<
llvm
::
Value
*>
vecs
(
op
->
vectors
.
size
());
int
total_lanes
=
0
;
for
(
int
i
=
0
,
e
=
op
->
vectors
.
size
();
i
<
e
;
++
i
)
{
vecs
[
i
]
=
VisitExpr
(
op
->
vectors
[
i
]);
total_lanes
+=
op
->
vectors
[
i
].
type
().
lanes
();
}
llvm
::
Value
*
v0
=
CreateVecConcat
(
vecs
);
std
::
vector
<
uint32_t
>
idx
(
op
->
indices
.
size
());
for
(
int
i
=
0
,
e
=
op
->
indices
.
size
();
i
<
e
;
++
i
)
{
const
int64_t
*
val
=
as_const_int
(
op
->
indices
[
i
]);
CHECK
(
val
&&
*
val
>=
0
&&
*
val
<
total_lanes
)
<<
"Shuffled indeces are suppose to be int, "
<<
"but get "
<<
op
->
indices
[
i
]
<<
"
\n
"
;
idx
[
i
]
=
*
val
;
}
llvm
::
Value
*
mask
=
llvm
::
ConstantDataVector
::
get
(
builder_
->
getContext
(),
idx
);
auto
res
=
builder_
->
CreateShuffleVector
(
v0
,
llvm
::
UndefValue
::
get
(
v0
->
getType
()),
mask
);
return
res
;
}
llvm
::
Value
*
CodeGenLLVM
::
VisitExpr_
(
const
Broadcast
*
op
)
{
return
CreateBroadcast
(
MakeValue
(
op
->
value
),
op
->
lanes
);
}
...
...
src/codegen/llvm/codegen_llvm.h
View file @
a279dd0e
...
...
@@ -131,6 +131,7 @@ class CodeGenLLVM :
llvm
::
Value
*
VisitExpr_
(
const
Load
*
op
)
override
;
llvm
::
Value
*
VisitExpr_
(
const
Call
*
op
)
override
;
llvm
::
Value
*
VisitExpr_
(
const
Ramp
*
op
)
override
;
llvm
::
Value
*
VisitExpr_
(
const
Shuffle
*
op
)
override
;
llvm
::
Value
*
VisitExpr_
(
const
Broadcast
*
op
)
override
;
// stmt
void
VisitStmt_
(
const
Store
*
op
)
override
;
...
...
src/pass/ir_visitor.cc
View file @
a279dd0e
...
...
@@ -177,6 +177,13 @@ void IRVisitor::Visit_(const Ramp *op) {
this
->
Visit
(
op
->
stride
);
}
void
IRVisitor
::
Visit_
(
const
Shuffle
*
op
)
{
for
(
const
auto
&
elem
:
op
->
indices
)
this
->
Visit
(
elem
);
for
(
const
auto
&
elem
:
op
->
vectors
)
this
->
Visit
(
elem
);
}
void
IRVisitor
::
Visit_
(
const
Broadcast
*
op
)
{
this
->
Visit
(
op
->
value
);
}
...
...
@@ -269,6 +276,7 @@ TVM_STATIC_IR_FUNCTOR(IRVisitor, vtable)
.
DISPATCH_TO_VISIT
(
Not
)
.
DISPATCH_TO_VISIT
(
Select
)
.
DISPATCH_TO_VISIT
(
Ramp
)
.
DISPATCH_TO_VISIT
(
Shuffle
)
.
DISPATCH_TO_VISIT
(
Broadcast
)
.
DISPATCH_TO_VISIT
(
AssertStmt
)
.
DISPATCH_TO_VISIT
(
ProducerConsumer
)
...
...
tests/python/unittest/test_codegen_cuda.py
View file @
a279dd0e
...
...
@@ -154,9 +154,53 @@ def test_cuda_inf_nan():
check_inf_nan
(
ctx
,
1
,
float
(
'nan'
),
'float64'
)
def
test_cuda_shuffle
():
if
not
tvm
.
gpu
(
0
)
.
exist
or
not
tvm
.
module
.
enabled
(
"cuda"
):
print
(
"skip because cuda is not enabled.."
)
return
a
=
tvm
.
placeholder
((
64
,
),
'int32'
)
b
=
tvm
.
placeholder
((
64
,
),
'int32'
)
c
=
tvm
.
compute
((
64
,
),
lambda
x
:
a
[
x
]
+
b
[
x
-
(
x
%
4
)
+
(
3
-
x
%
4
)])
sch
=
tvm
.
create_schedule
(
c
.
op
)
x
=
c
.
op
.
axis
[
0
]
xo
,
xi
=
sch
[
c
]
.
split
(
x
,
4
)
thrx
=
tvm
.
thread_axis
(
"threadIdx.x"
)
sch
[
c
]
.
bind
(
xo
,
thrx
)
sch
[
c
]
.
vectorize
(
xi
)
def
my_vectorize
(
stmt
):
def
vectorizer
(
op
):
if
op
.
for_type
==
tvm
.
stmt
.
For
.
Vectorized
:
four
=
tvm
.
const
(
4
,
'int32'
)
idx
=
tvm
.
make
.
Ramp
(
thrx
.
var
*
four
,
tvm
.
const
(
1
,
'int32'
),
4
)
all_ones
=
tvm
.
const
(
1
,
'int32x4'
)
store
=
op
.
body
value
=
store
.
value
new_a
=
tvm
.
make
.
Load
(
'int32x4'
,
value
.
a
.
buffer_var
,
idx
,
all_ones
)
bs
,
ids
=
[],
[]
for
i
in
range
(
4
):
bs
.
append
(
tvm
.
make
.
Load
(
'int32'
,
value
.
b
.
buffer_var
,
thrx
.
var
*
four
+
tvm
.
const
(
i
,
'int32'
)))
ids
.
append
(
tvm
.
const
(
3
-
i
,
'int32'
))
new_b
=
tvm
.
make
.
Shuffle
(
bs
,
ids
)
return
tvm
.
make
.
Store
(
store
.
buffer_var
,
new_a
+
new_b
,
idx
,
all_ones
)
return
None
return
tvm
.
ir_pass
.
IRTransform
(
stmt
,
None
,
vectorizer
,
[
'For'
])
with
tvm
.
build_config
(
add_lower_pass
=
[(
1
,
my_vectorize
)]):
module
=
tvm
.
build
(
sch
,
[
a
,
b
,
c
],
target
=
'cuda'
)
a_
=
np
.
array
(
list
(
range
(
64
)),
dtype
=
'int32'
)
b_
=
np
.
array
((
list
(
range
(
4
))[::
-
1
])
*
16
,
dtype
=
'int32'
)
c_
=
np
.
zeros
((
64
,
),
dtype
=
'int32'
)
ref
=
a_
+
np
.
array
((
list
(
range
(
4
)))
*
16
,
dtype
=
'int32'
)
nda
,
ndb
,
ndc
=
[
tvm
.
ndarray
.
array
(
i
,
tvm
.
gpu
(
0
))
for
i
in
[
a_
,
b_
,
c_
]]
module
(
nda
,
ndb
,
ndc
)
tvm
.
testing
.
assert_allclose
(
ndc
.
asnumpy
(),
ref
)
if
__name__
==
"__main__"
:
test_cuda_vectorize_add
()
test_cuda_multiply_add
()
test_cuda_vectorize_load
()
test_cuda_make_int8x4
()
test_cuda_inf_nan
()
test_cuda_shuffle
()
tests/python/unittest/test_codegen_llvm.py
View file @
a279dd0e
...
...
@@ -548,6 +548,37 @@ def test_dwarf_debug_information():
check_llvm_object
()
check_llvm_ir
()
def
test_llvm_shuffle
():
a
=
tvm
.
placeholder
((
8
,
),
'int32'
)
b
=
tvm
.
placeholder
((
8
,
),
'int32'
)
c
=
tvm
.
compute
((
8
,
),
lambda
x
:
a
[
x
]
+
b
[
7
-
x
])
sch
=
tvm
.
create_schedule
(
c
.
op
)
def
my_vectorize
(
stmt
):
def
vectorizer
(
op
):
store
=
op
.
body
idx
=
tvm
.
make
.
Ramp
(
tvm
.
const
(
0
,
'int32'
),
tvm
.
const
(
1
,
'int32'
),
8
)
all_ones
=
tvm
.
const
(
1
,
'int32x8'
)
value
=
store
.
value
b_idx
=
tvm
.
make
.
Shuffle
([
idx
],
[
tvm
.
const
(
i
,
'int32'
)
for
i
in
range
(
7
,
-
1
,
-
1
)])
new_a
=
tvm
.
make
.
Load
(
'int32x8'
,
value
.
a
.
buffer_var
,
idx
,
all_ones
)
new_b
=
tvm
.
make
.
Load
(
'int32x8'
,
value
.
b
.
buffer_var
,
b_idx
,
all_ones
)
value
=
new_a
+
new_b
return
tvm
.
make
.
Store
(
store
.
buffer_var
,
new_a
+
new_b
,
idx
,
all_ones
)
return
tvm
.
ir_pass
.
IRTransform
(
stmt
,
None
,
vectorizer
,
[
'For'
])
with
tvm
.
build_config
(
add_lower_pass
=
[(
1
,
my_vectorize
)]):
ir
=
tvm
.
lower
(
sch
,
[
a
,
b
,
c
],
simple_mode
=
True
)
module
=
tvm
.
build
(
sch
,
[
a
,
b
,
c
])
a_
=
tvm
.
ndarray
.
array
(
np
.
arange
(
1
,
9
,
dtype
=
'int32'
))
b_
=
tvm
.
ndarray
.
array
(
np
.
arange
(
8
,
0
,
-
1
,
dtype
=
'int32'
))
c_
=
tvm
.
ndarray
.
array
(
np
.
zeros
((
8
,
),
dtype
=
'int32'
))
module
(
a_
,
b_
,
c_
)
tvm
.
testing
.
assert_allclose
(
c_
.
asnumpy
(),
(
a_
.
asnumpy
()
*
2
)
.
astype
(
'int32'
))
if
__name__
==
"__main__"
:
test_llvm_import
()
test_alignment
()
...
...
@@ -567,3 +598,4 @@ if __name__ == "__main__":
test_llvm_div
()
test_llvm_fp_math
()
test_dwarf_debug_information
()
test_llvm_shuffle
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment