Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
tic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
wenyuanbo
tic
Commits
47e57be4
Commit
47e57be4
authored
Sep 29, 2018
by
Zhi
Committed by
Tianqi Chen
Sep 29, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
support of multiple devices for tvm.build (#1773)
parent
bea0b00f
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
161 additions
and
110 deletions
+161
-110
python/tvm/build_module.py
+156
-85
tests/python/unittest/test_runtime_heterogeneous.py
+5
-25
No files found.
python/tvm/build_module.py
View file @
47e57be4
...
...
@@ -379,24 +379,94 @@ def lower(sch,
return
stmt
return
ir_pass
.
MakeAPI
(
stmt
,
name
,
arg_list
,
0
,
cfg
.
restricted_func
)
def
build
(
sch
,
def
_build_for_device
(
flist
,
target
,
target_host
):
"""Build the lowered functions for a device with the given compilation
target.
Parameters
----------
flist : list of LoweredFunc
The schedule to be built.
target : str or :any:`tvm.target.Target`
The target and option of the compilation.
target_host : str or :any:`tvm.target.Target`
The host compilation target.
Returns
-------
fhost : list of LoweredFunc
A list of lowered functions for the host.
mdev : tvm.module
A module that contains device code.
"""
target
=
_target
.
create
(
target
)
device_type
=
ndarray
.
context
(
target
.
target_name
,
0
)
.
device_type
fhost
=
[]
fdevice
=
[]
for
func
in
flist
:
if
not
ir_pass
.
VerifyMemory
(
func
,
device_type
):
raise
ValueError
(
"Direct host side access to device memory is detected in
%
s. "
"Did you forget to bind?"
%
func
.
name
)
if
func
.
func_type
==
container
.
LoweredFunc
.
MixedFunc
:
if
current_build_config
()
.
detect_global_barrier
:
func
=
ir_pass
.
ThreadSync
(
func
,
"global"
)
func
=
ir_pass
.
ThreadSync
(
func
,
"shared"
)
func
=
ir_pass
.
ThreadSync
(
func
,
"warp"
)
warp_size
=
target
.
thread_warp_size
func
=
ir_pass
.
LowerThreadAllreduce
(
func
,
warp_size
)
fsplits
=
[
s
for
s
in
ir_pass
.
SplitHostDevice
(
func
)]
fhost
.
append
(
fsplits
[
0
])
for
x
in
fsplits
[
1
:]:
fdevice
.
append
(
x
)
elif
func
.
func_type
==
container
.
LoweredFunc
.
HostFunc
:
fhost
.
append
(
func
)
elif
func
.
func_type
==
container
.
LoweredFunc
.
DeviceFunc
:
fdevice
.
append
(
func
)
else
:
raise
ValueError
(
"unknown function type
%
d"
%
func
.
func_type
)
for
i
,
func
in
enumerate
(
fdevice
):
warp_size
=
target
.
thread_warp_size
fdevice
[
i
]
=
ir_pass
.
LowerWarpMemory
(
func
,
warp_size
)
if
"gpu"
in
target
.
keys
and
not
fdevice
:
warnings
.
warn
(
"Specified target
%
s, but cannot find device code, did you do "
"bind?"
%
target
)
fhost
=
[
ir_pass
.
BindDeviceType
(
x
,
device_type
)
for
x
in
fhost
]
fhost
=
[
ir_pass
.
LowerTVMBuiltin
(
x
)
for
x
in
fhost
]
if
device_type
==
ndarray
.
cpu
(
0
)
.
device_type
and
target_host
==
target
:
assert
not
fdevice
target_host
=
_target
.
create
(
target_host
)
fdevice
=
[
ir_pass
.
LowerIntrin
(
x
,
target
.
target_name
)
for
x
in
fdevice
]
fhost
=
[
ir_pass
.
LowerIntrin
(
x
,
target_host
.
target_name
)
for
x
in
fhost
]
fhost
=
[
ir_pass
.
CombineContextCall
(
x
)
for
x
in
fhost
]
mdev
=
codegen
.
build_module
(
fdevice
,
str
(
target
))
if
fdevice
else
None
return
fhost
,
mdev
def
build
(
inputs
,
args
=
None
,
target
=
None
,
target_host
=
None
,
name
=
"default_function"
,
binds
=
None
,
postpone_host_codegen
=
False
):
binds
=
None
):
"""Build a function with arguments as signature. Code will be generated
for a device specified by the target. For homogeneous execution, a module
that contains both host and device code is returned. For heterogeneous
execution, a list of lowered functions for the host and a module containing
device code are returned, but actual code generation for the host module is
postponed after code generation is finished for all devices.
for devices coupled with target information.
Parameters
----------
sch : tvm.Schedule, or LoweredFunc
The schedule to be buil
ded
inputs : tvm.Schedule, LoweredFunc, or dict of target to LoweredFunc list
The schedule to be buil
t
args : list of Buffer or Tensor or Var, optional
The argument lists to the function.
...
...
@@ -420,107 +490,108 @@ def build(sch,
Dictionary that maps the binding of symbolic buffer to Tensor.
By default, a new buffer is created for each tensor in the argument.
postpone_host_codegen : bool, optional
A bool value that indicates if code generation for the host module
should be postponed. This variable is set to be true for heterogeneous
execution. Otherwise, it is defaulted to false.
Returns
-------
ret : tvm.module, or (list of LoweredFunc, tvm.module) tuple
A module that combines both host and device code is returned when
postpone_host_codegen is not set. Otherwise, a list of lowered
functions for the host and a module contains only device code are
returned.
ret : tvm.module
A module that combines both host and device code.
Examples
________
There are two typical example uses of this function depending on the type
of the argument `inputs`:
1. it is a list of lowered functions:
.. code-block:: python
n = 2
A = tvm.placeholder((n,), name='A')
B = tvm.placeholder((n,), name='B')
C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
s = tvm.create_schedule(C.op)
f = tvm.lower(s, [A, B, C], name="test_add")
m = tvm.build(f, target="llvm")
2. it is a dict of compilation target to list of lowered functions:
.. code-block:: python
n = 2
A = tvm.placeholder((n,), name='A')
B = tvm.placeholder((n,), name='B')
C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
s1 = tvm.create_schedule(C.op)
s2 = topi.cpp.cuda.schedule_injective("cuda", [C])
f1 = tvm.lower(s1, [A, B, C], name="test_add1")
f2 = tvm.lower(s2, [A, B, C], name="test_add2")
m = tvm.build({"llvm": [f1], "cuda": [f2]}, target_host="llvm")
Note
----
See the note on :any:`tvm.target` on target string format.
"""
if
isinstance
(
sch
,
schedule
.
Schedule
):
if
isinstance
(
inputs
,
schedule
.
Schedule
):
if
args
is
None
:
raise
ValueError
(
"args must be given for build from schedule"
)
flist
=
lower
(
sch
,
args
,
flist
=
lower
(
inputs
,
args
,
name
=
name
,
binds
=
binds
)
if
isinstance
(
flist
,
container
.
LoweredFunc
):
flist
=
[
flist
]
elif
isinstance
(
sch
,
container
.
LoweredFunc
):
elif
isinstance
(
inputs
,
container
.
LoweredFunc
):
if
args
:
raise
ValueError
(
"args must be done when build from LoweredFunc"
)
flist
=
[
sch
]
elif
isinstance
(
sch
,
(
list
,
tuple
,
container
.
Array
)):
flist
=
sch
raise
ValueError
(
"args must be done when build from LoweredFunc."
)
flist
=
[
inputs
]
elif
isinstance
(
inputs
,
(
list
,
tuple
,
container
.
Array
)):
flist
=
inputs
elif
not
isinstance
(
inputs
,
(
dict
,
container
.
Map
)):
raise
ValueError
(
"inputs must be Schedule, LoweredFunc, list of "
"LoweredFunc, or dict of target to list of "
"LoweredFunc."
)
if
not
isinstance
(
inputs
,
(
dict
,
container
.
Map
)):
target
=
_target
.
current_target
()
if
target
is
None
else
target
target
=
target
if
target
else
"llvm"
target_flist
=
{
target
:
flist
}
else
:
raise
ValueError
(
"sch have to be Schedule, LoweredFunc or list of LoweredFunc"
)
target_flist
=
inputs
for
tar
,
flist
in
target_flist
.
items
():
if
not
isinstance
(
tar
,
(
str
,
_target
.
Target
)):
raise
ValueError
(
"The key of inputs must be str or "
"_target.Target when inputs is dict."
)
fname_set
=
set
()
for
x
in
flist
:
if
not
isinstance
(
x
,
container
.
LoweredFunc
):
raise
ValueError
(
"sch have to be Schedule, LoweredFunc or list of LoweredFunc"
)
raise
ValueError
(
"inputs must be Schedule, LoweredFunc, list "
"of LoweredFunc, or dict of str to list of "
"LoweredFunc."
)
if
x
.
name
in
fname_set
:
raise
ValueError
(
"Duplicate function name
%
s"
%
x
.
name
)
fname_set
.
add
(
x
.
name
)
target
=
_target
.
current_target
()
if
target
is
None
else
target
target
=
_target
.
create
(
target
)
if
target
else
_target
.
create
(
"llvm"
)
device_type
=
ndarray
.
context
(
target
.
target_name
,
0
)
.
device_type
fhost
=
[]
fdevice
=
[]
for
func
in
flist
:
if
not
ir_pass
.
VerifyMemory
(
func
,
device_type
):
raise
ValueError
(
"Direct host side access to device memory is detected in
%
s. "
"Did you forget to bind?"
%
func
.
name
)
if
func
.
func_type
==
container
.
LoweredFunc
.
MixedFunc
:
if
current_build_config
()
.
detect_global_barrier
:
func
=
ir_pass
.
ThreadSync
(
func
,
"global"
)
func
=
ir_pass
.
ThreadSync
(
func
,
"shared"
)
func
=
ir_pass
.
ThreadSync
(
func
,
"warp"
)
warp_size
=
target
.
thread_warp_size
func
=
ir_pass
.
LowerThreadAllreduce
(
func
,
warp_size
)
fsplits
=
[
s
for
s
in
ir_pass
.
SplitHostDevice
(
func
)]
fhost
.
append
(
fsplits
[
0
])
for
x
in
fsplits
[
1
:]:
fdevice
.
append
(
x
)
elif
func
.
func_type
==
container
.
LoweredFunc
.
HostFunc
:
fhost
.
append
(
func
)
elif
func
.
func_type
==
container
.
LoweredFunc
.
DeviceFunc
:
fdevice
.
append
(
func
)
else
:
raise
ValueError
(
"unknown function type
%
d"
%
func
.
func_type
)
for
i
,
func
in
enumerate
(
fdevice
):
warp_size
=
target
.
thread_warp_size
fdevice
[
i
]
=
ir_pass
.
LowerWarpMemory
(
func
,
warp_size
)
if
"gpu"
in
target
.
keys
and
not
fdevice
:
warnings
.
warn
(
"Specified target
%
s, but cannot find device code, did you do bind?"
%
target
)
fhost
=
[
ir_pass
.
BindDeviceType
(
x
,
device_type
)
for
x
in
fhost
]
fhost
=
[
ir_pass
.
LowerTVMBuiltin
(
x
)
for
x
in
fhost
]
if
not
target_host
:
for
tar
,
_
in
target_flist
.
items
():
tar
=
_target
.
create
(
tar
)
device_type
=
ndarray
.
context
(
tar
.
target_name
,
0
)
.
device_type
if
device_type
==
ndarray
.
cpu
(
0
)
.
device_type
:
target_host
=
target
assert
not
fdevice
else
:
target_host
=
tar
break
if
not
target_host
:
target_host
=
"llvm"
if
module
.
enabled
(
"llvm"
)
else
"stackvm"
target_host
=
_target
.
create
(
target_host
)
target_device
=
target
fdevice
=
[
ir_pass
.
LowerIntrin
(
x
,
target_device
.
target_name
)
for
x
in
fdevice
]
fhost
=
[
ir_pass
.
LowerIntrin
(
x
,
target_host
.
target_name
)
for
x
in
fhost
]
fhost
=
[
ir_pass
.
CombineContextCall
(
x
)
for
x
in
fhost
]
# Append fhost to the device module and return the updated module. All
# device modules will be imported to the host module after all of them are
# collected.
mdev
=
codegen
.
build_module
(
fdevice
,
str
(
target_device
))
if
fdevice
else
None
if
postpone_host_codegen
:
return
fhost
,
mdev
fhost_all
=
[]
device_modules
=
[]
for
tar
,
flist
in
target_flist
.
items
():
fhost
,
mdev
=
_build_for_device
(
flist
,
tar
,
target_host
)
# Save the current lowered functions of the host and the device module.
fhost_all
+=
fhost
device_modules
.
append
(
mdev
)
# Generate a unified host module.
mhost
=
codegen
.
build_module
(
fhost_all
,
str
(
target_host
))
mhost
=
codegen
.
build_module
(
fhost
,
str
(
target_host
))
if
fdevice
:
# Import all modules.
for
mdev
in
device_modules
:
if
mdev
:
mhost
.
import_module
(
mdev
)
return
mhost
tests/python/unittest/test_runtime_heterogeneous.py
View file @
47e57be4
...
...
@@ -124,9 +124,6 @@ def test_simplex_data_transferring():
schedule_add
=
topi
.
cpp
.
cuda
.
schedule_injective
(
target
,
[
elemwise_add
])
lower_add
=
tvm
.
lower
(
schedule_add
,
[
tensor_a
,
tensor_b
,
elemwise_add
],
name
=
"elemwise_add"
)
host_funcs_add
,
lib_add
=
tvm
.
build
(
lower_add
,
target
=
target_device
,
name
=
"elemwise_add"
,
postpone_host_codegen
=
True
)
# Insert copy. Neither compute nor schedule is required for the copy
# node. The compute will be performed at runtime which is just data
...
...
@@ -142,16 +139,8 @@ def test_simplex_data_transferring():
elemwise_sub
],
name
=
"elemwise_sub"
)
host_funcs_sub
,
lib_sub
=
tvm
.
build
(
lower_sub
,
target
=
target_host
,
name
=
"elemwise_sub"
,
postpone_host_codegen
=
True
)
host_funcs
=
host_funcs_add
+
host_funcs_sub
mhost
=
tvm
.
codegen
.
build_module
(
host_funcs
,
target_host
)
if
lib_add
:
mhost
.
import_module
(
lib_add
)
if
lib_sub
:
mhost
.
import_module
(
lib_sub
)
target_flist
=
{
target_device
:
[
lower_add
],
target_host
:
[
lower_sub
]}
mhost
=
tvm
.
build
(
target_flist
,
target_host
=
target_host
)
ctx
=
[
host_ctx
,
device_ctx
]
mod
=
graph_runtime
.
create
(
graph
,
mhost
,
ctx
)
params
=
{}
...
...
@@ -338,10 +327,6 @@ def test_duplex_data_transferring():
lower_add1
=
tvm
.
lower
(
add_schedule1
,
[
tensor_d
,
copy_sub_add
,
elemwise_add1
],
name
=
"elemwise_add1"
)
host_funcs_add
,
lib_add
=
tvm
.
build
([
lower_add0
,
lower_add1
],
target
=
target_device
,
postpone_host_codegen
=
True
)
# Create module for sub whose target is the host.
tensor_c
=
tvm
.
placeholder
(
shape
,
name
=
"C"
)
elemwise_sub
=
tvm
.
compute
(
shape
,
lambda
*
i
:
copy_add_sub
(
*
i
)
...
...
@@ -350,15 +335,10 @@ def test_duplex_data_transferring():
lower_sub
=
tvm
.
lower
(
sub_schedule
,
[
copy_add_sub
,
tensor_c
,
elemwise_sub
],
name
=
"elemwise_sub"
)
host_funcs_sub
,
lib_sub
=
tvm
.
build
(
lower_sub
,
target
=
target_host
,
postpone_host_codegen
=
True
)
host_funcs
=
host_funcs_add
+
host_funcs_sub
mhost
=
tvm
.
codegen
.
build_module
(
host_funcs
,
target_host
)
if
lib_add
:
mhost
.
import_module
(
lib_add
)
if
lib_sub
:
mhost
.
import_module
(
lib_sub
)
target_flist
=
{
target_device
:
[
lower_add0
,
lower_add1
],
target_host
:
[
lower_sub
]}
mhost
=
tvm
.
build
(
target_flist
,
target_host
=
target_host
)
ctx
=
[
host_ctx
,
device_ctx
]
params
=
{}
params
[
"A"
]
=
tensor_a
=
np
.
random
.
uniform
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment