Commit 47e57be4 by Zhi Committed by Tianqi Chen

support of multiple devices for tvm.build (#1773)

parent bea0b00f
...@@ -379,92 +379,32 @@ def lower(sch, ...@@ -379,92 +379,32 @@ def lower(sch,
return stmt return stmt
return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func) return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
def build(sch,
args=None, def _build_for_device(flist, target, target_host):
target=None, """Build the lowered functions for a device with the given compilation
target_host=None, target.
name="default_function",
binds=None,
postpone_host_codegen=False):
"""Build a function with arguments as signature. Code will be generated
for a device specified by the target. For homogeneous execution, a module
that contains both host and device code is returned. For heterogeneous
execution, a list of lowered functions for the host and a module containing
device code are returned, but actual code generation for the host module is
postponed after code generation is finished for all devices.
Parameters Parameters
---------- ----------
sch : tvm.Schedule, or LoweredFunc flist : list of LoweredFunc
The schedule to be builded The schedule to be built.
args : list of Buffer or Tensor or Var, optional
The argument lists to the function.
target : str or :any:`tvm.target.Target`, optional target : str or :any:`tvm.target.Target`
The target and option of the compilation. The target and option of the compilation.
target_host : str or :any:`tvm.target.Target` optional target_host : str or :any:`tvm.target.Target`
Host compilation target, if target is device. The host compilation target.
When TVM compiles device specific program such as CUDA,
we also need host(CPU) side code to interact with the driver
setup the dimensions and parameters correctly.
target_host is used to specify the host side codegen target.
By default, llvm is used if it is enabled,
otherwise a stackvm intepreter is used.
name : str, optional
The name of result function.
binds : dict, optional
Dictionary that maps the binding of symbolic buffer to Tensor.
By default, a new buffer is created for each tensor in the argument.
postpone_host_codegen : bool, optional
A bool value that indicates if code generation for the host module
should be postponed. This variable is set to be true for heterogeneous
execution. Otherwise, it is defaulted to false.
Returns Returns
------- -------
ret : tvm.module, or (list of LoweredFunc, tvm.module) tuple fhost : list of LoweredFunc
A module that combines both host and device code is returned when A list of lowered functions for the host.
postpone_host_codegen is not set. Otherwise, a list of lowered
functions for the host and a module contains only device code are
returned.
Note mdev : tvm.module
---- A module that contains device code.
See the note on :any:`tvm.target` on target string format.
""" """
if isinstance(sch, schedule.Schedule): target = _target.create(target)
if args is None:
raise ValueError("args must be given for build from schedule")
flist = lower(sch, args,
name=name,
binds=binds)
if isinstance(flist, container.LoweredFunc):
flist = [flist]
elif isinstance(sch, container.LoweredFunc):
if args:
raise ValueError("args must be done when build from LoweredFunc")
flist = [sch]
elif isinstance(sch, (list, tuple, container.Array)):
flist = sch
else:
raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc")
fname_set = set()
for x in flist:
if not isinstance(x, container.LoweredFunc):
raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc")
if x.name in fname_set:
raise ValueError("Duplicate function name %s" % x.name)
fname_set.add(x.name)
target = _target.current_target() if target is None else target
target = _target.create(target) if target else _target.create("llvm")
device_type = ndarray.context(target.target_name, 0).device_type device_type = ndarray.context(target.target_name, 0).device_type
fhost = [] fhost = []
fdevice = [] fdevice = []
for func in flist: for func in flist:
...@@ -496,31 +436,162 @@ def build(sch, ...@@ -496,31 +436,162 @@ def build(sch,
if "gpu" in target.keys and not fdevice: if "gpu" in target.keys and not fdevice:
warnings.warn( warnings.warn(
"Specified target %s, but cannot find device code, did you do bind?" % target) "Specified target %s, but cannot find device code, did you do "
"bind?" % target)
fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost] fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost]
fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost] fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost]
if not target_host: if device_type == ndarray.cpu(0).device_type and target_host == target:
if device_type == ndarray.cpu(0).device_type: assert not fdevice
target_host = target
assert not fdevice
else:
target_host = "llvm" if module.enabled("llvm") else "stackvm"
target_host = _target.create(target_host) target_host = _target.create(target_host)
target_device = target fdevice = [ir_pass.LowerIntrin(x, target.target_name) for x in fdevice]
fdevice = [ir_pass.LowerIntrin(x, target_device.target_name) for x in fdevice]
fhost = [ir_pass.LowerIntrin(x, target_host.target_name) for x in fhost] fhost = [ir_pass.LowerIntrin(x, target_host.target_name) for x in fhost]
fhost = [ir_pass.CombineContextCall(x) for x in fhost] fhost = [ir_pass.CombineContextCall(x) for x in fhost]
mdev = codegen.build_module(fdevice, str(target)) if fdevice else None
return fhost, mdev
def build(inputs,
args=None,
target=None,
target_host=None,
name="default_function",
binds=None):
"""Build a function with arguments as signature. Code will be generated
for devices coupled with target information.
Parameters
----------
inputs : tvm.Schedule, LoweredFunc, or dict of target to LoweredFunc list
The schedule to be built
args : list of Buffer or Tensor or Var, optional
The argument lists to the function.
target : str or :any:`tvm.target.Target`, optional
The target and option of the compilation.
target_host : str or :any:`tvm.target.Target` optional
Host compilation target, if target is device.
When TVM compiles device specific program such as CUDA,
we also need host(CPU) side code to interact with the driver
setup the dimensions and parameters correctly.
target_host is used to specify the host side codegen target.
By default, llvm is used if it is enabled,
otherwise a stackvm intepreter is used.
# Append fhost to the device module and return the updated module. All name : str, optional
# device modules will be imported to the host module after all of them are The name of result function.
# collected.
mdev = codegen.build_module(fdevice, str(target_device)) if fdevice else None binds : dict, optional
if postpone_host_codegen: Dictionary that maps the binding of symbolic buffer to Tensor.
return fhost, mdev By default, a new buffer is created for each tensor in the argument.
mhost = codegen.build_module(fhost, str(target_host)) Returns
if fdevice: -------
mhost.import_module(mdev) ret : tvm.module
A module that combines both host and device code.
Examples
________
There are two typical example uses of this function depending on the type
of the argument `inputs`:
1. it is a list of lowered functions:
.. code-block:: python
n = 2
A = tvm.placeholder((n,), name='A')
B = tvm.placeholder((n,), name='B')
C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
s = tvm.create_schedule(C.op)
f = tvm.lower(s, [A, B, C], name="test_add")
m = tvm.build(f, target="llvm")
2. it is a dict of compilation target to list of lowered functions:
.. code-block:: python
n = 2
A = tvm.placeholder((n,), name='A')
B = tvm.placeholder((n,), name='B')
C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
s1 = tvm.create_schedule(C.op)
s2 = topi.cpp.cuda.schedule_injective("cuda", [C])
f1 = tvm.lower(s1, [A, B, C], name="test_add1")
f2 = tvm.lower(s2, [A, B, C], name="test_add2")
m = tvm.build({"llvm": [f1], "cuda": [f2]}, target_host="llvm")
Note
----
See the note on :any:`tvm.target` on target string format.
"""
if isinstance(inputs, schedule.Schedule):
if args is None:
raise ValueError("args must be given for build from schedule")
flist = lower(inputs, args,
name=name,
binds=binds)
if isinstance(flist, container.LoweredFunc):
flist = [flist]
elif isinstance(inputs, container.LoweredFunc):
if args:
raise ValueError("args must be done when build from LoweredFunc.")
flist = [inputs]
elif isinstance(inputs, (list, tuple, container.Array)):
flist = inputs
elif not isinstance(inputs, (dict, container.Map)):
raise ValueError("inputs must be Schedule, LoweredFunc, list of "
"LoweredFunc, or dict of target to list of "
"LoweredFunc.")
if not isinstance(inputs, (dict, container.Map)):
target = _target.current_target() if target is None else target
target = target if target else "llvm"
target_flist = {target: flist}
else:
target_flist = inputs
for tar, flist in target_flist.items():
if not isinstance(tar, (str, _target.Target)):
raise ValueError("The key of inputs must be str or "
"_target.Target when inputs is dict.")
fname_set = set()
for x in flist:
if not isinstance(x, container.LoweredFunc):
raise ValueError("inputs must be Schedule, LoweredFunc, list "
"of LoweredFunc, or dict of str to list of "
"LoweredFunc.")
if x.name in fname_set:
raise ValueError("Duplicate function name %s" % x.name)
fname_set.add(x.name)
if not target_host:
for tar, _ in target_flist.items():
tar = _target.create(tar)
device_type = ndarray.context(tar.target_name, 0).device_type
if device_type == ndarray.cpu(0).device_type:
target_host = tar
break
if not target_host:
target_host = "llvm" if module.enabled("llvm") else "stackvm"
fhost_all = []
device_modules = []
for tar, flist in target_flist.items():
fhost, mdev = _build_for_device(flist, tar, target_host)
# Save the current lowered functions of the host and the device module.
fhost_all += fhost
device_modules.append(mdev)
# Generate a unified host module.
mhost = codegen.build_module(fhost_all, str(target_host))
# Import all modules.
for mdev in device_modules:
if mdev:
mhost.import_module(mdev)
return mhost return mhost
...@@ -124,9 +124,6 @@ def test_simplex_data_transferring(): ...@@ -124,9 +124,6 @@ def test_simplex_data_transferring():
schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add]) schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add])
lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add], lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add],
name="elemwise_add") name="elemwise_add")
host_funcs_add, lib_add = tvm.build(lower_add, target=target_device,
name="elemwise_add",
postpone_host_codegen=True)
# Insert copy. Neither compute nor schedule is required for the copy # Insert copy. Neither compute nor schedule is required for the copy
# node. The compute will be performed at runtime which is just data # node. The compute will be performed at runtime which is just data
...@@ -142,16 +139,8 @@ def test_simplex_data_transferring(): ...@@ -142,16 +139,8 @@ def test_simplex_data_transferring():
elemwise_sub], elemwise_sub],
name="elemwise_sub") name="elemwise_sub")
host_funcs_sub, lib_sub = tvm.build(lower_sub, target=target_host, target_flist = {target_device: [lower_add], target_host: [lower_sub]}
name="elemwise_sub", mhost = tvm.build(target_flist, target_host=target_host)
postpone_host_codegen=True)
host_funcs = host_funcs_add + host_funcs_sub
mhost = tvm.codegen.build_module(host_funcs, target_host)
if lib_add:
mhost.import_module(lib_add)
if lib_sub:
mhost.import_module(lib_sub)
ctx = [host_ctx, device_ctx] ctx = [host_ctx, device_ctx]
mod = graph_runtime.create(graph, mhost, ctx) mod = graph_runtime.create(graph, mhost, ctx)
params = {} params = {}
...@@ -338,10 +327,6 @@ def test_duplex_data_transferring(): ...@@ -338,10 +327,6 @@ def test_duplex_data_transferring():
lower_add1 = tvm.lower( lower_add1 = tvm.lower(
add_schedule1, [tensor_d, copy_sub_add, elemwise_add1], add_schedule1, [tensor_d, copy_sub_add, elemwise_add1],
name="elemwise_add1") name="elemwise_add1")
host_funcs_add, lib_add = tvm.build([lower_add0, lower_add1],
target=target_device,
postpone_host_codegen=True)
# Create module for sub whose target is the host. # Create module for sub whose target is the host.
tensor_c = tvm.placeholder(shape, name="C") tensor_c = tvm.placeholder(shape, name="C")
elemwise_sub = tvm.compute(shape, lambda *i: copy_add_sub(*i) elemwise_sub = tvm.compute(shape, lambda *i: copy_add_sub(*i)
...@@ -350,15 +335,10 @@ def test_duplex_data_transferring(): ...@@ -350,15 +335,10 @@ def test_duplex_data_transferring():
lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c, lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c,
elemwise_sub], elemwise_sub],
name="elemwise_sub") name="elemwise_sub")
host_funcs_sub, lib_sub = tvm.build(lower_sub, target=target_host,
postpone_host_codegen=True)
host_funcs = host_funcs_add + host_funcs_sub
mhost = tvm.codegen.build_module(host_funcs, target_host)
if lib_add:
mhost.import_module(lib_add)
if lib_sub:
mhost.import_module(lib_sub)
target_flist = {target_device: [lower_add0, lower_add1], target_host:
[lower_sub]}
mhost = tvm.build(target_flist, target_host=target_host)
ctx = [host_ctx, device_ctx] ctx = [host_ctx, device_ctx]
params = {} params = {}
params["A"] = tensor_a = np.random.uniform( params["A"] = tensor_a = np.random.uniform(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment