Commit 16694815 by Lianmin Zheng Committed by Tianqi Chen

[TOPI] add schedule for ARM Mali GPU (#786)

* add schedule for ARM Mali GPU

* fix lint

* fix lint
parent 8d263e37
Subproject commit 674a662c22b900b76e8a3c9b77987a2c5563ba71 Subproject commit c0871823b518093a0d04d6cba0a3291bc7b31401
...@@ -264,6 +264,19 @@ def rasp(options=None): ...@@ -264,6 +264,19 @@ def rasp(options=None):
return Target("llvm", opts) return Target("llvm", opts)
def mali(options=None):
"""Returns a ARM Mali GPU target.
Parameters
----------
options : list of str
Additional options
"""
opts = ["-device=mali"]
opts = _merge_opts(opts, options)
return Target("opencl", opts)
def create(target_str): def create(target_str):
"""Get a target given target string. """Get a target given target string.
......
...@@ -17,6 +17,7 @@ from . import nn ...@@ -17,6 +17,7 @@ from . import nn
from . import x86 from . import x86
from . import cuda from . import cuda
from . import rasp from . import rasp
from . import mali
from . import testing from . import testing
from . import util from . import util
from . import rocm from . import rocm
# pylint: disable=redefined-builtin, wildcard-import
"""ARM Mali GPU specific declaration and schedules."""
from __future__ import absolute_import as _abs
from .conv2d import *
from .depthwise_conv2d import *
from .dense import *
# pylint: disable=invalid-name,unused-variable
"""dense schedule on ARM Mali GPU"""
from __future__ import absolute_import as _abs
import tvm
from .. import generic
from .. import util
from .. import tag
@generic.schedule_dense.register(["mali"])
def schedule_dense(outs):
"""Schedule for dense operator.
Parameters
----------
outs: Array of Tensor
The computation graph description of dense
in the format of an array of tensors.
Returns
-------
s: Schedule
The computation schedule for dense.
"""
outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
s = tvm.create_schedule([x.op for x in outs])
def _schedule(dense):
data = s[dense].op.input_tensors[0]
weight = s[dense].op.input_tensors[1]
hidden = util.get_const_int(weight.shape[1])
out = util.get_const_int(weight.shape[0])
# set tunable parameter
tune_config = getattr(tvm.target.current_target(), "tune_config", None)
if tune_config is None:
if hidden > 8192:
num_thread = 32
unroll_step = 32
else:
if out <= 1024:
num_thread = 32
unroll_step = 16
else:
num_thread = 256
unroll_step = 32
if data.dtype == 'float16':
if hidden > 8192:
num_thread = 2
unroll_step = 32
else:
num_thread = 8
unroll_step = 256
else:
num_thread = tune_config['num_thread']
unroll_step = tune_config['unroll_step']
def fuse_and_bind(s, tensor, axis=None, num_thread=None):
""" fuse all the axis and bind to GPU threads """
axis = axis or s[tensor].op.axis
fused = s[tensor].fuse(*axis)
max_threads = tvm.target.current_target(allow_none=False).max_num_threads
bx, tx = s[tensor].split(fused, num_thread or max_threads)
s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
return bx, tx
output = outs[0]
bx, tx = fuse_and_bind(s, output, num_thread=num_thread)
k = s[dense].op.reduce_axis[0]
k, k_unroll = s[dense].split(k, unroll_step)
s[dense].unroll(k_unroll)
if dense.op not in s.outputs:
s[dense].compute_at(s[output], tx)
# bias = s[outs[0]].op.input_tensors[1]
# print(tvm.lower(s, [data, weight, bias, outs[0]], simple_mode=True))
def traverse(OP):
# inline all one-to-one-mapping operators except the last stage (output)
if tag.is_broadcast(OP.tag):
if OP not in s.outputs:
s[OP].compute_inline()
for tensor in OP.input_tensors:
if tensor.op.input_tensors:
traverse(tensor.op)
# schedule dense
elif OP.tag == 'dense':
dense = OP.output(0)
_schedule(dense)
else:
raise RuntimeError("Unsupported operator: %s" % OP.tag)
traverse(outs[0].op)
return s
# pylint: disable=invalid-name,unused-variable,unused-argument
"""depthwise_conv2d schedule on ARM Mali GPU"""
from __future__ import absolute_import as _abs
import tvm
from .. import generic
from .. import util
from .. import tag
@generic.schedule_depthwise_conv2d_nchw.register(["mali"])
def schedule_depthwise_conv2d_nchw(outs):
"""Schedule for depthwise_conv2d nchw forward.
Parameters
----------
outs: Array of Tensor
The computation graph description of depthwise_conv2d
in the format of an array of tensors.
Returns
-------
s: Schedule
The computation schedule for depthwise_conv2d nchw.
"""
outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
s = tvm.create_schedule([x.op for x in outs])
def _schedule(pad_data, kernel, conv):
raw_data = s[pad_data].op.input_tensors[0]
if conv.op not in s.outputs: # has bias or relu
output = outs[0]
else: # no bias or relu
output = conv
def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
""" tile and bind 3d """
y_factor = y_factor or z_factor
x_factor = x_factor or y_factor
zo, zi = s[tensor].split(z, z_factor)
yo, yi = s[tensor].split(y, y_factor)
xo, xi = s[tensor].split(x, x_factor)
s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
return zo, zi, yo, yi, xo, xi
# set tunable parameters
VH = 1
VW = 1
num_thread = 4
while util.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
VW = VW * 2
while util.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
VH = VH * 2
if raw_data.dtype == 'float16':
if util.get_const_int(conv.shape[3]) % (VW * 2) == 0:
VW *= 2
num_thread *= 2
else:
num_thread *= 2
# schedule padding
_, c, y, x = s[pad_data].op.axis
tile_and_bind3d(pad_data, c, y, x, num_thread, 1, 1)
# schedule conv
di, dj = s[conv].op.reduce_axis
s[conv].unroll(di)
s[conv].unroll(dj)
_, c, y, x = s[output].op.axis
y, x, yi, xi = s[output].tile(y, x, VH, VW)
s[output].unroll(yi)
s[output].vectorize(xi)
_, _, _, _, _, ji = tile_and_bind3d(output, c, y, x, num_thread, 1, 1)
if conv.op not in s.outputs:
_, c, y, x = s[conv].op.axis
y, x, yi, xi = s[conv].tile(y, x, VH, VW)
s[conv].unroll(yi)
s[conv].vectorize(xi)
s[conv].compute_at(s[output], ji)
def traverse(op):
# inline all one-to-one-mapping operators except the last stage (output)
if tag.is_broadcast(op.tag):
if op not in s.outputs:
s[op].compute_inline()
for tensor in op.input_tensors:
if tensor.op.input_tensors:
traverse(tensor.op)
# schedule depthwise_conv2d
if op.tag == 'depthwise_conv2d_nchw':
pad_data = op.input_tensors[0]
kernel = op.input_tensors[1]
conv = op.output(0)
_schedule(pad_data, kernel, conv)
traverse(outs[0].op)
return s
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment