[TOPI] add schedule for ARM Mali GPU (#786)

* add schedule for ARM Mali GPU * fix lint * fix lint

[TOPI] add schedule for ARM Mali GPU (#786)
* add schedule for ARM Mali GPU * fix lint * fix lint
16694815 · Lianmin Zheng · Tianqi Chen · 8d263e37 · c0871823 · 16694815
Commit 16694815 authored Jan 17, 2018 by Lianmin Zheng Committed by Tianqi Chen Jan 16, 2018
7 changed files
--- a/dmlc-core @ c0871823
+++ b/dmlc-core @ c0871823
-Subproject commit 674a662c22b900b76e8a3c9b77987a2c5563ba71
+Subproject commit c0871823b518093a0d04d6cba0a3291bc7b31401
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -264,6 +264,19 @@ def rasp(options=None):
    return Target("llvm", opts)


+def mali(options=None):
+    """Returns a ARM Mali GPU target.
+
+    Parameters
+    ----------
+    options : list of str
+        Additional options
+    """
+    opts = ["-device=mali"]
+    opts = _merge_opts(opts, options)
+    return Target("opencl", opts)
+
+
 def create(target_str):
    """Get a target given target string.


--- a/topi/python/topi/__init__.py
+++ b/topi/python/topi/__init__.py
@@ -17,6 +17,7 @@ from . import nn
 from . import x86
 from . import cuda
 from . import rasp
+from . import mali
 from . import testing
 from . import util
 from . import rocm
--- a/topi/python/topi/mali/__init__.py
+++ b/topi/python/topi/mali/__init__.py
+# pylint: disable=redefined-builtin, wildcard-import
+"""ARM Mali GPU specific declaration and schedules."""
+from __future__ import absolute_import as _abs
+
+from .conv2d import *
+from .depthwise_conv2d import *
+from .dense import *
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
--- a/topi/python/topi/mali/dense.py
+++ b/topi/python/topi/mali/dense.py
+# pylint: disable=invalid-name,unused-variable
+"""dense schedule on ARM Mali GPU"""
+
+from __future__ import absolute_import as _abs
+
+import tvm
+
+from .. import generic
+from .. import util
+from .. import tag
+
+@generic.schedule_dense.register(["mali"])
+def schedule_dense(outs):
+    """Schedule for dense operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of dense
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for dense.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(dense):
+        data = s[dense].op.input_tensors[0]
+        weight = s[dense].op.input_tensors[1]
+
+        hidden = util.get_const_int(weight.shape[1])
+        out = util.get_const_int(weight.shape[0])
+
+        # set tunable parameter
+        tune_config = getattr(tvm.target.current_target(), "tune_config", None)
+        if tune_config is None:
+            if hidden > 8192:
+                num_thread = 32
+                unroll_step = 32
+            else:
+                if out <= 1024:
+                    num_thread = 32
+                    unroll_step = 16
+                else:
+                    num_thread = 256
+                    unroll_step = 32
+
+            if data.dtype == 'float16':
+                if hidden > 8192:
+                    num_thread = 2
+                    unroll_step = 32
+                else:
+                    num_thread = 8
+                    unroll_step = 256
+        else:
+            num_thread = tune_config['num_thread']
+            unroll_step = tune_config['unroll_step']
+
+        def fuse_and_bind(s, tensor, axis=None, num_thread=None):
+            """ fuse all the axis and bind to GPU threads """
+            axis = axis or s[tensor].op.axis
+            fused = s[tensor].fuse(*axis)
+            max_threads = tvm.target.current_target(allow_none=False).max_num_threads
+            bx, tx = s[tensor].split(fused, num_thread or max_threads)
+            s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
+            s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+            return bx, tx
+
+        output = outs[0]
+        bx, tx = fuse_and_bind(s, output, num_thread=num_thread)
+
+        k = s[dense].op.reduce_axis[0]
+        k, k_unroll = s[dense].split(k, unroll_step)
+        s[dense].unroll(k_unroll)
+
+        if dense.op not in s.outputs:
+            s[dense].compute_at(s[output], tx)
+
+#        bias = s[outs[0]].op.input_tensors[1]
+#        print(tvm.lower(s, [data, weight, bias, outs[0]], simple_mode=True))
+
+    def traverse(OP):
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule dense
+        elif OP.tag == 'dense':
+            dense = OP.output(0)
+            _schedule(dense)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+    return s
--- a/topi/python/topi/mali/depthwise_conv2d.py
+++ b/topi/python/topi/mali/depthwise_conv2d.py
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""depthwise_conv2d schedule on ARM Mali GPU"""
+
+from __future__ import absolute_import as _abs
+import tvm
+
+from .. import generic
+from .. import util
+from .. import tag
+
+@generic.schedule_depthwise_conv2d_nchw.register(["mali"])
+def schedule_depthwise_conv2d_nchw(outs):
+    """Schedule for depthwise_conv2d nchw forward.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of depthwise_conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for depthwise_conv2d nchw.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(pad_data, kernel, conv):
+        raw_data = s[pad_data].op.input_tensors[0]
+
+        if conv.op not in s.outputs:  # has bias or relu
+            output = outs[0]
+        else:                         # no bias or relu
+            output = conv
+
+        def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+            """ tile and bind 3d """
+            y_factor = y_factor or z_factor
+            x_factor = x_factor or y_factor
+            zo, zi = s[tensor].split(z, z_factor)
+            yo, yi = s[tensor].split(y, y_factor)
+            xo, xi = s[tensor].split(x, x_factor)
+            s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+            s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+            s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+            s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+            s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+            s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+            return zo, zi, yo, yi, xo, xi
+
+        # set tunable parameters
+        VH = 1
+        VW = 1
+        num_thread = 4
+        while util.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
+            VW = VW * 2
+        while util.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
+            VH = VH * 2
+        if raw_data.dtype == 'float16':
+            if util.get_const_int(conv.shape[3]) % (VW * 2) == 0:
+                VW *= 2
+                num_thread *= 2
+            else:
+                num_thread *= 2
+
+        # schedule padding
+        _, c, y, x = s[pad_data].op.axis
+        tile_and_bind3d(pad_data, c, y, x, num_thread, 1, 1)
+
+        # schedule conv
+        di, dj = s[conv].op.reduce_axis
+        s[conv].unroll(di)
+        s[conv].unroll(dj)
+
+        _, c, y, x = s[output].op.axis
+        y, x, yi, xi = s[output].tile(y, x, VH, VW)
+        s[output].unroll(yi)
+        s[output].vectorize(xi)
+
+        _, _, _, _, _, ji = tile_and_bind3d(output, c, y, x, num_thread, 1, 1)
+
+        if conv.op not in s.outputs:
+            _, c, y, x = s[conv].op.axis
+            y, x, yi, xi = s[conv].tile(y, x, VH, VW)
+            s[conv].unroll(yi)
+            s[conv].vectorize(xi)
+            s[conv].compute_at(s[output], ji)
+
+    def traverse(op):
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        # schedule depthwise_conv2d
+        if op.tag == 'depthwise_conv2d_nchw':
+            pad_data = op.input_tensors[0]
+            kernel = op.input_tensors[1]
+            conv = op.output(0)
+            _schedule(pad_data, kernel, conv)
+
+    traverse(outs[0].op)
+    return s