[TOPI] NCHWc added input shape 4 condition, intel graphics conv2d schedule…

[TOPI] NCHWc added input shape 4 condition, intel graphics conv2d schedule debugged for inception_v3 workloads (#2265)

[TOPI] NCHWc added input shape 4 condition, intel graphics conv2d schedule…
[TOPI] NCHWc added input shape 4 condition, intel graphics conv2d schedule debugged for inception_v3 workloads (#2265)
cdd7f37f · Leyuan Wang · Tianqi Chen · 43433fa1 · cdd7f37f · cdd7f37f
Commit cdd7f37f authored Dec 14, 2018 by Leyuan Wang Committed by Tianqi Chen Dec 14, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 34 additions and 58 deletions

nnvm/python/nnvm/top/nn.py
+6 -3

topi/python/topi/intel_graphics/conv2d.py
+28 -55

No files found.
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
-# pylint: disable=invalid-name, unused-argument
+# pylint: disable=invalid-name, unused-argument, missing-docstring, no-else-return
 """Definition of nn ops"""
 from __future__ import absolute_import
@@ -170,8 +170,11 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
    out_layout = attrs.get_string("out_layout")
    out_dtype = attrs.get_string("out_dtype")
    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
-    _, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape)
+    if layout == "NCHW":
-    in_channel = in_channel_chunk * in_channel_block
+        _, in_channel, _, _ = get_const_tuple(inputs[0].shape)
+    else:
+        _, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape)
+        in_channel = in_channel_chunk * in_channel_block
    assert dilation == (1, 1), "not support dilate now"
    if groups == 1:
        # pylint: disable=assignment-from-no-return

--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches, too-many-boolean-expressions
 """conv2d schedule on Intel Graphics"""
 from __future__ import absolute_import as _abs
@@ -61,7 +61,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
 @conv2d_NCHWc.register(["intel_graphics"])
-def _decl_conv2d(data, kernel, stride, padding, layout, out_layout, out_dtype='float32'):
+def _decl_conv2d(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'):
    """Conv2D operator for Intel Graphics backend.
    Parameters
@@ -126,8 +126,7 @@ def schedule_conv2d_NCHWc(outs):
            for tensor in op.input_tensors:
                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                    traverse(tensor.op)
-        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
+        if 'conv2d' in op.tag:
-           or "1_16" in op.tag:
            _schedule_cl_spatialpack_NCHWc(s, op)
        scheduled_ops.append(op)
@@ -156,31 +155,30 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-    block_w = 0
+    block_w = 1
-    block_h = 0
+    block_h = 1
    if stride_h == 2:
        if num_filter + kernel_h == 515:
-            conv_tag = "4_4"
            block_h = 4
            block_w = 4
        else:
-            conv_tag = "4_5"
            block_h = 4
            block_w = 5
    elif kernel_h == 3:
        if num_filter == 512:
-            conv_tag = "2_7"
            block_h = 2
            block_w = 7
        else:
-            conv_tag = "2_14"
            block_h = 2
            block_w = 14
+    elif kernel_h == 7 and padding == 3 and stride == 1:
+        block_h = 3
+        block_w = 4
    else:
-        conv_tag = "1_16"
        block_h = 1
        block_w = 16
+    attrs = {'block_h': block_h, 'block_w' : block_w}
    c_h = out_height
    c_w = out_width
@@ -202,13 +200,13 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
          tvm.sum(
              temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
              kernel[ff, rc, ry, rx, vc].astype(out_dtype),
-              axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+              axis=[rc, ry, rx]), name='conv', attrs=attrs)
    output = tvm.compute(
        oshape,
        lambda nn, ff, yy, xx:
        conv[nn][ff//nv][yy][xx][ff%nv],
-        name='output_unpack', tag=conv_tag)
+        name='output_unpack', tag='conv2d')
    return output
@@ -224,21 +222,10 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
    kernel_L = s.cache_read(kernel, "local", [conv_L])
    _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
-    if "1_16" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 1
+    attrs = s[conv].op.attrs
-        OUTPUT_BLOCK_WIDTH = 16
+    OUTPUT_BLOCK_HEIGHT = attrs['block_h']
-    elif "2_14" in s[conv].op.tag:
+    OUTPUT_BLOCK_WIDTH = attrs['block_w']
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 14
-    elif "2_7" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 7
-    elif "4_5" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 5
-    elif "4_4" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 4
    # schedule conv
    z_factor = 1
@@ -308,7 +295,7 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
 @conv2d.register(["intel_graphics"])
-def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+def decl_conv2d(data, kernel, stride, padding, dilation, layout='NCHW', out_dtype='float32'):
    """Conv2D operator for Intel Graphics backend.
    Parameters
@@ -368,8 +355,7 @@ def schedule_conv2d_nchw(outs):
            for tensor in op.input_tensors:
                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                    traverse(tensor.op)
-        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
+        if 'conv2d' in op.tag:
-           or "1_16" in op.tag:
            _schedule_cl_spatialpack(s, op)
        scheduled_ops.append(op)
@@ -396,31 +382,30 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-    block_w = 0
+    block_w = 1
-    block_h = 0
+    block_h = 1
    if stride_h == 2:
        if num_filter + kernel_h == 515:
-            conv_tag = "4_4"
            block_h = 4
            block_w = 4
        else:
-            conv_tag = "4_5"
            block_h = 4
            block_w = 5
    elif kernel_h == 3:
        if num_filter == 512:
-            conv_tag = "2_7"
            block_h = 2
            block_w = 7
        else:
-            conv_tag = "2_14"
            block_h = 2
            block_w = 14
+    elif kernel_h == 7 and padding == 3 and stride == 1:
+        block_h = 3
+        block_w = 4
    else:
-        conv_tag = "1_16"
        block_h = 1
        block_w = 16
+    attrs = {'block_h': block_h, 'block_w' : block_w}
    c_h = out_height
    c_w = out_width
@@ -453,13 +438,13 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
          tvm.sum(
              temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
              kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
-              axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+              axis=[rc, ry, rx]), name='conv', attrs=attrs)
    output = tvm.compute(
        oshape,
        lambda nn, ff, yy, xx:
        conv[nn][ff//nv][yy][xx][ff%nv],
-        name='output_unpack', tag=conv_tag)
+        name='output_unpack', tag='conv2d')
    return output
@@ -477,21 +462,9 @@ def _schedule_cl_spatialpack(s, op):
    kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
    _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
-    if "1_16" in s[conv].op.tag:
+    attrs = s[conv].op.attrs
-        OUTPUT_BLOCK_HEIGHT = 1
+    OUTPUT_BLOCK_HEIGHT = attrs['block_h']
-        OUTPUT_BLOCK_WIDTH = 16
+    OUTPUT_BLOCK_WIDTH = attrs['block_w']
-    elif "2_14" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 14
-    elif "2_7" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 7
-    elif "4_5" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 5
-    elif "4_4" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 4
    # schedule conv
    z_factor = 1