Commit cdd7f37f by Leyuan Wang Committed by Tianqi Chen

[TOPI] NCHWc added input shape 4 condition, intel graphics conv2d schedule…

[TOPI] NCHWc added input shape 4 condition, intel graphics conv2d schedule debugged for inception_v3 workloads (#2265)
parent 43433fa1
# pylint: disable=invalid-name, unused-argument # pylint: disable=invalid-name, unused-argument, missing-docstring, no-else-return
"""Definition of nn ops""" """Definition of nn ops"""
from __future__ import absolute_import from __future__ import absolute_import
...@@ -170,8 +170,11 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _): ...@@ -170,8 +170,11 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
out_layout = attrs.get_string("out_layout") out_layout = attrs.get_string("out_layout")
out_dtype = attrs.get_string("out_dtype") out_dtype = attrs.get_string("out_dtype")
out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
_, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape) if layout == "NCHW":
in_channel = in_channel_chunk * in_channel_block _, in_channel, _, _ = get_const_tuple(inputs[0].shape)
else:
_, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape)
in_channel = in_channel_chunk * in_channel_block
assert dilation == (1, 1), "not support dilate now" assert dilation == (1, 1), "not support dilate now"
if groups == 1: if groups == 1:
# pylint: disable=assignment-from-no-return # pylint: disable=assignment-from-no-return
......
# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches # pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches, too-many-boolean-expressions
"""conv2d schedule on Intel Graphics""" """conv2d schedule on Intel Graphics"""
from __future__ import absolute_import as _abs from __future__ import absolute_import as _abs
...@@ -61,7 +61,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos): ...@@ -61,7 +61,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs) return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
@conv2d_NCHWc.register(["intel_graphics"]) @conv2d_NCHWc.register(["intel_graphics"])
def _decl_conv2d(data, kernel, stride, padding, layout, out_layout, out_dtype='float32'): def _decl_conv2d(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'):
"""Conv2D operator for Intel Graphics backend. """Conv2D operator for Intel Graphics backend.
Parameters Parameters
...@@ -126,8 +126,7 @@ def schedule_conv2d_NCHWc(outs): ...@@ -126,8 +126,7 @@ def schedule_conv2d_NCHWc(outs):
for tensor in op.input_tensors: for tensor in op.input_tensors:
if tensor.op.input_tensors and tensor.op not in scheduled_ops: if tensor.op.input_tensors and tensor.op not in scheduled_ops:
traverse(tensor.op) traverse(tensor.op)
if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \ if 'conv2d' in op.tag:
or "1_16" in op.tag:
_schedule_cl_spatialpack_NCHWc(s, op) _schedule_cl_spatialpack_NCHWc(s, op)
scheduled_ops.append(op) scheduled_ops.append(op)
...@@ -156,31 +155,30 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16 ...@@ -156,31 +155,30 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
ry = tvm.reduce_axis((0, kernel_h), name='ry') ry = tvm.reduce_axis((0, kernel_h), name='ry')
rx = tvm.reduce_axis((0, kernel_w), name='rx') rx = tvm.reduce_axis((0, kernel_w), name='rx')
block_w = 0 block_w = 1
block_h = 0 block_h = 1
if stride_h == 2: if stride_h == 2:
if num_filter + kernel_h == 515: if num_filter + kernel_h == 515:
conv_tag = "4_4"
block_h = 4 block_h = 4
block_w = 4 block_w = 4
else: else:
conv_tag = "4_5"
block_h = 4 block_h = 4
block_w = 5 block_w = 5
elif kernel_h == 3: elif kernel_h == 3:
if num_filter == 512: if num_filter == 512:
conv_tag = "2_7"
block_h = 2 block_h = 2
block_w = 7 block_w = 7
else: else:
conv_tag = "2_14"
block_h = 2 block_h = 2
block_w = 14 block_w = 14
elif kernel_h == 7 and padding == 3 and stride == 1:
block_h = 3
block_w = 4
else: else:
conv_tag = "1_16"
block_h = 1 block_h = 1
block_w = 16 block_w = 16
attrs = {'block_h': block_h, 'block_w' : block_w}
c_h = out_height c_h = out_height
c_w = out_width c_w = out_width
...@@ -202,13 +200,13 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16 ...@@ -202,13 +200,13 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
tvm.sum( tvm.sum(
temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
kernel[ff, rc, ry, rx, vc].astype(out_dtype), kernel[ff, rc, ry, rx, vc].astype(out_dtype),
axis=[rc, ry, rx]), tag=conv_tag, name='conv') axis=[rc, ry, rx]), name='conv', attrs=attrs)
output = tvm.compute( output = tvm.compute(
oshape, oshape,
lambda nn, ff, yy, xx: lambda nn, ff, yy, xx:
conv[nn][ff//nv][yy][xx][ff%nv], conv[nn][ff//nv][yy][xx][ff%nv],
name='output_unpack', tag=conv_tag) name='output_unpack', tag='conv2d')
return output return output
...@@ -224,21 +222,10 @@ def _schedule_cl_spatialpack_NCHWc(s, op): ...@@ -224,21 +222,10 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
kernel_L = s.cache_read(kernel, "local", [conv_L]) kernel_L = s.cache_read(kernel, "local", [conv_L])
_, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
if "1_16" in s[conv].op.tag:
OUTPUT_BLOCK_HEIGHT = 1 attrs = s[conv].op.attrs
OUTPUT_BLOCK_WIDTH = 16 OUTPUT_BLOCK_HEIGHT = attrs['block_h']
elif "2_14" in s[conv].op.tag: OUTPUT_BLOCK_WIDTH = attrs['block_w']
OUTPUT_BLOCK_HEIGHT = 2
OUTPUT_BLOCK_WIDTH = 14
elif "2_7" in s[conv].op.tag:
OUTPUT_BLOCK_HEIGHT = 2
OUTPUT_BLOCK_WIDTH = 7
elif "4_5" in s[conv].op.tag:
OUTPUT_BLOCK_HEIGHT = 4
OUTPUT_BLOCK_WIDTH = 5
elif "4_4" in s[conv].op.tag:
OUTPUT_BLOCK_HEIGHT = 4
OUTPUT_BLOCK_WIDTH = 4
# schedule conv # schedule conv
z_factor = 1 z_factor = 1
...@@ -308,7 +295,7 @@ def _schedule_cl_spatialpack_NCHWc(s, op): ...@@ -308,7 +295,7 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
@conv2d.register(["intel_graphics"]) @conv2d.register(["intel_graphics"])
def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): def decl_conv2d(data, kernel, stride, padding, dilation, layout='NCHW', out_dtype='float32'):
"""Conv2D operator for Intel Graphics backend. """Conv2D operator for Intel Graphics backend.
Parameters Parameters
...@@ -368,8 +355,7 @@ def schedule_conv2d_nchw(outs): ...@@ -368,8 +355,7 @@ def schedule_conv2d_nchw(outs):
for tensor in op.input_tensors: for tensor in op.input_tensors:
if tensor.op.input_tensors and tensor.op not in scheduled_ops: if tensor.op.input_tensors and tensor.op not in scheduled_ops:
traverse(tensor.op) traverse(tensor.op)
if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \ if 'conv2d' in op.tag:
or "1_16" in op.tag:
_schedule_cl_spatialpack(s, op) _schedule_cl_spatialpack(s, op)
scheduled_ops.append(op) scheduled_ops.append(op)
...@@ -396,31 +382,30 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float ...@@ -396,31 +382,30 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
ry = tvm.reduce_axis((0, kernel_h), name='ry') ry = tvm.reduce_axis((0, kernel_h), name='ry')
rx = tvm.reduce_axis((0, kernel_w), name='rx') rx = tvm.reduce_axis((0, kernel_w), name='rx')
block_w = 0 block_w = 1
block_h = 0 block_h = 1
if stride_h == 2: if stride_h == 2:
if num_filter + kernel_h == 515: if num_filter + kernel_h == 515:
conv_tag = "4_4"
block_h = 4 block_h = 4
block_w = 4 block_w = 4
else: else:
conv_tag = "4_5"
block_h = 4 block_h = 4
block_w = 5 block_w = 5
elif kernel_h == 3: elif kernel_h == 3:
if num_filter == 512: if num_filter == 512:
conv_tag = "2_7"
block_h = 2 block_h = 2
block_w = 7 block_w = 7
else: else:
conv_tag = "2_14"
block_h = 2 block_h = 2
block_w = 14 block_w = 14
elif kernel_h == 7 and padding == 3 and stride == 1:
block_h = 3
block_w = 4
else: else:
conv_tag = "1_16"
block_h = 1 block_h = 1
block_w = 16 block_w = 16
attrs = {'block_h': block_h, 'block_w' : block_w}
c_h = out_height c_h = out_height
c_w = out_width c_w = out_width
...@@ -453,13 +438,13 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float ...@@ -453,13 +438,13 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
tvm.sum( tvm.sum(
temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype), kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
axis=[rc, ry, rx]), tag=conv_tag, name='conv') axis=[rc, ry, rx]), name='conv', attrs=attrs)
output = tvm.compute( output = tvm.compute(
oshape, oshape,
lambda nn, ff, yy, xx: lambda nn, ff, yy, xx:
conv[nn][ff//nv][yy][xx][ff%nv], conv[nn][ff//nv][yy][xx][ff%nv],
name='output_unpack', tag=conv_tag) name='output_unpack', tag='conv2d')
return output return output
...@@ -477,21 +462,9 @@ def _schedule_cl_spatialpack(s, op): ...@@ -477,21 +462,9 @@ def _schedule_cl_spatialpack(s, op):
kernel_L = s.cache_read(kernel_vec, "local", [conv_L]) kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
_, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
if "1_16" in s[conv].op.tag: attrs = s[conv].op.attrs
OUTPUT_BLOCK_HEIGHT = 1 OUTPUT_BLOCK_HEIGHT = attrs['block_h']
OUTPUT_BLOCK_WIDTH = 16 OUTPUT_BLOCK_WIDTH = attrs['block_w']
elif "2_14" in s[conv].op.tag:
OUTPUT_BLOCK_HEIGHT = 2
OUTPUT_BLOCK_WIDTH = 14
elif "2_7" in s[conv].op.tag:
OUTPUT_BLOCK_HEIGHT = 2
OUTPUT_BLOCK_WIDTH = 7
elif "4_5" in s[conv].op.tag:
OUTPUT_BLOCK_HEIGHT = 4
OUTPUT_BLOCK_WIDTH = 5
elif "4_4" in s[conv].op.tag:
OUTPUT_BLOCK_HEIGHT = 4
OUTPUT_BLOCK_WIDTH = 4
# schedule conv # schedule conv
z_factor = 1 z_factor = 1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment