[TOPI][CUDA] Fix Winograd Kernel Size Support (#4276)

* fix_winograd_cuda_kernel_size * add unit test

[TOPI][CUDA] Fix Winograd Kernel Size Support (#4276)
* fix_winograd_cuda_kernel_size * add unit test
76b79671 · Cody Hao Yu · Wuwei Lin · 5bcd3313 · 76b79671 · 76b79671
Commit 76b79671 authored Nov 07, 2019 by Cody Hao Yu Committed by Wuwei Lin Nov 08, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 75 additions and 3 deletions

tests/python/relay/test_op_level2.py
+72 -0

topi/python/topi/cuda/conv2d_winograd.py
+3 -3

No files found.
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -18,6 +18,7 @@
 """
 import numpy as np
 import tvm
+from tvm import autotvm
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import ctx_list
@@ -174,6 +175,76 @@ def test_conv2d_run():
    run_test_conv2d("float32", "float32", 1, dshape, kshape,
                    padding=(1, 1), channels=10, kernel_size=(3 ,3), dilation=(3, 3))

+def test_conv2d_winograd():
+    class WinogradFallback(autotvm.FallbackContext):
+        def _query_inside(self, target, workload):
+            key = (target, workload)
+            if key in self.memory:
+                return self.memory[key]
+            cfg = autotvm.task.space.FallbackConfigEntity()
+            cfg.template_key = 'winograd'
+            cfg.is_fallback = False
+            cfg['tile_b'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
+            cfg['tile_y'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
+            cfg['tile_x'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
+            cfg['tile_rc'] = autotvm.task.space.SplitEntity([-1, 1])
+            cfg['auto_unroll_max_setp'] = autotvm.task.space.OtherOptionEntity(1500)
+            cfg['unroll_explicit'] = autotvm.task.space.OtherOptionEntity(1)
+            self.memory[key] = cfg
+            return cfg
+
+    def run_test_conv2d_cuda(dtype, out_dtype, scale, dshape, kshape,
+                             padding=(1, 1),
+                             groups=1,
+                             dilation=(1, 1),
+                             **attrs):
+
+        x = relay.var("x", shape=dshape, dtype=dtype)
+        w = relay.var("w", shape=kshape, dtype=dtype)
+        y = relay.nn.conv2d(x, w,
+                            padding=padding,
+                            dilation=dilation,
+                            groups=groups,
+                            **attrs)
+        func = relay.Function([x, w], y)
+        mod = relay.Module()
+        mod['main'] = func
+        mod = relay.transform.InferType()(mod)
+
+        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
+        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
+        ref_res = topi.testing.conv2d_nchw_python(
+            data.astype(out_dtype), kernel.astype(out_dtype), 1, padding,
+            groups=groups)
+
+        with WinogradFallback(), relay.build_config(opt_level=3):
+            for target, ctx in ctx_list():
+                if target != 'cuda':
+                    continue
+                params = {'w': tvm.nd.array(kernel)}
+                graph, lib, params = relay.build_module.build(mod, target=target, params=params)
+                module = tvm.contrib.graph_runtime.create(graph, lib, ctx)
+                module.set_input('x', tvm.nd.array(data))
+                module.set_input(**params)
+                module.run()
+                op_res1 = module.get_output(0)
+                tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-3, atol=1e-3)
+
+    # normal winograd: stride 1, padding 1, kernel 3x3
+    dshape = (1, 80, 73, 73)
+    kshape = (192, 80, 3, 3)
+    run_test_conv2d_cuda("float32", "float32", 1, dshape, kshape,
+                         padding=(1, 1), channels=192, kernel_size=(3, 3))
+    # extended winograd: stride 1, padding N, kernel 3x3
+    run_test_conv2d_cuda("float32", "float32", 1, dshape, kshape,
+                         padding=(0, 0), channels=192, kernel_size=(3, 3))
+    run_test_conv2d_cuda("float32", "float32", 1, dshape, kshape,
+                         padding=(2, 2), channels=192, kernel_size=(3, 3))
+    # extended winograd: stride 1, padding N, kernel NxN
+    kshape = (192, 80, 7, 7)
+    run_test_conv2d_cuda("float32", "float32", 1, dshape, kshape,
+                         padding=(2, 2), channels=192, kernel_size=(7, 7))
+

 def test_conv2d_transpose_infer_type():
    # symbolic in batch dimension
@@ -702,6 +773,7 @@ if __name__ == "__main__":
    test_conv2d_transpose_infer_type()
    test_conv2d_transpose_run()
    test_conv2d_run()
+    test_conv2d_winograd()
    test_bitserial_conv2d_infer_type()
    test_batch_flatten()
    test_upsampling()

--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -55,12 +55,13 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dty
        if dilation_h != 1 or dilation_w != 1:
            kernel = dilation(kernel, (1, 1, dilation_h, dilation_w))
        CO, CI, KH, KW = get_const_tuple(kernel.shape)
+        alpha = KW + tile_size - 1
        assert HSTR == 1 and WSTR == 1 and KH == KW
    else:
        # kernel tensor is pre-transfomred. this op is created by alter op layout.
        # dilation is not supported
-        _, _, CI, CO = get_const_tuple(kernel.shape)
-        KH = KW = 3
+        alpha, _, CI, CO = get_const_tuple(kernel.shape)
+        KH = KW = alpha + 1 - tile_size
        assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1

    HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel)
@@ -68,7 +69,6 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dty

    r = KW
    m = tile_size
-    alpha = m + r - 1
    A, B, G = winograd_transform_matrices(m, r, out_dtype)

    H = (H + 2 * HPAD - KH) // HSTR + 1