[TOPI] Conv2d Added and Optimized for Intel HD Graphics (#1290)

396bd562 · Leyuan Wang · Tianqi Chen · fb88b74e · 396bd562 · 396bd562
Commit 396bd562 authored Jun 22, 2018 by Leyuan Wang Committed by Tianqi Chen Jun 22, 2018
11 changed files
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -157,9 +157,9 @@ EXPORT Target rasp(const std::vector<std::string>& options =
 EXPORT Target mali(const std::vector<std::string>& options =
                   std::vector<std::string>());
-/*! \return A target for Intel GPU */
+/*! \return A target for Intel Graphics */
-EXPORT Target intel_gpu(const std::vector<std::string>& options =
+EXPORT Target intel_graphics(const std::vector<std::string>& options =
-                   std::vector<std::string>());
+                             std::vector<std::string>());
 /*! \return A target for stackvm */
 EXPORT Target stackvm(const std::vector<std::string>& options =

--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -76,7 +76,7 @@ class Target(NodeBase):
    - :any:`tvm.target.cuda` create CUDA target
    - :any:`tvm.target.rocm` create ROCM target
    - :any:`tvm.target.mali` create Mali target
-    - :any:`tvm.target.intel_gpu` create Intel GPU target
+    - :any:`tvm.target.intel_graphics` create Intel Graphics target
    """
    def __init__(self, handle):
        super(Target, self).__init__(handle)
@@ -402,15 +402,15 @@ def mali(options=None):
    return _api_internal._TargetCreate("opencl", *opts)
-def intel_gpu(options=None):
+def intel_graphics(options=None):
-    """Returns an Intel GPU target.
+    """Returns an Intel Graphics target.
    Parameters
    ----------
    options : str or list of str
        Additional options
    """
-    opts = ["-device=intel_gpu"]
+    opts = ["-device=intel_graphics"]
    opts = _merge_opts(opts, options)
    return _api_internal._TargetCreate("opencl", *opts)

--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -76,7 +76,7 @@ Target CreateTarget(const std::string& target_name,
    t->keys_array.push_back(ir::StringImm::make("rocm"));
    t->keys_array.push_back(ir::StringImm::make("gpu"));
    t->max_num_threads = 256;
-    if (t->device_name == "intel_gpu") {
+    if (t->device_name == "intel_graphics") {
      t->thread_warp_size = 16;
    }
  } else if (target_name == "metal" || target_name == "vulkan") {
@@ -274,9 +274,9 @@ Target mali(const std::vector<std::string>& options) {
  }));
 }
-Target intel_gpu(const std::vector<std::string>& options) {
+Target intel_graphics(const std::vector<std::string>& options) {
  return CreateTarget("opencl", MergeOptions(options, {
-    "-device=intel_gpu"
+    "-device=intel_graphics"
  }));
 }

--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -159,7 +159,7 @@ void CodeGenOpenCL::PrintStorageSync(const Call* op) {
  const std::string& sync = op->args[0].as<StringImm>()->value;
  if (sync == "warp") {
    this->PrintIndent();
-    this->stream << "sub_group_barrier(CLK_LOCAL_MEM_FENCE);\n";
+    this->stream << "barrier(CLK_LOCAL_MEM_FENCE);\n";
  } else if (sync == "shared") {
    this->PrintIndent();
    this->stream << "barrier(CLK_LOCAL_MEM_FENCE);\n";

--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -40,7 +40,7 @@ void OpenCLWorkspace::GetAttr(
    }
    case kWarpSize: {
      /* TODO: the warp size of OpenCL device is not always 1
-               e.g. Intel GPU has a sub group concept which contains 8 - 32 work items,
+               e.g. Intel Graphics has a sub group concept which contains 8 - 32 work items,
               corresponding to the number of SIMD entries the heardware configures.
               We need to figure out a way to query this information from the hardware.
      */

--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -34,7 +34,7 @@ def test_exp():
        np.testing.assert_allclose(
            b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
-    check_device("opencl -device=intel_gpu")
+    check_device("opencl -device=intel_graphics")
    check_device("cuda", "llvm")
    check_device("vulkan")

--- a/tests/python/unittest/test_lang_target.py
+++ b/tests/python/unittest/test_lang_target.py
@@ -47,7 +47,7 @@ def test_target_string_parse():
    assert str(target) == str(tvm.target.cuda("-libs=cublas,cudnn"))
-    assert tvm.target.intel_gpu().device_name == "intel_gpu"
+    assert tvm.target.intel_graphics().device_name == "intel_graphics"
 if __name__ == "__main__":
    test_target_dispatch()

--- a/topi/python/topi/__init__.py
+++ b/topi/python/topi/__init__.py
@@ -26,6 +26,7 @@ from . import x86
 from . import cuda
 from . import rasp
 from . import mali
+from . import intel_graphics
 from . import opengl
 from . import util
 from . import rocm

--- a/topi/python/topi/cuda/pooling.py
+++ b/topi/python/topi/cuda/pooling.py
@@ -33,9 +33,8 @@ def schedule_global_pool(outs):
        else:
            Out = outs[0].op.output(0)
            s[Pool].set_scope("local")
-        i, c, h, w = s[Out].op.axis
+        by, ty = s[Out].split(s[Out].op.axis[0], factor=num_thread)
-        by, ty = s[Out].split(i, factor=num_thread)
+        bx, tx = s[Out].split(s[Out].op.axis[1], factor=num_thread)
-        bx, tx = s[Out].split(c, factor=num_thread)
        s[Out].reorder(by, bx, ty, tx)
        s[Out].bind(ty, thread_y)
        s[Out].bind(tx, thread_x)

--- a/topi/python/topi/intel_graphics/__init__.py
+++ b/topi/python/topi/intel_graphics/__init__.py
+# pylint: disable=redefined-builtin, wildcard-import
+"""Intel Gen9 GPU specific declaration and schedules."""
+from __future__ import absolute_import as _abs
+from .conv2d import *
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py