[NVPTX] libdevice support, enable NVPTX backend in topi tests (#1365)

fab4f9cc · masahi · Tianqi Chen · 211ab978 · fab4f9cc · fab4f9cc
Commit fab4f9cc authored Jul 04, 2018 by masahi Committed by Tianqi Chen Jul 03, 2018
22 changed files
--- a/apps/benchmark/gpu_imagenet_bench.py
+++ b/apps/benchmark/gpu_imagenet_bench.py
@@ -25,7 +25,7 @@ def main():
                        choices=['resnet', 'mobilenet'],
                        help="The model type.")
    parser.add_argument('--target', type=str, required=True,
-                        choices=['cuda', 'rocm', 'opencl', 'metal'],
+                        choices=['cuda', 'rocm', 'opencl', 'metal', 'nvptx'],
                        help="Compilation target.")
    parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
    parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.")

--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -121,6 +121,20 @@ class CodeGenNVPTX : public CodeGenLLVM {
    // Additional optimization hook to tweak the builder.
  }

+  void Optimize() final {
+    for (auto& f : *module_) {
+      auto fname = static_cast<std::string>(f.getName());
+      if (fname.substr(0, 4) != "__nv") continue;
+      // This is to strip off unused __nv_* functions from the final module
+      // The one that is actually used will be inlined at call site
+      // Adapted from Halide's runtime linker
+      if (!f.isDeclaration() && !f.hasFnAttribute(llvm::Attribute::NoInline)) {
+        f.setLinkage(llvm::GlobalValue::AvailableExternallyLinkage);
+      }
+    }
+    CodeGenLLVM::Optimize();
+  }
+
 protected:
  void InitTarget(llvm::TargetMachine* tm) final {
    // Maximum vector lane = float4
@@ -179,8 +193,7 @@ runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
      }
      mlib->setTargetTriple(tm->getTargetTriple().str());
      mlib->setDataLayout(tm->createDataLayout());
-      // TODO(tqchen) libdevice linking not yet working.
-      // cg->AddLinkModule(std::move(mlib));
+      cg->AddLinkModule(std::move(mlib));
    }
  }
  std::unique_ptr<llvm::Module> module = cg->Finish();

--- a/src/codegen/llvm/intrin_rule_nvptx.cc
+++ b/src/codegen/llvm/intrin_rule_nvptx.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file intrin_rule_nvptx.cc
+ */
+#ifdef TVM_LLVM_VERSION
+
+#include <tvm/ir.h>
+#include <tvm/expr.h>
+#include <tvm/api_registry.h>
+#include <sstream>
+
+namespace tvm {
+namespace codegen {
+
+inline void DispatchExternLibDevice(const TVMArgs& args, TVMRetValue* rv) {
+  Expr e = args[0];
+  using namespace ir;
+  const Call* call = e.as<Call>();
+  CHECK(call != nullptr);
+  CHECK(call->type.bits() == 32 || call->type.bits() == 64) << "Only support float32 or float64.";
+  std::ostringstream intrinsic_name;
+  intrinsic_name << "__nv_" << call->name;
+  if (call->type.bits() == 32) intrinsic_name << "f";
+  *rv = Call::make(call->type, intrinsic_name.str(), call->args,
+                   Call::PureExtern);
+}
+
+namespace llvm {
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.floor")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.ceil")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.round")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.trunc")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.exp")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.fma")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.log")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.sqrt")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.pow")
+.set_body(DispatchExternLibDevice);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.tanh")
+.set_body(DispatchExternLibDevice);
+
+}  // namespace llvm
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // LLVM_VERSION
--- a/src/codegen/llvm/intrin_rule_rocm.cc
+++ b/src/codegen/llvm/intrin_rule_rocm.cc
 /*!
 *  Copyright (c) 2017 by Contributors
- * \file intrin_rule_llvm.cc
+ * \file intrin_rule_rocm.cc
 */
 #ifdef TVM_LLVM_VERSION


--- a/topi/python/topi/cuda/depthwise_conv2d.py
+++ b/topi/python/topi/cuda/depthwise_conv2d.py
@@ -158,7 +158,7 @@ def schedule_depthwise_conv2d_nhwc(outs):
        # num_thread here could be 728, it is larger than cuda.max_num_threads
        num_thread = tvm.ir_pass.Simplify(temp.shape[3]).value
        target = tvm.target.current_target()
-        if target and target.target_name != "cuda":
+        if target and (target.target_name not in ["cuda", "nvptx"]):
            num_thread = target.max_num_threads
        xoc, xic = s[Output].split(c, factor=num_thread)
        s[Output].reorder(xoc, b, h, w, xic)

--- a/topi/tests/python/test_topi_broadcast.py
+++ b/topi/tests/python/test_topi_broadcast.py
@@ -30,6 +30,7 @@ def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
    check_device("cuda")
    check_device("metal")
    check_device("rocm")
+    check_device("nvptx")


 def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
@@ -85,6 +86,7 @@ def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
    check_device("cuda")
    check_device("metal")
    check_device("rocm")
+    check_device("nvptx")

 def test_broadcast_to():
    verify_broadcast_to_ele((1,), (10,), topi.broadcast_to)

--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -52,7 +52,7 @@ def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, p
            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)

-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(device)



--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -44,8 +44,9 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        no_unroll_explicit = device in ["cuda", "nvptx", "rocm"]
        with tvm.build_config(auto_unroll_max_step=1400,
-                              unroll_explicit=(device != "cuda")):
+                              unroll_explicit=not no_unroll_explicit):
            func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
            func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
            func1(a, w, b)
@@ -53,7 +54,7 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)

-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(device)



--- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
@@ -51,7 +51,7 @@ def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel,
            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)

-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(device)



--- a/topi/tests/python/test_topi_dense.py
+++ b/topi/tests/python/test_topi_dense.py
@@ -45,7 +45,7 @@ def verify_dense(batch, in_dim, out_dim, use_bias=True):
        f(a, b, c, d)
        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)

-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(device)

 def test_dense():

--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -93,6 +93,7 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
    check_device("metal")
    check_device("rocm")
    check_device("vulkan")
+    check_device("nvptx")


 def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding, dilation=1):
@@ -184,6 +185,7 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
    check_device("metal")
    check_device("rocm")
    check_device("vulkan")
+    check_device("nvptx")

 def test_depthwise_conv2d():
    print("testing nchw")

--- a/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
@@ -87,6 +87,7 @@ def verify_depthwise_conv2d_back_input(batch, in_channel, in_h, channel_multipli
    check_device("metal")
    check_device("rocm")
    check_device("vulkan")
+    check_device("nvptx")

 def test_topi_depthwise_conv2d_backward_input_nhwc():
    verify_depthwise_conv2d_back_input(16, 256, 56, 1, 3, 1, 1)

--- a/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
@@ -80,6 +80,7 @@ def verify_depthwise_conv2d_back_weight(batch, in_channel, in_h, channel_multipl
    check_device("metal")
    check_device("rocm")
    check_device("vulkan")
+    check_device("nvptx")

 def test_topi_depthwise_conv2d_backward_weight_nhwc():
    verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 3, 1, 1)

--- a/topi/tests/python/test_topi_l2norm.py
+++ b/topi/tests/python/test_topi_l2norm.py
@@ -31,7 +31,7 @@ def verify_l2_normalize(ishape, eps, axis=None):
        f(a, b)
        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

-    for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+    for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(device)

 def test_l2_normalize():

--- a/topi/tests/python/test_topi_lrn.py
+++ b/topi/tests/python/test_topi_lrn.py
@@ -30,7 +30,7 @@ def verify_lrn(shape, size, axis, bias, alpha, beta):
        f(a, b)
        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

-    for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+    for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(device)

 def test_lrn():

--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -39,7 +39,7 @@ def test_ewise():
            foo(a, b)
            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)

-        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm']:
+        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx']:
            check_device(device)



--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -63,7 +63,7 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_
        f(a, b)
        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(device)

 def test_pool():
@@ -104,7 +104,7 @@ def verify_global_pool(n, c, h, w, pool_type):
        f(a, b)
        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(device)

 def test_global_pool():

--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -25,12 +25,11 @@ def _my_npy_argmin(arr, axis, keepdims):
        return arr.argmin(axis=axis).reshape(out_shape)


-def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
+def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32"):
    # Build the logic and compile the function
-    dat_dtype = "float32"
-    A = tvm.placeholder(shape=in_shape, name="A", dtype=dat_dtype)
+    A = tvm.placeholder(shape=in_shape, name="A", dtype=dtype)
    A1 = topi.sqrt(topi.exp(A))
-    out_dtype = "float32"
+    out_dtype = dtype
    if type == "sum":
        B = topi.sum(A1, axis=axis, keepdims=keepdims)
    elif type == "max":
@@ -57,8 +56,8 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):

        foo = tvm.build(s, [A, B], device, name=type)
        # Test
-        in_npy = np.random.uniform(size=in_shape).astype(np.float32)
-        in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32)
+        in_npy = np.random.uniform(size=in_shape).astype(dtype)
+        in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
        if type == "sum":
            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
        elif type == "max":
@@ -91,7 +90,7 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
                np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
        else:
            np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
-    for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan"]:
+    for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan", "nvptx"]:
        check_device(device)


@@ -128,6 +127,11 @@ def test_reduce_map():
                          axis=None,
                          keepdims=False,
                          type="sum")
+    verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
+                          axis=(1, 2, 3),
+                          keepdims=True,
+                          type="sum",
+                          dtype="float64")

 if __name__ == "__main__":
    test_reduce_map()
--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -27,7 +27,7 @@ def verify_relu(m, n):
        foo(a, b)
        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(device)



--- a/topi/tests/python/test_topi_resize.py
+++ b/topi/tests/python/test_topi_resize.py
@@ -40,7 +40,7 @@ def verify_bilinear_scale(batch, in_channel, in_height, in_width, out_height, ou

        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)

-    for device in ['llvm', 'cuda', 'vulkan']:
+    for device in ['llvm', 'cuda', 'vulkan', 'nvptx']:
        check_device(device)

 def test_resize():

--- a/topi/tests/python/test_topi_softmax.py
+++ b/topi/tests/python/test_topi_softmax.py
@@ -7,8 +7,8 @@ import topi.testing
 import logging
 from topi.util import get_const_tuple

-def verify_softmax(m, n):
-    A = tvm.placeholder((m, n), name='A')
+def verify_softmax(m, n, dtype="float32"):
+    A = tvm.placeholder((m, n), dtype=dtype, name='A')
    B = topi.nn.softmax(A)
    # confirm lower works
    s = tvm.create_schedule([B.op])
@@ -32,16 +32,16 @@ def verify_softmax(m, n):
        foo(a, b)
        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
        check_device(device)

 def test_softmax():
    verify_softmax(32, 10)
    verify_softmax(3, 4)
+    verify_softmax(32, 10, "float64")

-
-def verify_log_softmax(m, n):
-    A = tvm.placeholder((m, n), name='A')
+def verify_log_softmax(m, n, dtype="float32"):
+    A = tvm.placeholder((m, n), dtype=dtype, name='A')
    B = topi.nn.log_softmax(A)
    # confirm lower works
    s = tvm.create_schedule([B.op])
@@ -63,13 +63,14 @@ def verify_log_softmax(m, n):
        foo(a, b)
        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

-    for device in ["cuda", "opencl", "metal", "rocm", "vulkan"]:
+    for device in ["cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]:
        check_device(device)


 def test_log_softmax():
    verify_log_softmax(32, 10)
    verify_log_softmax(3, 4)
+    verify_log_softmax(32, 10, "float64")

 if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)

--- a/topi/tests/python/test_topi_upsampling.py
+++ b/topi/tests/python/test_topi_upsampling.py
@@ -41,7 +41,7 @@ def verify_upsampling(batch, in_channel, in_height, in_width, scale, layout='NCH

        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

-    for device in ['llvm', 'cuda', 'vulkan']:
+    for device in ['llvm', 'cuda', 'vulkan', 'nvptx']:
        check_device(device)

 def test_upsampling():