Commit fab4f9cc by masahi Committed by Tianqi Chen

[NVPTX] libdevice support, enable NVPTX backend in topi tests (#1365)

parent 211ab978
......@@ -25,7 +25,7 @@ def main():
choices=['resnet', 'mobilenet'],
help="The model type.")
parser.add_argument('--target', type=str, required=True,
choices=['cuda', 'rocm', 'opencl', 'metal'],
choices=['cuda', 'rocm', 'opencl', 'metal', 'nvptx'],
help="Compilation target.")
parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.")
......
......@@ -121,6 +121,20 @@ class CodeGenNVPTX : public CodeGenLLVM {
// Additional optimization hook to tweak the builder.
}
void Optimize() final {
for (auto& f : *module_) {
auto fname = static_cast<std::string>(f.getName());
if (fname.substr(0, 4) != "__nv") continue;
// This is to strip off unused __nv_* functions from the final module
// The one that is actually used will be inlined at call site
// Adapted from Halide's runtime linker
if (!f.isDeclaration() && !f.hasFnAttribute(llvm::Attribute::NoInline)) {
f.setLinkage(llvm::GlobalValue::AvailableExternallyLinkage);
}
}
CodeGenLLVM::Optimize();
}
protected:
void InitTarget(llvm::TargetMachine* tm) final {
// Maximum vector lane = float4
......@@ -179,8 +193,7 @@ runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
}
mlib->setTargetTriple(tm->getTargetTriple().str());
mlib->setDataLayout(tm->createDataLayout());
// TODO(tqchen) libdevice linking not yet working.
// cg->AddLinkModule(std::move(mlib));
cg->AddLinkModule(std::move(mlib));
}
}
std::unique_ptr<llvm::Module> module = cg->Finish();
......
/*!
* Copyright (c) 2017 by Contributors
* \file intrin_rule_nvptx.cc
*/
#ifdef TVM_LLVM_VERSION
#include <tvm/ir.h>
#include <tvm/expr.h>
#include <tvm/api_registry.h>
#include <sstream>
namespace tvm {
namespace codegen {
inline void DispatchExternLibDevice(const TVMArgs& args, TVMRetValue* rv) {
Expr e = args[0];
using namespace ir;
const Call* call = e.as<Call>();
CHECK(call != nullptr);
CHECK(call->type.bits() == 32 || call->type.bits() == 64) << "Only support float32 or float64.";
std::ostringstream intrinsic_name;
intrinsic_name << "__nv_" << call->name;
if (call->type.bits() == 32) intrinsic_name << "f";
*rv = Call::make(call->type, intrinsic_name.str(), call->args,
Call::PureExtern);
}
namespace llvm {
TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.floor")
.set_body(DispatchExternLibDevice);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.ceil")
.set_body(DispatchExternLibDevice);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.round")
.set_body(DispatchExternLibDevice);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.trunc")
.set_body(DispatchExternLibDevice);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.exp")
.set_body(DispatchExternLibDevice);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.fma")
.set_body(DispatchExternLibDevice);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.log")
.set_body(DispatchExternLibDevice);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.sqrt")
.set_body(DispatchExternLibDevice);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.pow")
.set_body(DispatchExternLibDevice);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.nvptx.tanh")
.set_body(DispatchExternLibDevice);
} // namespace llvm
} // namespace codegen
} // namespace tvm
#endif // LLVM_VERSION
/*!
* Copyright (c) 2017 by Contributors
* \file intrin_rule_llvm.cc
* \file intrin_rule_rocm.cc
*/
#ifdef TVM_LLVM_VERSION
......
......@@ -158,7 +158,7 @@ def schedule_depthwise_conv2d_nhwc(outs):
# num_thread here could be 728, it is larger than cuda.max_num_threads
num_thread = tvm.ir_pass.Simplify(temp.shape[3]).value
target = tvm.target.current_target()
if target and target.target_name != "cuda":
if target and (target.target_name not in ["cuda", "nvptx"]):
num_thread = target.max_num_threads
xoc, xic = s[Output].split(c, factor=num_thread)
s[Output].reorder(xoc, b, h, w, xic)
......
......@@ -30,6 +30,7 @@ def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
check_device("cuda")
check_device("metal")
check_device("rocm")
check_device("nvptx")
def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
......@@ -85,6 +86,7 @@ def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
check_device("cuda")
check_device("metal")
check_device("rocm")
check_device("nvptx")
def test_broadcast_to():
verify_broadcast_to_ele((1,), (10,), topi.broadcast_to)
......
......@@ -52,7 +52,7 @@ def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, p
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
check_device(device)
......
......@@ -44,8 +44,9 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
w = tvm.nd.array(w_np, ctx)
b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
no_unroll_explicit = device in ["cuda", "nvptx", "rocm"]
with tvm.build_config(auto_unroll_max_step=1400,
unroll_explicit=(device != "cuda")):
unroll_explicit=not no_unroll_explicit):
func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
func1(a, w, b)
......@@ -53,7 +54,7 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
check_device(device)
......
......@@ -51,7 +51,7 @@ def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel,
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
check_device(device)
......
......@@ -45,7 +45,7 @@ def verify_dense(batch, in_dim, out_dim, use_bias=True):
f(a, b, c, d)
np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
check_device(device)
def test_dense():
......
......@@ -93,6 +93,7 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
check_device("metal")
check_device("rocm")
check_device("vulkan")
check_device("nvptx")
def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding, dilation=1):
......@@ -184,6 +185,7 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
check_device("metal")
check_device("rocm")
check_device("vulkan")
check_device("nvptx")
def test_depthwise_conv2d():
print("testing nchw")
......
......@@ -87,6 +87,7 @@ def verify_depthwise_conv2d_back_input(batch, in_channel, in_h, channel_multipli
check_device("metal")
check_device("rocm")
check_device("vulkan")
check_device("nvptx")
def test_topi_depthwise_conv2d_backward_input_nhwc():
verify_depthwise_conv2d_back_input(16, 256, 56, 1, 3, 1, 1)
......
......@@ -80,6 +80,7 @@ def verify_depthwise_conv2d_back_weight(batch, in_channel, in_h, channel_multipl
check_device("metal")
check_device("rocm")
check_device("vulkan")
check_device("nvptx")
def test_topi_depthwise_conv2d_backward_weight_nhwc():
verify_depthwise_conv2d_back_weight(16, 256, 56, 1, 3, 1, 1)
......
......@@ -31,7 +31,7 @@ def verify_l2_normalize(ishape, eps, axis=None):
f(a, b)
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
check_device(device)
def test_l2_normalize():
......
......@@ -30,7 +30,7 @@ def verify_lrn(shape, size, axis, bias, alpha, beta):
f(a, b)
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
check_device(device)
def test_lrn():
......
......@@ -39,7 +39,7 @@ def test_ewise():
foo(a, b)
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm']:
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx']:
check_device(device)
......
......@@ -63,7 +63,7 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_
f(a, b)
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
check_device(device)
def test_pool():
......@@ -104,7 +104,7 @@ def verify_global_pool(n, c, h, w, pool_type):
f(a, b)
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
check_device(device)
def test_global_pool():
......
......@@ -25,12 +25,11 @@ def _my_npy_argmin(arr, axis, keepdims):
return arr.argmin(axis=axis).reshape(out_shape)
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32"):
# Build the logic and compile the function
dat_dtype = "float32"
A = tvm.placeholder(shape=in_shape, name="A", dtype=dat_dtype)
A = tvm.placeholder(shape=in_shape, name="A", dtype=dtype)
A1 = topi.sqrt(topi.exp(A))
out_dtype = "float32"
out_dtype = dtype
if type == "sum":
B = topi.sum(A1, axis=axis, keepdims=keepdims)
elif type == "max":
......@@ -57,8 +56,8 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
foo = tvm.build(s, [A, B], device, name=type)
# Test
in_npy = np.random.uniform(size=in_shape).astype(np.float32)
in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32)
in_npy = np.random.uniform(size=in_shape).astype(dtype)
in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
if type == "sum":
out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
elif type == "max":
......@@ -91,7 +90,7 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
else:
np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan"]:
for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan", "nvptx"]:
check_device(device)
......@@ -128,6 +127,11 @@ def test_reduce_map():
axis=None,
keepdims=False,
type="sum")
verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
axis=(1, 2, 3),
keepdims=True,
type="sum",
dtype="float64")
if __name__ == "__main__":
test_reduce_map()
......@@ -27,7 +27,7 @@ def verify_relu(m, n):
foo(a, b)
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
check_device(device)
......
......@@ -40,7 +40,7 @@ def verify_bilinear_scale(batch, in_channel, in_height, in_width, out_height, ou
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
for device in ['llvm', 'cuda', 'vulkan']:
for device in ['llvm', 'cuda', 'vulkan', 'nvptx']:
check_device(device)
def test_resize():
......
......@@ -7,8 +7,8 @@ import topi.testing
import logging
from topi.util import get_const_tuple
def verify_softmax(m, n):
A = tvm.placeholder((m, n), name='A')
def verify_softmax(m, n, dtype="float32"):
A = tvm.placeholder((m, n), dtype=dtype, name='A')
B = topi.nn.softmax(A)
# confirm lower works
s = tvm.create_schedule([B.op])
......@@ -32,16 +32,16 @@ def verify_softmax(m, n):
foo(a, b)
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
check_device(device)
def test_softmax():
verify_softmax(32, 10)
verify_softmax(3, 4)
verify_softmax(32, 10, "float64")
def verify_log_softmax(m, n):
A = tvm.placeholder((m, n), name='A')
def verify_log_softmax(m, n, dtype="float32"):
A = tvm.placeholder((m, n), dtype=dtype, name='A')
B = topi.nn.log_softmax(A)
# confirm lower works
s = tvm.create_schedule([B.op])
......@@ -63,13 +63,14 @@ def verify_log_softmax(m, n):
foo(a, b)
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
for device in ["cuda", "opencl", "metal", "rocm", "vulkan"]:
for device in ["cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]:
check_device(device)
def test_log_softmax():
verify_log_softmax(32, 10)
verify_log_softmax(3, 4)
verify_log_softmax(32, 10, "float64")
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
......
......@@ -41,7 +41,7 @@ def verify_upsampling(batch, in_channel, in_height, in_width, scale, layout='NCH
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
for device in ['llvm', 'cuda', 'vulkan']:
for device in ['llvm', 'cuda', 'vulkan', 'nvptx']:
check_device(device)
def test_upsampling():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment