Commit 2afe0248 by Dayananda V Committed by Tianqi Chen

Vulkan TVM Android Support (#1571)

parent 764516a6
...@@ -123,18 +123,25 @@ export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++ ...@@ -123,18 +123,25 @@ export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
python android_rpc_test.py python android_rpc_test.py
``` ```
This will compile TVM IR to shared libraries (CPU and OpenCL) and run vector addition on your Android device. On my test device, it gives following results. This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set [`'test_opencl = True'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L25) and on Vulkan target set [`'test_vulkan = False'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L27) in [tests/android_rpc_test.py](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute.
On my test device, it gives following results.
```bash ```bash
TVM: Initializing cython mode...
[01:21:43] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64
[01:21:43] src/runtime/opencl/opencl_device_api.cc:194: Initialize OpenCL platform 'Apple'
[01:21:43] src/runtime/opencl/opencl_device_api.cc:214: opencl(0)='Iris' cl_device_id=0x1024500
[01:21:44] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64
Run GPU test ...
0.000155807 secs/op
Run CPU test ... Run CPU test ...
0.00139824 secs/op 0.000962932 secs/op
Run GPU(OpenCL Flavor) test ...
0.000155807 secs/op
[23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:674: Cannot initialize vulkan: [23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:512: Check failed: __e == VK_SUCCESS Vulan Error, code=-9: VK_ERROR_INCOMPATIBLE_DRIVER
Stack trace returned 10 entries:
[bt] (0) /home/user/.local/lib/python3.6/site-packages/tvm-0.4.0-py3.6-linux-x86_64.egg/tvm/libtvm.so(dmlc::StackTrace[abi:cxx11]()+0x53) [0x7f477f5399f3]
.........
You can still compile vulkan module but cannot run locally
Run GPU(Vulkan Flavor) test ...
0.000225198 secs/op
``` ```
You can define your own TVM operators and test via this RPC app on your Android device to find the most optimized TVM schedule. You can define your own TVM operators and test via this RPC app on your Android device to find the most optimized TVM schedule.
ifndef config ifndef config
ifneq ("$(wildcard ./config.mk)","") ifneq ("$(wildcard ./config.mk)","")
config ?= config.mk config ?= config.mk
else else
config ?= make/config.mk config ?= make/config.mk
endif endif
endif endif
include $(config) include $(config)
...@@ -16,10 +16,10 @@ APP_STL := c++_static ...@@ -16,10 +16,10 @@ APP_STL := c++_static
APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti
ifeq ($(USE_OPENCL), 1) ifeq ($(USE_OPENCL), 1)
APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
endif endif
ifeq ($(USE_VULKAN), 1) ifeq ($(USE_VULKAN), 1)
APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1 APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1
APP_LDFLAGS += -lvulkan APP_LDFLAGS += -lvulkan
endif endif
...@@ -21,59 +21,92 @@ key = "android" ...@@ -21,59 +21,92 @@ key = "android"
arch = "arm64" arch = "arm64"
target = "llvm -target=%s-linux-android" % arch target = "llvm -target=%s-linux-android" % arch
# whether enable to execute test on OpenCL target
test_opencl = False
# whether enable to execute test on Vulkan target
test_vulkan = False
def test_rpc_module(): def test_rpc_module():
# graph # graph
n = tvm.convert(1024) n = tvm.convert(1024)
A = tvm.placeholder((n,), name='A') A = tvm.placeholder((n,), name='A')
B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
a_np = np.random.uniform(size=1024).astype(A.dtype)
temp = util.tempdir() temp = util.tempdir()
s = tvm.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=64)
s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
# Build the dynamic lib.
# If we don't want to do metal and only use cpu, just set target to be target
f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
path_dso1 = temp.relpath("dev_lib2.so")
f.export_library(path_dso1, ndk.create_shared)
# Establish remote connection with target hardware
tracker = rpc.connect_tracker(tracker_host, tracker_port)
remote = tracker.request(key, priority=0,
session_timeout=60)
# Compile the Graph for CPU target
s = tvm.create_schedule(B.op) s = tvm.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=64) xo, xi = s[B].split(B.op.axis[0], factor=64)
s[B].parallel(xi) s[B].parallel(xi)
s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xo, "parallel_launch_point")
s[B].pragma(xi, "parallel_barrier_when_finish") s[B].pragma(xi, "parallel_barrier_when_finish")
f = tvm.build(s, [A, B], target, name="myadd_cpu") f = tvm.build(s, [A, B], target, name="myadd_cpu")
path_dso2 = temp.relpath("cpu_lib.so") path_dso_cpu = temp.relpath("cpu_lib.so")
f.export_library(path_dso2, ndk.create_shared) f.export_library(path_dso_cpu, ndk.create_shared)
tracker = rpc.connect_tracker(tracker_host, tracker_port)
remote = tracker.request(key, priority=0,
session_timeout=60)
# Execute the portable graph on cpu target
print('Run CPU test ...') print('Run CPU test ...')
ctx = remote.cpu(0) ctx = remote.cpu(0)
remote.upload(path_dso2) remote.upload(path_dso_cpu)
f2 = remote.load_module("cpu_lib.so") f2 = remote.load_module("cpu_lib.so")
a_np = np.random.uniform(size=1024).astype(A.dtype)
a = tvm.nd.array(a_np, ctx) a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
time_f = f2.time_evaluator(f2.entry_name, ctx, number=10) time_f = f2.time_evaluator(f2.entry_name, ctx, number=10)
cost = time_f(a, b).mean cost = time_f(a, b).mean
print('%g secs/op' % cost) print('%g secs/op\n' % cost)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
# Compile the Graph for OpenCL target
if test_opencl:
s = tvm.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=64)
s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
# Build the dynamic lib.
# If we don't want to do metal and only use cpu, just set target to be target
f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
path_dso_cl = temp.relpath("dev_lib_cl.so")
f.export_library(path_dso_cl, ndk.create_shared)
print('Run GPU(OpenCL Flavor) test ...')
ctx = remote.cl(0)
remote.upload(path_dso_cl)
f1 = remote.load_module("dev_lib_cl.so")
a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
cost = time_f(a, b).mean
print('%g secs/op\n' % cost)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
# Compile the Graph for Vulkan target
if test_vulkan:
s = tvm.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=64)
s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
# Build the dynamic lib.
# If we don't want to do metal and only use cpu, just set target to be target
f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd")
path_dso_vulkan = temp.relpath("dev_lib_vulkan.so")
f.export_library(path_dso_vulkan, ndk.create_shared)
print('Run GPU(Vulkan Flavor) test ...')
ctx = remote.vulkan(0)
remote.upload(path_dso_vulkan)
f1 = remote.load_module("dev_lib_vulkan.so")
a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
cost = time_f(a, b).mean
print('%g secs/op\n' % cost)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
print('Run GPU test ...')
ctx = remote.cl(0)
remote.upload(path_dso1)
f1 = remote.load_module("dev_lib2.so")
a_np = np.random.uniform(size=1024).astype(A.dtype)
a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
cost = time_f(a, b).mean
print('%g secs/op' % cost)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
if __name__ == "__main__": if __name__ == "__main__":
test_rpc_module() test_rpc_module()
...@@ -30,6 +30,7 @@ public class TVMContext { ...@@ -30,6 +30,7 @@ public class TVMContext {
MASK2STR.put(1, "cpu"); MASK2STR.put(1, "cpu");
MASK2STR.put(2, "gpu"); MASK2STR.put(2, "gpu");
MASK2STR.put(4, "opencl"); MASK2STR.put(4, "opencl");
MASK2STR.put(7, "vulkan");
MASK2STR.put(8, "metal"); MASK2STR.put(8, "metal");
MASK2STR.put(9, "vpi"); MASK2STR.put(9, "vpi");
...@@ -38,6 +39,7 @@ public class TVMContext { ...@@ -38,6 +39,7 @@ public class TVMContext {
STR2MASK.put("cuda", 2); STR2MASK.put("cuda", 2);
STR2MASK.put("cl", 4); STR2MASK.put("cl", 4);
STR2MASK.put("opencl", 4); STR2MASK.put("opencl", 4);
STR2MASK.put("vulkan", 7);
STR2MASK.put("metal", 8); STR2MASK.put("metal", 8);
STR2MASK.put("vpi", 9); STR2MASK.put("vpi", 9);
} }
...@@ -82,6 +84,19 @@ public class TVMContext { ...@@ -82,6 +84,19 @@ public class TVMContext {
} }
/** /**
* Construct a Vulkan device.
* @param devId The device id
* @return The created context
*/
public static TVMContext vulkan(int devId) {
return new TVMContext(7, devId);
}
public static TVMContext vulkan() {
return vulkan(0);
}
/**
* Construct a metal device. * Construct a metal device.
* @param devId The device id * @param devId The device id
* @return The created context * @return The created context
......
...@@ -144,6 +144,24 @@ public class RPCSession { ...@@ -144,6 +144,24 @@ public class RPCSession {
} }
/** /**
* Construct remote OpenCL device.
* @param devId device id.
* @return Remote OpenCL context.
*/
public TVMContext vulkan(int devId) {
return context(7, devId);
}
/**
* Construct remote OpenCL device.
* @return Remote OpenCL context.
*/
public TVMContext vulkan() {
return vulkan(0);
}
/**
* Construct remote Metal device. * Construct remote Metal device.
* @param devId device id. * @param devId device id.
* @return Remote metal context. * @return Remote metal context.
......
...@@ -130,6 +130,10 @@ class RPCSession(object): ...@@ -130,6 +130,10 @@ class RPCSession(object):
"""Construct OpenCL device.""" """Construct OpenCL device."""
return self.context(4, dev_id) return self.context(4, dev_id)
def vulkan(self, dev_id=0):
"""Construct Vulkan device."""
return self.context(7, dev_id)
def metal(self, dev_id=0): def metal(self, dev_id=0):
"""Construct Metal device.""" """Construct Metal device."""
return self.context(8, dev_id) return self.context(8, dev_id)
......
...@@ -696,6 +696,7 @@ var tvm_runtime = tvm_runtime || {}; ...@@ -696,6 +696,7 @@ var tvm_runtime = tvm_runtime || {};
1 : "cpu", 1 : "cpu",
2 : "gpu", 2 : "gpu",
4 : "opencl", 4 : "opencl",
7 : "vulkan",
8 : "metal", 8 : "metal",
9 : "vpi", 9 : "vpi",
11 : "opengl", 11 : "opengl",
...@@ -706,6 +707,7 @@ var tvm_runtime = tvm_runtime || {}; ...@@ -706,6 +707,7 @@ var tvm_runtime = tvm_runtime || {};
"cuda": 2, "cuda": 2,
"cl": 4, "cl": 4,
"opencl": 4, "opencl": 4,
"vulkan": 7,
"metal": 8, "metal": 8,
"vpi": 9, "vpi": 9,
"opengl": 11, "opengl": 11,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment