Commit cf81f9f9 by Tianqi Chen Committed by GitHub

[CUDA] Enable int64 (#683)

* [CUDA] Enable int64

* [PYTHON] Fix rpc tutorial with opencl

* OK

* update
parent f5a6e5e2
...@@ -15,6 +15,8 @@ import socket ...@@ -15,6 +15,8 @@ import socket
import struct import struct
import logging import logging
import multiprocessing import multiprocessing
import subprocess
import time
from . import util, cc, tar from . import util, cc, tar
from ..module import load as _load_module from ..module import load as _load_module
from .._ffi.function import _init_api, register_func from .._ffi.function import _init_api, register_func
...@@ -117,6 +119,17 @@ def _connect_proxy_loop(addr, key): ...@@ -117,6 +119,17 @@ def _connect_proxy_loop(addr, key):
process.join() process.join()
def _popen(cmd):
proc = subprocess.Popen(cmd,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
env=os.environ)
(out, _) = proc.communicate()
if proc.returncode != 0:
msg = "Server invoke error:\n"
msg += out
raise RuntimeError(msg)
class Server(object): class Server(object):
"""Start RPC server on a seperate process. """Start RPC server on a seperate process.
...@@ -140,15 +153,36 @@ class Server(object): ...@@ -140,15 +153,36 @@ class Server(object):
If this is true, the host and port actually corresponds to the If this is true, the host and port actually corresponds to the
address of the proxy server. address of the proxy server.
use_popen : bool, optional
Whether to use Popen to start a fresh new process instead of fork.
This is recommended to switch on if we want to do local RPC demonstration
for GPU devices to avoid fork safety issues.
key : str, optional key : str, optional
The key used to identify the server in Proxy connection. The key used to identify the server in Proxy connection.
""" """
def __init__(self, host, port=9091, port_end=9199, is_proxy=False, key=""): def __init__(self,
host,
port=9091,
port_end=9199,
is_proxy=False,
use_popen=False,
key=""):
self.host = host self.host = host
self.port = port self.port = port
self.libs = [] self.libs = []
if not is_proxy: if use_popen:
cmd = ["python",
"-m", "tvm.exec.rpc_server",
"--host=%s" % host,
"--port=%s" % port]
self.proc = multiprocessing.Process(
target=subprocess.check_call, args=(cmd,))
self.proc.deamon = True
self.proc.start()
time.sleep(1)
elif not is_proxy:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.port = None self.port = None
for my_port in range(port, port_end): for my_port in range(port, port_end):
...@@ -168,11 +202,15 @@ class Server(object): ...@@ -168,11 +202,15 @@ class Server(object):
self.sock = sock self.sock = sock
self.proc = multiprocessing.Process( self.proc = multiprocessing.Process(
target=_listen_loop, args=(self.sock,)) target=_listen_loop, args=(self.sock,))
self.proc.deamon = True
self.proc.start()
else: else:
self.proc = multiprocessing.Process( self.proc = multiprocessing.Process(
target=_connect_proxy_loop, args=((host, port), key)) target=_connect_proxy_loop, args=((host, port), key))
self.proc.deamon = True self.proc.deamon = True
self.proc.start() self.proc.start()
def terminate(self): def terminate(self):
"""Terminate the server process""" """Terminate the server process"""
......
...@@ -66,7 +66,11 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) const { // NOLINT(*) ...@@ -66,7 +66,11 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) const { // NOLINT(*)
} }
} else if (t.is_uint() || t.is_int()) { } else if (t.is_uint() || t.is_int()) {
if (t.is_uint()) { if (t.is_uint()) {
os << 'u'; if (t.lanes() != 1) {
os << "u";
} else {
os << "unsigned ";
}
} }
if (t.bits() == 8 && t.lanes() == 4) { if (t.bits() == 8 && t.lanes() == 4) {
// directly 4 8 bit int in integer. // directly 4 8 bit int in integer.
...@@ -77,16 +81,16 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) const { // NOLINT(*) ...@@ -77,16 +81,16 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) const { // NOLINT(*)
case 16: os << "short"; break; case 16: os << "short"; break;
case 32: os << "int"; break; case 32: os << "int"; break;
case 64: { case 64: {
if (lanes != 1 && sizeof(long) == 64) { // NOLINT(*) CHECK(sizeof(long) == 8) // NOLINT(*)
os << "long"; break; << "CUDA not support int64 int in 32 bit system";
} else { os << "long"; break;
os << "int64_t"; break;
}
} }
case 1: os << "int"; break; case 1: os << "int"; break;
default: fail = true; break; default: fail = true; break;
} }
if (!fail && lanes == 1) return; if (!fail && lanes == 1) {
return;
}
if (!fail && (lanes >= 2 && lanes <= 4)) { if (!fail && (lanes >= 2 && lanes <= 4)) {
os << lanes; return; os << lanes; return;
} }
......
...@@ -80,53 +80,58 @@ def test_popcount_llvm(): ...@@ -80,53 +80,58 @@ def test_popcount_llvm():
b.asnumpy(), list(map(lambda x: bin(x).count('1'), a.asnumpy())), rtol=1e-5) b.asnumpy(), list(map(lambda x: bin(x).count('1'), a.asnumpy())), rtol=1e-5)
def test_add(): def test_add():
# graph def run(dtype):
n = tvm.var('n') # graph
A = tvm.placeholder((n,), name='A') n = tvm.var('n')
B = tvm.placeholder((n,), name='B') A = tvm.placeholder((n,), name='A', dtype=dtype)
bias = tvm.var("bias", dtype="float32") B = tvm.placeholder((n,), name='B', dtype=dtype)
scale = tvm.var("scale", dtype="float32") bias = tvm.var("bias", dtype=dtype)
C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i) * scale + bias, name='C') scale = tvm.var("scale", dtype=dtype)
# schedule C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
s = tvm.create_schedule(C.op) # schedule
# create iter var and assign them tags. s = tvm.create_schedule(C.op)
num_thread = 32 # create iter var and assign them tags.
bx, x = s[C].split(C.op.axis[0], factor=num_thread*4) num_thread = 16
tx, x = s[C].split(x, nparts=num_thread) bx, x = s[C].split(C.op.axis[0], factor=num_thread*4)
_, x = s[C].split(x, factor=4) tx, x = s[C].split(x, nparts=num_thread)
s[C].bind(bx, tvm.thread_axis("blockIdx.x")) _, x = s[C].split(x, factor=4)
s[C].bind(tx, tvm.thread_axis("threadIdx.x")) s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
s[C].vectorize(x) s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
s[C].vectorize(x)
# one line to build the function. # one line to build the function.
def check_device(device): def check_device(device):
if not tvm.module.enabled(device): if not tvm.module.enabled(device):
print("skip because %s is not enabled.." % device) print("skip because %s is not enabled.." % device)
return return
fadd = tvm.build(s, [A, B, C, bias, scale], fadd = tvm.build(s, [A, B, C],
device, device,
name="myadd") name="myadd")
ctx = tvm.context(device, 0) print(fadd.imported_modules[0].get_source())
# launch the kernel. ctx = tvm.context(device, 0)
n = 1024 # launch the kernel.
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) n = 1024
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), ctx)
vbias = np.random.uniform() c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
vscale = np.random.uniform() ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=1)
ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=10) tcost = ftimer(a, b, c).mean
tcost = ftimer(a, b, c, vbias, vscale).mean np.testing.assert_allclose(
np.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy(), rtol=1e-6)
c.asnumpy(), a.asnumpy() + b.asnumpy() * vscale + vbias, rtol=1e-6)
check_device("opencl") check_device("opencl")
check_device("metal") check_device("metal")
check_device("cuda") check_device("cuda")
run("float32")
run("int32")
run("int64")
run("uint64")
if __name__ == "__main__": if __name__ == "__main__":
test_add()
test_log_pow_llvm() test_log_pow_llvm()
test_popcount_llvm() test_popcount_llvm()
test_exp() test_exp()
test_add()
...@@ -11,7 +11,7 @@ mv out docs/_build/html/jsdoc || exit -1 ...@@ -11,7 +11,7 @@ mv out docs/_build/html/jsdoc || exit -1
rm -rf python/tvm/*.pyc python/tvm/*/*.pyc rm -rf python/tvm/*.pyc python/tvm/*/*.pyc
cd docs cd docs
PYTHONPATH=../python make html || exit -1 PYTHONPATH=`pwd`/../python make html || exit -1
cd _build/html cd _build/html
tar czf docs.tgz * tar czf docs.tgz *
mv docs.tgz ../../../ mv docs.tgz ../../../
...@@ -101,7 +101,7 @@ from tvm.contrib import rpc, util ...@@ -101,7 +101,7 @@ from tvm.contrib import rpc, util
# same machine, for demonstration. This line can be omitted if we # same machine, for demonstration. This line can be omitted if we
# started an remote server. # started an remote server.
# #
server = rpc.Server(host='0.0.0.0', port=9090) server = rpc.Server(host='0.0.0.0', port=9090, use_popen=True)
###################################################################### ######################################################################
# Declare and Cross Compile Kernel on Local Machine # Declare and Cross Compile Kernel on Local Machine
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment