[TUTORIAL] Cross Compilation and RPC (#184)

* [TUTORIAL] Add tutorial for RPC * [TUTORIAL] Update tutorial * [TUTORIAL] Update tutorial * trigger update * [TUTORIAL] Improve build

[TUTORIAL] Cross Compilation and RPC (#184)
* [TUTORIAL] Add tutorial for RPC * [TUTORIAL] Update tutorial * [TUTORIAL] Update tutorial * trigger update * [TUTORIAL] Improve build
fcfec961 · ziheng · GitHub · d0041efd · fcfec961 · fcfec961
Commit fcfec961 authored Jun 16, 2017 by ziheng Committed by GitHub Jun 16, 2017
8 changed files
--- a/Makefile
+++ b/Makefile
@@ -14,6 +14,7 @@ include $(config)
 BUILD_TARGETS ?= lib/libtvm.so lib/libtvm_runtime.so
 all: ${BUILD_TARGETS}
+runtime: lib/libtvm_runtime.so
 ifndef DMLC_CORE_PATH
  DMLC_CORE_PATH = $(ROOTDIR)/dmlc-core

--- a/docs/api/python/contrib.rst
+++ b/docs/api/python/contrib.rst
@@ -12,6 +12,11 @@ tvm.contrib.cc_compiler
 .. automodule:: tvm.contrib.cc_compiler
    :members:
+tvm.contrib.rpc
+~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.rpc
+    :members:
 tvm.contrib.util
 ~~~~~~~~~~~~~~~~
 .. automodule:: tvm.contrib.util

--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -38,12 +38,19 @@ def find_lib_path():
        lib_dll_path = [os.path.join(p, 'libtvm.so') for p in dll_path]
        runtime_dll_path = [os.path.join(p, 'libtvm_runtime.so') for p in dll_path]
-    dll_path = runtime_dll_path if use_runtime else lib_dll_path
+    if not use_runtime:
-    lib_found = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
+        # try to find lib_dll_path
+        lib_found = [p for p in lib_dll_path if os.path.exists(p) and os.path.isfile(p)]
+    if use_runtime or not lib_found:
+        # try to find runtime_dll_path
+        use_runtime = True
+        lib_found = [p for p in runtime_dll_path if os.path.exists(p) and os.path.isfile(p)]
    if not lib_found:
        raise RuntimeError('Cannot find the files.\n' +
-                           'List of candidates:\n' + str('\n'.join(dll_path)))
+                           'List of candidates:\n' +
+                           str('\n'.join(lib_dll_path + runtime_dll_path)))
    if use_runtime:
        sys.stderr.write("Loading runtime library %s... exec only\n" % lib_found[0])
        sys.stderr.flush()

--- a/python/tvm/build.py
+++ b/python/tvm/build.py
@@ -194,7 +194,25 @@ def build(sch,
        The argument lists to the function.
    target : str, optional
-        The target of the compilation.
+        The target and option of the compilation.
+        When the target is llvm, you can set options like:
+          * **-mtriple=<target triple>** or **-target**
+            Specify the target triple, which is useful for cross
+            compilation.
+          * **-mcpu=<cpuname>**
+            Specify a specific chip in the current architecture to
+            generate code for. By default this is infered from the
+            target triple and autodetected to the current architecture.
+          * **-mattr=a1,+a2,-a3,...**
+            Override or control specific attributes of the target,
+            such as whether SIMD operations are enabled or not. The
+            default set of attributes is set by the current CPU.
    target_host : str, optional
        Host compilation target, if target is device.

--- a/python/tvm/contrib/rpc.py
+++ b/python/tvm/contrib/rpc.py
@@ -278,7 +278,10 @@ def connect(url, port):
    sess : RPCSession
        The connected session.
    """
-    sess = _Connect(url, port)
+    try:
+        sess = _Connect(url, port)
+    except NameError:
+        raise RuntimeError('Please compile with USE_RPC=1')
    return RPCSession(sess)
 _init_api("tvm.contrib.rpc")
--- a/src/codegen/llvm/llvm_common.cc
+++ b/src/codegen/llvm/llvm_common.cc
@@ -45,7 +45,7 @@ GetLLVMTargetMachine(const std::string& target_str) {
  // simple parser
  std::string target_triple = "";
  std::string cpu = "generic";
-  std::string features = "";
+  std::string attr = "";
  std::string key, value;
  if (target_str.length() > 5) {
    std::istringstream is(target_str.substr(5, target_str.length() - 5));
@@ -65,8 +65,8 @@ GetLLVMTargetMachine(const std::string& target_str) {
        target_triple = value;
      } else if (key == "-mcpu") {
        cpu = value;
-      } else if (key == "-features") {
+      } else if (key == "-mattr") {
-        features = value;
+        attr = value;
      } else {
        LOG(FATAL) << "unknown option " << key;
      }
@@ -83,7 +83,7 @@ GetLLVMTargetMachine(const std::string& target_str) {
  llvm::TargetOptions opt;
  auto rmodel = llvm::Reloc::PIC_;
  llvm::TargetMachine* tm =
-      target->createTargetMachine(target_triple, cpu, features, opt, rmodel);
+      target->createTargetMachine(target_triple, cpu, attr, opt, rmodel);
  return tm;
 }

--- a/tutorials/python/cross_compilation_and_rpc.py
+++ b/tutorials/python/cross_compilation_and_rpc.py
+"""
+Cross Compilation and RPC
+=========================
+**Author**: `Ziheng Jiang <https://github.com/ZihengJiang/>`_
+This tutorial introduces cross compilation and remote device
+execution with RPC in TVM.
+With cross compilation and RPC, you can compile program on your
+local machine then run it on remote device. It is useful when the
+resource of remote device is limited, like Raspberry Pi and mobile
+platforms, so you do not wish to put the compilation procedure on
+the device in order to save time and space.
+In this tutorial, I will take Raspberry Pi as our target platform
+for example.
+"""
+from __future__ import absolute_import, print_function
+import tvm
+import numpy as np
+from tvm.contrib import rpc, util
+######################################################################
+# Set Up RPC Server on Device
+# ---------------------------
+# To set up a TVM RPC server on the board, we have prepared a script
+# so you only need to run this command after following the
+# installation guide to install TVM on your device:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
+#
+# In the following code block, we simply start an RPC server on the
+# same machine, for demonstration. This line can be omitted if we
+# started an remote server.
+#
+server = rpc.Server(host='0.0.0.0', port=9090)
+######################################################################
+# .. note::
+#
+#   Usually device has limited resources and we only need to build
+#   runtime. The idea is we will use TVM compiler on the local server
+#   to compile and upload the compiled program to the device and run
+#   the device function remotely.
+#
+#   .. code-block:: bash
+#
+#     make runtime
+#
+#   Also make sure that you have set :code:`USE_RPC=1` in your
+#   :code:`config.mk`.
+#
+######################################################################
+# Declare and Cross Compile Kernel on Local Machine
+# -------------------------------------------------
+# Here we will declare a simple kernel with TVM on the local machine:
+#
+n = tvm.convert(1024)
+A = tvm.placeholder((n,), name='A')
+B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+s = tvm.create_schedule(B.op)
+######################################################################
+# Then we cross compile the kernel:
+#
+# the target here should be 'llvm -target=armv7l-none-linux-gnueabihf',
+# and we use 'llvm' here to make example run locally, see the detailed
+# note in the following block
+f = tvm.build(s, [A, B], target='llvm', name='myadd')
+# save the lib at local temp folder
+temp = util.tempdir()
+path = temp.relpath('mylib.o')
+f.save(path)
+######################################################################
+# .. note::
+#
+#   the argument :code:`target` in :code:`build` should be replaced
+#   :code:`'llvm'` with the target triple of your device, which might be
+#   different for different device. For example, it is
+#   :code:`'llvm -target=armv7l-none-linux-gnueabihf'` for my Raspberry
+#   Pi. Here we use :code:`'llvm'` directly to make the tutorial runable.
+#
+#   Usually, you can query the target by execute :code:`gcc -v` on your
+#   device, although it may be still a loose configuration.
+#
+#   Besides :code:`-target`, you can also set other compilation options
+#   like:
+#
+#   * -mtriple=<target triple>
+#       Specify the target triple, same as '-target'.
+#   * -mcpu=<cpuname>
+#       Specify a specific chip in the current architecture to generate code for. By default this is inferred from the target triple and autodetected to the current architecture.
+#   * -mattr=a1,+a2,-a3,...
+#       Override or control specific attributes of the target, such as whether SIMD operations are enabled or not. The default set of attributes is set by the current CPU.
+#       To get the list of available attributes, you can do:
+#
+#       .. code-block:: bash
+#
+#         llc -mtriple=<your device target triple> -mattr=help
+#
+#   These options are consistent with `llc <http://llvm.org/docs/CommandGuide/llc.html>`_.
+#   So for my board, to get the best performance, the complete compilation
+#   option would be:
+#
+#   .. code-block:: bash
+#
+#     llvm -mtriple=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon
+#
+#   It is recommended to set target triple and feature set to contain specific
+#   feature available, so we can take full advantage of the features of the
+#   board.
+#   You can find more details about cross compilation attributes from
+#   `LLVM guide of cross compilation <https://clang.llvm.org/docs/CrossCompilation.html>`_.
+######################################################################
+# Run Kernel Remotely by RPC
+# --------------------------
+# Here we will show you how to run the kernel on the remote device:
+# replace host with the ip address of your device
+host = '0.0.0.0'
+port = 9090
+# connect the remote device
+remote = rpc.connect(host, port)
+######################################################################
+# Here we upload the lib to the remote device, then invoke a device local
+# compiler for shared lib and load it into device memory. now `f` is a
+# remote module object.
+remote.upload(path)
+f = remote.load_module('mylib.o')
+# create array on the remote device
+ctx = remote.cpu(0)
+a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+# the function will run on the remote device
+f(a, b)
+np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
+######################################################################
+# When you want to evaluate the performance of the kernel on the remote
+# device, it is important to avoid overhead of remote function call.
+# :code:`time_evaluator` will returns a remote function that runs the
+# function over number times, measures the cost per run on the remote
+# device and returns the measured cost.
+#
+time_f = f.time_evaluator(f.entry_name, ctx, number=10)
+cost = time_f(a, b)
+print('%g secs/op' % cost)
+# terminate the server after experiment
+server.terminate()
+######################################################################
+# Summary
+# -------
+# This tutorial provides a walk through of cross compilation and RPC
+# features in TVM.
+#
+# - Set up RPC server on the remote device.
+# - Set up target device configuration to cross compile kernel on the
+#   local machine.
+# - Upload and run the kernel remotely by RPC API.
--- a/tutorials/python/tuple_inputs_operation.py
+++ b/tutorials/python/tuple_inputs_operation.py
 """
-Compute and Reduction with Tuple Inputs
+Compute and Reduce with Tuple Inputs
 =======================================
 **Author**: `Ziheng Jiang <https://github.com/ZihengJiang>`_