[RUNTIME] Make rutnime DLPack compatible, allow new device plugin (#71)

* [RUNTIME] Refactor runtime to be DLPack compatible. Enable plugin of new runtime. * fix mac compile * ok

[RUNTIME] Make rutnime DLPack compatible, allow new device plugin (#71)
* [RUNTIME] Refactor runtime to be DLPack compatible. Enable plugin of new runtime. * fix mac compile * ok
3957926e · Tianqi Chen · GitHub · 9e660dbe · 3957926e · 3957926e
Commit 3957926e authored Mar 14, 2017 by Tianqi Chen Committed by GitHub Mar 14, 2017
28 changed files
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "HalideIR"]
 	path = HalideIR
 	url = ssh://git@github.com/tqchen/HalideIR
+[submodule "dlpack"]
+	path = dlpack
+	url = https://github.com/dmlc/dlpack
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,9 @@ tvm_option(USE_MSVC_MT "Build with MT" OFF)

 include_directories("include")
 include_directories("HalideIR/src")
+include_directories("dlpack/include")
+
+
 set(TVM_LINKER_LIBS "")
 set(TVM_RUNTIME_LINKER_LIBS "")


--- a/Makefile
+++ b/Makefile
@@ -28,7 +28,7 @@ ALL_DEP = $(ALL_OBJ) $(LIB_HALIDE_IR)

 export LDFLAGS = -pthread -lm
 export CFLAGS =  -std=c++11 -Wall -O2 -fno-rtti\
-	 -Iinclude -Idmlc-core/include -IHalideIR/src  -fPIC -DDMLC_ENABLE_RTTI=0
+	 -Iinclude -Idlpack/include -Idmlc-core/include -IHalideIR/src  -fPIC -DDMLC_ENABLE_RTTI=0

 ifdef CUDA_PATH
 	NVCC=$(CUDA_PATH)/bin/nvcc

--- a/dlpack @ 9f433c5e
+++ b/dlpack @ 9f433c5e
+Subproject commit 9f433c5ecfdd47184339cdd2b99706d24fae3aa1
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -70,7 +70,13 @@ class BufferNode : public Node {
  Array<Expr> strides;
  /*! \brief data type in the content of the tensor */
  Type dtype;
-  // Maybe need more information(alignment) later
+  /*!
+   * \brief The offset in bytes to the beginning pointer to data
+   *  Can be undefined, indicating this must be zero.
+   */
+  Expr byte_offset;
+  /*! \brief Alignment bytes size of byte_offset */
+  int offset_alignment;
  /*! \brief constructor */
  BufferNode() {}

@@ -80,13 +86,17 @@ class BufferNode : public Node {
    v->Visit("shape", &shape);
    v->Visit("strides", &strides);
    v->Visit("dtype", &dtype);
+    v->Visit("byte_offset", &byte_offset);
+    v->Visit("offset_alignment", &offset_alignment);
  }

  static Buffer make(std::string name,
                     Var ptr,
                     Array<Expr> shape,
                     Array<Expr> strides,
-                     Type dtype);
+                     Type dtype,
+                     Expr byte_offset,
+                     int offset_alignment);

  static constexpr const char* _type_key = "Buffer";
  TVM_DECLARE_NODE_TYPE_INFO(BufferNode, Node);

--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -41,6 +41,14 @@ using Halide::Internal::const_true;
 using Halide::Internal::const_false;
 using Halide::Internal::is_no_op;

+inline Type TVMShapeIndexType() {
+  if (std::is_signed<tvm_index_t>::value) {
+    return Int(sizeof(tvm_index_t) * 8);
+  } else {
+    return UInt(sizeof(tvm_index_t) * 8);
+  }
+}
+
 inline Type TVMType2Type(TVMType t) {
  return Type(static_cast<halide_type_code_t>(t.code), t.bits, t.lanes);
 }

--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -167,7 +167,8 @@ enum TVMArrayFieldKind {
  kStrides = 3,
  kTypeCode = 4,
  kTypeBits = 5,
-  kTypeLanes = 6
+  kTypeLanes = 6,
+  kByteOffset = 7
 };
 }   // namespace intrinsic


--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -31,22 +31,22 @@

 #include <stdint.h>
 #include <stddef.h>
+// TVM Runtime is DLPack compatible.
+#include <dlpack/dlpack.h>


 TVM_EXTERN_C {
 /*! \brief type of array index. */
-typedef uint32_t tvm_index_t;
+typedef int64_t tvm_index_t;
 /*!
 * \brief The type code in TVMType
 * \note TVMType is used in two places.
 */
 typedef enum {
-  kInt = 0U,
-  kUInt = 1U,
-  kFloat = 2U,
-  kHandle = 3U,
+  // The type code of other types are compatible with DLPack.
  // The next few fields are extension types
  // that is used by TVM API calls.
+  kHandle = 3U,
  kNull = 4U,
  kArrayHandle = 5U,
  kTVMType = 6U,
@@ -67,14 +67,17 @@ typedef enum {
 *
 * \note Arguments TVM API function always takes bits=64 and lanes=1
 */
-typedef struct {
-  /*! \brief type code, in TVMTypeCode */
-  uint8_t code;
-  /*! \brief number of bits of the type */
-  uint8_t bits;
-  /*! \brief number of lanes, */
-  uint16_t lanes;
-} TVMType;
+typedef DLDataType TVMType;
+
+/*!
+ * \brief The Device information, abstract away common device types.
+ */
+typedef DLContext TVMContext;
+
+/*!
+ * \brief The tensor array stucture to TVM API.
+ */
+typedef DLTensor TVMArray;

 /*!
 * \brief Union type of values
@@ -97,50 +100,6 @@ typedef struct {
  size_t size;
 } TVMByteArray;

-/*!
- * \brief The device type
- */
-typedef enum {
-  /*! \brief CPU device */
-  kCPU = 1,
-  /*! \brief NVidia GPU device(CUDA) */
-  kGPU = 2,
-  /*! \brief opencl device */
-  kOpenCL = 4
-} TVMDeviceMask;
-
-/*!
- * \brief The Device information, abstract away common device types.
- */
-typedef struct {
-  /*! \brief The device type mask */
-  int dev_mask;
-  /*! \brief the device id */
-  int dev_id;
-} TVMContext;
-
-/*!
- * \brief Data structure representing a n-dimensional array(tensor).
- *  This is used to pass data specification into TVM.
- */
-typedef struct {
-  /*! \brief The data field pointer on specified device */
-  void* data;
-  /*! \brief The shape pointers of the array */
-  const tvm_index_t* shape;
-  /*!
-   * \brief The stride data about each dimension of the array, can be NULL
-   *  When strides is NULL, it indicates that the array is empty.
-   */
-  const tvm_index_t* strides;
-  /*! \brief number of dimensions of the array */
-  tvm_index_t ndim;
-  /*! \brief The data type flag */
-  TVMType dtype;
-  /*! \brief The device context this array sits on */
-  TVMContext ctx;
-} TVMArray;
-
 /*! \brief Handle to TVM runtime modules. */
 typedef void* TVMModuleHandle;
 /*! \brief Handle to packed function handle. */

--- a/python/tvm/_ctypes/_ndarray.py
+++ b/python/tvm/_ctypes/_ndarray.py
@@ -8,35 +8,37 @@ import numpy as np

 from .._base import _LIB, check_call
 from .._base import c_array
-from ._types import TVMType, tvm_index_t
+from ._types import TVMType, tvm_shape_index_t

 class TVMContext(ctypes.Structure):
    """TVM context strucure."""
-    _fields_ = [("dev_mask", ctypes.c_int),
-                ("dev_id", ctypes.c_int)]
+    _fields_ = [("device_id", ctypes.c_int),
+                ("device_type", ctypes.c_int)]
+
    MASK2STR = {
        1 : 'cpu',
        2 : 'gpu',
        4 : 'opencl'
    }
-    def __init__(self, dev_mask, dev_id):
+    def __init__(self, device_id, device_type):
        super(TVMContext, self).__init__()
-        self.dev_mask = dev_mask
-        self.dev_id = dev_id
+        self.device_id = device_id
+        self.device_type = device_type

    def __repr__(self):
        return "%s(%d)" % (
-            TVMContext.MASK2STR[self.dev_mask], self.dev_id)
+            TVMContext.MASK2STR[self.device_type], self.device_id)


 class TVMArray(ctypes.Structure):
    """TVMValue in C API"""
    _fields_ = [("data", ctypes.c_void_p),
-                ("shape", ctypes.POINTER(tvm_index_t)),
-                ("strides", ctypes.POINTER(tvm_index_t)),
-                ("ndim", tvm_index_t),
+                ("ctx", TVMContext),
+                ("ndim", ctypes.c_int),
                ("dtype", TVMType),
-                ("ctx", TVMContext)]
+                ("shape", ctypes.POINTER(tvm_shape_index_t)),
+                ("strides", ctypes.POINTER(tvm_shape_index_t)),
+                ("byte_offset", ctypes.c_size_t)]


 TVMArrayHandle = ctypes.POINTER(TVMArray)
@@ -50,7 +52,7 @@ def cpu(dev_id=0):
    dev_id : int, optional
        The integer device id
    """
-    return TVMContext(1, dev_id)
+    return TVMContext(dev_id, 1)


 def gpu(dev_id=0):
@@ -61,7 +63,7 @@ def gpu(dev_id=0):
    dev_id : int, optional
        The integer device id
    """
-    return TVMContext(2, dev_id)
+    return TVMContext(dev_id, 2)


 def opencl(dev_id=0):
@@ -72,7 +74,7 @@ def opencl(dev_id=0):
    dev_id : int, optional
        The integer device id
    """
-    return TVMContext(4, dev_id)
+    return TVMContext(dev_id, 4)


 def numpyasarray(np_data):
@@ -81,7 +83,7 @@ def numpyasarray(np_data):
    data = np_data
    assert data.flags['C_CONTIGUOUS']
    arr = TVMArray()
-    shape = c_array(tvm_index_t, data.shape)
+    shape = c_array(tvm_shape_index_t, data.shape)
    arr.data = data.ctypes.data_as(ctypes.c_void_p)
    arr.shape = shape
    arr.strides = None
@@ -114,8 +116,8 @@ def empty(shape, dtype="float32", ctx=cpu(0)):
    arr : tvm.nd.NDArray
        The array tvm supported.
    """
-    shape = c_array(tvm_index_t, shape)
-    ndim = tvm_index_t(len(shape))
+    shape = c_array(tvm_shape_index_t, shape)
+    ndim = ctypes.c_int(len(shape))
    handle = TVMArrayHandle()
    dtype = TVMType(dtype)
    check_call(_LIB.TVMArrayAlloc(

--- a/python/tvm/_ctypes/_types.py
+++ b/python/tvm/_ctypes/_types.py
@@ -6,7 +6,7 @@ import ctypes
 import numpy as np
 from .._base import py_str

-tvm_index_t = ctypes.c_uint32
+tvm_shape_index_t = ctypes.c_int64

 class TypeCode(object):
    """Type code used in API calls"""

--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -261,8 +261,10 @@ def call_packed(*args):

 def Buffer(shape, dtype=None,
           name="buffer",
-           ptr=None,
-           strides=None):
+           data=None,
+           strides=None,
+           byte_offset=None,
+           offset_alignment=0):
    """Create a new symbolic buffer

    Parameters
@@ -276,12 +278,18 @@ def Buffer(shape, dtype=None,
    name : str, optional
        The name of the buffer.

-    ptr : Var, optional
+    data : Var, optional
        The data pointer in the buffer.

    strides: array of Expr
        The stride of the buffer.

+    byte_offset: Expr, optional
+        The offset in bytes to data pointer.
+
+    offset_alignment: int, optional
+        The alignment of offset
+
    Returns
    -------
    buffer : Buffer
@@ -290,11 +298,11 @@ def Buffer(shape, dtype=None,
    shape = (shape,) if isinstance(shape, _expr.Expr) else shape
    dtype = float32 if dtype is None else dtype
    strides = () if strides is None else strides
-    if ptr is None:
-        ptr = Var(name, "handle")
+    if data is None:
+        data = Var(name, "handle")

    return _api_internal._Buffer(
-        name, ptr, shape, strides, dtype)
+        name, data, shape, strides, dtype, byte_offset, offset_alignment)


 def _IterVar(dom, name, iter_type, thread_tag=''):

--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -138,7 +138,9 @@ TVM_REGISTER_API(_Buffer)
                            args[1],
                            args[2],
                            args[3],
-                            args[4]);
+                            args[4],
+                            args[5],
+                            args[6]);
  });

 TVM_REGISTER_API(_Tensor)

--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -33,17 +33,18 @@ void CodeGenLLVM::Init(const std::string& module_name,
    t_int32_ = llvm::Type::getInt32Ty(*ctx);
    t_int64_ = llvm::Type::getInt64Ty(*ctx);
    t_float64_ = llvm::Type::getDoubleTy(*ctx);
-    t_tvm_index_ = llvm::Type::getIntNTy(*ctx, sizeof(tvm_index_t) * 8);
+    t_tvm_shape_index_ = llvm::Type::getIntNTy(*ctx, TVMShapeIndexType().bits());
    t_tvm_context_ = llvm::StructType::create({t_int_, t_int_});
    t_tvm_type_ = llvm::StructType::create({t_int8_, t_int8_, t_int16_});
    t_tvm_func_handle_ = t_void_p_;
    t_tvm_array_ = llvm::StructType::create(
        {t_void_p_,
-         t_tvm_index_->getPointerTo(),
-         t_tvm_index_->getPointerTo(),
-         t_tvm_index_,
+         t_tvm_context_,
+         t_int_,
         t_tvm_type_,
-         t_tvm_context_});
+         t_tvm_shape_index_->getPointerTo(),
+         t_tvm_shape_index_->getPointerTo(),
+         t_int64_});
    t_tvm_value_ = llvm::StructType::create({t_float64_});
    t_f_tvm_par_for_lambda_ = llvm::FunctionType::get(
        t_int_, {t_int64_, t_int64_, t_void_p_}, false);
@@ -663,25 +664,29 @@ llvm::Value* CodeGenLLVM::CreateIntrinstic(const Call* op) {
        ret = builder_->CreateInBoundsGEP(arr, {zero, ConstInt32(0)}); break;
      }
      case intrinsic::kShape: {
-        ret = builder_->CreateInBoundsGEP(arr, {zero, ConstInt32(1)}); break;
+        ret = builder_->CreateInBoundsGEP(arr, {zero, ConstInt32(4)}); break;
      }
      case intrinsic::kStrides: {
-        ret = builder_->CreateInBoundsGEP(arr, {zero, ConstInt32(2)}); break;
+        ret = builder_->CreateInBoundsGEP(arr, {zero, ConstInt32(5)}); break;
      }
      case intrinsic::kNDim: {
-        ret = builder_->CreateInBoundsGEP(arr, {zero, ConstInt32(3)}); break;
+        ret = builder_->CreateInBoundsGEP(arr, {zero, ConstInt32(2)}); break;
      }
      case intrinsic::kTypeCode: {
        ret = builder_->CreateInBoundsGEP(
-            arr, {zero, ConstInt32(4), ConstInt32(0)}); break;
+            arr, {zero, ConstInt32(3), ConstInt32(0)}); break;
      }
      case intrinsic::kTypeBits: {
        ret = builder_->CreateInBoundsGEP(
-            arr, {zero, ConstInt32(4), ConstInt32(1)}); break;
+            arr, {zero, ConstInt32(3), ConstInt32(1)}); break;
      }
      case intrinsic::kTypeLanes: {
        ret = builder_->CreateInBoundsGEP(
-            arr, {zero, ConstInt32(4), ConstInt32(2)}); break;
+            arr, {zero, ConstInt32(3), ConstInt32(2)}); break;
+      }
+      case intrinsic::kByteOffset: {
+        ret = builder_->CreateInBoundsGEP(
+            arr, {zero, ConstInt32(6)}); break;
      }
      default: LOG(FATAL) << "unknown field code";
    }

--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -160,7 +160,7 @@ class CodeGenLLVM :
  llvm::MDNode* md_very_likely_branch_{nullptr};
  llvm::MDNode* md_tbaa_root_{nullptr};
  // TVM related data types
-  llvm::Type* t_tvm_index_{nullptr};
+  llvm::Type* t_tvm_shape_index_{nullptr};
  llvm::Type* t_tvm_func_handle_{nullptr};
  llvm::StructType* t_tvm_context_{nullptr};
  llvm::StructType* t_tvm_type_{nullptr};

--- a/src/codegen/stack_vm/codegen_stack_vm.cc
+++ b/src/codegen/stack_vm/codegen_stack_vm.cc
@@ -166,6 +166,7 @@ void CodeGenStackVM::VisitExpr_(const Call* op) {
      case intrinsic::kTypeCode: PushOp(StackVM::TVM_ARRAY_GET_TYPE_CODE); break;
      case intrinsic::kTypeBits: PushOp(StackVM::TVM_ARRAY_GET_TYPE_BITS); break;
      case intrinsic::kTypeLanes: PushOp(StackVM::TVM_ARRAY_GET_TYPE_LANES); break;
+      case intrinsic::kByteOffset: PushOp(StackVM::TVM_ARRAY_GET_BYTE_OFFSET); break;
      default: LOG(FATAL) << "unknown field code";
    }
  } else if (op->is_intrinsic(intrinsic::tvm_call_packed)) {
@@ -227,15 +228,12 @@ void CodeGenStackVM::PushBinary(StackVM::OpCode op_int64,

 void CodeGenStackVM::PushCast(Type dst, Type src) {
  if (dst.is_int()) {
-    if (src.is_int()) return;
-    if (src.is_uint() && src.bits() <= 32) return;
-  } else if (dst.is_uint() && dst.bits() <= 32) {
-    if (src.is_int()) return;
-    if (src.is_uint() && src.bits() <= 32) return;
+    if (src.is_int() || src.is_uint()) return;
+  } else if (dst.is_uint()) {
+    if (src.is_int() || src.is_uint()) return;
  } else if (dst.is_float()) {
    if (src.is_float()) return;
  }
-  LOG(FATAL) << "Cannot handle cast " << src << " to " << dst;
 }

 void CodeGenStackVM::VisitExpr_(const StringImm *op) {

--- a/src/codegen/stack_vm/stack_vm.cc
+++ b/src/codegen/stack_vm/stack_vm.cc
@@ -139,6 +139,7 @@ int64_t StackVM::PrintCode(std::ostream& os, int64_t pc) const {
    STACK_VM_PRINT_CODE0(TVM_ARRAY_GET_SHAPE);
    STACK_VM_PRINT_CODE0(TVM_ARRAY_GET_STRIDES);
    STACK_VM_PRINT_CODE0(TVM_ARRAY_GET_NDIM);
+    STACK_VM_PRINT_CODE0(TVM_ARRAY_GET_BYTE_OFFSET);
    STACK_VM_PRINT_CODE0(TVM_ARRAY_GET_TYPE_CODE);
    STACK_VM_PRINT_CODE0(TVM_ARRAY_GET_TYPE_BITS);
    STACK_VM_PRINT_CODE0(TVM_ARRAY_GET_TYPE_LANES);
@@ -352,6 +353,9 @@ void StackVM::Run(State* s) const {
      case TVM_ARRAY_GET_NDIM: {
        STACK_VM_TVM_ARRARY_GET(v_int64, int64_t, ndim); break;
      }
+      case TVM_ARRAY_GET_BYTE_OFFSET: {
+        STACK_VM_TVM_ARRARY_GET(v_int64, int64_t, byte_offset); break;
+      }
      case TVM_ARRAY_GET_TYPE_CODE: {
        STACK_VM_TVM_ARRARY_GET(v_int64, int64_t, dtype.code); break;
      }

--- a/src/codegen/stack_vm/stack_vm.h
+++ b/src/codegen/stack_vm/stack_vm.h
@@ -199,7 +199,8 @@ class StackVM {
    TVM_ARRAY_GET_NDIM,
    TVM_ARRAY_GET_TYPE_CODE,
    TVM_ARRAY_GET_TYPE_BITS,
-    TVM_ARRAY_GET_TYPE_LANES
+    TVM_ARRAY_GET_TYPE_LANES,
+    TVM_ARRAY_GET_BYTE_OFFSET
  };
  /*! \brief The code structure */
  union Code {

--- a/src/lang/buffer.cc
+++ b/src/lang/buffer.cc
@@ -22,7 +22,8 @@ Buffer::Buffer(Array<Expr> shape,
    : Buffer(BufferNode::make(
          name,
          Var(name, Type(Type::Handle, 0, 0)),
-          shape, Array<Expr>(), dtype)) {
+          shape, Array<Expr>(), dtype,
+          Expr(), 0)) {
 }

 inline Expr BufferOffset(const BufferNode* n, Array<Expr> index) {
@@ -40,6 +41,9 @@ inline Expr BufferOffset(const BufferNode* n, Array<Expr> index) {
      base = base + index[i] * n->strides[i];
    }
  }
+  if (!is_zero(n->byte_offset)) {
+    base = base + (n->byte_offset / n->dtype.bytes());
+  }
  return base;
 }

@@ -58,13 +62,27 @@ Buffer BufferNode::make(std::string name,
                        Var data,
                        Array<Expr> shape,
                        Array<Expr> strides,
-                        Type dtype) {
+                        Type dtype,
+                        Expr byte_offset,
+                        int offset_alignment) {
  auto n = std::make_shared<BufferNode>();
  n->name = name;
  n->data = data;
  n->shape = shape;
  n->strides = strides;
  n->dtype = dtype;
+
+  if (!byte_offset.defined()) {
+    byte_offset = make_const(shape[0].type(), 0);
+  }
+  if (offset_alignment != 0) {
+    CHECK_EQ(offset_alignment % dtype.bytes(), 0)
+        << "Offset alignments must be at least " << dtype.bytes();
+  } else {
+    offset_alignment = dtype.bytes();
+  }
+  n->byte_offset = byte_offset;
+  n->offset_alignment = offset_alignment;
  return Buffer(n);
 }


--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -36,7 +36,8 @@ LoweredFunc MakeAPI(Stmt body,
                    std::string name,
                    Array<NodeRef> api_args,
                    int num_unpacked_args) {
-  const Type tvm_index_type = UInt(32);
+  const Type tvm_shape_type = TVMShapeIndexType();
+  const Type tvm_ndim_type = Int(32);
  const Stmt nop = Evaluate::make(0);
  int num_args = static_cast<int>(api_args.size());
  CHECK_LE(num_unpacked_args, num_args);
@@ -120,13 +121,15 @@ LoweredFunc MakeAPI(Stmt body,
          << "api_args can only be Buffer or Var";
      Buffer buf(api_args[i].node_);
      // dimension checks
-      Expr v_ndim = TVMArrayGet(tvm_index_type, v_arg, intrinsic::kNDim);
+      Expr v_ndim = TVMArrayGet(tvm_ndim_type, v_arg, intrinsic::kNDim);
      std::ostringstream ndim_err_msg;
      ndim_err_msg << "arg_" << i
                   << ".ndim is expected to equal "
                   << buf->shape.size();
      seq_init.emplace_back(
-          MakeAssertEQ(v_ndim, UIntImm::make(tvm_index_type, buf->shape.size()),
+          MakeAssertEQ(v_ndim,
+                       make_const(tvm_ndim_type,
+                                  static_cast<int64_t>(buf->shape.size())),
                       ndim_err_msg.str()));
      // type checks
      Type dtype = buf->dtype;
@@ -147,7 +150,7 @@ LoweredFunc MakeAPI(Stmt body,
      }
      // shape field
      Var v_shape(v_arg->name_hint + ".shape", Handle());
-      handle_data_type.Set(v_shape, UIntImm::make(tvm_index_type, 0));
+      handle_data_type.Set(v_shape, make_const(tvm_shape_type, 0));
      seq_init.emplace_back(LetStmt::make(
          v_shape, TVMArrayGet(Handle(), v_arg, intrinsic::kShape), nop));
      for (size_t k = 0; k < buf->shape.size(); ++k) {
@@ -155,12 +158,12 @@ LoweredFunc MakeAPI(Stmt body,
        field_name << v_shape->name_hint << '[' << k << ']';
        f_push(buf->shape[k],
               cast(buf->shape[k].type(),
-                    Load::make(tvm_index_type, v_shape, IntImm::make(Int(32), k))),
+                    Load::make(tvm_shape_type, v_shape, IntImm::make(Int(32), k))),
               field_name.str());
      }
      // strides field
      Var v_strides(v_arg->name_hint + ".strides", Handle());
-      handle_data_type.Set(v_strides, UIntImm::make(tvm_index_type, 0));
+      handle_data_type.Set(v_strides, make_const(tvm_shape_type, 0));
      seq_init.emplace_back(LetStmt::make(
          v_strides, TVMArrayGet(Handle(), v_arg, intrinsic::kStrides), nop));
      if (buf->strides.size() == 0) {
@@ -174,10 +177,13 @@ LoweredFunc MakeAPI(Stmt body,
          field_name << v_strides->name_hint << '[' << k << ']';
          f_push(buf->strides[k],
                 cast(buf->shape[k].type(),
-                      Load::make(tvm_index_type, v_strides, IntImm::make(Int(32), k))),
+                      Load::make(tvm_shape_type, v_strides, IntImm::make(Int(32), k))),
                 field_name.str());
        }
      }
+      // Byte_offset field.
+      f_push(buf->byte_offset, TVMArrayGet(UInt(64), v_arg, intrinsic::kByteOffset),
+             v_arg->name_hint + ".byte_offset");
    }
  }


--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -7,17 +7,56 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
 #include <dmlc/timer.h>
+#include <array>
 #include <algorithm>
 #include <string>
 #include <cstdlib>
 #include <thread>
+#include <mutex>
 #include "./runtime_base.h"
 #include "./device_api.h"

 namespace tvm {
 namespace runtime {

+class DeviceAPIManager {
+ public:
+  static const int kMaxDeviceAPI = 16;
+  // Get API
+  static DeviceAPI* Get(TVMContext ctx) {
+    return Global()->GetAPI(ctx.device_type);
+  }
+
+ private:
+  std::array<DeviceAPI*, kMaxDeviceAPI> api_;
+  std::mutex mutex_;
+  // constructor
+  DeviceAPIManager() {
+    std::fill(api_.begin(), api_.end(), nullptr);
+  }
+  // Global static variable.
+  static DeviceAPIManager* Global() {
+    static DeviceAPIManager inst;
+    return &inst;
+  }
+  // Get or initialize API.
+  DeviceAPI* GetAPI(DLDeviceType type) {
+    if (api_[type] != nullptr) return api_[type];
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (api_[type] != nullptr) return api_[type];
+    std::string factory = "_device_api_" + DeviceName(type);
+    auto* f = Registry::Get(factory);
+    CHECK(f != nullptr)
+        << "Device API " << DeviceName(type) << " is not enabled.";
+    void* ptr = (*f)();
+    api_[type] = static_cast<DeviceAPI*>(ptr);
+    return api_[type];
+  }
+};
+
+
 inline TVMArray* TVMArrayCreate_() {
  TVMArray* arr = new TVMArray();
  arr->shape = nullptr;
@@ -33,9 +72,8 @@ inline void TVMArrayFree_(TVMArray* arr) {
    delete[] arr->shape;
    delete[] arr->strides;
    if (arr->data != nullptr) {
-      TVM_DEVICE_SWITCH(arr->ctx, {
-          FreeDataSpace<xpu>(arr->ctx, arr->data);
-        });
+      DeviceAPIManager::Get(arr->ctx)->FreeDataSpace(
+          arr->ctx, arr->data);
    }
  }
  delete arr;
@@ -282,10 +320,8 @@ int TVMArrayAlloc(const tvm_index_t* shape,
  arr->ctx = ctx;
  size_t size = GetDataSize(arr);
  size_t alignment = GetDataAlignment(arr);
-  // ctx data pointer
-  TVM_DEVICE_SWITCH(ctx, {
-      arr->data = AllocDataSpace<xpu>(ctx, size, alignment);
-    });
+  arr->data = DeviceAPIManager::Get(ctx)->AllocDataSpace(
+      ctx, size, alignment);
  *out = arr;
  API_END_HANDLE_ERROR(TVMArrayFree_(arr));
 }
@@ -306,28 +342,21 @@ int TVMArrayCopyFromTo(TVMArrayHandle from,
  CHECK_EQ(from_size, to_size)
      << "TVMArrayCopyFromTo: The size must exactly match";
  TVMContext ctx = from->ctx;
-  if (ctx.dev_mask == kCPU) {
+  if (ctx.device_type == kCPU) {
    ctx = to->ctx;
  } else {
-    CHECK(to->ctx.dev_mask == kCPU ||
-          to->ctx.dev_mask == from->ctx.dev_mask)
+    CHECK(to->ctx.device_type == kCPU ||
+          to->ctx.device_type == from->ctx.device_type)
        << "Can not copy across different ctx types directly";
  }
-
-  TVM_DEVICE_SWITCH(ctx, {
-      CopyDataFromTo<xpu>(from->data, to->data,
-                          from_size,
-                          from->ctx,
-                          to->ctx,
-                          stream);
-    });
+  DeviceAPIManager::Get(ctx)->CopyDataFromTo(
+      from->data, to->data, from_size,
+      from->ctx, to->ctx, stream);
  API_END();
 }

 int TVMSynchronize(TVMContext ctx, TVMStreamHandle stream) {
  API_BEGIN();
-  TVM_DEVICE_SWITCH(ctx, {
-      StreamSync<xpu>(ctx, stream);
-    });
+  DeviceAPIManager::Get(ctx)->StreamSync(ctx, stream);
  API_END();
 }
--- a/src/runtime/device_api_cpu.h
+++ b/src/runtime/device_api_cpu.h
@@ -7,6 +7,7 @@
 #define TVM_RUNTIME_DEVICE_API_CPU_H_

 #include <dmlc/logging.h>
+#include <tvm/runtime/registry.h>
 #include <cstdlib>
 #include <cstring>
 #include "./device_api.h"
@@ -14,41 +15,47 @@
 namespace tvm {
 namespace runtime {

-template<>
-void* AllocDataSpace<kCPU>(TVMContext ctx, size_t size, size_t alignment) {
-  void* ptr;
+class CPUDeviceAPI : public DeviceAPI {
+ public:
+  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+    void* ptr;
 #if _MSC_VER
-  ptr = _aligned_malloc(size, alignment);
-  if (ptr == nullptr) throw std::bad_alloc();
+    ptr = _aligned_malloc(size, alignment);
+    if (ptr == nullptr) throw std::bad_alloc();
 #else
-  int ret = posix_memalign(&ptr, alignment, size);
-  if (ret != 0) throw std::bad_alloc();
+    int ret = posix_memalign(&ptr, alignment, size);
+    if (ret != 0) throw std::bad_alloc();
 #endif
-  return ptr;
-}
+    return ptr;
+  }

-template<>
-void FreeDataSpace<kCPU>(TVMContext ctx, void* ptr) {
+  void FreeDataSpace(TVMContext ctx, void* ptr) final {
 #if _MSC_VER
-  _aligned_free(ptr);
+    _aligned_free(ptr);
 #else
-  free(ptr);
+    free(ptr);
 #endif
-}
+  }

-template<>
-void CopyDataFromTo<kCPU>(const void* from,
-                          void* to,
-                          size_t size,
-                          TVMContext ctx_from,
-                          TVMContext ctx_to,
-                          TVMStreamHandle stream) {
-  memcpy(to, from, size);
-}
+  void CopyDataFromTo(const void* from,
+                      void* to,
+                      size_t size,
+                      TVMContext ctx_from,
+                      TVMContext ctx_to,
+                      TVMStreamHandle stream) final {
+    memcpy(to, from, size);
+  }

-template<>
-void StreamSync<kCPU>(TVMContext ctx, TVMStreamHandle stream) {
-}
+  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
+  }
+};
+
+TVM_REGISTER_GLOBAL(_device_api_cpu)
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    static CPUDeviceAPI inst;
+    DeviceAPI* ptr = &inst;
+    *rv = static_cast<void*>(ptr);
+  });
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_DEVICE_API_CPU_H_
--- a/src/runtime/cuda/device_api_cuda.h
+++ b/src/runtime/cuda/device_api_cuda.h
 /*!
 *  Copyright (c) 2017 by Contributors
- * \file device_api_cuda.h
+ * \file cuda_device_api.cc
 * \brief GPU specific API
 */
-#ifndef TVM_RUNTIME_CUDA_DEVICE_API_CUDA_H_
-#define TVM_RUNTIME_CUDA_DEVICE_API_CUDA_H_
-
-#include "./cuda_common.h"
+#include <tvm/runtime/config.h>

 #if TVM_CUDA_RUNTIME
-
 #include <dmlc/logging.h>
+#include <tvm/runtime/registry.h>
 #include <cuda_runtime.h>
+#include "./cuda_common.h"
+#include "../device_api.h"

 namespace tvm {
 namespace runtime {

-template<>
-inline void* AllocDataSpace<kGPU>(TVMContext ctx, size_t size, size_t alignment) {
-  CUDA_CALL(cudaSetDevice(ctx.dev_id));
-  CHECK_EQ(256 % alignment, 0U)
-      << "CUDA space is aligned at 256 bytes";
-  void *ret;
-  CUDA_CALL(cudaMalloc(&ret, size));
-  return ret;
-}
+class CUDADeviceAPI : public DeviceAPI {
+ public:
+  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    CHECK_EQ(256 % alignment, 0U)
+        << "CUDA space is aligned at 256 bytes";
+    void *ret;
+    CUDA_CALL(cudaMalloc(&ret, size));
+    return ret;
+  }
+
+  void FreeDataSpace(TVMContext ctx, void* ptr) final {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    CUDA_CALL(cudaFree(ptr));
+  }

-template<>
-inline void FreeDataSpace<kGPU>(TVMContext ctx, void* ptr) {
-  CUDA_CALL(cudaSetDevice(ctx.dev_id));
-  CUDA_CALL(cudaFree(ptr));
-}
+  void CopyDataFromTo(const void* from,
+                      void* to,
+                      size_t size,
+                      TVMContext ctx_from,
+                      TVMContext ctx_to,
+                      TVMStreamHandle stream) final {
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
+    if (ctx_from.device_type == kGPU && ctx_to.device_type == kGPU) {
+      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
+      if (ctx_from.device_id == ctx_to.device_id) {
+        GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
+      } else {
+        cudaMemcpyPeerAsync(to, ctx_to.device_id,
+                            from, ctx_from.device_id,
+                            size, cu_stream);
+      }
+    } else if (ctx_from.device_type == kGPU && ctx_to.device_type == kCPU) {
+      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
+      GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
+    } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kGPU) {
+      CUDA_CALL(cudaSetDevice(ctx_to.device_id));
+      GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
+    } else {
+      LOG(FATAL) << "expect copy from/to GPU or between GPU";
+    }
+  }

-inline void GPUCopy(const void* from,
-                    void* to,
-                    size_t size,
-                    cudaMemcpyKind kind,
-                    cudaStream_t stream) {
-  if (stream != 0) {
-    CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
-  } else {
-    CUDA_CALL(cudaMemcpy(to, from, size, kind));
+  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
  }
-}

-template<>
-inline void CopyDataFromTo<kGPU>(const void* from,
-                                 void* to,
-                                 size_t size,
-                                 TVMContext ctx_from,
-                                 TVMContext ctx_to,
-                                 TVMStreamHandle stream) {
-  cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
-  if (ctx_from.dev_mask == kGPU && ctx_to.dev_mask == kGPU) {
-    CUDA_CALL(cudaSetDevice(ctx_from.dev_id));
-    if (ctx_from.dev_id == ctx_to.dev_id) {
-      GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
+ private:
+  static void GPUCopy(const void* from,
+                      void* to,
+                      size_t size,
+                      cudaMemcpyKind kind,
+                      cudaStream_t stream) {
+    if (stream != 0) {
+      CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
    } else {
-      cudaMemcpyPeerAsync(to, ctx_to.dev_id,
-                          from, ctx_from.dev_id,
-                          size, cu_stream);
+      CUDA_CALL(cudaMemcpy(to, from, size, kind));
    }
-  } else if (ctx_from.dev_mask == kGPU && ctx_to.dev_mask == kCPU) {
-    CUDA_CALL(cudaSetDevice(ctx_from.dev_id));
-    GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
-  } else if (ctx_from.dev_mask == kCPU && ctx_to.dev_mask == kGPU) {
-    CUDA_CALL(cudaSetDevice(ctx_to.dev_id));
-    GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
-  } else {
-    LOG(FATAL) << "expect copy from/to GPU or between GPU";
  }
-}
+};

-template<>
-inline void StreamSync<kGPU>(TVMContext ctx, TVMStreamHandle stream) {
-  CUDA_CALL(cudaSetDevice(ctx.dev_id));
-  CUDA_CALL(cudaStreamSynchronize(
-      static_cast<cudaStream_t>(stream)));
-}
+TVM_REGISTER_GLOBAL(_device_api_gpu)
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    static CUDADeviceAPI inst;
+    DeviceAPI* ptr = &inst;
+    *rv = static_cast<void*>(ptr);
+  });

 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_CUDA_RUNTIME
-#endif  // TVM_RUNTIME_CUDA_DEVICE_API_CUDA_H_
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -50,9 +50,9 @@ class CUDAModuleNode : public runtime::ModuleNode {
  }

  void PreCompile(const std::string& name, TVMContext ctx) final {
-    CUDA_CALL(cudaSetDevice(ctx.dev_id));
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
    cudaFree(nullptr);
-    this->GetFunc(ctx.dev_id, name);
+    this->GetFunc(ctx.device_id, name);
  }

  PackedFunc GetFunction(
@@ -79,15 +79,15 @@ class CUDAModuleNode : public runtime::ModuleNode {
    }
  }

-  // get a CUfunction from primary context in dev_id
-  CUfunction GetFunc(int dev_id, const std::string& func_name) {
+  // get a CUfunction from primary context in device_id
+  CUfunction GetFunc(int device_id, const std::string& func_name) {
    std::lock_guard<std::mutex> lock(mutex_);
    // must recheck under the lock scope
-    if (module_[dev_id] == nullptr) {
-      CUDA_DRIVER_CALL(cuModuleLoadData(&(module_[dev_id]), data_.c_str()));
+    if (module_[device_id] == nullptr) {
+      CUDA_DRIVER_CALL(cuModuleLoadData(&(module_[device_id]), data_.c_str()));
    }
    CUfunction func;
-    CUresult result = cuModuleGetFunction(&func, module_[dev_id], func_name.c_str());
+    CUresult result = cuModuleGetFunction(&func, module_[device_id], func_name.c_str());
    if (result != CUDA_SUCCESS) {
      const char *msg;
      cuGetErrorName(result, &msg);
@@ -132,14 +132,14 @@ class CUDAWrappedFunc {
  void operator()(TVMArgs args,
                  TVMRetValue* rv,
                  void** void_args) const {
-    int dev_id;
-    CUDA_CALL(cudaGetDevice(&dev_id));
-    if (fcache_[dev_id] == nullptr) {
-      fcache_[dev_id] = m_->GetFunc(dev_id, func_name_);
+    int device_id;
+    CUDA_CALL(cudaGetDevice(&device_id));
+    if (fcache_[device_id] == nullptr) {
+      fcache_[device_id] = m_->GetFunc(device_id, func_name_);
    }
    ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
    CUDA_DRIVER_CALL(cuLaunchKernel(
-        fcache_[dev_id],
+        fcache_[device_id],
        wl.grid_dim(0),
        wl.grid_dim(1),
        wl.grid_dim(2),
@@ -169,23 +169,23 @@ void AutoSetCUDADevice(const TVMArgs& args, TVMRetValue* rv) {
  int* type_codes = static_cast<int*>(args[1].operator void*());
  int num_args = args[2].operator int();

-  int dev_id = -1;
+  int device_id = -1;
  for (int i = 0; i < num_args; ++i) {
    if (type_codes[i] == kArrayHandle) {
      TVMContext ctx = static_cast<TVMArray*>(values[i].v_handle)->ctx;
-      CHECK_EQ(ctx.dev_mask, kGPU)
+      CHECK_EQ(ctx.device_type, kGPU)
          << "All operands need to be GPU";
-      if (dev_id == -1) {
-        dev_id = ctx.dev_id;
+      if (device_id == -1) {
+        device_id = ctx.device_id;
      } else {
-        CHECK_EQ(dev_id, ctx.dev_id)
+        CHECK_EQ(device_id, ctx.device_id)
            << "Operands comes from different devices ";
      }
    }
  }
-  CHECK_NE(dev_id, -1)
+  CHECK_NE(device_id, -1)
      << "Cannot detect device id from list";
-  CUDA_CALL(cudaSetDevice(dev_id));
+  CUDA_CALL(cudaSetDevice(device_id));
 }

 PackedFunc CUDAModuleNode::GetFunction(

--- a/src/runtime/device_api.h
+++ b/src/runtime/device_api.h
@@ -8,83 +8,65 @@

 #include <tvm/base.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <string>

 namespace tvm {
 namespace runtime {
-/*!
- * \brief Allocate a data space on device.
- * \param ctx The device context to perform operation.
- * \param size The size of the memory
- * \param alignment The alignment of the memory.
- * \return The allocated device pointer
- * \tparam xpu The device mask.
- */
-template<TVMDeviceMask xpu>
-inline void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment);

-/*!
- * \brief Free a data space on device.
- * \param ctx The device context to perform operation.
- * \param ptr The data space.
- * \tparam xpu The device mask.
- */
-template<TVMDeviceMask xpu>
-inline void FreeDataSpace(TVMContext ctx, void* ptr);
+class DeviceAPI {
+ public:
+  /*! \brief virtual destructor */
+  virtual ~DeviceAPI() {}
+  /*!
+   * \brief Allocate a data space on device.
+   * \param ctx The device context to perform operation.
+   * \param size The size of the memory
+   * \param alignment The alignment of the memory.
+   * \return The allocated device pointer
+   */
+  virtual void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) = 0;
+  /*!
+   * \brief Free a data space on device.
+   * \param ctx The device context to perform operation.
+   * \param ptr The data space.
+   * \tparam xpu The device mask.
+   */
+  virtual void FreeDataSpace(TVMContext ctx, void* ptr) = 0;
+  /*!
+   * \brief copy data from one place to another
+   * \param dev The device to perform operation.
+   * \param from The source array.
+   * \param to The target array.
+   * \param size The size of the memory
+   * \param ctx_from The source context
+   * \param ctx_to The target context
+   */
+  virtual void CopyDataFromTo(const void* from,
+                              void* to,
+                              size_t size,
+                              TVMContext ctx_from,
+                              TVMContext ctx_to,
+                              TVMStreamHandle stream) = 0;
+  /*!
+   * \brief Synchronize the stream
+   * \param ctx The context to perform operation.
+   * \param stream The stream to be sync.
+   */
+  virtual void StreamSync(TVMContext ctx, TVMStreamHandle stream) = 0;
+};

 /*!
- * \brief copy data from one place to another
- * \param dev The device to perform operation.
- * \param from The source array.
- * \param to The target array.
- * \param size The size of the memory
- * \param ctx_from The source context
- * \param ctx_to The target context
- * \tparam xpu The device mask.
- */
-template<TVMDeviceMask xpu>
-inline void CopyDataFromTo(const void* from,
-                           void* to,
-                           size_t size,
-                           TVMContext ctx_from,
-                           TVMContext ctx_to,
-                           TVMStreamHandle stream);
-/*!
- * \brief Synchronize the stream
- * \param ctx The context to perform operation.
- * \param stream The stream to be sync.
- * \tparam xpu The device mask.
+ * \brief The name of Device API factory.
+ * \param type The device type.
 */
-template<TVMDeviceMask xpu>
-inline void StreamSync(TVMContext ctx, TVMStreamHandle stream);
-
-// macro to run cuda related code
-#if TVM_CUDA_RUNTIME
-#define TVM_RUN_CUDA(OP) { const TVMDeviceMask xpu = kGPU; OP; }
-#else
-#define TVM_RUN_CUDA(OP) LOG(FATAL) << "CUDA is not enabled";
-#endif
-
-// macro to run opencl related code
-#if TVM_OPENCL_RUNTIME
-#define TVM_RUN_OPENCL(OP) { const TVMDeviceMask xpu = kOpenCL; OP; }
-#else
-#define TVM_RUN_OPENCL(OP) LOG(FATAL) << "OpenCL is not enabled";
-#endif
-
-// macro to switch options between devices
-#define TVM_DEVICE_SWITCH(ctx, OP)                                  \
-  switch (ctx.dev_mask) {                                           \
-    case kCPU: { const TVMDeviceMask xpu = kCPU; OP; break; }       \
-    case kGPU: TVM_RUN_CUDA(OP); break;                             \
-    case kOpenCL: TVM_RUN_OPENCL(OP); break;                        \
-    default: LOG(FATAL) << "unknown device_mask " << ctx.dev_mask;  \
+inline std::string DeviceName(DLDeviceType type) {
+  switch (static_cast<int>(type)) {
+    case kCPU: return "cpu";
+    case kGPU: return "gpu";
+    case kOpenCL: return "opencl";
+    default: LOG(FATAL) << "unknown type =" << type; return "Unknown";
  }
-
+}
 }  // namespace runtime
 }  // namespace tvm
-
-#include "./device_api_cpu.h"
-#include "./cuda/device_api_cuda.h"
-#include "./opencl/device_api_opencl.h"
-
 #endif  // TVM_RUNTIME_DEVICE_API_H_
--- a/src/runtime/opencl/device_api_opencl.h
+++ b/src/runtime/opencl/device_api_opencl.h
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file device_api_opencl.h
- * \brief OpenCL specific API
- */
-#ifndef TVM_RUNTIME_OPENCL_DEVICE_API_OPENCL_H_
-#define TVM_RUNTIME_OPENCL_DEVICE_API_OPENCL_H_
-
-#include <tvm/runtime/config.h>
-
-#if TVM_OPENCL_RUNTIME
-#include <string>
-#include <vector>
-#include "./opencl_common.h"
-
-namespace tvm {
-namespace runtime {
-
-template<>
-inline void* AllocDataSpace<kOpenCL>(TVMContext ctx, size_t size, size_t alignment) {
-  cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
-  cl_int err_code;
-  cl_mem mptr = clCreateBuffer(
-      w->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
-  OPENCL_CHECK_ERROR(err_code);
-  return mptr;
-}
-
-template<>
-inline void FreeDataSpace<kOpenCL>(TVMContext ctx, void* ptr) {
-  cl_mem mptr = static_cast<cl_mem>(ptr);
-  OPENCL_CALL(clReleaseMemObject(mptr));
-}
-
-template<>
-inline void CopyDataFromTo<kOpenCL>(const void* from,
-                                    void* to,
-                                    size_t size,
-                                    TVMContext ctx_from,
-                                    TVMContext ctx_to,
-                                    TVMStreamHandle stream) {
-  CHECK(stream == nullptr);
-  cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
-  if (ctx_from.dev_mask == kOpenCL && ctx_to.dev_mask == kOpenCL) {
-    OPENCL_CALL(clEnqueueCopyBuffer(
-        w->GetQueue(ctx_to),
-        static_cast<cl_mem>((void*)from),  // NOLINT(*)
-        static_cast<cl_mem>(to),
-        0, 0, size, 0, nullptr, nullptr));
-  } else if (ctx_from.dev_mask == kOpenCL && ctx_to.dev_mask == kCPU) {
-    OPENCL_CALL(clEnqueueReadBuffer(
-        w->GetQueue(ctx_from),
-        static_cast<cl_mem>((void*)from),  // NOLINT(*)
-        CL_FALSE, 0, size, to,
-        0, nullptr, nullptr));
-    OPENCL_CALL(clFinish(w->GetQueue(ctx_from)));
-  } else if (ctx_from.dev_mask == kCPU && ctx_to.dev_mask == kOpenCL) {
-    OPENCL_CALL(clEnqueueWriteBuffer(
-        w->GetQueue(ctx_to),
-        static_cast<cl_mem>(to),
-        CL_FALSE, 0, size, from,
-        0, nullptr, nullptr));
-    OPENCL_CALL(clFinish(w->GetQueue(ctx_to)));
-  } else {
-    LOG(FATAL) << "Expect copy from/to GPU or between GPU";
-  }
-}
-
-template<>
-inline void StreamSync<kOpenCL>(TVMContext ctx, TVMStreamHandle stream) {
-  CHECK(stream == nullptr);
-  cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
-  OPENCL_CALL(clFinish(w->GetQueue(ctx)));
-}
-
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_OPENCL_RUNTIME
-#endif  // TVM_RUNTIME_OPENCL_DEVICE_API_OPENCL_H_
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -10,8 +10,8 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <dmlc/logging.h>
-
 #if TVM_OPENCL_RUNTIME
+#include "../device_api.h"

 #ifdef __APPLE__
 #include <OpenCL/opencl.h>
@@ -101,7 +101,7 @@ inline const char* CLGetErrorString(cl_int error) {
 /*!
 * \brief Process global OpenCL workspace.
 */
-class OpenCLWorkspace {
+class OpenCLWorkspace : public DeviceAPI {
 public:
  // global platform id
  cl_platform_id platform_id;
@@ -132,13 +132,23 @@ class OpenCLWorkspace {
  }
  // get the queue of the context
  cl_command_queue GetQueue(TVMContext ctx) const {
-    CHECK_EQ(ctx.dev_mask, kOpenCL);
+    CHECK_EQ(ctx.device_type, kOpenCL);
    CHECK(initialized())
        << "The OpenCL is not initialized";
-    CHECK(ctx.dev_id >= 0  && static_cast<size_t>(ctx.dev_id) < queues.size())
-        << "Invalid OpenCL dev_id=" << ctx.dev_id;
-    return queues[ctx.dev_id];
+    CHECK(ctx.device_id >= 0  && static_cast<size_t>(ctx.device_id) < queues.size())
+        << "Invalid OpenCL device_id=" << ctx.device_id;
+    return queues[ctx.device_id];
  }
+  // override device API
+  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final;
+  void FreeDataSpace(TVMContext ctx, void* ptr) final;
+  void CopyDataFromTo(const void* from,
+                      void* to,
+                      size_t size,
+                      TVMContext ctx_from,
+                      TVMContext ctx_to,
+                      TVMStreamHandle stream) final;
+  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
  // get the global workspace
  static OpenCLWorkspace* Global();
 };
@@ -160,8 +170,8 @@ class OpenCLThreadEntry {
  std::vector<KTEntry> kernel_table;

  OpenCLThreadEntry() {
-    context.dev_id = 0;
-    context.dev_mask = kOpenCL;
+    context.device_id = 0;
+    context.device_type = kOpenCL;
  }
  // get the global workspace
  static OpenCLThreadEntry* ThreadLocal();

--- a/src/runtime/opencl/opencl_workspace.cc
+++ b/src/runtime/opencl/opencl_workspace.cc
 /*!
 *  Copyright (c) 2017 by Contributors
- * \file opencl_workspace.cc
+ * \file opencl_device_api.cc
 */
 #include "./opencl_common.h"

@@ -18,6 +18,57 @@ OpenCLWorkspace* OpenCLWorkspace::Global() {
  return &inst;
 }

+void* OpenCLWorkspace::AllocDataSpace(
+    TVMContext ctx, size_t size, size_t alignment) {
+  cl_int err_code;
+  cl_mem mptr = clCreateBuffer(
+      this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
+  OPENCL_CHECK_ERROR(err_code);
+  return mptr;
+}
+
+void OpenCLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) {
+  cl_mem mptr = static_cast<cl_mem>(ptr);
+  OPENCL_CALL(clReleaseMemObject(mptr));
+}
+
+void OpenCLWorkspace::CopyDataFromTo(const void* from,
+                                     void* to,
+                                     size_t size,
+                                     TVMContext ctx_from,
+                                     TVMContext ctx_to,
+                                     TVMStreamHandle stream) {
+  CHECK(stream == nullptr);
+  if (ctx_from.device_type == kOpenCL && ctx_to.device_type == kOpenCL) {
+    OPENCL_CALL(clEnqueueCopyBuffer(
+        this->GetQueue(ctx_to),
+        static_cast<cl_mem>((void*)from),  // NOLINT(*)
+        static_cast<cl_mem>(to),
+        0, 0, size, 0, nullptr, nullptr));
+  } else if (ctx_from.device_type == kOpenCL && ctx_to.device_type == kCPU) {
+    OPENCL_CALL(clEnqueueReadBuffer(
+        this->GetQueue(ctx_from),
+        static_cast<cl_mem>((void*)from),  // NOLINT(*)
+        CL_FALSE, 0, size, to,
+        0, nullptr, nullptr));
+    OPENCL_CALL(clFinish(this->GetQueue(ctx_from)));
+  } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kOpenCL) {
+    OPENCL_CALL(clEnqueueWriteBuffer(
+        this->GetQueue(ctx_to),
+        static_cast<cl_mem>(to),
+        CL_FALSE, 0, size, from,
+        0, nullptr, nullptr));
+    OPENCL_CALL(clFinish(this->GetQueue(ctx_to)));
+  } else {
+    LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL";
+  }
+}
+
+void OpenCLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
+  CHECK(stream == nullptr);
+  OPENCL_CALL(clFinish(this->GetQueue(ctx)));
+}
+
 typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;

 OpenCLThreadEntry* OpenCLThreadEntry::ThreadLocal() {
@@ -141,6 +192,12 @@ bool InitOpenCL(TVMArgs args, TVMRetValue* rv) {
 TVM_REGISTER_GLOBAL(_module_init_opencl)
 .set_body(InitOpenCL);

+TVM_REGISTER_GLOBAL(_device_api_opencl)
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    DeviceAPI* ptr = OpenCLWorkspace::Global();
+    *rv = static_cast<void*>(ptr);
+  });
+
 }  // namespace cl
 }  // namespace runtime
 }  // namespace tvm

--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -123,11 +123,11 @@ class OpenCLModuleNode : public ModuleNode {
                          const std::string& func_name,
                          const KTRefEntry& e) {
    std::lock_guard<std::mutex> lock(build_lock_);
-    int dev_id = t->context.dev_id;
-    if (!device_built_flag_[dev_id]) {
+    int device_id = t->context.device_id;
+    if (!device_built_flag_[device_id]) {
      // build program
      cl_int err;
-      cl_device_id dev = w->devices[dev_id];
+      cl_device_id dev = w->devices[device_id];
      err = clBuildProgram(program_, 1, &dev, nullptr, nullptr, nullptr);
      if (err != CL_SUCCESS) {
        size_t len;
@@ -139,7 +139,7 @@ class OpenCLModuleNode : public ModuleNode {
            program_, dev, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr);
        LOG(FATAL) << "OpenCL build error for device=" << dev << log;
      }
-      device_built_flag_[dev_id] = true;
+      device_built_flag_[device_id] = true;
    }
    // build kernel
    cl_int err;
@@ -246,23 +246,23 @@ void AutoSetOpenCLDevice(const TVMArgs& args, TVMRetValue* rv) {
  int num_args = args[2].operator int();

  // TODO(tqchen): merge this with CUDA logic.
-  int dev_id = -1;
+  int device_id = -1;
  for (int i = 0; i < num_args; ++i) {
    if (type_codes[i] == kArrayHandle) {
      TVMContext ctx = static_cast<TVMArray*>(values[i].v_handle)->ctx;
-      CHECK_EQ(ctx.dev_mask, kOpenCL)
+      CHECK_EQ(ctx.device_type, kOpenCL)
          << "All operands need to be OpenCL";
-      if (dev_id == -1) {
-        dev_id = ctx.dev_id;
+      if (device_id == -1) {
+        device_id = ctx.device_id;
      } else {
-        CHECK_EQ(dev_id, ctx.dev_id)
+        CHECK_EQ(device_id, ctx.device_id)
            << "Operands comes from different devices ";
      }
    }
  }
-  CHECK_NE(dev_id, -1)
+  CHECK_NE(device_id, -1)
      << "Cannot detect device id from list";
-  cl::OpenCLThreadEntry::ThreadLocal()->context.dev_id = dev_id;
+  cl::OpenCLThreadEntry::ThreadLocal()->context.device_id = device_id;
 }

 PackedFunc OpenCLModuleNode::GetFunction(