Add support for multiple OpenCL platforms (#1345)

cb68c82c · MORITA Kazutaka · Tianqi Chen · f927e1f3 · cb68c82c · cb68c82c
Commit cb68c82c authored Jul 10, 2018 by MORITA Kazutaka Committed by Tianqi Chen Jul 09, 2018
17 changed files
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -39,6 +39,9 @@ set(USE_CUDA OFF)
 # - /path/to/rocm: use specific path to rocm
 set(USE_ROCM OFF)
+# Whether enable SDAccel runtime
+set(USE_SDACCEL OFF)
 # Whether enable OpenCL runtime
 set(USE_OPENCL OFF)

--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -7,6 +7,18 @@ if(OpenCL_FOUND)
  include_directories(${OpenCL_INCLUDE_DIRS})
 endif(OpenCL_FOUND)
+if(USE_SDACCEL)
+  message(STATUS "Build with SDAccel support")
+  file(GLOB RUNTIME_SDACCEL_SRCS src/runtime/opencl/sdaccel/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_SDACCEL_SRCS})
+  if(NOT USE_OPENCL)
+    message(STATUS "Enable OpenCL support required for SDAccel")
+    set(USE_OPENCL ON)
+  endif()
+else()
+  list(APPEND COMPILER_SRCS src/codegen/opt/build_sdaccel_off.cc)
+endif(USE_SDACCEL)
 if(USE_OPENCL)
  find_package(OpenCL REQUIRED)
  message(STATUS "Build with OpenCL support")

--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -60,6 +60,7 @@ typedef int64_t tvm_index_t;
 /*! \brief Extension device types in TVM */
 typedef enum {
+  kDLSDAccel = 6,
  kDLVulkan = 7,
  kOpenGL = 11,
  // Extension DRAM type, used for quickly test extension device

--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -95,6 +95,7 @@ class TVMContext(ctypes.Structure):
        1 : 'cpu',
        2 : 'gpu',
        4 : 'opencl',
+        6 : 'sdaccel',
        7 : 'vulkan',
        8 : 'metal',
        9 : 'vpi',
@@ -111,7 +112,7 @@ class TVMContext(ctypes.Structure):
        'nvptx': 2,
        'cl': 4,
        'opencl': 4,
-        'sdaccel': 4,
+        'sdaccel': 6,
        'vulkan': 7,
        'metal': 8,
        'vpi': 9,

--- a/python/tvm/contrib/sdaccel.py
+++ b/python/tvm/contrib/sdaccel.py
 """Utility for Interacting with SDAccel Tools"""
 import subprocess
 import os
-import re
 from . import util
 from ..api import register_func
-def _vhls_to_opencl(code):
-    """Convert source code from Vivado HLS to OpenCL."""
-    out = ''
-    for line in code.split('\n'):
-        if re.match(r'#include', line):
-            # OpenCL doesn't support include.
-            continue
-        if re.match(r'#pragma', line):
-            # Remove Vivado HLS specific pragmas.
-            continue
-        if re.match(r'extern "C"', line):
-            line = re.sub(r'^extern "C"', "__kernel", line)
-            # Add __global to pointer parameters.
-            line = re.sub(r'(\w+)\s*\*', r"__global \1*", line)
-        out += line + '\n'
-    return out
-def _fake_compile_vhls(code):
-    """Fake compile Vivado HLS code for SDAccel.
-    Compile the Vivado HLS code as an OpenCL code, and generate a program
-    binary for GPU which can be used instead of xclbin.
-    Parameters
-    ----------
-    code : str
-        The Vivado HLS code.
-    Return
-    ------
-    binary : bytearray
-        The program binary which can be passed to clCreateProgramWithBinary
-    """
-    try:
-        import pyopencl as cl
-    except ImportError:
-        raise ImportError('PyOpenCL is required for testing SDAccel backend.')
-    ctx = cl.Context(dev_type=cl.device_type.GPU)
-    program = cl.Program(ctx, _vhls_to_opencl(code)).build()
-    binary = bytearray(program.binaries[0])
-    return binary
 @register_func("tvm_callback_sdaccel_compile")
 def compile_vhls(code, kernel):
    """Compile Vivado HLS code for SDAccel.
@@ -87,9 +39,7 @@ def compile_vhls(code, kernel):
    platform = os.environ.get("XCL_PLATFORM", os.environ.get("AWS_PLATFORM"))
    if platform is None:
-        # If we don't have the Xilinx toolchain, create a program binary for
+        raise RuntimeError("No Xlinx device specified.")
-        # GPU and use it for testing.
-        return _fake_compile_vhls(code)
    # build xo
    args = [xocc, "-c", "-t", target, "--platform", platform, "-o", tmp_xo, "-k", kernel] + \

--- a/src/codegen/codegen_vhls.cc
+++ b/src/codegen/codegen_vhls.cc
@@ -6,7 +6,7 @@
 #include <string>
 #include "./codegen_vhls.h"
 #include "./build_common.h"
-#include "../runtime/opencl/opencl_module.h"
+#include "../runtime/opencl/sdaccel/sdaccel_module.h"
 namespace tvm {
 namespace codegen {
@@ -91,7 +91,7 @@ runtime::Module BuildSDAccel(Array<LoweredFunc> funcs) {
  } else {
    LOG(FATAL) << "Cannot compile Vivado HLS code.";
  }
-  return OpenCLModuleCreate(xclbin, "xclbin", ExtractFuncInfo(funcs), code);
+  return SDAccelModuleCreate(xclbin, "xclbin", ExtractFuncInfo(funcs), code);
 }
 TVM_REGISTER_API("codegen.build_sdaccel")

--- a/src/codegen/opt/build_sdaccel_off.cc
+++ b/src/codegen/opt/build_sdaccel_off.cc
+/*!
+ *  Copyright (c) 2018 by Contributors
+ *  Optional module when build opencl is switched to off
+ */
+#include "../codegen_source_base.h"
+#include "../../runtime/opencl/opencl_module.h"
+namespace tvm {
+namespace runtime {
+Module SDAccelModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source) {
+  LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
+  return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "sdaccel");
+}
+}  // namespace runtime
+}  // namespace tvm
--- a/src/pass/verify_memory.cc
+++ b/src/pass/verify_memory.cc
@@ -36,7 +36,7 @@ class MemoryAccessVerifier final : protected IRVisitor {
  /// Interface to perform memory access verification
  void Run() {
-    if (!IsGPUDevice(dev_type_)) return;
+    if (!IsGPUDevice(dev_type_) && !IsFPGADevice(dev_type_)) return;
    IRVisitor::Visit(func_->body);
  }
@@ -143,6 +143,10 @@ class MemoryAccessVerifier final : protected IRVisitor {
           kDLVulkan == dev_type || kDLMetal == dev_type ||
           kDLROCM == dev_type || kOpenGL == dev_type;
  }
+  /// Check if a given DLDeviceType/TVMDeviceExtType value denotes FPGA device.
+  static bool IsFPGADevice(int dev_type) {
+    return kDLSDAccel == dev_type;
+  }
 private:
  /// Status of visitor

--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -31,6 +31,7 @@ inline std::string DeviceName(int type) {
    case kDLCPU: return "cpu";
    case kDLGPU: return "gpu";
    case kDLOpenCL: return "opencl";
+    case kDLSDAccel: return "sdaccel";
    case kDLVulkan: return "vulkan";
    case kDLMetal: return "metal";
    case kDLVPI: return "vpi";

--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -21,6 +21,10 @@
 #include <string>
 #include <vector>
 #include "../workspace_pool.h"
+#include "../pack_args.h"
+#include "../thread_storage_scope.h"
+#include "../meta_data.h"
+#include "../file_util.h"
 namespace tvm {
 namespace runtime {
@@ -97,17 +101,23 @@ inline const char* CLGetErrorString(cl_int error) {
    OPENCL_CHECK_ERROR(e);                                            \
  }
+class OpenCLThreadEntry;
 /*!
 * \brief Process global OpenCL workspace.
 */
-class OpenCLWorkspace final : public DeviceAPI {
+class OpenCLWorkspace : public DeviceAPI {
 public:
  // global platform id
  cl_platform_id platform_id;
+  // global platform name
+  std::string platform_name;
  // global context of this process
  cl_context context{nullptr};
  // whether the workspace it initialized.
  bool initialized_{false};
+  // the device type
+  std::string device_type;
  // the devices
  std::vector<cl_device_id> devices;
  // the queues
@@ -128,10 +138,17 @@ class OpenCLWorkspace final : public DeviceAPI {
    }
  }
  // Initialzie the device.
-  void Init();
+  void Init(const std::string& device_type, const std::string& platform_name = "");
+  virtual void Init() {
+    Init("gpu");
+  }
+  // Check whether the context is OpenCL or not.
+  virtual bool IsOpenCLDevice(TVMContext ctx) {
+    return ctx.device_type == kDLOpenCL;
+  }
  // get the queue of the context
  cl_command_queue GetQueue(TVMContext ctx) {
-    CHECK_EQ(ctx.device_type, kDLOpenCL);
+    CHECK(IsOpenCLDevice(ctx));
    this->Init();
    CHECK(ctx.device_id >= 0  && static_cast<size_t>(ctx.device_id) < queues.size())
        << "Invalid OpenCL device_id=" << ctx.device_id;
@@ -157,6 +174,12 @@ class OpenCLWorkspace final : public DeviceAPI {
  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
  void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
  void FreeWorkspace(TVMContext ctx, void* data) final;
+  /*!
+   * \brief Get the thread local ThreadEntry
+   */
+  virtual OpenCLThreadEntry* GetThreadEntry();
  // get the global workspace
  static const std::shared_ptr<OpenCLWorkspace>& Global();
 };
@@ -179,15 +202,87 @@ class OpenCLThreadEntry {
  /*! \brief workspace pool */
  WorkspacePool pool;
  // constructor
-  OpenCLThreadEntry()
+  OpenCLThreadEntry(DLDeviceType device_type, std::shared_ptr<DeviceAPI> device)
-      : pool(kDLOpenCL, OpenCLWorkspace::Global()) {
+      : pool(device_type, device) {
    context.device_id = 0;
-    context.device_type = kDLOpenCL;
+    context.device_type = device_type;
  }
+  OpenCLThreadEntry()
+      : OpenCLThreadEntry(kDLOpenCL, OpenCLWorkspace::Global()) {}
  // get the global workspace
  static OpenCLThreadEntry* ThreadLocal();
 };
 }  // namespace cl
+// Module to support thread-safe multi-device execution.
+// OpenCL runtime is a bit tricky because clSetKernelArg is not thread-safe
+// To make the call thread-safe, we create a thread-local kernel table
+// and lazily install new kernels into the kernel table when the kernel is called.
+// The kernels are recycled when the module get destructed.
+class OpenCLModuleNode : public ModuleNode {
+ public:
+  // Kernel table reference entry.
+  struct KTRefEntry {
+    size_t kernel_id;
+    size_t version;
+  };
+  explicit OpenCLModuleNode(std::string data,
+                            std::string fmt,
+                            std::unordered_map<std::string, FunctionInfo> fmap,
+                            std::string source)
+      : data_(data), fmt_(fmt), fmap_(fmap), source_(source) {}
+  // destructor
+  ~OpenCLModuleNode();
+  /*!
+   * \brief Get the global workspace
+   */
+  virtual const std::shared_ptr<cl::OpenCLWorkspace>& GetGlobalWorkspace();
+  virtual const char* type_key() const;
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final;
+  void SaveToBinary(dmlc::Stream* stream) final;
+  std::string GetSource(const std::string& format) final;
+  // Initialize the programs
+  void Init();
+  // install a new kernel to thread local entry
+  cl_kernel InstallKernel(cl::OpenCLWorkspace* w,
+                          cl::OpenCLThreadEntry* t,
+                          const std::string& func_name,
+                          const KTRefEntry& e);
+ protected:
+  // The workspace, need to keep reference to use it in destructor.
+  // In case of static destruction order problem.
+  std::shared_ptr<cl::OpenCLWorkspace> workspace_;
+  // the binary data
+  std::string data_;
+ private:
+  // The format
+  std::string fmt_;
+  // function information table.
+  std::unordered_map<std::string, FunctionInfo> fmap_;
+  // Module local mutex
+  std::mutex build_lock_;
+  // The OpenCL source.
+  std::string source_;
+  // the binary data
+  cl_program program_{nullptr};
+  // build info
+  std::vector<bool> device_built_flag_;
+  // kernel id cache
+  std::unordered_map<std::string, KTRefEntry> kid_map_;
+  // kernels build so far.
+  std::vector<cl_kernel> kernels_;
+};
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -10,13 +10,17 @@ namespace tvm {
 namespace runtime {
 namespace cl {
+OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() {
+  return OpenCLThreadEntry::ThreadLocal();
+}
 const std::shared_ptr<OpenCLWorkspace>& OpenCLWorkspace::Global() {
  static std::shared_ptr<OpenCLWorkspace> inst = std::make_shared<OpenCLWorkspace>();
  return inst;
 }
 void OpenCLWorkspace::SetDevice(TVMContext ctx) {
-  OpenCLThreadEntry::ThreadLocal()->context.device_id = ctx.device_id;
+  GetThreadEntry()->context.device_id = ctx.device_id;
 }
 void OpenCLWorkspace::GetAttr(
@@ -121,13 +125,13 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from,
                                     TVMStreamHandle stream) {
  this->Init();
  CHECK(stream == nullptr);
-  if (ctx_from.device_type == kDLOpenCL && ctx_to.device_type == kDLOpenCL) {
+  if (IsOpenCLDevice(ctx_from) && IsOpenCLDevice(ctx_to)) {
    OPENCL_CALL(clEnqueueCopyBuffer(
        this->GetQueue(ctx_to),
        static_cast<cl_mem>((void*)from),  // NOLINT(*)
        static_cast<cl_mem>(to),
        from_offset, to_offset, size, 0, nullptr, nullptr));
-  } else if (ctx_from.device_type == kDLOpenCL && ctx_to.device_type == kDLCPU) {
+  } else if (IsOpenCLDevice(ctx_from) && ctx_to.device_type == kDLCPU) {
    OPENCL_CALL(clEnqueueReadBuffer(
        this->GetQueue(ctx_from),
        static_cast<cl_mem>((void*)from),  // NOLINT(*)
@@ -135,7 +139,7 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from,
        static_cast<char*>(to) + to_offset,
        0, nullptr, nullptr));
    OPENCL_CALL(clFinish(this->GetQueue(ctx_from)));
-  } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLOpenCL) {
+  } else if (ctx_from.device_type == kDLCPU && IsOpenCLDevice(ctx_to)) {
    OPENCL_CALL(clEnqueueWriteBuffer(
        this->GetQueue(ctx_to),
        static_cast<cl_mem>(to),
@@ -156,11 +160,11 @@ void OpenCLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
 void* OpenCLWorkspace::AllocWorkspace(TVMContext ctx,
                                      size_t size,
                                      TVMType type_hint) {
-  return OpenCLThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
+  return GetThreadEntry()->pool.AllocWorkspace(ctx, size);
 }
 void OpenCLWorkspace::FreeWorkspace(TVMContext ctx, void* data) {
-  OpenCLThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
+  GetThreadEntry()->pool.FreeWorkspace(ctx, data);
 }
 typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;
@@ -223,38 +227,39 @@ bool MatchPlatformInfo(
  return param_value.find(value) != std::string::npos;
 }
-void OpenCLWorkspace::Init() {
+void OpenCLWorkspace::Init(const std::string& device_type, const std::string& platform_name) {
  if (initialized_) return;
  std::lock_guard<std::mutex> lock(this->mu);
  if (initialized_) return;
  initialized_ = true;
  if (context != nullptr) return;
  // matched platforms
-  std::vector<cl_platform_id> platform_matched = cl::GetPlatformIDs();
+  std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
-  if (platform_matched.size() == 0) {
+  if (platform_ids.size() == 0) {
    LOG(WARNING) << "No OpenCL platform matched given existing options ...";
    return;
  }
-  if (platform_matched.size() > 1) {
+  this->platform_id = nullptr;
-    LOG(WARNING) << "Multiple OpenCL platforms matched, use the first one ... ";
+  for (auto platform_id : platform_ids) {
-  }
+    if (!MatchPlatformInfo(platform_id, CL_PLATFORM_NAME, platform_name)) {
-  this->platform_id = platform_matched[0];
+      continue;
-  LOG(INFO) << "Initialize OpenCL platform \'"
+    }
-            << cl::GetPlatformInfo(this->platform_id, CL_PLATFORM_NAME) << '\'';
+    std::vector<cl_device_id> devices_matched = cl::GetDeviceIDs(platform_id, device_type);
-  std::string device_types[] = {"accelerator", "gpu", "cpu"};
-  std::vector<cl_device_id> devices_matched;
-  for (auto type : device_types) {
-    devices_matched = cl::GetDeviceIDs(this->platform_id, type);
    if (devices_matched.size() > 0) {
+      this->platform_id = platform_id;
+      this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
+      this->device_type = device_type;
+      this->devices = devices_matched;
+      LOG(INFO) << "Initialize OpenCL platform \'" << this->platform_name << '\'';
      break;
    }
-    LOG(INFO) << "No OpenCL device any device matched given the options: " << type << " mode";
+    LOG(INFO) << "\'" << cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME)
+              << "\' platform has no OpenCL device: " << device_type << " mode";
  }
-  if (devices_matched.size() == 0) {
+  if (this->platform_id == nullptr) {
    LOG(WARNING) << "No OpenCL device";
    return;
  }
-  this->devices = devices_matched;
  cl_int err_code;
  this->context = clCreateContext(
      nullptr, this->devices.size(), &(this->devices[0]),
@@ -272,11 +277,6 @@ void OpenCLWorkspace::Init() {
  }
 }
-bool InitOpenCL(TVMArgs args, TVMRetValue* rv) {
-  cl::OpenCLWorkspace::Global()->Init();
-  return true;
-}
 TVM_REGISTER_GLOBAL("device_api.opencl")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
    DeviceAPI* ptr = OpenCLWorkspace::Global().get();

--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -10,180 +10,9 @@
 #include "./opencl_common.h"
 #include "./opencl_module.h"
-#include "../pack_args.h"
-#include "../thread_storage_scope.h"
-#include "../meta_data.h"
-#include "../file_util.h"
 namespace tvm {
 namespace runtime {
-// Module to support thread-safe multi-device execution.
-// OpenCL runtime is a bit tricky because clSetKernelArg is not thread-safe
-// To make the call thread-safe, we create a thread-local kernel table
-// and lazily install new kernels into the kernel table when the kernel is called.
-// The kernels are recycled when the module get destructed.
-class OpenCLModuleNode : public ModuleNode {
- public:
-  // Kernel table reference entry.
-  struct KTRefEntry {
-    size_t kernel_id;
-    size_t version;
-  };
-  explicit OpenCLModuleNode(std::string data,
-                            std::string fmt,
-                            std::unordered_map<std::string, FunctionInfo> fmap,
-                            std::string source)
-      : data_(data), fmt_(fmt), fmap_(fmap), source_(source) {}
-  // destructor
-  ~OpenCLModuleNode() {
-    {
-      // free the kernel ids in global table.
-      std::lock_guard<std::mutex> lock(workspace_->mu);
-      for (auto& kv : kid_map_) {
-        workspace_->free_kernel_ids.push_back(kv.second.kernel_id);
-      }
-    }
-    // free the kernels
-    for (cl_kernel k : kernels_) {
-      OPENCL_CALL(clReleaseKernel(k));
-    }
-    if (program_) {
-      OPENCL_CALL(clReleaseProgram(program_));
-    }
-  }
-  const char* type_key() const final {
-    return "opencl";
-  }
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
-  void SaveToFile(const std::string& file_name,
-                  const std::string& format) final {
-    std::string fmt = GetFileFormat(file_name, format);
-    CHECK_EQ(fmt, fmt_)
-        << "Can only save to format=" << fmt_;
-    std::string meta_file = GetMetaFilePath(file_name);
-    SaveMetaDataToFile(meta_file, fmap_);
-    SaveBinaryToFile(file_name, data_);
-  }
-  void SaveToBinary(dmlc::Stream* stream) final {
-    stream->Write(fmt_);
-    stream->Write(fmap_);
-    stream->Write(data_);
-  }
-  std::string GetSource(const std::string& format) final {
-    if (format == fmt_) return data_;
-    if (fmt_ == "cl") {
-      return data_;
-    } else {
-      return source_;
-    }
-  }
-  // Initialize the programs
-  void Init() {
-    workspace_ = cl::OpenCLWorkspace::Global();
-    workspace_->Init();
-    CHECK(workspace_->context != nullptr) << "No OpenCL device";
-    if (fmt_ == "cl") {
-      const char* s = data_.c_str();
-      size_t len = data_.length();
-      cl_int err;
-      program_ = clCreateProgramWithSource(
-          workspace_->context, 1, &s, &len, &err);
-      OPENCL_CHECK_ERROR(err);
-    } else if (fmt_ == "xclbin" || fmt_ == "awsxclbin") {
-      const unsigned char* s = (const unsigned char *)data_.c_str();
-      size_t len = data_.length();
-      cl_int err;
-      program_ = clCreateProgramWithBinary(
-          workspace_->context, 1, &(workspace_->devices[0]), &len, &s, NULL, &err);
-      if (err != CL_SUCCESS) {
-        LOG(ERROR) << "OpenCL Error: " << cl::CLGetErrorString(err);
-      }
-    } else {
-      LOG(FATAL) << "Unknown OpenCL format " << fmt_;
-    }
-    device_built_flag_.resize(workspace_->devices.size(), false);
-    // initialize the kernel id, need to lock global table.
-    std::lock_guard<std::mutex> lock(workspace_->mu);
-    for (const auto& kv : fmap_) {
-      const std::string& key = kv.first;
-      KTRefEntry e;
-      if (workspace_->free_kernel_ids.size() != 0) {
-        e.kernel_id = workspace_->free_kernel_ids.back();
-        workspace_->free_kernel_ids.pop_back();
-      } else {
-        e.kernel_id = workspace_->num_registered_kernels++;
-      }
-      e.version = workspace_->timestamp++;
-      kid_map_[key] = e;
-    }
-  }
-  // install a new kernel to thread local entry
-  cl_kernel InstallKernel(cl::OpenCLWorkspace* w,
-                          cl::OpenCLThreadEntry* t,
-                          const std::string& func_name,
-                          const KTRefEntry& e) {
-    std::lock_guard<std::mutex> lock(build_lock_);
-    int device_id = t->context.device_id;
-    if (!device_built_flag_[device_id]) {
-      // build program
-      cl_int err;
-      cl_device_id dev = w->devices[device_id];
-      err = clBuildProgram(program_, 1, &dev, nullptr, nullptr, nullptr);
-      if (err != CL_SUCCESS) {
-        size_t len;
-        std::string log;
-        clGetProgramBuildInfo(
-            program_, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len);
-        log.resize(len);
-        clGetProgramBuildInfo(
-            program_, dev, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr);
-        LOG(FATAL) << "OpenCL build error for device=" << dev << log;
-      }
-      device_built_flag_[device_id] = true;
-    }
-    // build kernel
-    cl_int err;
-    cl_kernel kernel = clCreateKernel(program_, func_name.c_str(), &err);
-    OPENCL_CHECK_ERROR(err);
-    t->kernel_table[e.kernel_id].kernel = kernel;
-    t->kernel_table[e.kernel_id].version = e.version;
-    kernels_.push_back(kernel);
-    return kernel;
-  }
- private:
-  // The workspace, need to keep reference to use it in destructor.
-  // In case of static destruction order problem.
-  std::shared_ptr<cl::OpenCLWorkspace> workspace_;
-  // the binary data
-  std::string data_;
-  // The format
-  std::string fmt_;
-  // function information table.
-  std::unordered_map<std::string, FunctionInfo> fmap_;
-  // Module local mutex
-  std::mutex build_lock_;
-  // The OpenCL source.
-  std::string source_;
-  // the binary data
-  cl_program program_{nullptr};
-  // build info
-  std::vector<bool> device_built_flag_;
-  // kernel id cache
-  std::unordered_map<std::string, KTRefEntry> kid_map_;
-  // kernels build so far.
-  std::vector<cl_kernel> kernels_;
-};
 class OpenCLWrappedFunc {
 public:
  // initialize the OpenCL function.
@@ -193,7 +22,7 @@ class OpenCLWrappedFunc {
            std::string func_name,
            std::vector<size_t> arg_size,
            const std::vector<std::string>& thread_axis_tags)  {
-    w_ = cl::OpenCLWorkspace::Global().get();
+    w_ = m->GetGlobalWorkspace().get();
    m_ = m;
    sptr_ = sptr;
    entry_ = entry;
@@ -205,7 +34,7 @@ class OpenCLWrappedFunc {
  void operator()(TVMArgs args,
                  TVMRetValue* rv,
                  void** void_args) const {
-    cl::OpenCLThreadEntry* t = cl::OpenCLThreadEntry::ThreadLocal();
+    cl::OpenCLThreadEntry* t = w_->GetThreadEntry();
    // get the kernel from thread local kernel table.
    if (entry_.kernel_id >= t->kernel_table.size()) {
      t->kernel_table.resize(entry_.kernel_id + 1);
@@ -250,6 +79,31 @@ class OpenCLWrappedFunc {
  ThreadAxisConfig thread_axis_cfg_;
 };
+OpenCLModuleNode::~OpenCLModuleNode() {
+  {
+    // free the kernel ids in global table.
+    std::lock_guard<std::mutex> lock(workspace_->mu);
+    for (auto& kv : kid_map_) {
+      workspace_->free_kernel_ids.push_back(kv.second.kernel_id);
+    }
+  }
+  // free the kernels
+  for (cl_kernel k : kernels_) {
+    OPENCL_CALL(clReleaseKernel(k));
+  }
+  if (program_) {
+    OPENCL_CALL(clReleaseProgram(program_));
+  }
+}
+const std::shared_ptr<cl::OpenCLWorkspace>& OpenCLModuleNode::GetGlobalWorkspace() {
+  return cl::OpenCLWorkspace::Global();
+}
+const char* OpenCLModuleNode::type_key() const {
+  return "opencl";
+}
 PackedFunc OpenCLModuleNode::GetFunction(
    const std::string& name,
    const std::shared_ptr<ModuleNode>& sptr_to_self) {
@@ -279,6 +133,104 @@ PackedFunc OpenCLModuleNode::GetFunction(
  return PackFuncVoidAddr(f, info.arg_types);
 }
+void OpenCLModuleNode::SaveToFile(const std::string& file_name,
+                                  const std::string& format) {
+  std::string fmt = GetFileFormat(file_name, format);
+  CHECK_EQ(fmt, fmt_)
+      << "Can only save to format=" << fmt_;
+  std::string meta_file = GetMetaFilePath(file_name);
+  SaveMetaDataToFile(meta_file, fmap_);
+  SaveBinaryToFile(file_name, data_);
+}
+void OpenCLModuleNode::SaveToBinary(dmlc::Stream* stream) {
+  stream->Write(fmt_);
+  stream->Write(fmap_);
+  stream->Write(data_);
+}
+std::string OpenCLModuleNode::GetSource(const std::string& format) {
+  if (format == fmt_) return data_;
+  if (fmt_ == "cl") {
+    return data_;
+  } else {
+    return source_;
+  }
+}
+void OpenCLModuleNode::Init() {
+  workspace_ = GetGlobalWorkspace();
+  workspace_->Init();
+  CHECK(workspace_->context != nullptr) << "No OpenCL device";
+  if (fmt_ == "cl") {
+    const char* s = data_.c_str();
+    size_t len = data_.length();
+    cl_int err;
+    program_ = clCreateProgramWithSource(
+        workspace_->context, 1, &s, &len, &err);
+    OPENCL_CHECK_ERROR(err);
+  } else if (fmt_ == "xclbin" || fmt_ == "awsxclbin") {
+    const unsigned char* s = (const unsigned char *)data_.c_str();
+    size_t len = data_.length();
+    cl_int err;
+    program_ = clCreateProgramWithBinary(
+        workspace_->context, 1, &(workspace_->devices[0]), &len, &s, NULL, &err);
+    if (err != CL_SUCCESS) {
+      LOG(ERROR) << "OpenCL Error: " << cl::CLGetErrorString(err);
+    }
+  } else {
+    LOG(FATAL) << "Unknown OpenCL format " << fmt_;
+  }
+  device_built_flag_.resize(workspace_->devices.size(), false);
+  // initialize the kernel id, need to lock global table.
+  std::lock_guard<std::mutex> lock(workspace_->mu);
+  for (const auto& kv : fmap_) {
+    const std::string& key = kv.first;
+    KTRefEntry e;
+    if (workspace_->free_kernel_ids.size() != 0) {
+      e.kernel_id = workspace_->free_kernel_ids.back();
+      workspace_->free_kernel_ids.pop_back();
+    } else {
+      e.kernel_id = workspace_->num_registered_kernels++;
+    }
+    e.version = workspace_->timestamp++;
+    kid_map_[key] = e;
+  }
+}
+cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w,
+                                          cl::OpenCLThreadEntry* t,
+                                          const std::string& func_name,
+                                          const KTRefEntry& e) {
+  std::lock_guard<std::mutex> lock(build_lock_);
+  int device_id = t->context.device_id;
+  if (!device_built_flag_[device_id]) {
+    // build program
+    cl_int err;
+    cl_device_id dev = w->devices[device_id];
+    err = clBuildProgram(program_, 1, &dev, nullptr, nullptr, nullptr);
+    if (err != CL_SUCCESS) {
+      size_t len;
+      std::string log;
+      clGetProgramBuildInfo(
+          program_, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len);
+      log.resize(len);
+      clGetProgramBuildInfo(
+          program_, dev, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr);
+      LOG(FATAL) << "OpenCL build error for device=" << dev << log;
+    }
+    device_built_flag_[device_id] = true;
+  }
+  // build kernel
+  cl_int err;
+  cl_kernel kernel = clCreateKernel(program_, func_name.c_str(), &err);
+  OPENCL_CHECK_ERROR(err);
+  t->kernel_table[e.kernel_id].kernel = kernel;
+  t->kernel_table[e.kernel_id].version = e.version;
+  kernels_.push_back(kernel);
+  return kernel;
+}
 Module OpenCLModuleCreate(
    std::string data,
    std::string fmt,
@@ -323,16 +275,6 @@ TVM_REGISTER_GLOBAL("module.loadfile_clbin")
    *rv = OpenCLModuleLoadFile(args[0], args[1]);
  });
-TVM_REGISTER_GLOBAL("module.loadfile_xclbin")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = OpenCLModuleLoadFile(args[0], args[1]);
-  });
-TVM_REGISTER_GLOBAL("module.loadfile_awsxclbin")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = OpenCLModuleLoadFile(args[0], args[1]);
-  });
 TVM_REGISTER_GLOBAL("module.loadbinary_opencl")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
    *rv = OpenCLModuleLoadBinary(args[0]);

--- a/src/runtime/opencl/opencl_module.h
+++ b/src/runtime/opencl/opencl_module.h
@@ -15,7 +15,7 @@
 namespace tvm {
 namespace runtime {
 /*!
- * \brief create a opencl module from data.
+ * \brief create a opencl module for GPU devices from data.
 *
 * \param data The module data.
 * \param fmt The format of the data, can be "clbin", "cl"

--- a/src/runtime/opencl/sdaccel/sdaccel_common.h
+++ b/src/runtime/opencl/sdaccel/sdaccel_common.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sdaccel_common.h
+ * \brief SDAccel common header
+ */
+#ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
+#define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
+#include "../opencl_common.h"
+namespace tvm {
+namespace runtime {
+namespace cl {
+/*!
+ * \brief Process global SDAccel workspace.
+ */
+class SDAccelWorkspace final : public OpenCLWorkspace {
+ public:
+  // override OpenCL device API
+  void Init() final;
+  bool IsOpenCLDevice(TVMContext ctx) final;
+  OpenCLThreadEntry* GetThreadEntry() final;
+  // get the global workspace
+  static const std::shared_ptr<OpenCLWorkspace>& Global();
+};
+/*! \brief Thread local workspace for SDAccel*/
+class SDAccelThreadEntry : public OpenCLThreadEntry {
+ public:
+  // constructor
+  SDAccelThreadEntry()
+      : OpenCLThreadEntry(static_cast<DLDeviceType>(kDLSDAccel), SDAccelWorkspace::Global()) {}
+  // get the global workspace
+  static SDAccelThreadEntry* ThreadLocal();
+};
+}  // namespace cl
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
--- a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
+++ b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sdaccel_device_api.cc
+ */
+#include <tvm/runtime/registry.h>
+#include <dmlc/thread_local.h>
+#include <tvm/container.h>
+#include <tvm/ir.h>
+#include <tvm/packed_func_ext.h>
+#include "./sdaccel_common.h"
+namespace tvm {
+namespace runtime {
+namespace cl {
+OpenCLThreadEntry* SDAccelWorkspace::GetThreadEntry() {
+  return SDAccelThreadEntry::ThreadLocal();
+}
+const std::shared_ptr<OpenCLWorkspace>& SDAccelWorkspace::Global() {
+  static std::shared_ptr<OpenCLWorkspace> inst = std::make_shared<SDAccelWorkspace>();
+  return inst;
+}
+void SDAccelWorkspace::Init() {
+  OpenCLWorkspace::Init("accelerator", "Xilinx");
+}
+bool SDAccelWorkspace::IsOpenCLDevice(TVMContext ctx) {
+  return ctx.device_type == static_cast<DLDeviceType>(kDLSDAccel);
+}
+typedef dmlc::ThreadLocalStore<SDAccelThreadEntry> SDAccelThreadStore;
+SDAccelThreadEntry* SDAccelThreadEntry::ThreadLocal() {
+  return SDAccelThreadStore::Get();
+}
+TVM_REGISTER_GLOBAL("device_api.sdaccel")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    DeviceAPI* ptr = SDAccelWorkspace::Global().get();
+    *rv = static_cast<void*>(ptr);
+  });
+}  // namespace cl
+}  // namespace runtime
+}  // namespace tvm
--- a/src/runtime/opencl/sdaccel/sdaccel_module.cc
+++ b/src/runtime/opencl/sdaccel/sdaccel_module.cc
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sdaccel_module.cc
+ */
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/registry.h>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include "./sdaccel_common.h"
+#include "./sdaccel_module.h"
+namespace tvm {
+namespace runtime {
+class SDAccelModuleNode : public OpenCLModuleNode {
+ public:
+  explicit SDAccelModuleNode(std::string data,
+                             std::string fmt,
+                             std::unordered_map<std::string, FunctionInfo> fmap,
+                             std::string source)
+      : OpenCLModuleNode(data, fmt, fmap, source) {}
+  const std::shared_ptr<cl::OpenCLWorkspace>& GetGlobalWorkspace() final;
+  const char* type_key() const final;
+};
+const std::shared_ptr<cl::OpenCLWorkspace>& SDAccelModuleNode::GetGlobalWorkspace() {
+  return cl::SDAccelWorkspace::Global();
+}
+const char* SDAccelModuleNode::type_key() const {
+  return "sdaccel";
+}
+Module SDAccelModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source) {
+  std::shared_ptr<SDAccelModuleNode> n =
+      std::make_shared<SDAccelModuleNode>(data, fmt, fmap, source);
+  n->Init();
+  return Module(n);
+}
+Module SDAccelModuleLoadFile(const std::string& file_name,
+                             const std::string& format) {
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt = GetFileFormat(file_name, format);
+  std::string meta_file = GetMetaFilePath(file_name);
+  LoadBinaryFromFile(file_name, &data);
+  LoadMetaDataFromFile(meta_file, &fmap);
+  return SDAccelModuleCreate(data, fmt, fmap, std::string());
+}
+Module SDAccelModuleLoadBinary(void* strm) {
+  dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt;
+  stream->Read(&fmt);
+  stream->Read(&fmap);
+  stream->Read(&data);
+  return SDAccelModuleCreate(data, fmt, fmap, std::string());
+}
+TVM_REGISTER_GLOBAL("module.loadfile_xclbin")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = SDAccelModuleLoadFile(args[0], args[1]);
+  });
+TVM_REGISTER_GLOBAL("module.loadfile_awsxclbin")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = SDAccelModuleLoadFile(args[0], args[1]);
+  });
+}  // namespace runtime
+}  // namespace tvm
--- a/src/runtime/opencl/sdaccel/sdaccel_module.h
+++ b/src/runtime/opencl/sdaccel/sdaccel_module.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sdaccel_module.h
+ * \brief Execution handling of OPENCL kernels for SDAccel FPGAs
+ */
+#ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
+#define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
+#include <tvm/runtime/packed_func.h>
+#include <memory>
+#include <vector>
+#include <string>
+#include "../../meta_data.h"
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief create a opencl module for SDAccel from data.
+ *
+ * \param data The module data.
+ * \param fmt The format of the data, can be "xclbin", "awsxclbin"
+ * \param fmap The map function information map of each function.
+ */
+Module SDAccelModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source);
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_