Commit cb68c82c by MORITA Kazutaka Committed by Tianqi Chen

Add support for multiple OpenCL platforms (#1345)

parent f927e1f3
...@@ -39,6 +39,9 @@ set(USE_CUDA OFF) ...@@ -39,6 +39,9 @@ set(USE_CUDA OFF)
# - /path/to/rocm: use specific path to rocm # - /path/to/rocm: use specific path to rocm
set(USE_ROCM OFF) set(USE_ROCM OFF)
# Whether enable SDAccel runtime
set(USE_SDACCEL OFF)
# Whether enable OpenCL runtime # Whether enable OpenCL runtime
set(USE_OPENCL OFF) set(USE_OPENCL OFF)
......
...@@ -7,6 +7,18 @@ if(OpenCL_FOUND) ...@@ -7,6 +7,18 @@ if(OpenCL_FOUND)
include_directories(${OpenCL_INCLUDE_DIRS}) include_directories(${OpenCL_INCLUDE_DIRS})
endif(OpenCL_FOUND) endif(OpenCL_FOUND)
if(USE_SDACCEL)
message(STATUS "Build with SDAccel support")
file(GLOB RUNTIME_SDACCEL_SRCS src/runtime/opencl/sdaccel/*.cc)
list(APPEND RUNTIME_SRCS ${RUNTIME_SDACCEL_SRCS})
if(NOT USE_OPENCL)
message(STATUS "Enable OpenCL support required for SDAccel")
set(USE_OPENCL ON)
endif()
else()
list(APPEND COMPILER_SRCS src/codegen/opt/build_sdaccel_off.cc)
endif(USE_SDACCEL)
if(USE_OPENCL) if(USE_OPENCL)
find_package(OpenCL REQUIRED) find_package(OpenCL REQUIRED)
message(STATUS "Build with OpenCL support") message(STATUS "Build with OpenCL support")
......
...@@ -60,6 +60,7 @@ typedef int64_t tvm_index_t; ...@@ -60,6 +60,7 @@ typedef int64_t tvm_index_t;
/*! \brief Extension device types in TVM */ /*! \brief Extension device types in TVM */
typedef enum { typedef enum {
kDLSDAccel = 6,
kDLVulkan = 7, kDLVulkan = 7,
kOpenGL = 11, kOpenGL = 11,
// Extension DRAM type, used for quickly test extension device // Extension DRAM type, used for quickly test extension device
......
...@@ -95,6 +95,7 @@ class TVMContext(ctypes.Structure): ...@@ -95,6 +95,7 @@ class TVMContext(ctypes.Structure):
1 : 'cpu', 1 : 'cpu',
2 : 'gpu', 2 : 'gpu',
4 : 'opencl', 4 : 'opencl',
6 : 'sdaccel',
7 : 'vulkan', 7 : 'vulkan',
8 : 'metal', 8 : 'metal',
9 : 'vpi', 9 : 'vpi',
...@@ -111,7 +112,7 @@ class TVMContext(ctypes.Structure): ...@@ -111,7 +112,7 @@ class TVMContext(ctypes.Structure):
'nvptx': 2, 'nvptx': 2,
'cl': 4, 'cl': 4,
'opencl': 4, 'opencl': 4,
'sdaccel': 4, 'sdaccel': 6,
'vulkan': 7, 'vulkan': 7,
'metal': 8, 'metal': 8,
'vpi': 9, 'vpi': 9,
......
"""Utility for Interacting with SDAccel Tools""" """Utility for Interacting with SDAccel Tools"""
import subprocess import subprocess
import os import os
import re
from . import util from . import util
from ..api import register_func from ..api import register_func
def _vhls_to_opencl(code):
"""Convert source code from Vivado HLS to OpenCL."""
out = ''
for line in code.split('\n'):
if re.match(r'#include', line):
# OpenCL doesn't support include.
continue
if re.match(r'#pragma', line):
# Remove Vivado HLS specific pragmas.
continue
if re.match(r'extern "C"', line):
line = re.sub(r'^extern "C"', "__kernel", line)
# Add __global to pointer parameters.
line = re.sub(r'(\w+)\s*\*', r"__global \1*", line)
out += line + '\n'
return out
def _fake_compile_vhls(code):
"""Fake compile Vivado HLS code for SDAccel.
Compile the Vivado HLS code as an OpenCL code, and generate a program
binary for GPU which can be used instead of xclbin.
Parameters
----------
code : str
The Vivado HLS code.
Return
------
binary : bytearray
The program binary which can be passed to clCreateProgramWithBinary
"""
try:
import pyopencl as cl
except ImportError:
raise ImportError('PyOpenCL is required for testing SDAccel backend.')
ctx = cl.Context(dev_type=cl.device_type.GPU)
program = cl.Program(ctx, _vhls_to_opencl(code)).build()
binary = bytearray(program.binaries[0])
return binary
@register_func("tvm_callback_sdaccel_compile") @register_func("tvm_callback_sdaccel_compile")
def compile_vhls(code, kernel): def compile_vhls(code, kernel):
"""Compile Vivado HLS code for SDAccel. """Compile Vivado HLS code for SDAccel.
...@@ -87,9 +39,7 @@ def compile_vhls(code, kernel): ...@@ -87,9 +39,7 @@ def compile_vhls(code, kernel):
platform = os.environ.get("XCL_PLATFORM", os.environ.get("AWS_PLATFORM")) platform = os.environ.get("XCL_PLATFORM", os.environ.get("AWS_PLATFORM"))
if platform is None: if platform is None:
# If we don't have the Xilinx toolchain, create a program binary for raise RuntimeError("No Xlinx device specified.")
# GPU and use it for testing.
return _fake_compile_vhls(code)
# build xo # build xo
args = [xocc, "-c", "-t", target, "--platform", platform, "-o", tmp_xo, "-k", kernel] + \ args = [xocc, "-c", "-t", target, "--platform", platform, "-o", tmp_xo, "-k", kernel] + \
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <string> #include <string>
#include "./codegen_vhls.h" #include "./codegen_vhls.h"
#include "./build_common.h" #include "./build_common.h"
#include "../runtime/opencl/opencl_module.h" #include "../runtime/opencl/sdaccel/sdaccel_module.h"
namespace tvm { namespace tvm {
namespace codegen { namespace codegen {
...@@ -91,7 +91,7 @@ runtime::Module BuildSDAccel(Array<LoweredFunc> funcs) { ...@@ -91,7 +91,7 @@ runtime::Module BuildSDAccel(Array<LoweredFunc> funcs) {
} else { } else {
LOG(FATAL) << "Cannot compile Vivado HLS code."; LOG(FATAL) << "Cannot compile Vivado HLS code.";
} }
return OpenCLModuleCreate(xclbin, "xclbin", ExtractFuncInfo(funcs), code); return SDAccelModuleCreate(xclbin, "xclbin", ExtractFuncInfo(funcs), code);
} }
TVM_REGISTER_API("codegen.build_sdaccel") TVM_REGISTER_API("codegen.build_sdaccel")
......
/*!
* Copyright (c) 2018 by Contributors
* Optional module when build opencl is switched to off
*/
#include "../codegen_source_base.h"
#include "../../runtime/opencl/opencl_module.h"
namespace tvm {
namespace runtime {
Module SDAccelModuleCreate(
std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source) {
LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "sdaccel");
}
} // namespace runtime
} // namespace tvm
...@@ -36,7 +36,7 @@ class MemoryAccessVerifier final : protected IRVisitor { ...@@ -36,7 +36,7 @@ class MemoryAccessVerifier final : protected IRVisitor {
/// Interface to perform memory access verification /// Interface to perform memory access verification
void Run() { void Run() {
if (!IsGPUDevice(dev_type_)) return; if (!IsGPUDevice(dev_type_) && !IsFPGADevice(dev_type_)) return;
IRVisitor::Visit(func_->body); IRVisitor::Visit(func_->body);
} }
...@@ -143,6 +143,10 @@ class MemoryAccessVerifier final : protected IRVisitor { ...@@ -143,6 +143,10 @@ class MemoryAccessVerifier final : protected IRVisitor {
kDLVulkan == dev_type || kDLMetal == dev_type || kDLVulkan == dev_type || kDLMetal == dev_type ||
kDLROCM == dev_type || kOpenGL == dev_type; kDLROCM == dev_type || kOpenGL == dev_type;
} }
/// Check if a given DLDeviceType/TVMDeviceExtType value denotes FPGA device.
static bool IsFPGADevice(int dev_type) {
return kDLSDAccel == dev_type;
}
private: private:
/// Status of visitor /// Status of visitor
......
...@@ -31,6 +31,7 @@ inline std::string DeviceName(int type) { ...@@ -31,6 +31,7 @@ inline std::string DeviceName(int type) {
case kDLCPU: return "cpu"; case kDLCPU: return "cpu";
case kDLGPU: return "gpu"; case kDLGPU: return "gpu";
case kDLOpenCL: return "opencl"; case kDLOpenCL: return "opencl";
case kDLSDAccel: return "sdaccel";
case kDLVulkan: return "vulkan"; case kDLVulkan: return "vulkan";
case kDLMetal: return "metal"; case kDLMetal: return "metal";
case kDLVPI: return "vpi"; case kDLVPI: return "vpi";
......
...@@ -21,6 +21,10 @@ ...@@ -21,6 +21,10 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "../workspace_pool.h" #include "../workspace_pool.h"
#include "../pack_args.h"
#include "../thread_storage_scope.h"
#include "../meta_data.h"
#include "../file_util.h"
namespace tvm { namespace tvm {
namespace runtime { namespace runtime {
...@@ -97,17 +101,23 @@ inline const char* CLGetErrorString(cl_int error) { ...@@ -97,17 +101,23 @@ inline const char* CLGetErrorString(cl_int error) {
OPENCL_CHECK_ERROR(e); \ OPENCL_CHECK_ERROR(e); \
} }
class OpenCLThreadEntry;
/*! /*!
* \brief Process global OpenCL workspace. * \brief Process global OpenCL workspace.
*/ */
class OpenCLWorkspace final : public DeviceAPI { class OpenCLWorkspace : public DeviceAPI {
public: public:
// global platform id // global platform id
cl_platform_id platform_id; cl_platform_id platform_id;
// global platform name
std::string platform_name;
// global context of this process // global context of this process
cl_context context{nullptr}; cl_context context{nullptr};
// whether the workspace it initialized. // whether the workspace it initialized.
bool initialized_{false}; bool initialized_{false};
// the device type
std::string device_type;
// the devices // the devices
std::vector<cl_device_id> devices; std::vector<cl_device_id> devices;
// the queues // the queues
...@@ -128,10 +138,17 @@ class OpenCLWorkspace final : public DeviceAPI { ...@@ -128,10 +138,17 @@ class OpenCLWorkspace final : public DeviceAPI {
} }
} }
// Initialzie the device. // Initialzie the device.
void Init(); void Init(const std::string& device_type, const std::string& platform_name = "");
virtual void Init() {
Init("gpu");
}
// Check whether the context is OpenCL or not.
virtual bool IsOpenCLDevice(TVMContext ctx) {
return ctx.device_type == kDLOpenCL;
}
// get the queue of the context // get the queue of the context
cl_command_queue GetQueue(TVMContext ctx) { cl_command_queue GetQueue(TVMContext ctx) {
CHECK_EQ(ctx.device_type, kDLOpenCL); CHECK(IsOpenCLDevice(ctx));
this->Init(); this->Init();
CHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size()) CHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
<< "Invalid OpenCL device_id=" << ctx.device_id; << "Invalid OpenCL device_id=" << ctx.device_id;
...@@ -157,6 +174,12 @@ class OpenCLWorkspace final : public DeviceAPI { ...@@ -157,6 +174,12 @@ class OpenCLWorkspace final : public DeviceAPI {
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final; void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
void FreeWorkspace(TVMContext ctx, void* data) final; void FreeWorkspace(TVMContext ctx, void* data) final;
/*!
* \brief Get the thread local ThreadEntry
*/
virtual OpenCLThreadEntry* GetThreadEntry();
// get the global workspace // get the global workspace
static const std::shared_ptr<OpenCLWorkspace>& Global(); static const std::shared_ptr<OpenCLWorkspace>& Global();
}; };
...@@ -179,15 +202,87 @@ class OpenCLThreadEntry { ...@@ -179,15 +202,87 @@ class OpenCLThreadEntry {
/*! \brief workspace pool */ /*! \brief workspace pool */
WorkspacePool pool; WorkspacePool pool;
// constructor // constructor
OpenCLThreadEntry() OpenCLThreadEntry(DLDeviceType device_type, std::shared_ptr<DeviceAPI> device)
: pool(kDLOpenCL, OpenCLWorkspace::Global()) { : pool(device_type, device) {
context.device_id = 0; context.device_id = 0;
context.device_type = kDLOpenCL; context.device_type = device_type;
} }
OpenCLThreadEntry()
: OpenCLThreadEntry(kDLOpenCL, OpenCLWorkspace::Global()) {}
// get the global workspace // get the global workspace
static OpenCLThreadEntry* ThreadLocal(); static OpenCLThreadEntry* ThreadLocal();
}; };
} // namespace cl } // namespace cl
// Module to support thread-safe multi-device execution.
// OpenCL runtime is a bit tricky because clSetKernelArg is not thread-safe
// To make the call thread-safe, we create a thread-local kernel table
// and lazily install new kernels into the kernel table when the kernel is called.
// The kernels are recycled when the module get destructed.
class OpenCLModuleNode : public ModuleNode {
public:
// Kernel table reference entry.
struct KTRefEntry {
size_t kernel_id;
size_t version;
};
explicit OpenCLModuleNode(std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source)
: data_(data), fmt_(fmt), fmap_(fmap), source_(source) {}
// destructor
~OpenCLModuleNode();
/*!
* \brief Get the global workspace
*/
virtual const std::shared_ptr<cl::OpenCLWorkspace>& GetGlobalWorkspace();
virtual const char* type_key() const;
PackedFunc GetFunction(
const std::string& name,
const std::shared_ptr<ModuleNode>& sptr_to_self) final;
void SaveToFile(const std::string& file_name,
const std::string& format) final;
void SaveToBinary(dmlc::Stream* stream) final;
std::string GetSource(const std::string& format) final;
// Initialize the programs
void Init();
// install a new kernel to thread local entry
cl_kernel InstallKernel(cl::OpenCLWorkspace* w,
cl::OpenCLThreadEntry* t,
const std::string& func_name,
const KTRefEntry& e);
protected:
// The workspace, need to keep reference to use it in destructor.
// In case of static destruction order problem.
std::shared_ptr<cl::OpenCLWorkspace> workspace_;
// the binary data
std::string data_;
private:
// The format
std::string fmt_;
// function information table.
std::unordered_map<std::string, FunctionInfo> fmap_;
// Module local mutex
std::mutex build_lock_;
// The OpenCL source.
std::string source_;
// the binary data
cl_program program_{nullptr};
// build info
std::vector<bool> device_built_flag_;
// kernel id cache
std::unordered_map<std::string, KTRefEntry> kid_map_;
// kernels build so far.
std::vector<cl_kernel> kernels_;
};
} // namespace runtime } // namespace runtime
} // namespace tvm } // namespace tvm
#endif // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_ #endif // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
...@@ -10,13 +10,17 @@ namespace tvm { ...@@ -10,13 +10,17 @@ namespace tvm {
namespace runtime { namespace runtime {
namespace cl { namespace cl {
OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() {
return OpenCLThreadEntry::ThreadLocal();
}
const std::shared_ptr<OpenCLWorkspace>& OpenCLWorkspace::Global() { const std::shared_ptr<OpenCLWorkspace>& OpenCLWorkspace::Global() {
static std::shared_ptr<OpenCLWorkspace> inst = std::make_shared<OpenCLWorkspace>(); static std::shared_ptr<OpenCLWorkspace> inst = std::make_shared<OpenCLWorkspace>();
return inst; return inst;
} }
void OpenCLWorkspace::SetDevice(TVMContext ctx) { void OpenCLWorkspace::SetDevice(TVMContext ctx) {
OpenCLThreadEntry::ThreadLocal()->context.device_id = ctx.device_id; GetThreadEntry()->context.device_id = ctx.device_id;
} }
void OpenCLWorkspace::GetAttr( void OpenCLWorkspace::GetAttr(
...@@ -121,13 +125,13 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from, ...@@ -121,13 +125,13 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from,
TVMStreamHandle stream) { TVMStreamHandle stream) {
this->Init(); this->Init();
CHECK(stream == nullptr); CHECK(stream == nullptr);
if (ctx_from.device_type == kDLOpenCL && ctx_to.device_type == kDLOpenCL) { if (IsOpenCLDevice(ctx_from) && IsOpenCLDevice(ctx_to)) {
OPENCL_CALL(clEnqueueCopyBuffer( OPENCL_CALL(clEnqueueCopyBuffer(
this->GetQueue(ctx_to), this->GetQueue(ctx_to),
static_cast<cl_mem>((void*)from), // NOLINT(*) static_cast<cl_mem>((void*)from), // NOLINT(*)
static_cast<cl_mem>(to), static_cast<cl_mem>(to),
from_offset, to_offset, size, 0, nullptr, nullptr)); from_offset, to_offset, size, 0, nullptr, nullptr));
} else if (ctx_from.device_type == kDLOpenCL && ctx_to.device_type == kDLCPU) { } else if (IsOpenCLDevice(ctx_from) && ctx_to.device_type == kDLCPU) {
OPENCL_CALL(clEnqueueReadBuffer( OPENCL_CALL(clEnqueueReadBuffer(
this->GetQueue(ctx_from), this->GetQueue(ctx_from),
static_cast<cl_mem>((void*)from), // NOLINT(*) static_cast<cl_mem>((void*)from), // NOLINT(*)
...@@ -135,7 +139,7 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from, ...@@ -135,7 +139,7 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from,
static_cast<char*>(to) + to_offset, static_cast<char*>(to) + to_offset,
0, nullptr, nullptr)); 0, nullptr, nullptr));
OPENCL_CALL(clFinish(this->GetQueue(ctx_from))); OPENCL_CALL(clFinish(this->GetQueue(ctx_from)));
} else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLOpenCL) { } else if (ctx_from.device_type == kDLCPU && IsOpenCLDevice(ctx_to)) {
OPENCL_CALL(clEnqueueWriteBuffer( OPENCL_CALL(clEnqueueWriteBuffer(
this->GetQueue(ctx_to), this->GetQueue(ctx_to),
static_cast<cl_mem>(to), static_cast<cl_mem>(to),
...@@ -156,11 +160,11 @@ void OpenCLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) { ...@@ -156,11 +160,11 @@ void OpenCLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
void* OpenCLWorkspace::AllocWorkspace(TVMContext ctx, void* OpenCLWorkspace::AllocWorkspace(TVMContext ctx,
size_t size, size_t size,
TVMType type_hint) { TVMType type_hint) {
return OpenCLThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); return GetThreadEntry()->pool.AllocWorkspace(ctx, size);
} }
void OpenCLWorkspace::FreeWorkspace(TVMContext ctx, void* data) { void OpenCLWorkspace::FreeWorkspace(TVMContext ctx, void* data) {
OpenCLThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data); GetThreadEntry()->pool.FreeWorkspace(ctx, data);
} }
typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore; typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;
...@@ -223,38 +227,39 @@ bool MatchPlatformInfo( ...@@ -223,38 +227,39 @@ bool MatchPlatformInfo(
return param_value.find(value) != std::string::npos; return param_value.find(value) != std::string::npos;
} }
void OpenCLWorkspace::Init() { void OpenCLWorkspace::Init(const std::string& device_type, const std::string& platform_name) {
if (initialized_) return; if (initialized_) return;
std::lock_guard<std::mutex> lock(this->mu); std::lock_guard<std::mutex> lock(this->mu);
if (initialized_) return; if (initialized_) return;
initialized_ = true; initialized_ = true;
if (context != nullptr) return; if (context != nullptr) return;
// matched platforms // matched platforms
std::vector<cl_platform_id> platform_matched = cl::GetPlatformIDs(); std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
if (platform_matched.size() == 0) { if (platform_ids.size() == 0) {
LOG(WARNING) << "No OpenCL platform matched given existing options ..."; LOG(WARNING) << "No OpenCL platform matched given existing options ...";
return; return;
} }
if (platform_matched.size() > 1) { this->platform_id = nullptr;
LOG(WARNING) << "Multiple OpenCL platforms matched, use the first one ... "; for (auto platform_id : platform_ids) {
} if (!MatchPlatformInfo(platform_id, CL_PLATFORM_NAME, platform_name)) {
this->platform_id = platform_matched[0]; continue;
LOG(INFO) << "Initialize OpenCL platform \'" }
<< cl::GetPlatformInfo(this->platform_id, CL_PLATFORM_NAME) << '\''; std::vector<cl_device_id> devices_matched = cl::GetDeviceIDs(platform_id, device_type);
std::string device_types[] = {"accelerator", "gpu", "cpu"};
std::vector<cl_device_id> devices_matched;
for (auto type : device_types) {
devices_matched = cl::GetDeviceIDs(this->platform_id, type);
if (devices_matched.size() > 0) { if (devices_matched.size() > 0) {
this->platform_id = platform_id;
this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
this->device_type = device_type;
this->devices = devices_matched;
LOG(INFO) << "Initialize OpenCL platform \'" << this->platform_name << '\'';
break; break;
} }
LOG(INFO) << "No OpenCL device any device matched given the options: " << type << " mode"; LOG(INFO) << "\'" << cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME)
<< "\' platform has no OpenCL device: " << device_type << " mode";
} }
if (devices_matched.size() == 0) { if (this->platform_id == nullptr) {
LOG(WARNING) << "No OpenCL device"; LOG(WARNING) << "No OpenCL device";
return; return;
} }
this->devices = devices_matched;
cl_int err_code; cl_int err_code;
this->context = clCreateContext( this->context = clCreateContext(
nullptr, this->devices.size(), &(this->devices[0]), nullptr, this->devices.size(), &(this->devices[0]),
...@@ -272,11 +277,6 @@ void OpenCLWorkspace::Init() { ...@@ -272,11 +277,6 @@ void OpenCLWorkspace::Init() {
} }
} }
bool InitOpenCL(TVMArgs args, TVMRetValue* rv) {
cl::OpenCLWorkspace::Global()->Init();
return true;
}
TVM_REGISTER_GLOBAL("device_api.opencl") TVM_REGISTER_GLOBAL("device_api.opencl")
.set_body([](TVMArgs args, TVMRetValue* rv) { .set_body([](TVMArgs args, TVMRetValue* rv) {
DeviceAPI* ptr = OpenCLWorkspace::Global().get(); DeviceAPI* ptr = OpenCLWorkspace::Global().get();
......
...@@ -10,180 +10,9 @@ ...@@ -10,180 +10,9 @@
#include "./opencl_common.h" #include "./opencl_common.h"
#include "./opencl_module.h" #include "./opencl_module.h"
#include "../pack_args.h"
#include "../thread_storage_scope.h"
#include "../meta_data.h"
#include "../file_util.h"
namespace tvm { namespace tvm {
namespace runtime { namespace runtime {
// Module to support thread-safe multi-device execution.
// OpenCL runtime is a bit tricky because clSetKernelArg is not thread-safe
// To make the call thread-safe, we create a thread-local kernel table
// and lazily install new kernels into the kernel table when the kernel is called.
// The kernels are recycled when the module get destructed.
class OpenCLModuleNode : public ModuleNode {
public:
// Kernel table reference entry.
struct KTRefEntry {
size_t kernel_id;
size_t version;
};
explicit OpenCLModuleNode(std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source)
: data_(data), fmt_(fmt), fmap_(fmap), source_(source) {}
// destructor
~OpenCLModuleNode() {
{
// free the kernel ids in global table.
std::lock_guard<std::mutex> lock(workspace_->mu);
for (auto& kv : kid_map_) {
workspace_->free_kernel_ids.push_back(kv.second.kernel_id);
}
}
// free the kernels
for (cl_kernel k : kernels_) {
OPENCL_CALL(clReleaseKernel(k));
}
if (program_) {
OPENCL_CALL(clReleaseProgram(program_));
}
}
const char* type_key() const final {
return "opencl";
}
PackedFunc GetFunction(
const std::string& name,
const std::shared_ptr<ModuleNode>& sptr_to_self) final;
void SaveToFile(const std::string& file_name,
const std::string& format) final {
std::string fmt = GetFileFormat(file_name, format);
CHECK_EQ(fmt, fmt_)
<< "Can only save to format=" << fmt_;
std::string meta_file = GetMetaFilePath(file_name);
SaveMetaDataToFile(meta_file, fmap_);
SaveBinaryToFile(file_name, data_);
}
void SaveToBinary(dmlc::Stream* stream) final {
stream->Write(fmt_);
stream->Write(fmap_);
stream->Write(data_);
}
std::string GetSource(const std::string& format) final {
if (format == fmt_) return data_;
if (fmt_ == "cl") {
return data_;
} else {
return source_;
}
}
// Initialize the programs
void Init() {
workspace_ = cl::OpenCLWorkspace::Global();
workspace_->Init();
CHECK(workspace_->context != nullptr) << "No OpenCL device";
if (fmt_ == "cl") {
const char* s = data_.c_str();
size_t len = data_.length();
cl_int err;
program_ = clCreateProgramWithSource(
workspace_->context, 1, &s, &len, &err);
OPENCL_CHECK_ERROR(err);
} else if (fmt_ == "xclbin" || fmt_ == "awsxclbin") {
const unsigned char* s = (const unsigned char *)data_.c_str();
size_t len = data_.length();
cl_int err;
program_ = clCreateProgramWithBinary(
workspace_->context, 1, &(workspace_->devices[0]), &len, &s, NULL, &err);
if (err != CL_SUCCESS) {
LOG(ERROR) << "OpenCL Error: " << cl::CLGetErrorString(err);
}
} else {
LOG(FATAL) << "Unknown OpenCL format " << fmt_;
}
device_built_flag_.resize(workspace_->devices.size(), false);
// initialize the kernel id, need to lock global table.
std::lock_guard<std::mutex> lock(workspace_->mu);
for (const auto& kv : fmap_) {
const std::string& key = kv.first;
KTRefEntry e;
if (workspace_->free_kernel_ids.size() != 0) {
e.kernel_id = workspace_->free_kernel_ids.back();
workspace_->free_kernel_ids.pop_back();
} else {
e.kernel_id = workspace_->num_registered_kernels++;
}
e.version = workspace_->timestamp++;
kid_map_[key] = e;
}
}
// install a new kernel to thread local entry
cl_kernel InstallKernel(cl::OpenCLWorkspace* w,
cl::OpenCLThreadEntry* t,
const std::string& func_name,
const KTRefEntry& e) {
std::lock_guard<std::mutex> lock(build_lock_);
int device_id = t->context.device_id;
if (!device_built_flag_[device_id]) {
// build program
cl_int err;
cl_device_id dev = w->devices[device_id];
err = clBuildProgram(program_, 1, &dev, nullptr, nullptr, nullptr);
if (err != CL_SUCCESS) {
size_t len;
std::string log;
clGetProgramBuildInfo(
program_, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len);
log.resize(len);
clGetProgramBuildInfo(
program_, dev, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr);
LOG(FATAL) << "OpenCL build error for device=" << dev << log;
}
device_built_flag_[device_id] = true;
}
// build kernel
cl_int err;
cl_kernel kernel = clCreateKernel(program_, func_name.c_str(), &err);
OPENCL_CHECK_ERROR(err);
t->kernel_table[e.kernel_id].kernel = kernel;
t->kernel_table[e.kernel_id].version = e.version;
kernels_.push_back(kernel);
return kernel;
}
private:
// The workspace, need to keep reference to use it in destructor.
// In case of static destruction order problem.
std::shared_ptr<cl::OpenCLWorkspace> workspace_;
// the binary data
std::string data_;
// The format
std::string fmt_;
// function information table.
std::unordered_map<std::string, FunctionInfo> fmap_;
// Module local mutex
std::mutex build_lock_;
// The OpenCL source.
std::string source_;
// the binary data
cl_program program_{nullptr};
// build info
std::vector<bool> device_built_flag_;
// kernel id cache
std::unordered_map<std::string, KTRefEntry> kid_map_;
// kernels build so far.
std::vector<cl_kernel> kernels_;
};
class OpenCLWrappedFunc { class OpenCLWrappedFunc {
public: public:
// initialize the OpenCL function. // initialize the OpenCL function.
...@@ -193,7 +22,7 @@ class OpenCLWrappedFunc { ...@@ -193,7 +22,7 @@ class OpenCLWrappedFunc {
std::string func_name, std::string func_name,
std::vector<size_t> arg_size, std::vector<size_t> arg_size,
const std::vector<std::string>& thread_axis_tags) { const std::vector<std::string>& thread_axis_tags) {
w_ = cl::OpenCLWorkspace::Global().get(); w_ = m->GetGlobalWorkspace().get();
m_ = m; m_ = m;
sptr_ = sptr; sptr_ = sptr;
entry_ = entry; entry_ = entry;
...@@ -205,7 +34,7 @@ class OpenCLWrappedFunc { ...@@ -205,7 +34,7 @@ class OpenCLWrappedFunc {
void operator()(TVMArgs args, void operator()(TVMArgs args,
TVMRetValue* rv, TVMRetValue* rv,
void** void_args) const { void** void_args) const {
cl::OpenCLThreadEntry* t = cl::OpenCLThreadEntry::ThreadLocal(); cl::OpenCLThreadEntry* t = w_->GetThreadEntry();
// get the kernel from thread local kernel table. // get the kernel from thread local kernel table.
if (entry_.kernel_id >= t->kernel_table.size()) { if (entry_.kernel_id >= t->kernel_table.size()) {
t->kernel_table.resize(entry_.kernel_id + 1); t->kernel_table.resize(entry_.kernel_id + 1);
...@@ -250,6 +79,31 @@ class OpenCLWrappedFunc { ...@@ -250,6 +79,31 @@ class OpenCLWrappedFunc {
ThreadAxisConfig thread_axis_cfg_; ThreadAxisConfig thread_axis_cfg_;
}; };
OpenCLModuleNode::~OpenCLModuleNode() {
{
// free the kernel ids in global table.
std::lock_guard<std::mutex> lock(workspace_->mu);
for (auto& kv : kid_map_) {
workspace_->free_kernel_ids.push_back(kv.second.kernel_id);
}
}
// free the kernels
for (cl_kernel k : kernels_) {
OPENCL_CALL(clReleaseKernel(k));
}
if (program_) {
OPENCL_CALL(clReleaseProgram(program_));
}
}
const std::shared_ptr<cl::OpenCLWorkspace>& OpenCLModuleNode::GetGlobalWorkspace() {
return cl::OpenCLWorkspace::Global();
}
const char* OpenCLModuleNode::type_key() const {
return "opencl";
}
PackedFunc OpenCLModuleNode::GetFunction( PackedFunc OpenCLModuleNode::GetFunction(
const std::string& name, const std::string& name,
const std::shared_ptr<ModuleNode>& sptr_to_self) { const std::shared_ptr<ModuleNode>& sptr_to_self) {
...@@ -279,6 +133,104 @@ PackedFunc OpenCLModuleNode::GetFunction( ...@@ -279,6 +133,104 @@ PackedFunc OpenCLModuleNode::GetFunction(
return PackFuncVoidAddr(f, info.arg_types); return PackFuncVoidAddr(f, info.arg_types);
} }
void OpenCLModuleNode::SaveToFile(const std::string& file_name,
const std::string& format) {
std::string fmt = GetFileFormat(file_name, format);
CHECK_EQ(fmt, fmt_)
<< "Can only save to format=" << fmt_;
std::string meta_file = GetMetaFilePath(file_name);
SaveMetaDataToFile(meta_file, fmap_);
SaveBinaryToFile(file_name, data_);
}
void OpenCLModuleNode::SaveToBinary(dmlc::Stream* stream) {
stream->Write(fmt_);
stream->Write(fmap_);
stream->Write(data_);
}
std::string OpenCLModuleNode::GetSource(const std::string& format) {
if (format == fmt_) return data_;
if (fmt_ == "cl") {
return data_;
} else {
return source_;
}
}
void OpenCLModuleNode::Init() {
workspace_ = GetGlobalWorkspace();
workspace_->Init();
CHECK(workspace_->context != nullptr) << "No OpenCL device";
if (fmt_ == "cl") {
const char* s = data_.c_str();
size_t len = data_.length();
cl_int err;
program_ = clCreateProgramWithSource(
workspace_->context, 1, &s, &len, &err);
OPENCL_CHECK_ERROR(err);
} else if (fmt_ == "xclbin" || fmt_ == "awsxclbin") {
const unsigned char* s = (const unsigned char *)data_.c_str();
size_t len = data_.length();
cl_int err;
program_ = clCreateProgramWithBinary(
workspace_->context, 1, &(workspace_->devices[0]), &len, &s, NULL, &err);
if (err != CL_SUCCESS) {
LOG(ERROR) << "OpenCL Error: " << cl::CLGetErrorString(err);
}
} else {
LOG(FATAL) << "Unknown OpenCL format " << fmt_;
}
device_built_flag_.resize(workspace_->devices.size(), false);
// initialize the kernel id, need to lock global table.
std::lock_guard<std::mutex> lock(workspace_->mu);
for (const auto& kv : fmap_) {
const std::string& key = kv.first;
KTRefEntry e;
if (workspace_->free_kernel_ids.size() != 0) {
e.kernel_id = workspace_->free_kernel_ids.back();
workspace_->free_kernel_ids.pop_back();
} else {
e.kernel_id = workspace_->num_registered_kernels++;
}
e.version = workspace_->timestamp++;
kid_map_[key] = e;
}
}
cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w,
cl::OpenCLThreadEntry* t,
const std::string& func_name,
const KTRefEntry& e) {
std::lock_guard<std::mutex> lock(build_lock_);
int device_id = t->context.device_id;
if (!device_built_flag_[device_id]) {
// build program
cl_int err;
cl_device_id dev = w->devices[device_id];
err = clBuildProgram(program_, 1, &dev, nullptr, nullptr, nullptr);
if (err != CL_SUCCESS) {
size_t len;
std::string log;
clGetProgramBuildInfo(
program_, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len);
log.resize(len);
clGetProgramBuildInfo(
program_, dev, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr);
LOG(FATAL) << "OpenCL build error for device=" << dev << log;
}
device_built_flag_[device_id] = true;
}
// build kernel
cl_int err;
cl_kernel kernel = clCreateKernel(program_, func_name.c_str(), &err);
OPENCL_CHECK_ERROR(err);
t->kernel_table[e.kernel_id].kernel = kernel;
t->kernel_table[e.kernel_id].version = e.version;
kernels_.push_back(kernel);
return kernel;
}
Module OpenCLModuleCreate( Module OpenCLModuleCreate(
std::string data, std::string data,
std::string fmt, std::string fmt,
...@@ -323,16 +275,6 @@ TVM_REGISTER_GLOBAL("module.loadfile_clbin") ...@@ -323,16 +275,6 @@ TVM_REGISTER_GLOBAL("module.loadfile_clbin")
*rv = OpenCLModuleLoadFile(args[0], args[1]); *rv = OpenCLModuleLoadFile(args[0], args[1]);
}); });
TVM_REGISTER_GLOBAL("module.loadfile_xclbin")
.set_body([](TVMArgs args, TVMRetValue* rv) {
*rv = OpenCLModuleLoadFile(args[0], args[1]);
});
TVM_REGISTER_GLOBAL("module.loadfile_awsxclbin")
.set_body([](TVMArgs args, TVMRetValue* rv) {
*rv = OpenCLModuleLoadFile(args[0], args[1]);
});
TVM_REGISTER_GLOBAL("module.loadbinary_opencl") TVM_REGISTER_GLOBAL("module.loadbinary_opencl")
.set_body([](TVMArgs args, TVMRetValue* rv) { .set_body([](TVMArgs args, TVMRetValue* rv) {
*rv = OpenCLModuleLoadBinary(args[0]); *rv = OpenCLModuleLoadBinary(args[0]);
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
namespace tvm { namespace tvm {
namespace runtime { namespace runtime {
/*! /*!
* \brief create a opencl module from data. * \brief create a opencl module for GPU devices from data.
* *
* \param data The module data. * \param data The module data.
* \param fmt The format of the data, can be "clbin", "cl" * \param fmt The format of the data, can be "clbin", "cl"
......
/*!
* Copyright (c) 2018 by Contributors
* \file sdaccel_common.h
* \brief SDAccel common header
*/
#ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
#define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
#include "../opencl_common.h"
namespace tvm {
namespace runtime {
namespace cl {
/*!
* \brief Process global SDAccel workspace.
*/
class SDAccelWorkspace final : public OpenCLWorkspace {
public:
// override OpenCL device API
void Init() final;
bool IsOpenCLDevice(TVMContext ctx) final;
OpenCLThreadEntry* GetThreadEntry() final;
// get the global workspace
static const std::shared_ptr<OpenCLWorkspace>& Global();
};
/*! \brief Thread local workspace for SDAccel*/
class SDAccelThreadEntry : public OpenCLThreadEntry {
public:
// constructor
SDAccelThreadEntry()
: OpenCLThreadEntry(static_cast<DLDeviceType>(kDLSDAccel), SDAccelWorkspace::Global()) {}
// get the global workspace
static SDAccelThreadEntry* ThreadLocal();
};
} // namespace cl
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
/*!
* Copyright (c) 2018 by Contributors
* \file sdaccel_device_api.cc
*/
#include <tvm/runtime/registry.h>
#include <dmlc/thread_local.h>
#include <tvm/container.h>
#include <tvm/ir.h>
#include <tvm/packed_func_ext.h>
#include "./sdaccel_common.h"
namespace tvm {
namespace runtime {
namespace cl {
OpenCLThreadEntry* SDAccelWorkspace::GetThreadEntry() {
return SDAccelThreadEntry::ThreadLocal();
}
const std::shared_ptr<OpenCLWorkspace>& SDAccelWorkspace::Global() {
static std::shared_ptr<OpenCLWorkspace> inst = std::make_shared<SDAccelWorkspace>();
return inst;
}
void SDAccelWorkspace::Init() {
OpenCLWorkspace::Init("accelerator", "Xilinx");
}
bool SDAccelWorkspace::IsOpenCLDevice(TVMContext ctx) {
return ctx.device_type == static_cast<DLDeviceType>(kDLSDAccel);
}
typedef dmlc::ThreadLocalStore<SDAccelThreadEntry> SDAccelThreadStore;
SDAccelThreadEntry* SDAccelThreadEntry::ThreadLocal() {
return SDAccelThreadStore::Get();
}
TVM_REGISTER_GLOBAL("device_api.sdaccel")
.set_body([](TVMArgs args, TVMRetValue* rv) {
DeviceAPI* ptr = SDAccelWorkspace::Global().get();
*rv = static_cast<void*>(ptr);
});
} // namespace cl
} // namespace runtime
} // namespace tvm
/*!
* Copyright (c) 2018 by Contributors
* \file sdaccel_module.cc
*/
#include <dmlc/memory_io.h>
#include <tvm/runtime/registry.h>
#include <vector>
#include <string>
#include <unordered_map>
#include "./sdaccel_common.h"
#include "./sdaccel_module.h"
namespace tvm {
namespace runtime {
class SDAccelModuleNode : public OpenCLModuleNode {
public:
explicit SDAccelModuleNode(std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source)
: OpenCLModuleNode(data, fmt, fmap, source) {}
const std::shared_ptr<cl::OpenCLWorkspace>& GetGlobalWorkspace() final;
const char* type_key() const final;
};
const std::shared_ptr<cl::OpenCLWorkspace>& SDAccelModuleNode::GetGlobalWorkspace() {
return cl::SDAccelWorkspace::Global();
}
const char* SDAccelModuleNode::type_key() const {
return "sdaccel";
}
Module SDAccelModuleCreate(
std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source) {
std::shared_ptr<SDAccelModuleNode> n =
std::make_shared<SDAccelModuleNode>(data, fmt, fmap, source);
n->Init();
return Module(n);
}
Module SDAccelModuleLoadFile(const std::string& file_name,
const std::string& format) {
std::string data;
std::unordered_map<std::string, FunctionInfo> fmap;
std::string fmt = GetFileFormat(file_name, format);
std::string meta_file = GetMetaFilePath(file_name);
LoadBinaryFromFile(file_name, &data);
LoadMetaDataFromFile(meta_file, &fmap);
return SDAccelModuleCreate(data, fmt, fmap, std::string());
}
Module SDAccelModuleLoadBinary(void* strm) {
dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
std::string data;
std::unordered_map<std::string, FunctionInfo> fmap;
std::string fmt;
stream->Read(&fmt);
stream->Read(&fmap);
stream->Read(&data);
return SDAccelModuleCreate(data, fmt, fmap, std::string());
}
TVM_REGISTER_GLOBAL("module.loadfile_xclbin")
.set_body([](TVMArgs args, TVMRetValue* rv) {
*rv = SDAccelModuleLoadFile(args[0], args[1]);
});
TVM_REGISTER_GLOBAL("module.loadfile_awsxclbin")
.set_body([](TVMArgs args, TVMRetValue* rv) {
*rv = SDAccelModuleLoadFile(args[0], args[1]);
});
} // namespace runtime
} // namespace tvm
/*!
* Copyright (c) 2018 by Contributors
* \file sdaccel_module.h
* \brief Execution handling of OPENCL kernels for SDAccel FPGAs
*/
#ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
#define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
#include <tvm/runtime/packed_func.h>
#include <memory>
#include <vector>
#include <string>
#include "../../meta_data.h"
namespace tvm {
namespace runtime {
/*!
* \brief create a opencl module for SDAccel from data.
*
* \param data The module data.
* \param fmt The format of the data, can be "xclbin", "awsxclbin"
* \param fmap The map function information map of each function.
*/
Module SDAccelModuleCreate(
std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source);
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment