Commit cb68c82c by MORITA Kazutaka Committed by Tianqi Chen

Add support for multiple OpenCL platforms (#1345)

parent f927e1f3
......@@ -39,6 +39,9 @@ set(USE_CUDA OFF)
# - /path/to/rocm: use specific path to rocm
set(USE_ROCM OFF)
# Whether enable SDAccel runtime
set(USE_SDACCEL OFF)
# Whether enable OpenCL runtime
set(USE_OPENCL OFF)
......
......@@ -7,6 +7,18 @@ if(OpenCL_FOUND)
include_directories(${OpenCL_INCLUDE_DIRS})
endif(OpenCL_FOUND)
if(USE_SDACCEL)
message(STATUS "Build with SDAccel support")
file(GLOB RUNTIME_SDACCEL_SRCS src/runtime/opencl/sdaccel/*.cc)
list(APPEND RUNTIME_SRCS ${RUNTIME_SDACCEL_SRCS})
if(NOT USE_OPENCL)
message(STATUS "Enable OpenCL support required for SDAccel")
set(USE_OPENCL ON)
endif()
else()
list(APPEND COMPILER_SRCS src/codegen/opt/build_sdaccel_off.cc)
endif(USE_SDACCEL)
if(USE_OPENCL)
find_package(OpenCL REQUIRED)
message(STATUS "Build with OpenCL support")
......
......@@ -60,6 +60,7 @@ typedef int64_t tvm_index_t;
/*! \brief Extension device types in TVM */
typedef enum {
kDLSDAccel = 6,
kDLVulkan = 7,
kOpenGL = 11,
// Extension DRAM type, used for quickly test extension device
......
......@@ -95,6 +95,7 @@ class TVMContext(ctypes.Structure):
1 : 'cpu',
2 : 'gpu',
4 : 'opencl',
6 : 'sdaccel',
7 : 'vulkan',
8 : 'metal',
9 : 'vpi',
......@@ -111,7 +112,7 @@ class TVMContext(ctypes.Structure):
'nvptx': 2,
'cl': 4,
'opencl': 4,
'sdaccel': 4,
'sdaccel': 6,
'vulkan': 7,
'metal': 8,
'vpi': 9,
......
"""Utility for Interacting with SDAccel Tools"""
import subprocess
import os
import re
from . import util
from ..api import register_func
def _vhls_to_opencl(code):
"""Convert source code from Vivado HLS to OpenCL."""
out = ''
for line in code.split('\n'):
if re.match(r'#include', line):
# OpenCL doesn't support include.
continue
if re.match(r'#pragma', line):
# Remove Vivado HLS specific pragmas.
continue
if re.match(r'extern "C"', line):
line = re.sub(r'^extern "C"', "__kernel", line)
# Add __global to pointer parameters.
line = re.sub(r'(\w+)\s*\*', r"__global \1*", line)
out += line + '\n'
return out
def _fake_compile_vhls(code):
"""Fake compile Vivado HLS code for SDAccel.
Compile the Vivado HLS code as an OpenCL code, and generate a program
binary for GPU which can be used instead of xclbin.
Parameters
----------
code : str
The Vivado HLS code.
Return
------
binary : bytearray
The program binary which can be passed to clCreateProgramWithBinary
"""
try:
import pyopencl as cl
except ImportError:
raise ImportError('PyOpenCL is required for testing SDAccel backend.')
ctx = cl.Context(dev_type=cl.device_type.GPU)
program = cl.Program(ctx, _vhls_to_opencl(code)).build()
binary = bytearray(program.binaries[0])
return binary
@register_func("tvm_callback_sdaccel_compile")
def compile_vhls(code, kernel):
"""Compile Vivado HLS code for SDAccel.
......@@ -87,9 +39,7 @@ def compile_vhls(code, kernel):
platform = os.environ.get("XCL_PLATFORM", os.environ.get("AWS_PLATFORM"))
if platform is None:
# If we don't have the Xilinx toolchain, create a program binary for
# GPU and use it for testing.
return _fake_compile_vhls(code)
raise RuntimeError("No Xlinx device specified.")
# build xo
args = [xocc, "-c", "-t", target, "--platform", platform, "-o", tmp_xo, "-k", kernel] + \
......
......@@ -6,7 +6,7 @@
#include <string>
#include "./codegen_vhls.h"
#include "./build_common.h"
#include "../runtime/opencl/opencl_module.h"
#include "../runtime/opencl/sdaccel/sdaccel_module.h"
namespace tvm {
namespace codegen {
......@@ -91,7 +91,7 @@ runtime::Module BuildSDAccel(Array<LoweredFunc> funcs) {
} else {
LOG(FATAL) << "Cannot compile Vivado HLS code.";
}
return OpenCLModuleCreate(xclbin, "xclbin", ExtractFuncInfo(funcs), code);
return SDAccelModuleCreate(xclbin, "xclbin", ExtractFuncInfo(funcs), code);
}
TVM_REGISTER_API("codegen.build_sdaccel")
......
/*!
* Copyright (c) 2018 by Contributors
* Optional module when build opencl is switched to off
*/
#include "../codegen_source_base.h"
#include "../../runtime/opencl/opencl_module.h"
namespace tvm {
namespace runtime {
Module SDAccelModuleCreate(
std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source) {
LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "sdaccel");
}
} // namespace runtime
} // namespace tvm
......@@ -36,7 +36,7 @@ class MemoryAccessVerifier final : protected IRVisitor {
/// Interface to perform memory access verification
void Run() {
if (!IsGPUDevice(dev_type_)) return;
if (!IsGPUDevice(dev_type_) && !IsFPGADevice(dev_type_)) return;
IRVisitor::Visit(func_->body);
}
......@@ -143,6 +143,10 @@ class MemoryAccessVerifier final : protected IRVisitor {
kDLVulkan == dev_type || kDLMetal == dev_type ||
kDLROCM == dev_type || kOpenGL == dev_type;
}
/// Check if a given DLDeviceType/TVMDeviceExtType value denotes FPGA device.
static bool IsFPGADevice(int dev_type) {
return kDLSDAccel == dev_type;
}
private:
/// Status of visitor
......
......@@ -31,6 +31,7 @@ inline std::string DeviceName(int type) {
case kDLCPU: return "cpu";
case kDLGPU: return "gpu";
case kDLOpenCL: return "opencl";
case kDLSDAccel: return "sdaccel";
case kDLVulkan: return "vulkan";
case kDLMetal: return "metal";
case kDLVPI: return "vpi";
......
......@@ -21,6 +21,10 @@
#include <string>
#include <vector>
#include "../workspace_pool.h"
#include "../pack_args.h"
#include "../thread_storage_scope.h"
#include "../meta_data.h"
#include "../file_util.h"
namespace tvm {
namespace runtime {
......@@ -97,17 +101,23 @@ inline const char* CLGetErrorString(cl_int error) {
OPENCL_CHECK_ERROR(e); \
}
class OpenCLThreadEntry;
/*!
* \brief Process global OpenCL workspace.
*/
class OpenCLWorkspace final : public DeviceAPI {
class OpenCLWorkspace : public DeviceAPI {
public:
// global platform id
cl_platform_id platform_id;
// global platform name
std::string platform_name;
// global context of this process
cl_context context{nullptr};
// whether the workspace it initialized.
bool initialized_{false};
// the device type
std::string device_type;
// the devices
std::vector<cl_device_id> devices;
// the queues
......@@ -128,10 +138,17 @@ class OpenCLWorkspace final : public DeviceAPI {
}
}
// Initialzie the device.
void Init();
void Init(const std::string& device_type, const std::string& platform_name = "");
virtual void Init() {
Init("gpu");
}
// Check whether the context is OpenCL or not.
virtual bool IsOpenCLDevice(TVMContext ctx) {
return ctx.device_type == kDLOpenCL;
}
// get the queue of the context
cl_command_queue GetQueue(TVMContext ctx) {
CHECK_EQ(ctx.device_type, kDLOpenCL);
CHECK(IsOpenCLDevice(ctx));
this->Init();
CHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
<< "Invalid OpenCL device_id=" << ctx.device_id;
......@@ -157,6 +174,12 @@ class OpenCLWorkspace final : public DeviceAPI {
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
void FreeWorkspace(TVMContext ctx, void* data) final;
/*!
* \brief Get the thread local ThreadEntry
*/
virtual OpenCLThreadEntry* GetThreadEntry();
// get the global workspace
static const std::shared_ptr<OpenCLWorkspace>& Global();
};
......@@ -179,15 +202,87 @@ class OpenCLThreadEntry {
/*! \brief workspace pool */
WorkspacePool pool;
// constructor
OpenCLThreadEntry()
: pool(kDLOpenCL, OpenCLWorkspace::Global()) {
OpenCLThreadEntry(DLDeviceType device_type, std::shared_ptr<DeviceAPI> device)
: pool(device_type, device) {
context.device_id = 0;
context.device_type = kDLOpenCL;
context.device_type = device_type;
}
OpenCLThreadEntry()
: OpenCLThreadEntry(kDLOpenCL, OpenCLWorkspace::Global()) {}
// get the global workspace
static OpenCLThreadEntry* ThreadLocal();
};
} // namespace cl
// Module to support thread-safe multi-device execution.
// OpenCL runtime is a bit tricky because clSetKernelArg is not thread-safe
// To make the call thread-safe, we create a thread-local kernel table
// and lazily install new kernels into the kernel table when the kernel is called.
// The kernels are recycled when the module get destructed.
class OpenCLModuleNode : public ModuleNode {
public:
// Kernel table reference entry.
struct KTRefEntry {
size_t kernel_id;
size_t version;
};
explicit OpenCLModuleNode(std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source)
: data_(data), fmt_(fmt), fmap_(fmap), source_(source) {}
// destructor
~OpenCLModuleNode();
/*!
* \brief Get the global workspace
*/
virtual const std::shared_ptr<cl::OpenCLWorkspace>& GetGlobalWorkspace();
virtual const char* type_key() const;
PackedFunc GetFunction(
const std::string& name,
const std::shared_ptr<ModuleNode>& sptr_to_self) final;
void SaveToFile(const std::string& file_name,
const std::string& format) final;
void SaveToBinary(dmlc::Stream* stream) final;
std::string GetSource(const std::string& format) final;
// Initialize the programs
void Init();
// install a new kernel to thread local entry
cl_kernel InstallKernel(cl::OpenCLWorkspace* w,
cl::OpenCLThreadEntry* t,
const std::string& func_name,
const KTRefEntry& e);
protected:
// The workspace, need to keep reference to use it in destructor.
// In case of static destruction order problem.
std::shared_ptr<cl::OpenCLWorkspace> workspace_;
// the binary data
std::string data_;
private:
// The format
std::string fmt_;
// function information table.
std::unordered_map<std::string, FunctionInfo> fmap_;
// Module local mutex
std::mutex build_lock_;
// The OpenCL source.
std::string source_;
// the binary data
cl_program program_{nullptr};
// build info
std::vector<bool> device_built_flag_;
// kernel id cache
std::unordered_map<std::string, KTRefEntry> kid_map_;
// kernels build so far.
std::vector<cl_kernel> kernels_;
};
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
......@@ -10,13 +10,17 @@ namespace tvm {
namespace runtime {
namespace cl {
OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() {
return OpenCLThreadEntry::ThreadLocal();
}
const std::shared_ptr<OpenCLWorkspace>& OpenCLWorkspace::Global() {
static std::shared_ptr<OpenCLWorkspace> inst = std::make_shared<OpenCLWorkspace>();
return inst;
}
void OpenCLWorkspace::SetDevice(TVMContext ctx) {
OpenCLThreadEntry::ThreadLocal()->context.device_id = ctx.device_id;
GetThreadEntry()->context.device_id = ctx.device_id;
}
void OpenCLWorkspace::GetAttr(
......@@ -121,13 +125,13 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from,
TVMStreamHandle stream) {
this->Init();
CHECK(stream == nullptr);
if (ctx_from.device_type == kDLOpenCL && ctx_to.device_type == kDLOpenCL) {
if (IsOpenCLDevice(ctx_from) && IsOpenCLDevice(ctx_to)) {
OPENCL_CALL(clEnqueueCopyBuffer(
this->GetQueue(ctx_to),
static_cast<cl_mem>((void*)from), // NOLINT(*)
static_cast<cl_mem>(to),
from_offset, to_offset, size, 0, nullptr, nullptr));
} else if (ctx_from.device_type == kDLOpenCL && ctx_to.device_type == kDLCPU) {
} else if (IsOpenCLDevice(ctx_from) && ctx_to.device_type == kDLCPU) {
OPENCL_CALL(clEnqueueReadBuffer(
this->GetQueue(ctx_from),
static_cast<cl_mem>((void*)from), // NOLINT(*)
......@@ -135,7 +139,7 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from,
static_cast<char*>(to) + to_offset,
0, nullptr, nullptr));
OPENCL_CALL(clFinish(this->GetQueue(ctx_from)));
} else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLOpenCL) {
} else if (ctx_from.device_type == kDLCPU && IsOpenCLDevice(ctx_to)) {
OPENCL_CALL(clEnqueueWriteBuffer(
this->GetQueue(ctx_to),
static_cast<cl_mem>(to),
......@@ -156,11 +160,11 @@ void OpenCLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
void* OpenCLWorkspace::AllocWorkspace(TVMContext ctx,
size_t size,
TVMType type_hint) {
return OpenCLThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
return GetThreadEntry()->pool.AllocWorkspace(ctx, size);
}
void OpenCLWorkspace::FreeWorkspace(TVMContext ctx, void* data) {
OpenCLThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
GetThreadEntry()->pool.FreeWorkspace(ctx, data);
}
typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;
......@@ -223,38 +227,39 @@ bool MatchPlatformInfo(
return param_value.find(value) != std::string::npos;
}
void OpenCLWorkspace::Init() {
void OpenCLWorkspace::Init(const std::string& device_type, const std::string& platform_name) {
if (initialized_) return;
std::lock_guard<std::mutex> lock(this->mu);
if (initialized_) return;
initialized_ = true;
if (context != nullptr) return;
// matched platforms
std::vector<cl_platform_id> platform_matched = cl::GetPlatformIDs();
if (platform_matched.size() == 0) {
std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
if (platform_ids.size() == 0) {
LOG(WARNING) << "No OpenCL platform matched given existing options ...";
return;
}
if (platform_matched.size() > 1) {
LOG(WARNING) << "Multiple OpenCL platforms matched, use the first one ... ";
}
this->platform_id = platform_matched[0];
LOG(INFO) << "Initialize OpenCL platform \'"
<< cl::GetPlatformInfo(this->platform_id, CL_PLATFORM_NAME) << '\'';
std::string device_types[] = {"accelerator", "gpu", "cpu"};
std::vector<cl_device_id> devices_matched;
for (auto type : device_types) {
devices_matched = cl::GetDeviceIDs(this->platform_id, type);
this->platform_id = nullptr;
for (auto platform_id : platform_ids) {
if (!MatchPlatformInfo(platform_id, CL_PLATFORM_NAME, platform_name)) {
continue;
}
std::vector<cl_device_id> devices_matched = cl::GetDeviceIDs(platform_id, device_type);
if (devices_matched.size() > 0) {
this->platform_id = platform_id;
this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
this->device_type = device_type;
this->devices = devices_matched;
LOG(INFO) << "Initialize OpenCL platform \'" << this->platform_name << '\'';
break;
}
LOG(INFO) << "No OpenCL device any device matched given the options: " << type << " mode";
LOG(INFO) << "\'" << cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME)
<< "\' platform has no OpenCL device: " << device_type << " mode";
}
if (devices_matched.size() == 0) {
if (this->platform_id == nullptr) {
LOG(WARNING) << "No OpenCL device";
return;
}
this->devices = devices_matched;
cl_int err_code;
this->context = clCreateContext(
nullptr, this->devices.size(), &(this->devices[0]),
......@@ -272,11 +277,6 @@ void OpenCLWorkspace::Init() {
}
}
bool InitOpenCL(TVMArgs args, TVMRetValue* rv) {
cl::OpenCLWorkspace::Global()->Init();
return true;
}
TVM_REGISTER_GLOBAL("device_api.opencl")
.set_body([](TVMArgs args, TVMRetValue* rv) {
DeviceAPI* ptr = OpenCLWorkspace::Global().get();
......
......@@ -15,7 +15,7 @@
namespace tvm {
namespace runtime {
/*!
* \brief create a opencl module from data.
* \brief create a opencl module for GPU devices from data.
*
* \param data The module data.
* \param fmt The format of the data, can be "clbin", "cl"
......
/*!
* Copyright (c) 2018 by Contributors
* \file sdaccel_common.h
* \brief SDAccel common header
*/
#ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
#define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
#include "../opencl_common.h"
namespace tvm {
namespace runtime {
namespace cl {
/*!
* \brief Process global SDAccel workspace.
*/
class SDAccelWorkspace final : public OpenCLWorkspace {
public:
// override OpenCL device API
void Init() final;
bool IsOpenCLDevice(TVMContext ctx) final;
OpenCLThreadEntry* GetThreadEntry() final;
// get the global workspace
static const std::shared_ptr<OpenCLWorkspace>& Global();
};
/*! \brief Thread local workspace for SDAccel*/
class SDAccelThreadEntry : public OpenCLThreadEntry {
public:
// constructor
SDAccelThreadEntry()
: OpenCLThreadEntry(static_cast<DLDeviceType>(kDLSDAccel), SDAccelWorkspace::Global()) {}
// get the global workspace
static SDAccelThreadEntry* ThreadLocal();
};
} // namespace cl
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
/*!
* Copyright (c) 2018 by Contributors
* \file sdaccel_device_api.cc
*/
#include <tvm/runtime/registry.h>
#include <dmlc/thread_local.h>
#include <tvm/container.h>
#include <tvm/ir.h>
#include <tvm/packed_func_ext.h>
#include "./sdaccel_common.h"
namespace tvm {
namespace runtime {
namespace cl {
OpenCLThreadEntry* SDAccelWorkspace::GetThreadEntry() {
return SDAccelThreadEntry::ThreadLocal();
}
const std::shared_ptr<OpenCLWorkspace>& SDAccelWorkspace::Global() {
static std::shared_ptr<OpenCLWorkspace> inst = std::make_shared<SDAccelWorkspace>();
return inst;
}
void SDAccelWorkspace::Init() {
OpenCLWorkspace::Init("accelerator", "Xilinx");
}
bool SDAccelWorkspace::IsOpenCLDevice(TVMContext ctx) {
return ctx.device_type == static_cast<DLDeviceType>(kDLSDAccel);
}
typedef dmlc::ThreadLocalStore<SDAccelThreadEntry> SDAccelThreadStore;
SDAccelThreadEntry* SDAccelThreadEntry::ThreadLocal() {
return SDAccelThreadStore::Get();
}
TVM_REGISTER_GLOBAL("device_api.sdaccel")
.set_body([](TVMArgs args, TVMRetValue* rv) {
DeviceAPI* ptr = SDAccelWorkspace::Global().get();
*rv = static_cast<void*>(ptr);
});
} // namespace cl
} // namespace runtime
} // namespace tvm
/*!
* Copyright (c) 2018 by Contributors
* \file sdaccel_module.cc
*/
#include <dmlc/memory_io.h>
#include <tvm/runtime/registry.h>
#include <vector>
#include <string>
#include <unordered_map>
#include "./sdaccel_common.h"
#include "./sdaccel_module.h"
namespace tvm {
namespace runtime {
class SDAccelModuleNode : public OpenCLModuleNode {
public:
explicit SDAccelModuleNode(std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source)
: OpenCLModuleNode(data, fmt, fmap, source) {}
const std::shared_ptr<cl::OpenCLWorkspace>& GetGlobalWorkspace() final;
const char* type_key() const final;
};
const std::shared_ptr<cl::OpenCLWorkspace>& SDAccelModuleNode::GetGlobalWorkspace() {
return cl::SDAccelWorkspace::Global();
}
const char* SDAccelModuleNode::type_key() const {
return "sdaccel";
}
Module SDAccelModuleCreate(
std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source) {
std::shared_ptr<SDAccelModuleNode> n =
std::make_shared<SDAccelModuleNode>(data, fmt, fmap, source);
n->Init();
return Module(n);
}
Module SDAccelModuleLoadFile(const std::string& file_name,
const std::string& format) {
std::string data;
std::unordered_map<std::string, FunctionInfo> fmap;
std::string fmt = GetFileFormat(file_name, format);
std::string meta_file = GetMetaFilePath(file_name);
LoadBinaryFromFile(file_name, &data);
LoadMetaDataFromFile(meta_file, &fmap);
return SDAccelModuleCreate(data, fmt, fmap, std::string());
}
Module SDAccelModuleLoadBinary(void* strm) {
dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
std::string data;
std::unordered_map<std::string, FunctionInfo> fmap;
std::string fmt;
stream->Read(&fmt);
stream->Read(&fmap);
stream->Read(&data);
return SDAccelModuleCreate(data, fmt, fmap, std::string());
}
TVM_REGISTER_GLOBAL("module.loadfile_xclbin")
.set_body([](TVMArgs args, TVMRetValue* rv) {
*rv = SDAccelModuleLoadFile(args[0], args[1]);
});
TVM_REGISTER_GLOBAL("module.loadfile_awsxclbin")
.set_body([](TVMArgs args, TVMRetValue* rv) {
*rv = SDAccelModuleLoadFile(args[0], args[1]);
});
} // namespace runtime
} // namespace tvm
/*!
* Copyright (c) 2018 by Contributors
* \file sdaccel_module.h
* \brief Execution handling of OPENCL kernels for SDAccel FPGAs
*/
#ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
#define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
#include <tvm/runtime/packed_func.h>
#include <memory>
#include <vector>
#include <string>
#include "../../meta_data.h"
namespace tvm {
namespace runtime {
/*!
* \brief create a opencl module for SDAccel from data.
*
* \param data The module data.
* \param fmt The format of the data, can be "xclbin", "awsxclbin"
* \param fmap The map function information map of each function.
*/
Module SDAccelModuleCreate(
std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source);
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment