Commit 891630ed by Tianqi Chen Committed by GitHub

[CODEGEN/EXEC] CUDA, NVRTC pipeline complete (#27)

* [CODEGEN] CUDA/OPENCL pipeline complete

* Hide TVMType by str in frontend
parent ff06917c
......@@ -4,7 +4,7 @@ language: cpp
os:
- linux
# - osx
- osx
env:
# code analysis
......
Subproject commit 30bf0f043e6388418958fd1f29259ee43c42b600
Subproject commit 642ae50ac749c91c04483db04500163304d4334e
......@@ -15,7 +15,7 @@ all: lib/libtvm.a lib/libtvm.so
LIB_HALIDE_IR = HalideIR/lib/libHalideIR.a
SRC = $(wildcard src/*.cc src/*/*.cc)
SRC = $(wildcard src/*.cc src/*/*.cc src/*/*/*.cc)
ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC))
ALL_DEP = $(ALL_OBJ) $(LIB_HALIDE_IR)
......@@ -39,7 +39,7 @@ endif
ifeq ($(USE_CUDA), 1)
CFLAGS += -DTVM_CUDA_RUNTIME=1
LDFLAGS += -lcuda -lcudart
LDFLAGS += -lcuda -lcudart -lnvrtc
else
CFLAGS += -DTVM_CUDA_RUNTIME=0
endif
......@@ -92,3 +92,4 @@ clean:
-include build/*.d
-include build/*/*.d
-include build/*/*/*.d
......@@ -12,25 +12,9 @@
#include <string>
#include <memory>
#include <functional>
#include <typeinfo>
#include <type_traits>
namespace tvm {
/*!
*\brief whether to use CUDA runtime
*/
#ifndef TVM_CUDA_RUNTIME
#define TVM_CUDA_RUNTIME 1
#endif
/*!
*\brief whether to use opencl runtime
*/
#ifndef TVM_OPENCL_RUNTIME
#define TVM_OPENCL_RUNTIME 0
#endif
using ::tvm::Node;
using ::tvm::NodeRef;
using ::tvm::AttrVisitor;
......
......@@ -9,13 +9,18 @@
#include <string>
#include "./base.h"
#include "./expr.h"
#include "./module.h"
#include "./lowered_func.h"
#include "./runtime/packed_func.h"
namespace tvm {
/*! \brief namespace for lowlevel IR pass and codegen */
namespace codegen {
// use packed function from runtime.
using runtime::PackedFunc;
using runtime::TVMArgs;
using runtime::TVMRetValue;
/*!
* \brief Make an user callable API LoweredFunc.
*
......@@ -64,8 +69,35 @@ Array<Var> UndefinedVars(const LoweredFunc& f);
*/
Array<LoweredFunc> SplitHostDevice(LoweredFunc func);
/*!
* \brief Build a stack VM function.
* \param func The LoweredFunc to be build
* \param device_funcs The additional device functions
* \return A packed function representing the func.
*/
PackedFunc BuildStackVM(
LoweredFunc func,
const std::unordered_map<LoweredFunc, PackedFunc>& device_funcs);
/*!
* \brief Build a CUDA function with NVRTC
*
* \param fsplits The LoweredFuncs to be build (after SplitHostDevice)
* The first element is the host function, followed by device functions.
* \param host_mode The host side compilation mode:
* - "stackvm": use stack vm to interpret host side code.
*/
PackedFunc BuildNVRTC(Array<LoweredFunc> fsplits, std::string host_mode);
runtime::PackedFunc BuildStackVM(LoweredFunc func);
/*!
* \brief Build a OpenCL function.
*
* \param fsplits The LoweredFuncs to be build (after SplitHostDevice)
* The first element is the host function, followed by device functions.
* \param host_mode The host side compilation mode:
* - "stackvm": use stack vm to interpret host side code.
*/
PackedFunc BuildOpenCL(Array<LoweredFunc> fsplits, std::string host_mode);
} // namespace codegen
} // namespace tvm
......
......@@ -12,7 +12,7 @@
#include <string>
#include <algorithm>
#include "./base.h"
#include "./runtime/packed_func.h"
#include "./runtime/c_runtime_api.h"
namespace tvm {
......@@ -33,6 +33,19 @@ using Halide::Internal::Variable;
using Halide::Internal::make_const;
inline Type TVMType2Type(TVMType t) {
return Type(static_cast<halide_type_code_t>(t.code), t.bits, t.lanes);
}
inline TVMType Type2TVMType(Type t) {
TVMType ret;
ret.code = static_cast<uint8_t>(t.code());
ret.bits = static_cast<uint8_t>(t.bits());
ret.lanes = static_cast<uint16_t>(t.lanes());
return ret;
}
/*! \brief a named variable in TVM */
class Var : public Halide::VarExpr {
public:
......
/*!
* Copyright (c) 2016 by Contributors
* \file module.h
* \brief Low level IR module,
* Contains lowered function information.
* Copyright (c) 2017 by Contributors
* \file lowered_func.h
* \brief Information about a lowered TVM function.
* This data structure is final step toward codegen.
*/
#ifndef TVM_MODULE_H_
#define TVM_MODULE_H_
#ifndef TVM_LOWERED_FUNC_H_
#define TVM_LOWERED_FUNC_H_
#include <tvm/container.h>
#include <ir/FunctionBase.h>
......@@ -102,4 +102,13 @@ inline const LoweredFuncNode* LoweredFunc::operator->() const {
} // namespace tvm
#endif // TVM_MODULE_H_
namespace std {
template <>
struct hash<::tvm::LoweredFunc> {
std::size_t operator()(const ::tvm::LoweredFunc& k) const {
return k.hash();
}
};
}
#endif // TVM_LOWERED_FUNC_H_
......@@ -14,6 +14,7 @@
#include "./base.h"
#include "./expr.h"
#include "./runtime/packed_func.h"
namespace tvm {
using runtime::TVMArgs;
......@@ -162,19 +163,7 @@ inline void TVMArgsSetter::operator()(size_t i, NodeRef& other) const { // NOLI
type_codes_[i] = kNodeHandle;
}
// Type related stuffs
inline Type TVMType2Type(TVMType t) {
return Type(static_cast<halide_type_code_t>(t.code), t.bits, t.lanes);
}
inline TVMType Type2TVMType(Type t) {
TVMType ret;
ret.code = static_cast<uint8_t>(t.code());
ret.bits = static_cast<uint8_t>(t.bits());
ret.lanes = static_cast<uint16_t>(t.lanes());
return ret;
}
// type related stuffs
inline TVMRetValue& TVMRetValue::operator=(const Halide::Type& t) {
return this->operator=(Type2TVMType(t));
}
......
/*!
* Copyright (c) 2017 by Contributors
* \file config.h
* \brief Runtime library related configurations.
*/
#ifndef TVM_RUNTIME_CONFIG_H_
#define TVM_RUNTIME_CONFIG_H_
/*!
*\brief whether to use CUDA runtime
*/
#ifndef TVM_CUDA_RUNTIME
#define TVM_CUDA_RUNTIME 1
#endif
/*!
*\brief whether to use opencl runtime
*/
#ifndef TVM_OPENCL_RUNTIME
#define TVM_OPENCL_RUNTIME 0
#endif
#endif // TVM_RUNTIME_CONFIG_H_
......@@ -163,6 +163,13 @@ inline const char* TypeCode2Str(int type_code);
*/
inline TVMType String2TVMType(std::string s);
/*!
* \brief convert a TVM type to string.
* \param t The type to be converted.
* \return The corresponding tvm type in string.
*/
inline std::string TVMType2String(TVMType t);
// macro to check type code.
#define TVM_CHECK_TYPE_CODE(CODE, T) \
CHECK_EQ(CODE, T) << " expected " \
......@@ -258,6 +265,9 @@ class TVMArgValue : public TVMPODValue_ {
using TVMPODValue_::operator TVMArray*;
// conversion operator.
operator std::string() const {
if (type_code_ == kTVMType) {
return TVMType2String(operator TVMType());
}
TVM_CHECK_TYPE_CODE(type_code_, kStr);
return std::string(value_.v_str);
}
......@@ -308,7 +318,6 @@ class TVMRetValue : public TVMPODValue_ {
*/
TVMRetValue(TVMRetValue&& other)
: TVMPODValue_(other.value_, other.type_code_) {
other.type_code_ = kNull;
}
/*! \brief destructor */
~TVMRetValue() {
......@@ -328,6 +337,9 @@ class TVMRetValue : public TVMPODValue_ {
}
// conversion operators
operator std::string() const {
if (type_code_ == kTVMType) {
return TVMType2String(operator TVMType());
}
TVM_CHECK_TYPE_CODE(type_code_, kStr);
return *ptr<std::string>();
}
......@@ -418,6 +430,13 @@ class TVMRetValue : public TVMPODValue_ {
*ret_type_code = type_code_;
type_code_ = kNull;
}
/*! \return The value field, if the data is POD */
const TVMValue& value() const {
CHECK(type_code_ != kNodeHandle &&
type_code_ != kFuncHandle &&
type_code_ != kStr) << "TVMRetValue.value can only be used for POD data";
return value_;
}
// NodeRef related extenstions: in tvm/packed_func_ext.h
inline TVMRetValue& operator=(const NodeRef& other);
inline TVMRetValue& operator=(const std::shared_ptr<Node>& other);
......@@ -488,7 +507,7 @@ inline const char* TypeCode2Str(int type_code) {
case kInt: return "int";
case kFloat: return "float";
case kStr: return "str";
case kHandle: return "Handle";
case kHandle: return "handle";
case kNull: return "NULL";
case kNodeHandle: return "NodeHandle";
case kArrayHandle: return "ArrayHandle";
......@@ -499,6 +518,21 @@ inline const char* TypeCode2Str(int type_code) {
}
}
inline std::ostream& operator<<(std::ostream& os, TVMType t) { // NOLINT(*)
os << TypeCode2Str(t.code)
<< static_cast<int>(t.bits);
if (t.lanes != 1) {
os << 'x' << static_cast<int>(t.lanes);
}
return os;
}
inline std::string TVMType2String(TVMType t) {
std::ostringstream os;
os << t;
return os.str();
}
inline TVMType String2TVMType(std::string s) {
TVMType t;
t.bits = 32; t.lanes = 1;
......
......@@ -13,7 +13,7 @@ from . import collections
from . import schedule
from . import ndarray as nd
from .ndarray import cpu, gpu, opencl, init_opencl
from .ndarray import cpu, gpu, opencl, init_opencl, cl
from ._base import TVMError
from .api import *
......@@ -90,8 +90,8 @@ def _make_tvm_args(args, temp_args):
values[i].v_float64 = arg
type_codes[i] = TypeCode.FLOAT
elif isinstance(arg, TVMType):
values[i].v_type = arg
type_codes[i] = TypeCode.TVM_TYPE
values[i].v_str = c_str(str(arg))
type_codes[i] = TypeCode.STR
elif isinstance(arg, string_types):
values[i].v_str = c_str(arg)
type_codes[i] = TypeCode.STR
......
......@@ -86,8 +86,7 @@ class TVMValue(ctypes.Union):
_fields_ = [("v_int64", ctypes.c_int64),
("v_float64", ctypes.c_double),
("v_handle", ctypes.c_void_p),
("v_str", ctypes.c_char_p),
("v_type", TVMType)]
("v_str", ctypes.c_char_p)]
TVMPackedCFunc = ctypes.CFUNCTYPE(
......@@ -117,7 +116,6 @@ RETURN_SWITCH = {
TypeCode.FLOAT: lambda x: x.v_float64,
TypeCode.HANDLE: _return_handle,
TypeCode.NULL: lambda x: None,
TypeCode.TVM_TYPE: lambda x: x.v_type,
TypeCode.STR: lambda x: py_str(x.v_str)
}
......@@ -127,6 +125,5 @@ C_TO_PY_ARG_SWITCH = {
TypeCode.FLOAT: lambda x: x.v_float64,
TypeCode.HANDLE: _return_handle,
TypeCode.NULL: lambda x: None,
TypeCode.TVM_TYPE: lambda x: x.v_type,
TypeCode.STR: lambda x: py_str(x.v_str)
}
......@@ -16,9 +16,9 @@ from . import make as _make
from . import expr as _expr
from . import collections as _collections
int32 = TVMType("int32")
float32 = TVMType("float32")
handle = TVMType("handle")
int32 = "int32"
float32 = "float32"
handle = "handle"
def const(value, dtype=None):
"""construct a constant"""
......
......@@ -9,6 +9,12 @@ from . import expr as _expr
class Array(NodeBase):
"""Array container of TVM"""
def __getitem__(self, i):
if isinstance(i, slice):
start = i.start if i.start is not None else 0
stop = i.stop if i.stop is not None else len(self)
step = i.step if i.step is not None else 1
return [self[idx] for idx in range(start, stop, step)]
if i >= len(self):
raise IndexError("array index out ot range")
return _api_internal._ArrayGetItem(self, i)
......
......@@ -2,7 +2,7 @@
This is a simplified runtime API for quick testing and proptyping.
"""
# pylint: disable=unused-import
# pylint: disable=unused-import, invalid-name
from __future__ import absolute_import as _abs
import numpy as _np
......@@ -12,6 +12,8 @@ from ._ctypes._ndarray import _init_ndarray_module
from ._ctypes._ndarray import init_opencl
from ._ctypes._function import Function
cl = opencl
class NDArray(NDArrayBase):
"""Lightweight NDArray class of TVM runtime.
......
# Code organization
- c_api C API related functions
- api API functionr registration
- lang The definition of DSL related data structure
- schedule The operations on the schedule graph before converting to IR.
- pass The optimization pass on the IR structure
- runtime Minimum runtime related codes.
- jit JIT runtime related code.
......@@ -8,15 +8,30 @@
#include <tvm/codegen.h>
#include <tvm/api_registry.h>
#include "../codegen/codegen_c.h"
#include "../codegen/codegen_cuda.h"
#include "../codegen/codegen_opencl.h"
namespace tvm {
namespace codegen {
TVM_REGISTER_API(_codegen_CompileToC)
.set_body([](TVMArgs args, TVMRetValue *ret) {
std::string mode = "c";
if (args.size() > 2) {
mode = args[2].operator std::string();
}
if (mode == "c") {
*ret = CodeGenC().Compile(args[0], args[1]);
} else if (mode == "cuda") {
*ret = CodeGenCUDA().Compile(args[0], args[1]);
} else if (mode == "opencl") {
*ret = CodeGenOpenCL().Compile(args[0], args[1]);
} else {
LOG(FATAL) << "cannot recognize mode";
}
});
TVM_REGISTER_API(_codegen_MakeAPI)
.set_body([](TVMArgs args, TVMRetValue *ret) {
*ret = MakeAPI(
......@@ -28,29 +43,20 @@ TVM_REGISTER_API(_codegen_SplitHostDevice)
*ret = SplitHostDevice(args[0]);
});
// generate a dummy packed function for testing
void DummyHelloFunction(TVMArgs args, TVMRetValue* rv) {
LOG(INFO) << args.size() << " arguments";
for (int i = 0; i < args.size(); ++i) {
switch (args.type_codes[i]) {
case kNull: LOG(INFO) << i << ":nullptr"; break;
case kFloat: LOG(INFO) << i << ": double=" << args.values[i].v_float64; break;
case kInt: LOG(INFO) << i << ": long=" << args.values[i].v_int64; break;
case kHandle: LOG(INFO) << i << ": handle=" << args.values[i].v_handle; break;
case kArrayHandle: LOG(INFO) << i << ": array_handle=" << args.values[i].v_handle; break;
default: LOG(FATAL) << "unhandled type " << runtime::TypeCode2Str(args.type_codes[i]);
}
}
}
TVM_REGISTER_API(_codegen_BuildStackVM)
.set_body([](TVMArgs args, TVMRetValue *ret) {
*ret = BuildStackVM(args[0],
std::unordered_map<LoweredFunc, PackedFunc>());
});
TVM_REGISTER_API(_codegen_DummyHelloFunction)
TVM_REGISTER_API(_codegen_BuildNVRTC)
.set_body([](TVMArgs args, TVMRetValue *ret) {
*ret = runtime::PackedFunc(DummyHelloFunction);
*ret = BuildNVRTC(args[0], args[1]);
});
TVM_REGISTER_API(_codegen_BuildStackVM)
TVM_REGISTER_API(_codegen_BuildOpenCL)
.set_body([](TVMArgs args, TVMRetValue *ret) {
*ret = BuildStackVM(args[0]);
*ret = BuildOpenCL(args[0], args[1]);
});
} // namespace codegen
......
......@@ -121,7 +121,8 @@ int TVMNodeGetAttr(NodeHandle handle,
} else {
(*tnode)->VisitAttrs(&getter);
*ret_success = getter.found_node_ref || rv.type_code() != kNull;
if (rv.type_code() == kStr) {
if (rv.type_code() == kStr ||
rv.type_code() == kTVMType) {
TVMAPIThreadLocalEntry *e = TVMAPIThreadLocalStore::Get();
e->ret_str = rv.operator std::string();
*ret_type_code = kStr;
......
/*!
* Copyright (c) 2016 by Contributors
* \file common.h
* \brief Common utilities
*/
#ifndef TVM_BASE_COMMON_H_
#define TVM_BASE_COMMON_H_
#include <tvm/base.h>
#include <tvm/expr.h>
#include <string>
namespace tvm {
inline std::string Type2String(const Type& t) {
if (t.code() ==Type::Handle) return "handle";
std::ostringstream os;
os << t;
return os.str();
}
inline Type String2Type(std::string s) {
std::istringstream is(s);
halide_type_code_t code = Type::Int;
if (s.substr(0, 3) == "int") {
code = Type::Int; s = s.substr(3);
} else if (s.substr(0, 4) == "uint") {
code = Type::UInt; s = s.substr(4);
} else if (s.substr(0, 5) == "float") {
code = Type::Float; s = s.substr(5);
} else if (s.substr(0, 5) == "float") {
code = Type::Float; s = s.substr(5);
} else if (s == "handle") {
return Handle();
} else {
LOG(FATAL) << "unknown type " << s;
}
int bits = 32, lanes = 1;
if (sscanf(s.c_str(), "%dx%d", &bits, &lanes) == 0) {
LOG(FATAL) << "unknown type " << s;
}
return Type(code, bits, lanes);
}
inline const char* TVMTypeCode2Str(int type_code) {
switch (type_code) {
case kInt: return "int";
case kFloat: return "float";
case kStr: return "str";
case kHandle: return "Handle";
case kNull: return "NULL";
case kNodeHandle: return "NodeHandle";
default: LOG(FATAL) << "unknown type_code="
<< static_cast<int>(type_code); return "";
}
}
template<typename T>
struct NodeTypeChecker {
static inline bool Check(Node* sptr) {
// This is the only place in the project where RTTI is used
// It can be turned off, but will make non strict checking.
// TODO(tqchen) possibly find alternative to turn of RTTI
using ContainerType = typename T::ContainerType;
return (dynamic_cast<ContainerType*>(sptr) != nullptr);
}
static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
using ContainerType = typename T::ContainerType;
os << ContainerType::_type_key;
}
};
template<typename T>
struct NodeTypeChecker<Array<T> > {
static inline bool Check(Node* sptr) {
if (sptr == nullptr) return false;
if (!sptr->is_type<ArrayNode>()) return false;
ArrayNode* n = static_cast<ArrayNode*>(sptr);
for (const auto& p : n->data) {
if (!NodeTypeChecker<T>::Check(p.get())) return false;
}
return true;
}
static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
os << "array<";
NodeTypeChecker<T>::PrintName(os);
os << ">";
}
};
template<typename K, typename V>
struct NodeTypeChecker<Map<K, V> > {
static inline bool Check(Node* sptr) {
if (sptr == nullptr) return false;
if (!sptr->is_type<MapNode>()) return false;
MapNode* n = static_cast<MapNode*>(sptr);
for (const auto& kv : n->data) {
if (!NodeTypeChecker<K>::Check(kv.first.get())) return false;
if (!NodeTypeChecker<V>::Check(kv.second.get())) return false;
}
return true;
}
static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
os << "map<";
NodeTypeChecker<K>::PrintName(os);
os << ',';
NodeTypeChecker<V>::PrintName(os);
os << '>';
}
};
template<typename T>
inline std::string NodeTypeName() {
std::ostringstream os;
NodeTypeChecker<T>::PrintName(os);
return os.str();
}
} // namespace tvm
#endif // TVM_BASE_COMMON_H_
......@@ -25,7 +25,15 @@ std::string CodeGenC::Compile(LoweredFunc f,
Var v = f->args[i];
std::string vid = AllocVarID(v.get());
if (i != 0) stream << ", ";
if (v.type().is_handle()) {
stream << arg_addr_space_;
}
if (handle_data_type_.count(v.get())) {
PrintType(handle_data_type_.at(v.get()), stream);
stream << "*";
} else {
PrintType(v.type(), stream);
}
stream << ' ' << vid;
}
stream << ") {\n";
......@@ -510,6 +518,10 @@ TVM_STATIC_IR_FUNCTOR(CodeGenC, vtable_print_stmt)
.set_dispatch<AttrStmt>([](const AttrStmt *op, CodeGenC* p) { p->PrintStmt(op); })
.set_dispatch<AssertStmt>([](const AssertStmt *op, CodeGenC* p) { p->PrintStmt(op); });
void CodeGenC::PrintThreadTagExpr(
std::string thread_tag, std::ostream& os) const { // NOLINT(*)
os << thread_tag;
}
void CodeGenC::PrintStmt(const LetStmt* op) {
std::string value = PrintExpr(op->value);
......@@ -585,7 +597,9 @@ void CodeGenC::PrintStmt(const AttrStmt* op) {
PrintType(iv->var.type(), stream);
stream << ' '
<< AllocVarID(iv->var.get())
<< " = " << iv->thread_tag << ";\n";
<< " = ";
PrintThreadTagExpr(iv->thread_tag, stream);
stream << ";\n";
}
}
}
......
......@@ -7,7 +7,8 @@
#define TVM_CODEGEN_CODEGEN_C_H_
#include <tvm/ir.h>
#include <tvm/module.h>
#include <tvm/codegen.h>
#include <tvm/lowered_func.h>
#include <string>
#include <unordered_map>
......@@ -70,13 +71,20 @@ class CodeGenC {
* \return the variable name.
*/
std::string GetVarID(const Variable* v) const;
// The following parts are overloadable print operations.
/*!
* Print Type represetnation of type t.
* \param t The type representation.
* \return os The stream to print the ctype into
* \param os The stream to print the ctype into
*/
virtual void PrintType(Type t, std::ostream& os) const; // NOLINT(*)
// The following parts are overloadable print operations.
/*!
* \brief Print expr representing the thread tag
* \param thread_tag The tag in the thread.
* \param os The strean to output to
*/
virtual void PrintThreadTagExpr(
std::string thread_tag, std::ostream& os) const; // NOLINT(*)
virtual void PrintStmt(const ir::LetStmt* op);
virtual void PrintStmt(const ir::Store* op);
virtual void PrintStmt(const ir::Allocate* op);
......@@ -101,6 +109,10 @@ class CodeGenC {
/*! \brief the stream to be printed */
std::ostringstream stream;
protected:
// additional string for arg addr_space.
std::string arg_addr_space_;
private:
/*!
* \brief Get the SSA ID corresponds to src
......
/*!
* Copyright (c) 2017 by Contributors
* \file codegen_cuda.cc
*/
#include <tvm/base.h>
#include <tvm/runtime/config.h>
#include <tvm/packed_func_ext.h>
#include <vector>
#include <string>
#include "./codegen_cuda.h"
#include "./codegen_stack_vm.h"
#include "../runtime/cuda/cuda_common.h"
#include "../runtime/cuda/cuda_module.h"
namespace tvm {
namespace codegen {
std::string CodeGenCUDA::Compile(
LoweredFunc f,
bool output_ssa) {
this->stream << "extern \"C\" __global__ ";
return CodeGenC::Compile(f, output_ssa);
}
#if TVM_CUDA_RUNTIME
std::unordered_map<LoweredFunc, PackedFunc>
MakeNVRTC(Array<LoweredFunc> funcs) {
std::ostringstream os;
os << "typedef int int32_t;\n"
<< "typedef unsigned unt32_t;\n";
bool output_ssa = true;
for (LoweredFunc f : funcs) {
os << CodeGenCUDA().Compile(f, output_ssa);
os << '\n';
}
std::string ptx = runtime::NVRTCCompile(os.str());
std::unordered_map<LoweredFunc, PackedFunc> ret;
runtime::CUDAModule m = runtime::CUDAModule::Create(ptx);
for (LoweredFunc f : funcs) {
std::vector<TVMType> arg_types(f->args.size());
std::vector<std::string> thread_axis_tags(f->thread_axis.size());
for (size_t i = 0; i < f->args.size(); ++i) {
arg_types[i] = Type2TVMType(f->args[i].type());
}
for (size_t i = 0; i < f->thread_axis.size(); ++i) {
thread_axis_tags[i] = f->thread_axis[i]->thread_tag;
}
ret[f] = m.GetPackedFunc(f->name, arg_types, thread_axis_tags);
}
return ret;
}
PackedFunc BuildNVRTC(Array<LoweredFunc> fsplits, std::string host_mode) {
Array<LoweredFunc> device_list(fsplits.begin() + 1, fsplits.end());
std::unordered_map<LoweredFunc, PackedFunc> device_funcs = MakeNVRTC(device_list);
if (host_mode == "stackvm") {
StackVM vm = codegen::CodeGenStackVM().Compile(fsplits[0], device_funcs);
auto f = [vm](TVMArgs args, TVMRetValue* rv) {
runtime::AutoSetCUDADevice(args);
vm(args);
};
return PackedFunc(f);
} else {
LOG(FATAL) << "unknown host mode " << host_mode;
return PackedFunc();
}
}
#else
// dummy function when cuda is not available
PackedFunc BuildNVRTC(Array<LoweredFunc> func, std::string host_mode) {
LOG(FATAL) << "CUDA is not enabled";
return PackedFunc();
}
#endif // TVM_CUDA_RUNTIME
} // namespace codegen
} // namespace tvm
/*!
* Copyright (c) 2017 by Contributors
* \file codegen_cuda.h
* \brief Utility to generate cuda code
*/
#ifndef TVM_CODEGEN_CODEGEN_CUDA_H_
#define TVM_CODEGEN_CODEGEN_CUDA_H_
#include <tvm/codegen.h>
#include <tvm/packed_func_ext.h>
#include <string>
#include "./codegen_c.h"
namespace tvm {
namespace codegen {
class CodeGenCUDA : public CodeGenC {
public:
/*!
* \brief Generate the C code of statement
* \param f The function to be compiled
* \param output_ssa Whether output ssa form.
* \note Only call compile once,
* create a new codegen object each time.
*/
std::string Compile(LoweredFunc f,
bool output_ssa);
};
} // namespace codegen
} // namespace tvm
#endif // TVM_CODEGEN_CODEGEN_CUDA_H_
/*!
* Copyright (c) 2017 by Contributors
* \file codegen_cuda.cc
*/
#include <tvm/runtime/config.h>
#include <tvm/packed_func_ext.h>
#include <vector>
#include <string>
#include "./codegen_opencl.h"
#include "./codegen_stack_vm.h"
#include "../runtime/opencl/opencl_common.h"
#include "../runtime/opencl/opencl_module.h"
namespace tvm {
namespace codegen {
std::string CodeGenOpenCL::Compile(
LoweredFunc f,
bool output_ssa) {
this->stream << " __kernel ";
this->arg_addr_space_ = "__global ";
return CodeGenC::Compile(f, output_ssa);
}
void CodeGenOpenCL::PrintThreadTagExpr(
std::string thread_tag, std::ostream& os) const { // NOLINT(*)
if (thread_tag == "threadIdx.x") {
os << "get_local_id(0)";
} else if (thread_tag == "threadIdx.y") {
os << "get_local_id(1)";
} else if (thread_tag == "threadIdx.z") {
os << "get_local_id(2)";
} else if (thread_tag == "blockIdx.x") {
os << "get_global_id(0) / get_local_size(0)";
} else if (thread_tag == "blockIdx.y") {
os << "get_global_id(1) / get_local_size(1)";
} else if (thread_tag == "blockIdx.z") {
os << "get_global_id(2) / get_local_size(2)";
} else {
LOG(FATAL) << "unknown thread tag";
}
}
#if TVM_OPENCL_RUNTIME
std::unordered_map<LoweredFunc, PackedFunc>
MakeOpenCL(Array<LoweredFunc> funcs) {
std::ostringstream os;
os << "typedef int int32_t;\n"
<< "typedef unsigned unt32_t;\n";
bool output_ssa = true;
for (LoweredFunc f : funcs) {
os << CodeGenOpenCL().Compile(f, output_ssa);
os << '\n';
}
std::unordered_map<LoweredFunc, PackedFunc> ret;
runtime::OpenCLModule m =
runtime::OpenCLModule::CreateWithSource(os.str());
for (LoweredFunc f : funcs) {
std::vector<TVMType> arg_types(f->args.size());
std::vector<std::string> thread_axis_tags(f->thread_axis.size());
for (size_t i = 0; i < f->args.size(); ++i) {
arg_types[i] = Type2TVMType(f->args[i].type());
}
for (size_t i = 0; i < f->thread_axis.size(); ++i) {
thread_axis_tags[i] = f->thread_axis[i]->thread_tag;
}
ret[f] = m.GetPackedFunc(f->name, arg_types, thread_axis_tags);
}
return ret;
}
PackedFunc BuildOpenCL(Array<LoweredFunc> fsplits, std::string host_mode) {
Array<LoweredFunc> device_list(fsplits.begin() + 1, fsplits.end());
std::unordered_map<LoweredFunc, PackedFunc> device_funcs = MakeOpenCL(device_list);
if (host_mode == "stackvm") {
StackVM vm = codegen::CodeGenStackVM().Compile(fsplits[0], device_funcs);
auto f = [vm](TVMArgs args, TVMRetValue* rv) {
runtime::AutoSetOpenCLContext(args);
vm(args);
};
return PackedFunc(f);
} else {
LOG(FATAL) << "unknown host mode " << host_mode;
return PackedFunc();
}
}
#else
// dummy function when opencl is not available
PackedFunc BuildOpenCL(Array<LoweredFunc> func, std::string host_mode) {
LOG(FATAL) << "OpenCL is not enabled";
return PackedFunc();
}
#endif // TVM_OPENCL_RUNTIME
} // namespace codegen
} // namespace tvm
/*!
* Copyright (c) 2017 by Contributors
* \file codegen_opencl.h
* \brief Utility to generate opencl code
*/
#ifndef TVM_CODEGEN_CODEGEN_OPENCL_H_
#define TVM_CODEGEN_CODEGEN_OPENCL_H_
#include <tvm/codegen.h>
#include <tvm/packed_func_ext.h>
#include <string>
#include "./codegen_c.h"
namespace tvm {
namespace codegen {
class CodeGenOpenCL : public CodeGenC {
public:
/*!
* \brief Generate the OpenCL code of statement
* \param f The function to be compiled
* \param output_ssa Whether output ssa form.
* \note Only call compile once,
* create a new codegen object each time.
*/
std::string Compile(LoweredFunc f,
bool output_ssa);
// override print thread tag.
void PrintThreadTagExpr(
std::string thread_tag, std::ostream& os) const final; // NOLINT(*)
};
} // namespace codegen
} // namespace tvm
#endif // TVM_CODEGEN_CODEGEN_OPENCL_H_
......@@ -2,6 +2,7 @@
* Copyright (c) 2017 by Contributors
* \file codegen_stack_vm.cc
*/
#include <tvm/packed_func_ext.h>
#include <limits>
#include "./codegen_stack_vm.h"
......@@ -10,55 +11,34 @@ namespace codegen {
using namespace ir;
runtime::PackedFunc BuildStackVM(LoweredFunc func) {
StackVM vm = codegen::CodeGenStackVM().Compile(func);
using runtime::TVMArgs;
using runtime::TVMRetValue;
PackedFunc BuildStackVM(
LoweredFunc func,
const std::unordered_map<LoweredFunc, PackedFunc>& device_funcs) {
StackVM vm = codegen::CodeGenStackVM().Compile(func, device_funcs);
auto f = [vm](TVMArgs args, TVMRetValue* rv) {
StackVM::State* s = StackVM::ThreadLocalState();
s->sp = 0;
s->pc = 0;
if (s->heap.size() < vm.heap_size) {
s->heap.resize(vm.heap_size);
}
s->heap[0].v_handle = (void*)args.values; // NOLINT(*)
s->heap[1].v_handle = (void*)args.type_codes; // NOLINT(*)
s->heap[2].v_int64 = args.num_args;
vm.Run(s);
vm(args);
};
return runtime::PackedFunc(f);
}
TVMValue TVMPrint(const TVMValue* args, int num_args) {
CHECK_EQ(num_args, 2);
int tcode = static_cast<int>(args[1].v_int64);
int code = (tcode >> (8 * 3)) & 255;
int bits = (tcode >> (8 * 2)) & 255;
int lanes = tcode & ((1 << 16) - 1);
Type t((halide_type_code_t)code, bits, lanes);
if (t.is_handle()) {
LOG(INFO) << t << ": " << args[0].v_handle;
} else if (t.is_float()) {
LOG(INFO) << t << ": " << args[0].v_float64;
} else {
LOG(INFO) << t << ": " << args[0].v_int64;
}
TVMValue r; r.v_int64 = 0;
return r;
return PackedFunc(f);
}
CodeGenStackVM::FType& CodeGenStackVM::vtable() { // NOLINT(*)
static FType inst; return inst;
}
StackVM CodeGenStackVM::Compile(LoweredFunc f) {
StackVM CodeGenStackVM::Compile(
LoweredFunc f,
const std::unordered_map<LoweredFunc, PackedFunc>& device_funcs) {
for (size_t i = 0; i < f->args.size(); ++i) {
Var v = f->args[i];
int vid = AllocVarID(v.get());
CHECK_EQ(static_cast<size_t>(vid), i);
}
// setup device function map
for (const auto& kv : device_funcs) {
int fid = static_cast<int>(vm_.packed_func.size());
vm_.packed_func.push_back(kv.second);
device_fun_idmap_[kv.first] = fid;
}
this->Push(f->body);
return std::move(vm_);
}
......@@ -117,33 +97,23 @@ int CodeGenStackVM::AllocVarID(const Variable* v) {
return vid;
}
int CodeGenStackVM::GetGlobalFuncID(std::string name) {
auto it = fun_idmap_.find(name);
if (it != fun_idmap_.end()) return it->second;
using runtime::PackedFunc;
using runtime::TVMArgs;
using runtime::TVMRetValue;
PackedFunc f = PackedFunc::GetGlobal(name);
auto extern_f = [f](const TVMValue* args, int num_args) {
CHECK_EQ(num_args % 2, 0);
num_args = num_args / 2;
std::vector<int> type_codes(std::max(num_args, 1));
for (int i = 0; i < num_args; ++i) {
int tcode = static_cast<int>(args[num_args + i].v_int64);
int code = (tcode >> (8 * 3)) & 255;
type_codes[i] = code;
void CodeGenStackVM::PushCallPacked(
int fid, const std::vector<int>& arg_type_codes) {
StackVM::Code code;
// CALL_PACKED_FUNC
code.op_code = StackVM::CALL_PACKED_FUNC;
vm_.code.push_back(code);
// num_args
code.v_int = static_cast<int>(arg_type_codes.size());
vm_.code.push_back(code);
// fid
code.v_int = fid;
vm_.code.push_back(code);
// type codes.
for (int tcode : arg_type_codes) {
code.v_int = tcode;
vm_.code.push_back(code);
}
TVMRetValue rv;
f.CallPacked(TVMArgs(args, &type_codes[0], num_args), &rv);
TVMValue r; r.v_int64 = 0;
return r;
};
int fid = static_cast<int>(vm_.extern_func.size());
vm_.extern_func.push_back(extern_f);
fun_idmap_[name] = fid;
return fid;
}
int CodeGenStackVM::GetVarID(const Variable* v) const {
......@@ -162,7 +132,7 @@ void CodeGenStackVM::Push_(const ir::Load* op) {
this->PushOp(StackVM::PUSH_I64, op->type.element_of().bytes());
this->PushOp(StackVM::MUL_I64);
this->PushOp(StackVM::ADDR_ADD);
this->PushOp(StackVM::GetLoad(op->type));
this->PushOp(StackVM::GetLoad(Type2TVMType(op->type)));
}
}
void CodeGenStackVM::Push_(const ir::Store* op) {
......@@ -172,7 +142,7 @@ void CodeGenStackVM::Push_(const ir::Store* op) {
this->PushOp(StackVM::MUL_I64);
this->PushOp(StackVM::ADDR_ADD);
this->Push(op->value);
this->PushOp(StackVM::GetStore(op->value.type()));
this->PushOp(StackVM::GetStore(Type2TVMType(op->value.type())));
}
void CodeGenStackVM::Push_(const ir::Allocate* op) {
......@@ -231,22 +201,49 @@ void CodeGenStackVM::Push_(const ir::Call* op) {
for (size_t i = 1; i < op->args.size(); ++i) {
this->Push(op->args[i]);
}
// find the fuction id.
const std::string& func_name = s->value;
auto it = global_fun_idmap_.find(func_name);
int fid;
if (it != global_fun_idmap_.end()) {
fid = it->second;
} else {
fid = static_cast<int>(vm_.packed_func.size());
PackedFunc f = PackedFunc::GetGlobal(func_name);
vm_.packed_func.push_back(f);
global_fun_idmap_[func_name] = fid;
}
// get the argument type code.
std::vector<int> arg_type_codes;
for (size_t i = 1; i < op->args.size(); ++i) {
Type t = op->args[i].type();
int code = t.code();
int bits = t.bits();
int lanes = t.lanes();
int tcode = (code << (8 * 3)) | (bits << 16) | lanes;
this->PushOp(StackVM::PUSH_I64, tcode);
CHECK_EQ(lanes, 1);
arg_type_codes.push_back(code);
}
int num_args = static_cast<int>((op->args.size() - 1) * 2);
this->PushOp(StackVM::PUSH_I64, num_args);
this->PushOp(StackVM::CALL_EXTERN, GetGlobalFuncID(s->value));
this->PushCallPacked(fid, arg_type_codes);
} else if (op->is_intrinsic(intrinsic::tvm_handle_is_null)) {
CHECK_EQ(op->args.size(), 1U);
this->Push(op->args[0]);
this->PushOp(StackVM::PUSH_I64, 0);
this->PushOp(StackVM::EQ_I64);
} else if (op->call_type == Call::Extern && op->func.defined()) {
CHECK(op->func->is_type<LoweredFuncNode>());
LoweredFunc f(op->func.node_);
auto it = device_fun_idmap_.find(f);
CHECK(it != device_fun_idmap_.end())
<< "Cannot find device function " << f->name;
const int fid = it->second;
std::vector<int> arg_type_codes(op->args.size());
for (size_t i = 0; i < op->args.size(); ++i) {
this->Push(op->args[i]);
Type t = op->args[i].type();
int lanes = t.lanes();
CHECK_EQ(lanes, 1);
arg_type_codes[i] = t.code();
}
this->PushCallPacked(fid, arg_type_codes);
} else {
this->HandleUnknownCall(op);
}
......@@ -277,6 +274,8 @@ inline void PushBinary(StackVM::OpCode op_int64,
}
inline void PushCast(Type dst,
Type src,
CodeGenStackVM* p) {
......
......@@ -7,17 +7,18 @@
#define TVM_CODEGEN_CODEGEN_STACK_VM_H_
#include <tvm/ir.h>
#include <tvm/module.h>
#include <tvm/lowered_func.h>
#include <tvm/codegen.h>
#include <string>
#include <vector>
#include <unordered_map>
#include "../jit/stack_vm.h"
#include "../runtime/stack_vm/stack_vm.h"
namespace tvm {
namespace codegen {
using jit::StackVM;
using runtime::StackVM;
/*!
* \brief A base class to generate a stack VM.
......@@ -29,10 +30,13 @@ class CodeGenStackVM {
/*!
* \brief Generate a stack VM representing
* \param f The function to be compiled
* \param device_funcs The extern device functions to be linked.
* \note Only call compile once,
* create a new codegen object each time.
*/
StackVM Compile(LoweredFunc f);
StackVM Compile(
LoweredFunc f,
const std::unordered_map<LoweredFunc, PackedFunc>& device_funcs);
/*! \brief Push stmt to generate new code */
void Push(const Stmt& n);
/*! \brief Push expr to generate new code */
......@@ -50,6 +54,13 @@ class CodeGenStackVM {
*/
int64_t PushOp(StackVM::OpCode opcode, int operand);
/*!
* \brief Push a call packed function.
* \param fid The function id.
* \param arg_type_codes The type codes of arguments.
*/
void PushCallPacked(int fid,
const std::vector<int>& arg_type_codes);
/*!
* \brief Set the relative jump offset to be offset.
* \param operand_index The indexed returned by PushOp.
* \param operand The operand to be set.
......@@ -66,11 +77,6 @@ class CodeGenStackVM {
*/
int GetStrID(const std::string& key);
/*!
* \brief Push the function to the VM and get a id.
* \param f The function to be pushed.
*/
int GetGlobalFuncID(std::string name);
/*!
* \brief Allocate a variable name for a newly defined var.
* \param v The variable.
* \return the heap index of the var.
......@@ -101,8 +107,10 @@ class CodeGenStackVM {
std::unordered_map<const Variable*, int> var_idmap_;
/*! \brief id of each string */
std::unordered_map<std::string, int> str_idmap_;
/*! \brief id of each function */
std::unordered_map<std::string, int> fun_idmap_;
/*! \brief id of each global function */
std::unordered_map<std::string, int> global_fun_idmap_;
/*! \brief id of device function */
std::unordered_map<LoweredFunc, int> device_fun_idmap_;
};
} // namespace codegen
......
......@@ -5,7 +5,7 @@
*/
#include <tvm/codegen.h>
#include <tvm/ir.h>
#include <tvm/module.h>
#include <tvm/lowered_func.h>
#include <tvm/ir_pass.h>
#include <tvm/ir_mutator.h>
#include <unordered_map>
......@@ -169,6 +169,7 @@ class HostDeviceSplitter : public IRMutator {
n->body = m.Mutate(body);
n->name = os.str();
n->args = m.undefined_;
n->thread_axis = m.thread_axis_;
CHECK_NE(m.thread_extent_.size(), 0U);
// improve the handle data type
......
......@@ -4,13 +4,44 @@
* \brief Utilities to save/load TVM objects.
*/
#include <tvm/base.h>
#include <tvm/expr.h>
#include <tvm/container.h>
#include <dmlc/json.h>
#include <string>
#include "./common.h"
namespace tvm {
inline std::string Type2String(const Type& t) {
if (t.code() ==Type::Handle) return "handle";
std::ostringstream os;
os << t;
return os.str();
}
inline Type String2Type(std::string s) {
std::istringstream is(s);
halide_type_code_t code = Type::Int;
if (s.substr(0, 3) == "int") {
code = Type::Int; s = s.substr(3);
} else if (s.substr(0, 4) == "uint") {
code = Type::UInt; s = s.substr(4);
} else if (s.substr(0, 5) == "float") {
code = Type::Float; s = s.substr(5);
} else if (s.substr(0, 5) == "float") {
code = Type::Float; s = s.substr(5);
} else if (s == "handle") {
return Handle();
} else {
LOG(FATAL) << "unknown type " << s;
}
int bits = 32, lanes = 1;
if (sscanf(s.c_str(), "%dx%d", &bits, &lanes) == 0) {
LOG(FATAL) << "unknown type " << s;
}
return Type(code, bits, lanes);
}
// indexer to index all the ndoes
class NodeIndexer : public AttrVisitor {
public:
......
......@@ -193,7 +193,8 @@ int TVMFuncCall(TVMFunctionHandle func,
(*static_cast<const PackedFunc*>(func)).CallPacked(
TVMArgs(args, arg_type_codes, num_args), &rv);
// handle return string.
if (rv.type_code() == kStr) {
if (rv.type_code() == kStr ||
rv.type_code() == kTVMType) {
TVMRuntimeEntry* e = TVMAPIRuntimeStore::Get();
e->ret_str = rv.operator std::string();
*ret_type_code = kStr;
......
/*!
* Copyright (c) 2017 by Contributors
* \file cuda_common.h
* \brief Common utilities for CUDA
*/
#ifndef TVM_RUNTIME_CUDA_CUDA_COMMON_H_
#define TVM_RUNTIME_CUDA_CUDA_COMMON_H_
#include <tvm/runtime/config.h>
#include <tvm/runtime/packed_func.h>
#include <string>
#if TVM_CUDA_RUNTIME
#include <cuda_runtime.h>
namespace tvm {
namespace runtime {
#define CUDA_DRIVER_CALL(x) \
{ \
CUresult result = x; \
if (result != CUDA_SUCCESS) { \
const char *msg; \
cuGetErrorName(result, &msg); \
LOG(FATAL) \
<< "CUDAError: " #x " failed with error: " << msg; \
} \
}
#define CUDA_CALL(func) \
{ \
cudaError_t e = (func); \
CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
<< "CUDA: " << cudaGetErrorString(e); \
}
/*!
* \brief Compile code into ptx using NVRTC
* \param code The cuda code.
* \return The PTX code.
*/
std::string NVRTCCompile(const std::string& code);
/*!
* \brief Automatically detect and set cuda device.
* \param args The arguments.
*/
inline void AutoSetCUDADevice(const TVMArgs& args) {
int dev_id = -1;
for (int i = 0; i < args.size(); ++i) {
if (args.type_codes[i] == kArrayHandle) {
TVMContext ctx = static_cast<TVMArray*>(
args.values[i].v_handle)->ctx;
CHECK_EQ(ctx.dev_mask, kGPU)
<< "All operands need to be GPU";
if (dev_id == -1) {
dev_id = ctx.dev_id;
} else {
CHECK_EQ(dev_id, ctx.dev_id)
<< "Operands comes from different devices ";
}
}
}
CUDA_CALL(cudaSetDevice(dev_id));
}
} // namespace runtime
} // namespace tvm
#endif // TVM_CUDA_RUNTIME
#endif // TVM_RUNTIME_CUDA_CUDA_COMMON_H_
/*!
* Copyright (c) 2017 by Contributors
* \file cuda_module.cc
*/
#include "./cuda_module.h"
#if TVM_CUDA_RUNTIME
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <array>
#include <string>
#include <mutex>
#include "./cuda_common.h"
#include "../void_addr_args.h"
#include "../thread_axis_args.h"
namespace tvm {
namespace runtime {
/*!
* \brief Internal data structure to support multi-gpu execution.
* Try to use CUDA runtime's primary context.
*/
class CUDAModule::Internal {
public:
explicit Internal(std::string data)
: data_(data) {
std::fill(module_.begin(), module_.end(), nullptr);
}
// get a CUfunction from primary context in dev_id
CUfunction GetFunc(int dev_id, const std::string& func_name) {
std::lock_guard<std::mutex> lock(mutex_);
// must recheck under the lock scope
if (module_[dev_id] == nullptr) {
CUDA_DRIVER_CALL(cuModuleLoadData(&(module_[dev_id]), data_.c_str()));
}
CUfunction func;
CUresult result = cuModuleGetFunction(&func, module_[dev_id], func_name.c_str());
if (result != CUDA_SUCCESS) {
const char *msg;
cuGetErrorName(result, &msg);
LOG(FATAL)
<< "CUDAError: cuModuleGetFunction " << func_name
<< " failed with error: " << msg;
}
return func;
}
// destructor
~Internal() {
for (size_t i = 0; i < module_.size(); ++i) {
if (module_[i] != nullptr) {
CUDA_CALL(cudaSetDevice(i));
CUDA_DRIVER_CALL(cuModuleUnload(module_[i]));
}
}
}
private:
// the binary data
std::string data_;
// the internal modules per GPU, to be lazily initialized.
std::array<CUmodule, CUDAModule::kMaxNumGPUs> module_;
// internal mutex when updating the module
std::mutex mutex_;
};
// a wrapped function class to get packed fucn.
class CUDAWrappedFunc {
public:
// initialize the CUDA function.
void Init(std::shared_ptr<CUDAModule::Internal> m,
const std::string& func_name,
size_t num_void_args,
const std::vector<std::string>& thread_axis_tags) {
m_ = m;
func_name_ = func_name;
std::fill(fcache_.begin(), fcache_.end(), nullptr);
thread_axis_cfg_.Init(num_void_args, thread_axis_tags);
}
// invoke the function with void arguments
void operator()(TVMArgs args,
TVMRetValue* rv,
void** void_args) const {
int dev_id;
CUDA_CALL(cudaGetDevice(&dev_id));
if (fcache_[dev_id] == nullptr) {
fcache_[dev_id] = m_->GetFunc(dev_id, func_name_);
}
ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
CUDA_DRIVER_CALL(cuLaunchKernel(
fcache_[dev_id],
wl.grid_dim(0),
wl.grid_dim(1),
wl.grid_dim(2),
wl.block_dim(0),
wl.block_dim(1),
wl.block_dim(2),
0, nullptr, void_args, 0));
}
private:
// internal module
std::shared_ptr<CUDAModule::Internal> m_;
// The name of the function.
std::string func_name_;
// Device function cache per device.
// mark as mutable, to enable lazy initialization
mutable std::array<CUfunction, CUDAModule::kMaxNumGPUs> fcache_;
// thread axis configuration
ThreadAxisConfig thread_axis_cfg_;
};
PackedFunc CUDAModule::GetPackedFunc(
const std::string& func_name,
const std::vector<TVMType> arg_types,
const std::vector<std::string> thread_axis_tags) const {
CUDAWrappedFunc f;
f.Init(ptr_, func_name, arg_types.size(), thread_axis_tags);
return PackFromVoidAddrArgs(f, arg_types);
}
CUDAModule CUDAModule::Create(std::string ptx) {
// call a runtime API to make sure the context is created.
CUDAModule m;
m.ptr_ = std::make_shared<Internal>(ptx);
return m;
}
} // namespace runtime
} // namespace tvm
#endif // TVM_CUDA_RUNTIME
/*!
* Copyright (c) 2017 by Contributors
* \file cuda_module.h
* \brief Execution handling of CUDA kernels
*/
#ifndef TVM_RUNTIME_CUDA_CUDA_MODULE_H_
#define TVM_RUNTIME_CUDA_CUDA_MODULE_H_
#include <tvm/runtime/config.h>
#include <tvm/runtime/packed_func.h>
#include <memory>
#include <vector>
#include <string>
namespace tvm {
namespace runtime {
/*!
* \brief Handle execution of CUDA kernels as PackedFunc.
* It wraps around driver API to work with CUDA runtime API.
*/
class CUDAModule {
public:
/*!
* \brief Get CUDA Kernel launch wrapped as PackedFunc
* \param func_name The name of the function.
* \param arg_types The type of each argument in the function.
* \param thread_axis_tags The tag sequence of the thread axis.
*/
PackedFunc GetPackedFunc(
const std::string& func_name,
const std::vector<TVMType> arg_types,
const std::vector<std::string> thread_axis_tags) const;
/*!
* \brief create a cuda module from data.
* \param data The module data.
*/
static CUDAModule Create(std::string data);
/*! \brief hidden internal data structure. */
class Internal;
/*! \brief Maximum number of GPU supported in CUDAModule */
static constexpr const int kMaxNumGPUs = 32;
private:
std::shared_ptr<Internal> ptr_;
};
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_CUDA_CUDA_MODULE_H_
/*!
* Copyright (c) 2016 by Contributors
* \file device_api_gpu.h
* Copyright (c) 2017 by Contributors
* \file device_api_cuda.h
* \brief GPU specific API
*/
#ifndef TVM_RUNTIME_DEVICE_API_GPU_H_
#define TVM_RUNTIME_DEVICE_API_GPU_H_
#ifndef TVM_RUNTIME_CUDA_DEVICE_API_CUDA_H_
#define TVM_RUNTIME_CUDA_DEVICE_API_CUDA_H_
#include <dmlc/logging.h>
#include "./device_api.h"
#include "./cuda_common.h"
#if TVM_CUDA_RUNTIME
#include <dmlc/logging.h>
#include <cuda_runtime.h>
namespace tvm {
namespace runtime {
/*!
* \brief Protected CUDA call.
* \param func Expression to call.
*
* It checks for CUDA errors after invocation of the expression.
*/
#define CUDA_CALL(func) \
{ \
cudaError_t e = (func); \
CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
<< "CUDA: " << cudaGetErrorString(e); \
}
template<>
inline void* AllocDataSpace<kGPU>(TVMContext ctx, size_t size, size_t alignment) {
CUDA_CALL(cudaSetDevice(ctx.dev_id));
......@@ -94,4 +82,4 @@ inline void StreamSync<kGPU>(TVMContext ctx, TVMStreamHandle stream) {
} // namespace runtime
} // namespace tvm
#endif // TVM_CUDA_RUNTIME
#endif // TVM_RUNTIME_DEVICE_API_GPU_H_
#endif // TVM_RUNTIME_CUDA_DEVICE_API_CUDA_H_
/*!
* Copyright (c) 2017 by Contributors
* \file nvrtc.cc
*/
#include "./cuda_common.h"
#if TVM_CUDA_RUNTIME
#include <nvrtc.h>
namespace tvm {
namespace runtime {
#define NVRTC_CALL(x) \
{ \
nvrtcResult result = x; \
if (result != NVRTC_SUCCESS) { \
LOG(FATAL) \
<< "NvrtcError: " #x " failed with error: " \
<< nvrtcGetErrorString(result); \
} \
}
std::string NVRTCCompile(const std::string& code) {
nvrtcProgram prog;
NVRTC_CALL(nvrtcCreateProgram(
&prog, code.c_str(), nullptr, 0, nullptr, nullptr));
nvrtcResult compile_res = nvrtcCompileProgram(prog, 0, nullptr);
size_t log_size;
NVRTC_CALL(nvrtcGetProgramLogSize(prog, &log_size));
std::string log; log.resize(log_size);
NVRTC_CALL(nvrtcGetProgramLog(prog, &log[0]));
CHECK_EQ(compile_res, NVRTC_SUCCESS) << log;
size_t ptx_size;
NVRTC_CALL(nvrtcGetPTXSize(prog, &ptx_size));
std::string ptx;
ptx.resize(ptx_size);
NVRTC_CALL(nvrtcGetPTX(prog, &ptx[0]));
NVRTC_CALL(nvrtcDestroyProgram(&prog));
return ptx;
}
} // namespace runtime
} // namespace tvm
#endif // TVM_CUDA_RUNTIME
......@@ -109,7 +109,7 @@ inline void StreamSync(TVMContext ctx, TVMStreamHandle stream);
} // namespace tvm
#include "./device_api_cpu.h"
#include "./device_api_gpu.h"
#include "./device_api_opencl.h"
#include "./cuda/device_api_cuda.h"
#include "./opencl/device_api_opencl.h"
#endif // TVM_RUNTIME_DEVICE_API_H_
/*!
* Copyright (c) 2017 by Contributors
* \file device_api_opencl.h
* \brief OpenCL specific API
*/
#ifndef TVM_RUNTIME_OPENCL_DEVICE_API_OPENCL_H_
#define TVM_RUNTIME_OPENCL_DEVICE_API_OPENCL_H_
#include <tvm/runtime/config.h>
#if TVM_OPENCL_RUNTIME
#include <string>
#include <vector>
#include "./opencl_common.h"
namespace tvm {
namespace runtime {
namespace cl {
inline std::string GetPlatformInfo(
cl_platform_id pid, cl_platform_info param_name) {
size_t ret_size;
OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size));
std::string ret;
ret.resize(ret_size);
OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr));
return ret;
}
inline std::string GetDeviceInfo(
cl_device_id pid, cl_device_info param_name) {
size_t ret_size;
OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size));
std::string ret;
ret.resize(ret_size);
OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, &ret[0], nullptr));
return ret;
}
inline std::vector<cl_platform_id> GetPlatformIDs() {
cl_uint ret_size;
OPENCL_CALL(clGetPlatformIDs(0, nullptr, &ret_size));
std::vector<cl_platform_id> ret;
ret.resize(ret_size);
OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr));
return ret;
}
inline std::vector<cl_device_id> GetDeviceIDs(
cl_platform_id pid, std::string device_type) {
cl_device_type dtype = CL_DEVICE_TYPE_ALL;
if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU;
if (device_type == "gpu") dtype = CL_DEVICE_TYPE_CPU;
if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR;
cl_uint ret_size;
OPENCL_CALL(clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size));
std::vector<cl_device_id> ret;
ret.resize(ret_size);
OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr));
return ret;
}
inline bool MatchPlatformInfo(
cl_platform_id pid,
cl_platform_info param_name,
std::string value) {
if (value.length() == 0) return true;
std::string param_value = GetPlatformInfo(pid, param_name);
return param_value.find(value) != std::string::npos;
}
} // namespace cl
template<>
inline bool DeviceInit<kOpenCL>(const char** option_keys,
const char** option_vals,
int num_options) {
cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
std::lock_guard<std::mutex>(w->mu);
if (w->initialized()) return false;
// matching conditions
std::string platform_name, device_type;
for (int i = 0; i < num_options; ++i) {
std::string key = option_keys[i];
std::string val = option_vals[i];
if (key == "platform_name") {
platform_name = val;
} else if (key == "device_type") {
device_type = val;
} else {
LOG(FATAL) << "unknown DeviceInit option " << key;
}
}
// matched platforms
std::vector<cl_platform_id> platform_matched;
for (cl_platform_id pid : cl::GetPlatformIDs()) {
bool matched = true;
if (!cl::MatchPlatformInfo(pid, CL_PLATFORM_NAME, platform_name)) matched = false;
if (matched) platform_matched.push_back(pid);
}
if (platform_matched.size() == 0) {
LOG(FATAL) << "No OpenCL platform matched given existing options ...";
}
if (platform_matched.size() > 1) {
LOG(WARNING) << "Multiple OpenCL platforms matched, use the first one ... ";
}
w->platform_id = platform_matched[0];
LOG(INFO) << "Initialize OpenCL platform \'"
<< cl::GetPlatformInfo(w->platform_id, CL_PLATFORM_NAME) << '\'';
std::vector<cl_device_id> devices_matched =
cl::GetDeviceIDs(w->platform_id, device_type);
CHECK_GT(devices_matched.size(), 0U)
<< "No OpenCL device any device matched given the options";
w->devices = devices_matched;
cl_int err_code;
w->context = clCreateContext(
nullptr, w->devices.size(), &(w->devices[0]),
nullptr, nullptr, &err_code);
OPENCL_CHECK_ERROR(err_code);
CHECK_EQ(w->queues.size(), 0U);
for (size_t i = 0; i < w->devices.size(); ++i) {
cl_device_id did = w->devices[i];
w->queues.push_back(
clCreateCommandQueue(w->context, did, 0, &err_code));
OPENCL_CHECK_ERROR(err_code);
LOG(INFO) << "opencl(" << i
<< ")=\'" << cl::GetDeviceInfo(did, CL_DEVICE_NAME)
<< "\' cl_device_id=" << did;
}
return true;
}
template<>
inline void* AllocDataSpace<kOpenCL>(TVMContext ctx, size_t size, size_t alignment) {
cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
cl_int err_code;
cl_mem mptr = clCreateBuffer(
w->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
OPENCL_CHECK_ERROR(err_code);
return mptr;
}
template<>
inline void FreeDataSpace<kOpenCL>(TVMContext ctx, void* ptr) {
cl_mem mptr = static_cast<cl_mem>(ptr);
OPENCL_CALL(clReleaseMemObject(mptr));
}
template<>
inline void CopyDataFromTo<kOpenCL>(const void* from,
void* to,
size_t size,
TVMContext ctx_from,
TVMContext ctx_to,
TVMStreamHandle stream) {
CHECK(stream == nullptr);
cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
if (ctx_from.dev_mask == kOpenCL && ctx_to.dev_mask == kOpenCL) {
OPENCL_CALL(clEnqueueCopyBuffer(
w->GetQueue(ctx_to),
static_cast<cl_mem>((void*)from), // NOLINT(*)
static_cast<cl_mem>(to),
0, 0, size, 0, nullptr, nullptr));
} else if (ctx_from.dev_mask == kOpenCL && ctx_to.dev_mask == kCPU) {
OPENCL_CALL(clEnqueueReadBuffer(
w->GetQueue(ctx_from),
static_cast<cl_mem>((void*)from), // NOLINT(*)
CL_FALSE, 0, size, to,
0, nullptr, nullptr));
OPENCL_CALL(clFinish(w->GetQueue(ctx_from)));
} else if (ctx_from.dev_mask == kCPU && ctx_to.dev_mask == kOpenCL) {
OPENCL_CALL(clEnqueueWriteBuffer(
w->GetQueue(ctx_to),
static_cast<cl_mem>(to),
CL_FALSE, 0, size, from,
0, nullptr, nullptr));
OPENCL_CALL(clFinish(w->GetQueue(ctx_to)));
} else {
LOG(FATAL) << "Expect copy from/to GPU or between GPU";
}
}
template<>
inline void StreamSync<kOpenCL>(TVMContext ctx, TVMStreamHandle stream) {
CHECK(stream == nullptr);
cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
OPENCL_CALL(clFinish(w->GetQueue(ctx)));
}
} // namespace runtime
} // namespace tvm
#endif // TVM_OPENCL_RUNTIME
#endif // TVM_RUNTIME_OPENCL_DEVICE_API_OPENCL_H_
/*!
* Copyright (c) 2016 by Contributors
* \file device_api_opencl.h
* \brief OpenCL specific API
* Copyright (c) 2017 by Contributors
* \file opencl_common.h
* \brief OpenCL common header
*/
#ifndef TVM_RUNTIME_DEVICE_API_OPENCL_H_
#define TVM_RUNTIME_DEVICE_API_OPENCL_H_
#ifndef TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
#define TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
#include <tvm/runtime/config.h>
#include <tvm/runtime/c_runtime_api.h>
#include <tvm/runtime/packed_func.h>
#include <dmlc/logging.h>
#if TVM_OPENCL_RUNTIME
......@@ -18,7 +23,6 @@
#include <string>
#include <vector>
namespace tvm {
namespace runtime {
namespace cl {
......@@ -94,7 +98,9 @@ inline const char* CLGetErrorString(cl_int error) {
OPENCL_CHECK_ERROR(e); \
}
// Process local opencl workspace
/*!
* \brief Process global OpenCL workspace.
*/
class OpenCLWorkspace {
public:
// global platform id
......@@ -105,6 +111,9 @@ class OpenCLWorkspace {
std::vector<cl_device_id> devices;
// the queues
std::vector<cl_command_queue> queues;
// Number of registered kernels
// Used to register kernel into the workspace.
size_t num_registered_kernels{0};
// the mutex for initialization
std::mutex mu;
// destructor
......@@ -127,184 +136,49 @@ class OpenCLWorkspace {
return queues[ctx.dev_id];
}
// get the global workspace
static OpenCLWorkspace* Global() {
static OpenCLWorkspace inst;
return &inst;
}
static OpenCLWorkspace* Global();
};
inline std::string GetPlatformInfo(
cl_platform_id pid, cl_platform_info param_name) {
size_t ret_size;
OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size));
std::string ret;
ret.resize(ret_size);
OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr));
return ret;
}
inline std::string GetDeviceInfo(
cl_device_id pid, cl_device_info param_name) {
size_t ret_size;
OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size));
std::string ret;
ret.resize(ret_size);
OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, &ret[0], nullptr));
return ret;
}
inline std::vector<cl_platform_id> GetPlatformIDs() {
cl_uint ret_size;
OPENCL_CALL(clGetPlatformIDs(0, nullptr, &ret_size));
std::vector<cl_platform_id> ret;
ret.resize(ret_size);
OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr));
return ret;
}
inline std::vector<cl_device_id> GetDeviceIDs(
cl_platform_id pid, std::string device_type) {
cl_device_type dtype = CL_DEVICE_TYPE_ALL;
if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU;
if (device_type == "gpu") dtype = CL_DEVICE_TYPE_CPU;
if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR;
cl_uint ret_size;
OPENCL_CALL(clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size));
std::vector<cl_device_id> ret;
ret.resize(ret_size);
OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr));
return ret;
}
inline bool MatchPlatformInfo(
cl_platform_id pid,
cl_platform_info param_name,
std::string value) {
if (value.length() == 0) return true;
std::string param_value = GetPlatformInfo(pid, param_name);
return param_value.find(value) != std::string::npos;
}
/*! \brief Thread local workspace */
class OpenCLThreadEntry {
public:
/*! \brief The current context */
TVMContext context;
/*! \brief The thread-local kernel table */
std::vector<cl_kernel> kernel_table;
OpenCLThreadEntry() {
context.dev_id = 0;
context.dev_mask = kOpenCL;
}
// get the global workspace
static OpenCLThreadEntry* ThreadLocal();
};
} // namespace cl
template<>
inline bool DeviceInit<kOpenCL>(const char** option_keys,
const char** option_vals,
int num_options) {
cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
std::lock_guard<std::mutex>(w->mu);
if (w->initialized()) return false;
// matching conditions
std::string platform_name, device_type;
for (int i = 0; i < num_options; ++i) {
std::string key = option_keys[i];
std::string val = option_vals[i];
if (key == "platform_name") {
platform_name = val;
} else if (key == "device_type") {
device_type = val;
/*!
* \brief Automatically detect and set cuda device.
* \param args The arguments.
*/
inline void AutoSetOpenCLContext(const TVMArgs& args) {
// TODO(tqchen): merge this with CUDA logic.
int dev_id = -1;
for (int i = 0; i < args.size(); ++i) {
if (args.type_codes[i] == kArrayHandle) {
TVMContext ctx = static_cast<TVMArray*>(
args.values[i].v_handle)->ctx;
CHECK_EQ(ctx.dev_mask, kOpenCL)
<< "All operands need to be GPU";
if (dev_id == -1) {
dev_id = ctx.dev_id;
} else {
LOG(FATAL) << "unknown DeviceInit option " << key;
}
}
// matched platforms
std::vector<cl_platform_id> platform_matched;
for (cl_platform_id pid : cl::GetPlatformIDs()) {
bool matched = true;
if (!cl::MatchPlatformInfo(pid, CL_PLATFORM_NAME, platform_name)) matched = false;
if (matched) platform_matched.push_back(pid);
CHECK_EQ(dev_id, ctx.dev_id)
<< "Operands comes from different devices ";
}
if (platform_matched.size() == 0) {
LOG(FATAL) << "No OpenCL platform matched given existing options ...";
}
if (platform_matched.size() > 1) {
LOG(WARNING) << "Multiple OpenCL platforms matched, use the first one ... ";
}
w->platform_id = platform_matched[0];
LOG(INFO) << "Initialize OpenCL platform \'"
<< cl::GetPlatformInfo(w->platform_id, CL_PLATFORM_NAME) << '\'';
std::vector<cl_device_id> devices_matched =
cl::GetDeviceIDs(w->platform_id, device_type);
CHECK_GT(devices_matched.size(), 0U)
<< "No OpenCL device any device matched given the options";
w->devices = devices_matched;
cl_int err_code;
w->context = clCreateContext(
nullptr, w->devices.size(), &(w->devices[0]),
nullptr, nullptr, &err_code);
OPENCL_CHECK_ERROR(err_code);
CHECK_EQ(w->queues.size(), 0U);
for (size_t i = 0; i < w->devices.size(); ++i) {
cl_device_id did = w->devices[i];
w->queues.push_back(
clCreateCommandQueue(w->context, did, 0, &err_code));
OPENCL_CHECK_ERROR(err_code);
LOG(INFO) << "opencl(" << i
<< ")=\'" << cl::GetDeviceInfo(did, CL_DEVICE_NAME)
<< "\' cl_device_id=" << did;
}
return true;
}
template<>
inline void* AllocDataSpace<kOpenCL>(TVMContext ctx, size_t size, size_t alignment) {
cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
cl_int err_code;
cl_mem mptr = clCreateBuffer(
w->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
OPENCL_CHECK_ERROR(err_code);
return mptr;
}
template<>
inline void FreeDataSpace<kOpenCL>(TVMContext ctx, void* ptr) {
cl_mem mptr = static_cast<cl_mem>(ptr);
OPENCL_CALL(clReleaseMemObject(mptr));
}
template<>
inline void CopyDataFromTo<kOpenCL>(const void* from,
void* to,
size_t size,
TVMContext ctx_from,
TVMContext ctx_to,
TVMStreamHandle stream) {
CHECK(stream == nullptr);
cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
if (ctx_from.dev_mask == kOpenCL && ctx_to.dev_mask == kOpenCL) {
OPENCL_CALL(clEnqueueCopyBuffer(
w->GetQueue(ctx_to),
static_cast<cl_mem>((void*)from), // NOLINT(*)
static_cast<cl_mem>(to),
0, 0, size, 0, nullptr, nullptr));
} else if (ctx_from.dev_mask == kOpenCL && ctx_to.dev_mask == kCPU) {
OPENCL_CALL(clEnqueueReadBuffer(
w->GetQueue(ctx_from),
static_cast<cl_mem>((void*)from), // NOLINT(*)
CL_FALSE, 0, size, to,
0, nullptr, nullptr));
OPENCL_CALL(clFinish(w->GetQueue(ctx_from)));
} else if (ctx_from.dev_mask == kCPU && ctx_to.dev_mask == kOpenCL) {
OPENCL_CALL(clEnqueueWriteBuffer(
w->GetQueue(ctx_to),
static_cast<cl_mem>(to),
CL_FALSE, 0, size, from,
0, nullptr, nullptr));
OPENCL_CALL(clFinish(w->GetQueue(ctx_to)));
} else {
LOG(FATAL) << "Expect copy from/to GPU or between GPU";
}
cl::OpenCLThreadEntry::ThreadLocal()->context.dev_id = dev_id;
}
template<>
inline void StreamSync<kOpenCL>(TVMContext ctx, TVMStreamHandle stream) {
CHECK(stream == nullptr);
cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
OPENCL_CALL(clFinish(w->GetQueue(ctx)));
}
} // namespace runtime
} // namespace tvm
#endif // TVM_OPENCL_RUNTIME
#endif // TVM_RUNTIME_DEVICE_API_OPENCL_H_
#endif // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
/*!
* Copyright (c) 2017 by Contributors
* \file opencl_module.cc
*/
#include "./opencl_common.h"
#include "./opencl_module.h"
#if TVM_OPENCL_RUNTIME
#include <vector>
#include <string>
#include <unordered_map>
#include "../void_addr_args.h"
#include "../thread_axis_args.h"
namespace tvm {
namespace runtime {
using namespace detail;
/*!
* \brief Internal data structure to support multi-gpu execution.
* Try to use OpenCL runtime's primary context.
*/
class OpenCLModule::Internal {
public:
// the binary data
cl_program program;
// kernel id cache
std::unordered_map<std::string, size_t> kid_map;
explicit Internal(cl_program program)
: program(program) {
}
// destructor
~Internal() {
OPENCL_CALL(clReleaseProgram(program));
}
// get kernel id given key(function name.
size_t GetKernelID(const std::string& key) {
cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
std::lock_guard<std::mutex> lock(w->mu);
if (kid_map.count(key)) return kid_map.at(key);
size_t kid = w->num_registered_kernels++;
kid_map[key] = kid;
return kid;
}
};
class OpenCLWrappedFunc {
public:
// initialize the CUDA function.
void Init(std::shared_ptr<OpenCLModule::Internal> m,
size_t kernel_id,
std::string func_name,
std::vector<size_t> arg_size,
const std::vector<std::string>& thread_axis_tags) {
m_ = m;
kernel_id_ = kernel_id;
func_name_ = func_name;
arg_size_ = arg_size;
thread_axis_cfg_.Init(arg_size.size(), thread_axis_tags);
}
// invoke the function with void arguments
void operator()(TVMArgs args,
TVMRetValue* rv,
void** void_args) const {
cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
cl::OpenCLThreadEntry* t = cl::OpenCLThreadEntry::ThreadLocal();
CHECK(w->initialized());
// get the kernel from thread local kernel table.
if (kernel_id_ >= t->kernel_table.size()) {
t->kernel_table.resize(kernel_id_ + 1, nullptr);
}
cl_kernel kernel = t->kernel_table[kernel_id_];
if (kernel == nullptr) {
cl_int err;
kernel = clCreateKernel(m_->program, func_name_.c_str(), &err);
OPENCL_CHECK_ERROR(err);
t->kernel_table[kernel_id_] = kernel;
}
// setup arguments.
for (cl_uint i = 0; i < arg_size_.size(); ++i) {
OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], void_args[i]));
}
cl_command_queue queue = w->GetQueue(t->context);
ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
cl_uint work_dim = static_cast<cl_uint>(thread_axis_cfg_.work_dim());
for (cl_uint i = 0; i < work_dim; ++i) {
wl.work_size[i + 3] *= wl.work_size[i];
}
// launch kernel
OPENCL_CALL(clEnqueueNDRangeKernel(
queue, kernel, work_dim, nullptr,
wl.work_size + 3,
wl.work_size,
0, nullptr, nullptr));
}
private:
// modulex
std::shared_ptr<OpenCLModule::Internal> m_;
// global kernel id in the kernel table.
size_t kernel_id_;
// The name of the function.
std::string func_name_;
// convert code for void argument
std::vector<size_t> arg_size_;
// thread axis config
ThreadAxisConfig thread_axis_cfg_;
};
PackedFunc OpenCLModule::GetPackedFunc(
const std::string& func_name,
const std::vector<TVMType> arg_types,
const std::vector<std::string> thread_axis_tags) const {
OpenCLWrappedFunc f;
// get the kernel id.
size_t kid = ptr_->GetKernelID(func_name);
std::vector<size_t> arg_size(arg_types.size());
for (size_t i = 0; i < arg_types.size(); ++i) {
TVMType t = arg_types[i];
CHECK_EQ(t.lanes, 1U);
uint32_t bits = t.bits;
CHECK_EQ(bits % 8, 0U);
arg_size[i] = bits / 8;
}
// initialize the wrapped func.
f.Init(ptr_, kid, func_name, arg_size, thread_axis_tags);
return PackFromVoidAddrArgs(f, arg_types);
}
OpenCLModule OpenCLModule::CreateWithSource(std::string source) {
cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
CHECK(w->initialized());
const char* s = source.c_str();
size_t len = source.length();
cl_int err;
cl_program prog = clCreateProgramWithSource(
w->context, 1, &s, &len, &err);
OPENCL_CHECK_ERROR(err);
for (cl_device_id dev_id : w->devices) {
err = clBuildProgram(prog, 1, &dev_id, nullptr, nullptr, nullptr);
if (err != CL_SUCCESS) {
size_t len;
std::string log;
clGetProgramBuildInfo(
prog, dev_id, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len);
log.resize(len);
clGetProgramBuildInfo(
prog, dev_id, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr);
LOG(FATAL) << "OpenCL build error for device=" << dev_id << log;
}
}
OpenCLModule m;
m.ptr_ = std::make_shared<Internal>(prog);
return m;
}
} // namespace runtime
} // namespace tvm
#endif // TVM_OPENCL_RUNTIME
/*!
* Copyright (c) 2017 by Contributors
* \file opencl_module.h
* \brief Execution handling of OPENCL kernels
*/
#ifndef TVM_RUNTIME_OPENCL_OPENCL_MODULE_H_
#define TVM_RUNTIME_OPENCL_OPENCL_MODULE_H_
#include <tvm/runtime/config.h>
#include <tvm/runtime/packed_func.h>
#include <memory>
#include <vector>
#include <string>
namespace tvm {
namespace runtime {
/*!
* \brief Handle execution of OPENCL kernels as PackedFunc.
* It wraps around driver API to work with OPENCL runtime API.
*/
class OpenCLModule {
public:
/*!
* \brief Get OpenCL Kernel launch wrapped as PackedFunc
* \param func_name The name of the function.
* \param arg_types The type of each argument in the function.
* \param thread_axis_tags The tag sequence of the thread axis.
*/
PackedFunc GetPackedFunc(
const std::string& func_name,
const std::vector<TVMType> arg_types,
const std::vector<std::string> thread_axis_tags) const;
/*!
* \brief create a OpenCL module from data.
* \param source The module data.
*/
static OpenCLModule CreateWithSource(std::string source);
/*! \brief hidden internal data structure. */
class Internal;
private:
std::shared_ptr<Internal> ptr_;
};
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_OPENCL_OPENCL_MODULE_H_
/*!
* Copyright (c) 2017 by Contributors
* \file opencl_workspace.cc
*/
#include "./opencl_common.h"
#if TVM_OPENCL_RUNTIME
#include <dmlc/thread_local.h>
namespace tvm {
namespace runtime {
namespace cl {
OpenCLWorkspace* OpenCLWorkspace::Global() {
static OpenCLWorkspace inst;
return &inst;
}
typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;
OpenCLThreadEntry* OpenCLThreadEntry::ThreadLocal() {
return OpenCLThreadStore::Get();
}
} // namespace cl
} // namespace runtime
} // namespace tvm
#endif // TVM_OPENCL_RUNTIME
......@@ -7,7 +7,7 @@
#include "./stack_vm.h"
namespace tvm {
namespace jit {
namespace runtime {
typedef dmlc::ThreadLocalStore<StackVM::State> StackVMStateStore;
......@@ -126,7 +126,6 @@ int64_t StackVM::PrintCode(std::ostream& os, int64_t pc) const {
STACK_VM_PRINT_CODE0(SELECT);
STACK_VM_PRINT_HEAP_ACCESS(STORE_HEAP);
STACK_VM_PRINT_HEAP_ACCESS(LOAD_HEAP);
STACK_VM_PRINT_CODE1(CALL_EXTERN);
STACK_VM_PRINT_CODE1(ASSERT);
STACK_VM_PRINT_JUMP(RJUMP_IF_TRUE);
STACK_VM_PRINT_JUMP(RJUMP_IF_FALSE);
......@@ -143,6 +142,22 @@ int64_t StackVM::PrintCode(std::ostream& os, int64_t pc) const {
STACK_VM_PRINT_CODE0(TVM_ARRAY_GET_TYPE_CODE);
STACK_VM_PRINT_CODE0(TVM_ARRAY_GET_TYPE_BITS);
STACK_VM_PRINT_CODE0(TVM_ARRAY_GET_TYPE_LANES);
// packed function.
case CALL_PACKED_FUNC: {
int num_args = code[pc + 1].v_int;
os << "[" << pc << "]\tCALL_PACKED_FUNC "
<< " num_args=" << num_args
<< " fid=" << code[pc + 2].v_int;
os << " type_codes:";
for (int i = 0; i < num_args; ++i) {
os << ' ' << code[pc + 3 + i].v_int;
}
os << '\n';
for (int i = 0; i < num_args + 2; ++i) {
os << "[" << pc + 1 << "]" << std::endl;
}
return pc + 3 + num_args;
}
}
LOG(FATAL) << "unknown op code " << code[pc].op_code;
return 0;
......@@ -160,6 +175,19 @@ std::ostream& operator<<(std::ostream& os, const StackVM& vm) { // NOLINT(*)
return os;
}
void StackVM::operator()(const runtime::TVMArgs& args) const {
StackVM::State* s = StackVM::ThreadLocalState();
s->sp = 0;
s->pc = 0;
if (s->heap.size() < this->heap_size) {
s->heap.resize(this->heap_size);
}
s->heap[0].v_handle = (void*)args.values; // NOLINT(*)
s->heap[1].v_handle = (void*)args.type_codes; // NOLINT(*)
s->heap[2].v_int64 = args.num_args;
this->Run(s);
}
void StackVM::Run(State* s) const {
int64_t sp = s->sp;
int64_t pc = s->pc;
......@@ -174,7 +202,6 @@ void StackVM::Run(State* s) const {
heap.resize(heap_size);
}
const int64_t code_size = static_cast<int64_t>(code.size());
while (pc < code_size) {
switch (code[pc].op_code) {
case ADD_I64: STACK_VM_BINOP(+, v_int64); break;
......@@ -252,13 +279,19 @@ void StackVM::Run(State* s) const {
pc += 2;
break;
}
case CALL_EXTERN: {
int num_args = static_cast<int>(stack[sp].v_int64);
int call_fid = code[pc + 1].v_int;
stack[sp - num_args] = extern_func[call_fid](
&stack[sp - num_args], num_args);
sp = sp - num_args;
pc += 2;
case CALL_PACKED_FUNC: {
// call packed function.
int num_args = code[pc + 1].v_int;
int call_fid = code[pc + 2].v_int;
static_assert(sizeof(Code) == sizeof(int) &&
alignof(Code) == alignof(int), "asusmption");
const int* type_codes = &(code[pc].v_int) + 3;
runtime::TVMRetValue rv;
packed_func[call_fid].CallPacked(
runtime::TVMArgs(&stack[sp + 1 - num_args], type_codes, num_args), &rv);
sp = sp + 1 - num_args;
stack[sp] = rv.value();
pc += 3 + num_args;
break;
}
case ASSERT: {
......@@ -331,5 +364,5 @@ void StackVM::Run(State* s) const {
}
}
} // namespace jit
} // namespace runtime
} // namespace tvm
......@@ -5,18 +5,18 @@
*
* This can be used to interepret host side code
* to setup calls into device functions
* when only JIT for device is available(via NVRTC or OpenCL).
* when only Runtime compilation for device is available(via NVRTC or OpenCL).
*/
#ifndef TVM_JIT_STACK_VM_H_
#define TVM_JIT_STACK_VM_H_
#ifndef TVM_RUNTIME_STACK_VM_STACK_VM_H_
#define TVM_RUNTIME_STACK_VM_STACK_VM_H_
#include <tvm/base.h>
#include <tvm/runtime/c_runtime_api.h>
#include <tvm/runtime/packed_func.h>
#include <string>
#include <vector>
namespace tvm {
namespace jit {
namespace runtime {
/*!
* \brief A simple stack-based virtual machine.
......@@ -24,6 +24,11 @@ namespace jit {
class StackVM {
public:
/*!
* \brief Invoke the StackVM as PackedFunc
* \param args The arguments to the StackVM.
*/
void operator()(const TVMArgs& args) const;
/*!
* \brief The opcode of stack vm
* \note Notation
* - sp Stack pointer
......@@ -121,16 +126,19 @@ class StackVM {
*/
SELECT,
/*!
* \brief call an extern function
* \brief call an extern packed function
* \code
* num_args = stack[sp].v_int64;
* call_fid = code[pc + 1].v_int;
* f = extern_func[call_fid];
* stack[sp - num_args] = f(&stack[sp - num_args], num_args);
* int* type_codes = &(code[pc + 2].v_int)
* stack[sp - num_args] = f(&stack[sp - num_args], type_codes, num_args);
* sp = sp - num_args;
* // The type codes are hidden in the code space.
* pc = pc + 2 + num_args
* \endcode
*/
CALL_EXTERN,
CALL_PACKED_FUNC,
/*!
* \brief Assert condition is true.
* \code
......@@ -217,14 +225,12 @@ class StackVM {
int64_t PrintCode(std::ostream&os, int64_t pc) const; // NOLINT(*)
/*! \brief Get thread local state of the stack VM */
static State* ThreadLocalState();
/*! \brief extern function that will mutate the state */
using ExternFunc = std::function<TVMValue (const TVMValue* args, int num_args)>;
/*! \brief The instructions */
std::vector<Code> code;
/*! \brief constant error messages */
std::vector<std::string> str_data;
/*! \brief Extern functions */
std::vector<ExternFunc> extern_func;
/*! \brief Extern functions in packed func format */
std::vector<runtime::PackedFunc> packed_func;
/*! \brief name of each heap id*/
std::vector<std::string> heap_id_name;
/*! \brief The memory size needed */
......@@ -254,20 +260,20 @@ class StackVM {
* \param t the type code.
* \return The load opcode
*/
static OpCode GetLoad(Type t) {
CHECK_EQ(t.lanes(), 1);
if (t.is_handle()) return ADDR_LOAD_HANDLE;
if (t.is_int()) {
switch (t.bits()) {
static OpCode GetLoad(TVMType t) {
CHECK_EQ(t.lanes, 1U);
if (t.code == kHandle) return ADDR_LOAD_HANDLE;
if (t.code == kInt) {
switch (t.bits) {
case 32 : return ADDR_LOAD_INT32;
case 64 : return ADDR_LOAD_INT64;
}
} else if (t.is_uint()) {
switch (t.bits()) {
} else if (t.code == kUInt) {
switch (t.bits) {
case 32 : return ADDR_LOAD_UINT32;
}
} else if (t.is_float()) {
switch (t.bits()) {
} else if (t.code == kFloat) {
switch (t.bits) {
case 64 : return ADDR_LOAD_FP64;
}
}
......@@ -279,20 +285,19 @@ class StackVM {
* \param t the type code.
* \return The load opcode
*/
static OpCode GetStore(Type t) {
CHECK_EQ(t.lanes(), 1);
if (t.is_int()) {
switch (t.bits()) {
static OpCode GetStore(TVMType t) {
CHECK_EQ(t.lanes, 1U);
if (t.code == kInt) {
switch (t.bits) {
case 64 : return ADDR_STORE_INT64;
}
}
LOG(FATAL) << "Cannot store type " << t;
return ADDR_LOAD_FP64;
}
friend std::ostream& operator<<(std::ostream& os, const StackVM& vm); // NOLINT(*)
};
} // namespace jit
} // namespace runtime
} // namespace tvm
#endif // TVM_JIT_STACK_VM_H_
#endif // TVM_RUNTIME_STACK_VM_STACK_VM_H_
/*!
* Copyright (c) 2017 by Contributors
* \file thread_axis_args.h
* \brief Extract thread axis configuration from TVMArgs.
*/
#ifndef TVM_RUNTIME_THREAD_AXIS_ARGS_H_
#define TVM_RUNTIME_THREAD_AXIS_ARGS_H_
#include <string>
#include <vector>
namespace tvm {
namespace runtime {
/*! \brief workload speccification */
struct ThreadWorkLoad {
// array, first three are thread configuration.
size_t work_size[6];
/*!
* \param i The block dimension.
* \return i-th block dim
*/
inline size_t block_dim(size_t i) const {
return work_size[i];
}
/*!
* \param i The grid dimension.
* \return i-th grid dim
*/
inline size_t grid_dim(size_t i) const {
return work_size[i + 3];
}
};
/*! \brief Thread axis configuration */
class ThreadAxisConfig {
public:
void Init(size_t base,
const std::vector<std::string>& thread_axis_tags) {
base_ = base;
std::vector<bool> filled(6, false);
for (size_t i = 0; i < thread_axis_tags.size(); ++i) {
const std::string& tag = thread_axis_tags[i];
if (tag == "threadIdx.x") {
arg_index_map_.push_back(0);
filled[0] = true;
} else if (tag == "threadIdx.y") {
arg_index_map_.push_back(1);
filled[1] = true;
} else if (tag == "threadIdx.z") {
arg_index_map_.push_back(2);
filled[2] = true;
} else if (tag == "blockIdx.x") {
arg_index_map_.push_back(3 + 0);
filled[3] = true;
} else if (tag == "blockIdx.y") {
arg_index_map_.push_back(3 + 1);
filled[3 + 1] = true;
} else if (tag == "blockIdx.z") {
arg_index_map_.push_back(3 + 2);
filled[3 + 2] = true;
} else {
LOG(FATAL) << "do not known thread_tag=" << tag;
}
}
work_dim_ = 3;
for (int i = 0; i < 3; ++i) {
if (!filled[i]) {
for (int j = i; j < 3; ++j) {
CHECK(!filled[j] && !filled[j + 3])
<< "Invalid thread group configuration";
}
work_dim_ = i;
break;
} else {
CHECK(filled[i])
<< "Must have both threadIdx and blockIdx";
}
}
}
// extract workload from arguments.
ThreadWorkLoad Extract(TVMArgs x) const {
ThreadWorkLoad w;
std::fill(w.work_size, w.work_size + 6, 1);
for (size_t i = 0; i < arg_index_map_.size(); ++i) {
w.work_size[arg_index_map_[i]] =
static_cast<size_t>(x.values[base_ + i].v_int64);
}
return w;
}
// return the work dim
size_t work_dim() const {
return work_dim_;
}
private:
/*! \brief base axis */
size_t base_;
/*! \brief The worker dimension */
size_t work_dim_;
/*! \brief The index mapping. */
std::vector<uint32_t> arg_index_map_;
};
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_THREAD_AXIS_ARGS_H_
/*!
* Copyright (c) 2017 by Contributors
* \file void_addr_args.h
* \brief Utility to convert TVMArgs to void* array type-erasure function call.
*
* Array of argument address is a typical way of type-erasure for functions.
* The function signiture looks like function(void** args, int num_args);
* Where args takes the address of each input.
*/
#ifndef TVM_RUNTIME_VOID_ADDR_ARGS_H_
#define TVM_RUNTIME_VOID_ADDR_ARGS_H_
#include <tvm/runtime/c_runtime_api.h>
#include <vector>
namespace tvm {
namespace runtime {
/*!
* \brief Create a packed function from void addr types
* \param f with signiture (TVMArgs args, TVMRetValue* rv, void* void_args)
* \param arg_types The arguments that wish to get from
* \tparam T the function type
*
* \return The wrapped packed function.
*/
template<typename F>
inline PackedFunc PackFromVoidAddrArgs(
F f, const std::vector<TVMType>& arg_types);
// implementations details
namespace detail {
/*!
* \brief void addr argument data content
* holder in case conversion is needed.
*/
union VoidArgHolder {
int32_t v_int32;
uint32_t v_uint32;
float v_float32;
};
template<int MAX_NARG>
class VoidAddrArray {
public:
explicit VoidAddrArray(int num_args) {
}
void** addr() {
return addr_;
}
VoidArgHolder* holder() {
return holder_;
}
private:
void* addr_[MAX_NARG];
VoidArgHolder holder_[MAX_NARG];
};
template<>
class VoidAddrArray<0> {
public:
explicit VoidAddrArray(int num_args)
: addr_(num_args), holder_(num_args) {
}
void** addr() {
return addr_.data();
}
VoidArgHolder* holder() {
return holder_.data();
}
private:
std::vector<void*> addr_;
std::vector<VoidArgHolder> holder_;
};
/*! \brief conversion code used in void arg. */
enum VoidArgConvertCode {
INT64_TO_INT64,
INT64_TO_INT32,
INT64_TO_UINT32,
FLOAT64_TO_FLOAT32,
FLOAT64_TO_FLOAT64,
HANDLE_TO_HANDLE
};
template<int N, typename F>
inline PackedFunc PackFromVoidAddrArgs_(
F f, const std::vector<VoidArgConvertCode>& codes) {
int num_args = static_cast<int>(codes.size());
auto ret = [f, codes, num_args](TVMArgs args, TVMRetValue* ret) {
VoidAddrArray<N> temp(num_args);
void** addr = temp.addr();
VoidArgHolder* holder = temp.holder();
for (int i = 0; i < num_args; ++i) {
switch (codes[i]) {
case INT64_TO_INT64:
case FLOAT64_TO_FLOAT64:
case HANDLE_TO_HANDLE: {
addr[i] = (void*)&(args.values[i]); // NOLINT(*)
break;
}
case INT64_TO_INT32: {
holder[i].v_int32 = static_cast<int32_t>(args.values[i].v_int64);
addr[i] = &(holder[i]);
break;
}
case INT64_TO_UINT32 : {
holder[i].v_uint32 = static_cast<uint32_t>(args.values[i].v_int64);
addr[i] = &(holder[i]);
break;
}
case FLOAT64_TO_FLOAT32: {
holder[i].v_float32 = static_cast<float>(args.values[i].v_float64);
addr[i] = &(holder[i]);
break;
}
}
}
f(args, ret, addr);
};
return PackedFunc(ret);
}
inline VoidArgConvertCode GetVoidArgConvertCode(TVMType t) {
CHECK_EQ(t.lanes, 1U);
if (t.code == kInt) {
if (t.bits == 64U) return INT64_TO_INT64;
if (t.bits == 32U) return INT64_TO_INT32;
} else if (t.code == kUInt) {
if (t.bits == 32U) return INT64_TO_UINT32;
} else if (t.code == kFloat) {
if (t.bits == 64U) return FLOAT64_TO_FLOAT64;
if (t.bits == 32U) return FLOAT64_TO_FLOAT32;
} else if (t.code == kHandle) {
return HANDLE_TO_HANDLE;
}
LOG(FATAL) << "Cannot handle " << t;
return HANDLE_TO_HANDLE;
}
} // namespace detail
template<typename F>
inline PackedFunc PackFromVoidAddrArgs(
F f, const std::vector<TVMType>& arg_types) {
std::vector<detail::VoidArgConvertCode> codes(arg_types.size());
for (size_t i = 0; i < arg_types.size(); ++i) {
codes[i] = detail::GetVoidArgConvertCode(arg_types[i]);
}
size_t num_void_args = arg_types.size();
// specialization
if (num_void_args <= 4) {
return detail::PackFromVoidAddrArgs_<4>(f, codes);
} else if (num_void_args <= 8) {
return detail::PackFromVoidAddrArgs_<8>(f, codes);
} else {
return detail::PackFromVoidAddrArgs_<0>(f, codes);
}
}
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_VOID_ADDR_ARGS_H_
import tvm
import numpy
def mock_test_add():
"""Not yet working, mock design"""
n = tvm.Var('n')
A = tvm.placeholder((n,), name='A')
B = tvm.placeholder((n,), name='B')
C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
s = tvm.Schedule(C.op)
# GPU schedule have to split by gridIdx and threadIdx
num_thread = 256
grid_x = tvm.IterVar(thread_tag="gridIdx.x")
thread_x = tvm.IterVar((0, num_thread), thread_tag="threadIdx.x")
_, x = s[C].split(C.op.axis[0], factor=num_thread, outer=grid_x)
_, x = s[C].split(x, outer=thread_x)
# compile to IR
bounds = tvm.schedule.InferBound(s)
stmt = tvm.ir_pass.ScheduleOps(s, bounds)
Ab = tvm.Buffer(A.shape, A.dtype, name='A')
Bb = tvm.Buffer(B.shape, B.dtype, name='B')
Cb = tvm.Buffer(C.shape, C.dtype, name='C')
stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, C:Cb})
stmt = tvm.ir_pass.Simplify(stmt)
print(stmt)
output_ssa = False
f = tvm.codegen.MakeAPI(stmt, "myadd", [Ab, Bb, Cb], 1)
f_list = tvm.codegen.SplitHostDevice(f)
for x in f_list:
code = tvm.codegen.CompileToC(x, output_ssa)
print(code)
if __name__ == "__main__":
mock_test_add()
import tvm
import numpy as np
def test_add_pipeline():
"""Not yet working, mock design"""
n = tvm.Var('n')
A = tvm.placeholder((n,), name='A')
B = tvm.placeholder((n,), name='B')
C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
s = tvm.Schedule(C.op)
# GPU schedule have to split by gridIdx and threadIdx
num_thread = 256
grid_x = tvm.IterVar(thread_tag="blockIdx.x")
thread_x = tvm.IterVar((0, num_thread), thread_tag="threadIdx.x")
_, x = s[C].split(C.op.axis[0], factor=num_thread, outer=grid_x)
_, x = s[C].split(x, outer=thread_x)
# compile to IR
bounds = tvm.schedule.InferBound(s)
stmt = tvm.ir_pass.ScheduleOps(s, bounds)
Ab = tvm.Buffer(A.shape, A.dtype, name='A')
Bb = tvm.Buffer(B.shape, B.dtype, name='B')
Cb = tvm.Buffer(C.shape, C.dtype, name='C')
stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, C:Cb})
stmt = tvm.ir_pass.Simplify(stmt)
fapi = tvm.codegen.MakeAPI(stmt, "myadd", [Ab, Bb, Cb], 3)
fsplits = tvm.codegen.SplitHostDevice(fapi)
def check_cuda():
output_ssa = False
for f in fsplits[1:]:
print(tvm.codegen.CompileToC(f, output_ssa, "cuda"))
# build and invoke the kernel.
fcuda = tvm.codegen.BuildNVRTC(fsplits, "stackvm")
num_device = 1
for i in range(num_device):
ctx = tvm.gpu(i)
if not ctx.enabled:
continue
# launch the kernel.
n = 1027
a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx)
fcuda(a, b, c)
np.testing.assert_allclose(
c.asnumpy(), a.asnumpy() + b.asnumpy())
def check_opencl():
output_ssa = False
for f in fsplits[1:]:
print(tvm.codegen.CompileToC(f, output_ssa, "opencl"))
# build and invoke the kernel.
fcl = tvm.codegen.BuildOpenCL(fsplits, "stackvm")
# Disable OpenCL runtime test for now,
# since the local worksize on CPU might be too large.
num_device = 0
for i in range(num_device):
ctx = tvm.cl(i)
if not ctx.enabled:
continue
# launch the kernel.
n = 1027
a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx)
fcl(a, b, c)
np.testing.assert_allclose(
c.asnumpy(), a.asnumpy() + b.asnumpy())
tvm.init_opencl()
if tvm.cl(0).enabled:
check_opencl()
if tvm.gpu(0).enabled:
check_cuda()
if __name__ == "__main__":
test_add_pipeline()
import tvm
import numpy as np
def test_function():
ctx = tvm.cpu(0)
x = np.random.randint(0, 10, size=(3, 4))
x = np.array(x)
y = tvm.nd.array(x, ctx=ctx)
f = tvm.codegen.DummyHelloFunction()
f(y, 10)
def test_get_global():
targs = (10, 10.0, "hello")
# register into global function table
......
......@@ -10,6 +10,7 @@ def test_stack_vm_basic():
a = tvm.nd.array(np.zeros(10, dtype='float32'))
@tvm.register_func
def tvm_call_back_get_shape(shape0):
print(shape0)
assert shape0 == a.shape[0]
n = tvm.Var('n')
......@@ -74,3 +75,5 @@ def test_stack_vm_cond():
if __name__ == "__main__":
test_stack_vm_cond()
test_stack_vm_loop()
test_stack_vm_basic()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment