Commit 95a323aa by Zhi Committed by Leyuan Wang

[codegen] heterogeneous build for c++ (#3144)

* heterogeneous build for c++

* merge relay buildmodule to codegen build

* use module split

* use target_host

* remove sse3

* retrigger ci
parent 4ac93a53
......@@ -371,6 +371,35 @@ TVM_DLL runtime::Module build(const Array<LoweredFunc>& funcs,
const Target& target_host,
const BuildConfig& config);
/*!
* \brief Build a device and host module for a specific target from a map
* contains target to a list of lowered functions pairs. This function is used
* for heterogeneous build.
* \param input The map contains target to a list of lowered functions pairs.
* \param target_host The target for building host code. To use the default,
* pass Target().
* \param config The build configuration.
* \return The built module that contains code for different processors.
*/
TVM_DLL runtime::Module build(const Map<Target, Array<LoweredFunc>>& input,
const Target& target_host,
const BuildConfig& config);
/*!
* \brief Build a device and host module for a specific target from a map
* contains target to a list of lowered functions pairs. This function is used
* for heterogeneous build.
* \param input The map contains target string to a list of lowered functions
* pairs.
* \param target_host The target for building host code. To use the default,
* pass Target().
* \param config The build configuration.
* \return The built module that contains code for different processors.
*/
TVM_DLL runtime::Module build(const Map<std::string, Array<LoweredFunc>>& input,
const Target& target_host,
const BuildConfig& config);
class GenericFuncNode;
/*!
......
......@@ -428,20 +428,19 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
const Target& target_host,
const BuildConfig& config) {
std::unordered_set<std::string> all_names;
for (const auto &x : funcs) {
CHECK(all_names.count(x->name) == 0) << "Duplicate function name " << x->name;
for (const auto& x : funcs) {
CHECK(all_names.count(x->name) == 0)
<< "Duplicate function name " << x->name;
all_names.insert(x->name);
}
auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target);
Array<LoweredFunc> fhost;
Array<LoweredFunc> fdevice;
for (const auto& x : funcs) {
CHECK(ir::VerifyMemory(x, target->device_type))
<< "Direct host side access to device memory is detected in " << x->func_name()
<< ". Did you forget to bind?";
<< "Direct host side access to device memory is detected in "
<< x->func_name() << ". Did you forget to bind?";
if (x->func_type == kMixedFunc) {
auto func = x;
......@@ -450,6 +449,7 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
}
func = ir::ThreadSync(func, "shared");
func = ir::ThreadSync(func, "warp");
func = ir::LowerThreadAllreduce(func, target->thread_warp_size);
auto fsplits = ir::SplitHostDevice(func);
fhost.push_back(fsplits[0]);
......@@ -465,12 +465,32 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
}
}
for (size_t i = 0; i < fdevice.size(); i++) {
auto warp_size = target->thread_warp_size;
auto func = fdevice[i];
func = ir::LowerWarpMemory(fdevice[i], warp_size);
fdevice.Set(i, func);
}
auto keys = target->keys();
bool target_is_gpu =
std::find(keys.begin(), keys.end(), "gpu") != keys.end();
bool target_is_gpu = std::find(keys.begin(), keys.end(), "gpu") != keys.end();
if (target_is_gpu && fdevice.size() == 0) {
LOG(WARNING) << "Specified target " + target->str() +
" but cannot find device code. Did you forget to bind?";
LOG(WARNING) << "Specified target "
<< target->str()
<< " but cannot find device code. Did you forget to bind?";
}
for (size_t i = 0; i < fdevice.size(); ++i) {
auto func = fdevice[i];
func = ir::LowerIntrin(func, target->target_name);
fdevice.Set(i, func);
}
if (target->device_type == target::llvm()->device_type &&
target_host == target) {
CHECK(fdevice.empty()) << "No device code should be generated when target "
<< "and host_target are both llvm target."
<< "\n";
}
for (size_t i = 0; i < fhost.size(); ++i) {
......@@ -480,39 +500,89 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
fhost.Set(i, func);
}
for (size_t i = 0; i < fdevice.size(); ++i) {
auto func = fdevice[i];
func = ir::LowerIntrin(func, target->target_name);
fdevice.Set(i, func);
}
for (size_t i = 0; i < fhost.size(); ++i) {
auto func = fhost[i];
func = ir::LowerIntrin(func, target_host_val->target_name);
func = ir::LowerIntrin(func, target_host->target_name);
func = ir::CombineContextCall(func);
fhost.Set(i, func);
}
return {fhost, fdevice};
}
runtime::Module build(const Array<LoweredFunc>& funcs,
const Target& target,
// Create a module for a specific device (target). The lowered functions
// associated with the host is returned as well.
runtime::Module DeviceBuild(const Array<LoweredFunc>& fdevice,
const Target& target) {
if (!fdevice.empty()) {
return codegen::Build(fdevice, target->str());
} else {
return runtime::Module(nullptr);
}
}
// Build for heterogeneous execution.
runtime::Module build(const Map<Target, Array<LoweredFunc>>& inputs,
const Target& target_host,
const BuildConfig& config) {
auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target);
auto host_dev_funcs = split_dev_host_funcs(funcs, target, target_host, config);
Array<LoweredFunc> fhost_all;
std::vector<runtime::Module> device_modules;
Target target_host_val = target_host;
if (!target_host.defined()) {
for (const auto& it : inputs) {
if (it.first->device_type == kDLCPU) {
target_host_val = it.first;
break;
}
}
}
if (!target_host_val.defined()) {
target_host_val = DefaultTargetHost(target_host_val);
}
for (const auto& it : inputs) {
auto host_dev_funcs =
split_dev_host_funcs(it.second, it.first, target_host_val, config);
auto& fhost = host_dev_funcs[0];
auto& fdevice = host_dev_funcs[1];
// Get the module for a certain target.
runtime::Module mdev = DeviceBuild(fdevice, it.first);
for (const auto& it : fhost) {
fhost_all.push_back(it);
}
device_modules.push_back(mdev);
}
auto mhost = codegen::Build(fhost, target_host_val->str());
runtime::Module mhost = codegen::Build(fhost_all, target_host_val->str());
// Import all modules
for (const auto& it : device_modules) {
if (it.operator->()) {
mhost.Import(it);
}
}
return mhost;
}
if (fdevice.size() > 0) {
auto mdev = codegen::Build(fdevice, target->str());
mhost.Import(mdev);
// Build for heterogeneous execution when target is a string.
runtime::Module build(const Map<std::string, Array<LoweredFunc>>& inputs,
const Target& target_host,
const BuildConfig& config) {
Map<Target, Array<LoweredFunc>> updated_input;
for (const auto& it : inputs) {
auto target = Target::create(it.first);
updated_input.Set(target, it.second);
}
return build(updated_input, target_host, config);
}
return mhost;
// Build for homogeneous execution.
runtime::Module build(const Array<LoweredFunc>& funcs,
const Target& target,
const Target& target_host,
const BuildConfig& config) {
Map<Target, Array<LoweredFunc>> inputs = {{target, funcs}};
return build(inputs, target_host, config);
}
BuildConfig build_config() {
......
......@@ -601,52 +601,6 @@ class RelayBuildModule : public runtime::ModuleNode {
}
return func;
}
/*!
* \brief Build module given lowered functions for each target
*
* \param lowered_funcs target_str -> Array<LoweredFunc> map
* \param targets Targets map
* \param cfg Building configuration
*/
void BuildModule(const Map<std::string, Array<LoweredFunc> >& lowered_funcs,
const Map<HalideIR::Expr, HalideIR::Expr>& targets,
const BuildConfig& cfg) {
auto target_host = Target::create(cfg_.fallback_device);
for (const auto& kv : lowered_funcs) {
std::unordered_set<std::string> fname_set;
for (auto f : kv.second) {
if (fname_set.count(f->name)) {
LOG(FATAL) << "Duplicate function name "
<< f->name;
}
fname_set.insert(f->name);
}
}
std::unordered_map<std::string, Target> target_map;
for (const auto& kv : lowered_funcs) {
target_map[kv.first] = Target::create(kv.first);
}
Array<LoweredFunc> fhost_all;
std::vector<runtime::Module> device_module;
for (const auto& kv : lowered_funcs) {
auto target = target_map[kv.first];
auto host_dev_funcs = split_dev_host_funcs(kv.second, target, target_host, cfg);
for (auto f : host_dev_funcs[0]) {
fhost_all.push_back(f);
}
if (host_dev_funcs[1].size()) {
auto mdev = codegen::Build(host_dev_funcs[1], target->str());
device_module.push_back(mdev);
}
}
auto mhost = codegen::Build(fhost_all, target_host->str());
for (auto mdev : device_module) {
mhost.Import(mdev);
}
ret_.mod = mhost;
}
/*!
* \brief Build relay function to runtime module
......@@ -686,9 +640,8 @@ class RelayBuildModule : public runtime::ModuleNode {
ret_.graph_json = graph_codegen_->GetJSON();
ret_.params = graph_codegen_->GetParams();
BuildModule(graph_codegen_->GetLoweredFunc(),
device_target,
tvm_cfg_);
auto target_host = Target::create(target_host_);
ret_.mod = tvm::build(graph_codegen_->GetLoweredFunc(), target_host, tvm_cfg_);
}
protected:
......
......@@ -19,10 +19,14 @@
#include <dmlc/logging.h>
#include <gtest/gtest.h>
#include <topi/cuda/injective.h>
#include <tvm/tvm.h>
#include <tvm/operation.h>
#include <tvm/build_module.h>
#include <string>
#include <cmath>
TEST(BuildModule, Basic) {
using namespace tvm;
auto n = var("n");
......@@ -56,6 +60,134 @@ TEST(BuildModule, Basic) {
CHECK_EQ(mali_target->str(), "opencl -model=Mali-T860MP4@800Mhz -device=mali");
}
TEST(BuildModule, Heterogeneous) {
/* The testing network is like following, where the element-wise add and sub
* ops are allocated to GPU and CPU, respectively:
*
* A B
* \ /
* elemwise_add (gpu)
* \
* copy C
* \ /
* elemwise_sub (cpu)
*/
using namespace tvm;
const runtime::PackedFunc* pf = runtime::Registry::Get("module._Enabled");
bool enabled = (*pf)("cuda");
if (!enabled) {
LOG(INFO) << "Skip heterogeneous test because cuda is not enabled."
<< "\n";
return;
}
auto target_llvm = target::llvm();
auto target_cuda = target::cuda();
// The shape of input tensors.
const int n = 4;
Array<Expr> shape{n};
auto A = placeholder(shape, Float(32), "A");
auto B = placeholder(shape, Float(32), "B");
auto C = placeholder(shape, Float(32), "C");
auto elemwise_add = compute(A->shape, [&A, &B](Expr i) {
return A[i] + B[i];
}, "elemwise_add");
auto copy = placeholder(shape, Float(32), "__copy");
auto elemwise_sub = compute(C->shape, [&copy, &C](Expr i) {
return copy[i] - C[i];
}, "elemwise_sub");
auto s1 = topi::cuda::schedule_injective(target_cuda, {elemwise_add});
auto s2 = create_schedule({elemwise_sub->op});
auto config = build_config();
auto args1 = Array<Tensor>({A, B, elemwise_add});
auto args2 = Array<Tensor>({copy, C, elemwise_sub});
std::unordered_map<Tensor, Buffer> binds;
auto lowered_s1 = lower(s1, args1, "elemwise_add", binds, config);
auto lowered_s2 = lower(s2, args2, "elemwise_sub", binds, config);
Map<tvm::Target, Array<LoweredFunc>> inputs = {{target_cuda, lowered_s1},
{target_llvm, lowered_s2}};
auto module = build(inputs, Target(), config);
// Assertion for build.
CHECK_EQ(module->imports().size(), 1);
// Execute the graph and check the correctness.
// Setup graph json.
std::string json =
"{\"nodes\": [{\"op\": \"null\", \"name\": \"A\", \"inputs\": []}, "
"{\"op\": \"null\", \"name\": \"B\", \"inputs\": []}, {\"op\": "
"\"tvm_op\", \"name\": \"elemwise_add\", \"attrs\": {\"flatten_data\": "
"\"1\", \"func_name\": \"elemwise_add\", \"num_inputs\": \"2\", "
"\"num_outputs\": \"1\"}, \"inputs\": [[0, 0, 0], [1, 0, 0]]}, {\"op\": "
"\"tvm_op\", \"name\": \"__copy_add_to_sub\", \"attrs\": "
"{\"flatten_data\": \"0\", \"func_name\": \"__copy\", \"num_inputs\": "
"\"1\", \"num_outputs\": \"1\"}, \"inputs\": [[2, 0, 0]]}, {\"op\": "
"\"null\", \"name\": \"C\", \"inputs\": []}, {\"op\": \"tvm_op\", "
"\"name\": \"elemwise_sub\", \"attrs\": {\"flatten_data\": \"0\", "
"\"func_name\": \"elemwise_sub\", \"num_inputs\": \"2\", "
"\"num_outputs\": \"1\"}, \"inputs\": [[3, 0, 0], [4, 0, 0]]}], "
"\"arg_nodes\": [0, 1, 4], \"node_row_ptr\": [0, 1, 2, 3, 4, 5, 6], "
"\"heads\": [[5, 0, 0]], \"attrs\": {\"storage_id\": [\"list_int\", [3, "
"4, 0, 1, 5, 2]], \"shape\": [\"list_shape\", [[4], [4], [4], [4], [4], "
"[4]]], \"device_index\": [\"list_int\", [2, 2, 2, 1, 1, 1]], \"dtype\": "
"[\"list_int\", [0, 0, 0, 0, 0, 0]], \"dltype\": [\"list_str\", "
"[\"float32\", \"float32\", \"float32\", \"float32\", \"float32\", "
"\"float32\"]]}}";
// Setup inputs.
auto a_val =
runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
auto b_val =
runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
auto c_val =
runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
auto pa = (float*)a_val.ToDLPack()->dl_tensor.data;
auto pb = (float*)b_val.ToDLPack()->dl_tensor.data;
auto pc = (float*)c_val.ToDLPack()->dl_tensor.data;
// Assign values.
for (int i = 0; i < n; i++) {
pa[i] = i;
pb[i] = i + 1.0;
pc[i] = i - 1.0;
}
// Initialize graph runtime.
int cpu_dev_ty = static_cast<int>(kDLCPU);
int cpu_dev_id = 0;
int gpu_dev_ty = static_cast<int>(kDLGPU);
int gpu_dev_id = 0;
const runtime::PackedFunc* graph_runtime =
tvm::runtime::Registry::Get("tvm.graph_runtime.create");
runtime::Module mod = (*graph_runtime)(
json, module, cpu_dev_ty, cpu_dev_id, gpu_dev_ty, gpu_dev_id);
PackedFunc set_input = mod.GetFunction("set_input", false);
PackedFunc run = mod.GetFunction("run", false);
PackedFunc get_output = mod.GetFunction("get_output", false);
set_input("A", a_val);
set_input("B", b_val);
set_input("C", c_val);
run();
tvm::runtime::NDArray out = get_output(0);
float* p_out = (float*)out.ToDLPack()->dl_tensor.data;
// Check correctness.
for (int i = 0; i < n; ++i) {
CHECK_LT(std::fabs(p_out[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5);
}
}
int main(int argc, char ** argv) {
testing::InitGoogleTest(&argc, argv);
......
......@@ -89,7 +89,7 @@ def test_build():
tgt: tgt
}
m_bld.set_opt_level(3)
m_bld.build(func, targets, "llvm -mcpu=sse3", params=params)
m_bld.build(func, targets, "llvm", params=params)
g_json = m_bld.get_json()
mmod = m_bld.get_module()
params = m_bld.get_params()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment