Commit 95f12e31 by Wei Chen Committed by Jared Roesch

[Relay][VM]VM Profiler (#3727)

* [Relay][VM]VM debugger

* Report mean/min/max for op duration

* Typos

* Lint

* Lint

* Lint

* Support build debug VM in CMake

* Lint

* Enable VM debug in unit test

* Disable debug vm test until new docker image is built

* Add device sync code

* Fix qnn unit test

* Disable vm debug by default

* Rename files

* Rename classes

* Fix comment

* Fix comment
parent c87ace7e
...@@ -131,10 +131,33 @@ file(GLOB COMPILER_SRCS ...@@ -131,10 +131,33 @@ file(GLOB COMPILER_SRCS
src/schedule/*.cc src/schedule/*.cc
) )
file(GLOB_RECURSE RELAY_SRCS file(GLOB_RECURSE RELAY_OP_SRCS
src/relay/*.cc src/relay/op/*.cc
) )
list(APPEND COMPILER_SRCS ${RELAY_SRCS}) file(GLOB_RECURSE RELAY_PASS_SRCS
src/relay/pass/*.cc
)
file(GLOB RELAY_BACKEND_SRCS
src/relay/backend/*.cc
src/relay/backend/vm/*.cc
)
file(GLOB_RECURSE RELAY_IR_SRCS
src/relay/ir/*.cc
)
file(GLOB_RECURSE RELAY_QNN_SRCS
src/relay/qnn/*.cc
)
list(APPEND COMPILER_SRCS ${RELAY_OP_SRCS})
list(APPEND COMPILER_SRCS ${RELAY_PASS_SRCS})
list(APPEND COMPILER_SRCS ${RELAY_BACKEND_SRCS})
list(APPEND COMPILER_SRCS ${RELAY_IR_SRCS})
list(APPEND COMPILER_SRCS ${RELAY_QNN_SRCS})
if(USE_VM_PROFILER)
message(STATUS "Build compiler with Relay VM profiler support...")
file(GLOB BACKEND_VM_PROFILER_SRCS src/relay/backend/vm/profiler/*.cc)
list(APPEND COMPILER_SRCS ${BACKEND_VM_PROFILER_SRCS})
endif(USE_VM_PROFILER)
file(GLOB DATATYPE_SRCS src/codegen/datatype/*.cc) file(GLOB DATATYPE_SRCS src/codegen/datatype/*.cc)
list(APPEND COMPILER_SRCS ${DATATYPE_SRCS}) list(APPEND COMPILER_SRCS ${DATATYPE_SRCS})
...@@ -198,6 +221,12 @@ if(USE_GRAPH_RUNTIME) ...@@ -198,6 +221,12 @@ if(USE_GRAPH_RUNTIME)
endif(USE_GRAPH_RUNTIME_DEBUG) endif(USE_GRAPH_RUNTIME_DEBUG)
endif(USE_GRAPH_RUNTIME) endif(USE_GRAPH_RUNTIME)
if(USE_VM_PROFILER)
message(STATUS "Build with Relay VM profiler support...")
file(GLOB RUNTIME_VM_PROFILER_SRCS src/runtime/vm/profiler/*.cc)
list(APPEND RUNTIME_SRCS ${RUNTIME_VM_PROFILER_SRCS})
endif(USE_VM_PROFILER)
# Module rules # Module rules
include(cmake/modules/VTA.cmake) include(cmake/modules/VTA.cmake)
include(cmake/modules/CUDA.cmake) include(cmake/modules/CUDA.cmake)
......
...@@ -147,6 +147,7 @@ stage('Build') { ...@@ -147,6 +147,7 @@ stage('Build') {
echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake
echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake
echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
echo set\\(USE_ANTLR ON\\) >> config.cmake echo set\\(USE_ANTLR ON\\) >> config.cmake
echo set\\(USE_BLAS openblas\\) >> config.cmake echo set\\(USE_BLAS openblas\\) >> config.cmake
echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
...@@ -164,6 +165,7 @@ stage('Build') { ...@@ -164,6 +165,7 @@ stage('Build') {
echo set\\(USE_VULKAN ON\\) >> config.cmake echo set\\(USE_VULKAN ON\\) >> config.cmake
echo set\\(USE_MICRO ON\\) >> config.cmake echo set\\(USE_MICRO ON\\) >> config.cmake
echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
echo set\\(CMAKE_CXX_COMPILER clang-7\\) >> config.cmake echo set\\(CMAKE_CXX_COMPILER clang-7\\) >> config.cmake
echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
""" """
...@@ -182,6 +184,7 @@ stage('Build') { ...@@ -182,6 +184,7 @@ stage('Build') {
echo set\\(USE_SORT ON\\) >> config.cmake echo set\\(USE_SORT ON\\) >> config.cmake
echo set\\(USE_MICRO ON\\) >> config.cmake echo set\\(USE_MICRO ON\\) >> config.cmake
echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
echo set\\(USE_LLVM llvm-config-8\\) >> config.cmake echo set\\(USE_LLVM llvm-config-8\\) >> config.cmake
echo set\\(USE_NNPACK ON\\) >> config.cmake echo set\\(USE_NNPACK ON\\) >> config.cmake
echo set\\(NNPACK_PATH /NNPACK/build/\\) >> config.cmake echo set\\(NNPACK_PATH /NNPACK/build/\\) >> config.cmake
...@@ -212,6 +215,7 @@ stage('Build') { ...@@ -212,6 +215,7 @@ stage('Build') {
echo set\\(USE_SORT ON\\) >> config.cmake echo set\\(USE_SORT ON\\) >> config.cmake
echo set\\(USE_RPC ON\\) >> config.cmake echo set\\(USE_RPC ON\\) >> config.cmake
echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake
echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
......
...@@ -88,6 +88,9 @@ set(USE_GRAPH_RUNTIME ON) ...@@ -88,6 +88,9 @@ set(USE_GRAPH_RUNTIME ON)
# Whether enable additional graph debug functions # Whether enable additional graph debug functions
set(USE_GRAPH_RUNTIME_DEBUG OFF) set(USE_GRAPH_RUNTIME_DEBUG OFF)
# Whether enable additional vm profiler functions
set(USE_VM_PROFILER OFF)
# Whether build with LLVM support # Whether build with LLVM support
# Requires LLVM version >= 4.0 # Requires LLVM version >= 4.0
# #
......
...@@ -375,8 +375,41 @@ struct VMFrame { ...@@ -375,8 +375,41 @@ struct VMFrame {
*/ */
class VirtualMachine : public runtime::ModuleNode { class VirtualMachine : public runtime::ModuleNode {
public: public:
PackedFunc GetFunction(const std::string& name, /*!
const std::shared_ptr<ModuleNode>& sptr_to_self) final; * \brief Get a PackedFunc from module.
*
* The PackedFunc may not be fully initialized,
* there might still be first time running overhead when
* executing the function on certain devices.
* For benchmarking, use prepare to eliminate
*
* \param name the name of the function.
* \param sptr_to_self The shared_ptr that points to this module node.
*
* \return PackedFunc(nullptr) when it is not available.
*
* \note The function will always remain valid.
* If the function needs resource from the module(e.g. late linking),
* it should capture sptr_to_self.
*/
virtual PackedFunc GetFunction(const std::string& name,
const std::shared_ptr<ModuleNode>& sptr_to_self);
/*!
* \brief Invoke a PackedFunction
*
* \param packed_index The offset of the PackedFunction in all functions.
* \param func The PackedFunction to be invoked.
* \param arg_count The number of arguments to the PackedFunction.
* \param output_size The number of outputs of the PackedFunction.
* \param args Arguments to the PackedFunction.
*
* \note The return value will be stored in the last output_size slots of args.
*/
virtual void InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
Index output_size, const std::vector<Object>& args);
virtual ~VirtualMachine() {}
const char* type_key() const final { const char* type_key() const final {
return "VirtualMachine"; return "VirtualMachine";
...@@ -456,6 +489,10 @@ class VirtualMachine : public runtime::ModuleNode { ...@@ -456,6 +489,10 @@ class VirtualMachine : public runtime::ModuleNode {
*/ */
void RunLoop(); void RunLoop();
/*! \brief Get device context for params.
*/
TVMContext GetParamsContext() const;
/*! /*!
* \brief Load parameters from the parameter bytearray. * \brief Load parameters from the parameter bytearray.
* \param params The binary file that contains parameters. * \param params The binary file that contains parameters.
...@@ -478,9 +515,6 @@ class VirtualMachine : public runtime::ModuleNode { ...@@ -478,9 +515,6 @@ class VirtualMachine : public runtime::ModuleNode {
*/ */
void InvokeGlobal(const VMFunction& func, const std::vector<Object>& args); void InvokeGlobal(const VMFunction& func, const std::vector<Object>& args);
/*! \brief Get device context for params.
*/
TVMContext GetParamsContext() const;
/*! \brief The parameter name to data mapping. */ /*! \brief The parameter name to data mapping. */
std::unordered_map<std::string, Object> params_; std::unordered_map<std::string, Object> params_;
......
...@@ -34,6 +34,7 @@ from . import debug ...@@ -34,6 +34,7 @@ from . import debug
from . import param_dict from . import param_dict
from . import feature from . import feature
from .backend import vm from .backend import vm
from .backend import profiler_vm
from .backend import serializer from .backend import serializer
from .backend import deserializer from .backend import deserializer
from .backend import vmobj from .backend import vmobj
......
# License .to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable, invalid-name
"""
The Relay Virtual Machine profiler.
Provides extra APIs for profiling vm execution.
"""
import tvm
from . import vm, _vm
def _update_target(target):
target = target if target else tvm.target.current_target()
if target is None:
raise ValueError("Target is not set in env or passed as argument.")
tgts = {}
if isinstance(target, (str, tvm.target.Target)):
dev_type = tvm.expr.IntImm("int32", tvm.nd.context(str(target)).device_type)
tgts[dev_type] = tvm.target.create(target)
elif isinstance(target, dict):
for dev, tgt in target.items():
dev_type = tvm.expr.IntImm("int32", tvm.nd.context(dev).device_type)
tgts[dev_type] = tvm.target.create(tgt)
else:
raise TypeError("target is expected to be str, tvm.target.Target, " +
"or dict of str to str/tvm.target.Target, but received " +
"{}".format(type(target)))
return tgts
class VMCompilerProfiler(vm.VMCompiler):
"""Build Relay module to run on VM runtime."""
def __init__(self):
super().__init__()
self.mod = _vm._VMCompilerProfiler()
self._compile = self.mod["compile"]
self._get_vm = self.mod["get_vm"]
def compile(self, mod, target=None, target_host=None):
"""
Parameters
----------
mod : relay.Module
The Relay module to build.
target : str, :any:`tvm.target.Target`, or dict of str(i.e.
device/context name) to str/tvm.target.Target, optional
For heterogeneous compilation, it is a dictionary indicating context
to target mapping. For homogeneous compilation, it is a build target.
target_host : str or :any:`tvm.target.Target`, optional
Host compilation target, if target is device.
When TVM compiles device specific program such as CUDA,
we also need host(CPU) side code to interact with the driver
to setup the dimensions and parameters correctly.
target_host is used to specify the host side codegen target.
By default, llvm is used if it is enabled,
otherwise a stackvm intepreter is used.
Returns
-------
vm : VirtualMachineProfiler
The profile VM runtime.
"""
target = _update_target(target)
self._compile(mod, target, target_host)
return VirtualMachineProfiler(self._get_vm())
class VirtualMachineProfiler(vm.VirtualMachine):
"""Relay profile VM runtime."""
def __init__(self, mod):
super().__init__(mod)
self._get_stat = self.mod["get_stat"]
def get_stat(self):
return self._get_stat()
...@@ -30,12 +30,17 @@ ...@@ -30,12 +30,17 @@
#include <tvm/relay/transform.h> #include <tvm/relay/transform.h>
#include <tvm/runtime/vm.h> #include <tvm/runtime/vm.h>
#include <iostream> #include <iostream>
#include <memory>
#include <set>
#include <string>
#include <tuple>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "../../../runtime/vm/naive_allocator.h" #include "../../../runtime/vm/naive_allocator.h"
#include "../../backend/compile_engine.h" #include "../../backend/compile_engine.h"
#include "../../pass/pass_util.h" #include "../../pass/pass_util.h"
#include "compiler.h"
namespace tvm { namespace tvm {
namespace relay { namespace relay {
...@@ -56,36 +61,6 @@ using namespace relay::transform; ...@@ -56,36 +61,6 @@ using namespace relay::transform;
// (@jroesch): VM passes, eventually declare as passes. // (@jroesch): VM passes, eventually declare as passes.
bool IsClosure(const Function& func); bool IsClosure(const Function& func);
template <typename T, typename U>
using NodeMap = std::unordered_map<T, U, NodeHash, NodeEqual>;
using TagMap = NodeMap<tvm::relay::Constructor, Index>;
using TagNameMap = std::unordered_map<size_t, tvm::relay::Constructor>;
using GlobalMap = NodeMap<GlobalVar, Index>;
using ConstMap = NodeMap<Constant, Index>;
using ConstTensorShapeMap = NodeMap<TensorType, std::pair<Index, NDArray>>;
using TargetsMap = Map<tvm::Integer, tvm::Target>;
struct VMCompilerContext {
// The module context for the compilation
Module module;
// Error reporter
ErrorReporter err_reporter;
// Map from a unique integer to ADT constructor tag
TagNameMap tag_index_map;
// Map from ADT constructor tag to a unique integer
TagMap tag_map;
// Map from global var to a unique integer
GlobalMap global_map;
// Map from Const object to its index in const pool
ConstMap const_map;
// Map from Const tensor shape to its index in const pool
ConstTensorShapeMap const_tensor_shape_map;
// List of lowered functions
std::vector<LoweredFunc> lowered_funcs;
// The functions that have been lowered.
std::unordered_map<LoweredFunc, size_t, NodeHash, NodeEqual> seen_funcs;
};
// Compute the constant pool, i.e a mapping from Constant node to constant index. // Compute the constant pool, i.e a mapping from Constant node to constant index.
struct ConstantPool : ExprVisitor { struct ConstantPool : ExprVisitor {
std::set<GlobalVar> visited; std::set<GlobalVar> visited;
...@@ -664,152 +639,131 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> { ...@@ -664,152 +639,131 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
}; };
class VMCompiler : public runtime::ModuleNode { PackedFunc VMCompiler::GetFunction(const std::string& name,
public: const std::shared_ptr<ModuleNode>& sptr_to_self) {
PackedFunc GetFunction(const std::string& name, if (name == "compile") {
const std::shared_ptr<ModuleNode>& sptr_to_self) final { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
if (name == "compile") { CHECK_EQ(args.num_args, 3);
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Compile(args[0], args[1], args[2]);
CHECK_EQ(args.num_args, 3); });
this->Compile(args[0], args[1], args[2]); } else if (name == "get_vm") {
}); return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
} else if (name == "get_vm") { *rv = runtime::Module(vm_);
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { });
*rv = runtime::Module(vm_); } else {
}); LOG(FATAL) << "Unknown packed function: " << name;
} else { return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
LOG(FATAL) << "Unknown packed function: " << name;
return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
}
} }
}
const char* type_key() const final { void VMCompiler::Compile(const Module& mod_ref,
return "VMCompiler"; const TargetsMap& targets,
const tvm::Target& target_host) {
CHECK_EQ(targets.size(), 1)
<< "Currently VM compiler doesn't support heterogeneous compilation";
InitVM();
targets_ = targets;
target_host_ = target_host;
// Run some optimizations first, this code should
// be moved to pass manager.
context_.module = OptimizeModule(mod_ref);
// Populate the global map.
//
// This maps global variables to a global index
// in the VMFunction table.
PopulateGlobalMap();
// Next we populate constant map.
auto constant_analysis_result = LayoutConstantPool(context_.module);
context_.const_map = std::get<0>(constant_analysis_result);
context_.const_tensor_shape_map = std::get<1>(constant_analysis_result);
// Next we get ready by allocating space for
// the global state.
vm_->functions.resize(context_.module->functions.size());
vm_->constants.resize(context_.const_map.size() + context_.const_tensor_shape_map.size());
for (auto pair : context_.const_map) {
vm_->constants[pair.second] = Object::Tensor(pair.first->data);
} }
std::shared_ptr<VirtualMachine> GetVirtualMachine() const { for (auto pair : context_.const_tensor_shape_map) {
return vm_; vm_->constants[pair.second.first] = Object::Tensor(pair.second.second);
} }
void Compile(const Module& mod_ref, for (auto named_func : context_.module->functions) {
const TargetsMap& targets, auto gvar = named_func.first;
const tvm::Target& target_host) { auto func = named_func.second;
CHECK_EQ(targets.size(), 1) VMFunctionCompiler func_compiler(&context_, targets_);
<< "Currently VM compiler doesn't support heterogeneous compilation"; auto vm_func = func_compiler.Compile(gvar, func);
targets_ = targets;
target_host_ = target_host;
vm_ = std::make_shared<VirtualMachine>();
// Run some optimizations first, this code should
// be moved to pass manager.
context_.module = OptimizeModule(mod_ref);
// Populate the global map.
//
// This maps global variables to a global index
// in the VMFunction table.
PopulateGlobalMap();
// Next we populate constant map. size_t func_index = context_.global_map.at(gvar);
auto constant_analysis_result = LayoutConstantPool(context_.module); CHECK(func_index < vm_->functions.size());
context_.const_map = std::get<0>(constant_analysis_result); vm_->functions[func_index] = vm_func;
context_.const_tensor_shape_map = std::get<1>(constant_analysis_result); }
// Next we get ready by allocating space for
// the global state.
vm_->functions.resize(context_.module->functions.size());
vm_->constants.resize(context_.const_map.size() + context_.const_tensor_shape_map.size());
for (auto pair : context_.const_map) {
vm_->constants[pair.second] = Object::Tensor(pair.first->data);
}
for (auto pair : context_.const_tensor_shape_map) {
vm_->constants[pair.second.first] = Object::Tensor(pair.second.second);
}
for (auto named_func : context_.module->functions) {
auto gvar = named_func.first;
auto func = named_func.second;
VMFunctionCompiler func_compiler(&context_, targets_);
auto vm_func = func_compiler.Compile(gvar, func);
size_t func_index = context_.global_map.at(gvar);
CHECK(func_index < vm_->functions.size());
vm_->functions[func_index] = vm_func;
}
#if USE_RELAY_DEBUG #if USE_RELAY_DEBUG
for (auto vm_func : vm_->functions) { for (auto vm_func : vm_->functions) {
DLOG(INFO) << vm_func << "-------------"; DLOG(INFO) << vm_func << "-------------";
} }
#endif // USE_RELAY_DEBUG #endif // USE_RELAY_DEBUG
LibraryCodegen(); LibraryCodegen();
for (auto gv : context_.global_map) { for (auto gv : context_.global_map) {
vm_->global_map.insert({gv.first->name_hint, gv.second}); vm_->global_map.insert({gv.first->name_hint, gv.second});
}
} }
}
protected: Module VMCompiler::OptimizeModule(const Module& mod) {
Module OptimizeModule(const Module& mod) { // TODO(@icemelon9): check number of targets and build config, add more optimization pass
// TODO(@icemelon9): check number of targets and build config, add more optimization pass transform::Sequential seq({transform::SimplifyInference(),
transform::Sequential seq({transform::SimplifyInference(), transform::ToANormalForm(),
transform::ToANormalForm(), transform::InlinePrimitives(),
transform::InlinePrimitives(), transform::LambdaLift(),
transform::LambdaLift(), transform::InlinePrimitives(),
transform::InlinePrimitives(), transform::FuseOps()});
transform::FuseOps()}); auto pass_ctx = transform::PassContext::Create();
auto pass_ctx = transform::PassContext::Create(); tvm::With<relay::transform::PassContext> ctx(pass_ctx);
tvm::With<relay::transform::PassContext> ctx(pass_ctx); return seq(mod);
return seq(mod); }
}
void PopulateGlobalMap() {
// First we populate global map.
size_t global_index = 0;
for (auto named_func : context_.module->functions) {
auto gvar = named_func.first;
context_.global_map.insert({gvar, global_index++});
}
}
void LibraryCodegen() { void VMCompiler::PopulateGlobalMap() {
auto const& lowered_funcs = context_.lowered_funcs; // First we populate global map.
if (lowered_funcs.size() == 0) { size_t global_index = 0;
return; for (auto named_func : context_.module->functions) {
} auto gvar = named_func.first;
// TODO(@icemelon9): support heterogeneous targets context_.global_map.insert({gvar, global_index++});
Target target;
for (auto kv : targets_) {
target = kv.second;
}
if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
runtime::Module mod =
(*f)(tvm::Array<LoweredFunc>(lowered_funcs.begin(), lowered_funcs.end()), target,
target_host_);
CHECK(mod.operator->());
vm_->lib = mod;
} else {
LOG(FATAL) << "relay.backend.build is not registered";
}
size_t primitive_index = 0;
for (auto lfunc : lowered_funcs) {
vm_->primitive_map.insert({lfunc->name, primitive_index++});
}
} }
}
protected: void VMCompiler::LibraryCodegen() {
/*! \brief Target devices. */ auto const& lowered_funcs = context_.lowered_funcs;
TargetsMap targets_; if (lowered_funcs.size() == 0) {
/*! \brief Target host device. */ return;
tvm::Target target_host_; }
/*! \brief Global shared meta data */ // TODO(@icemelon9): support heterogeneous targets
VMCompilerContext context_; Target target;
/*! \brief Compiled virtual machine. */ for (auto kv : targets_) {
std::shared_ptr<VirtualMachine> vm_; target = kv.second;
}; }
if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
runtime::Module mod =
(*f)(tvm::Array<LoweredFunc>(lowered_funcs.begin(), lowered_funcs.end()), target,
target_host_);
CHECK(mod.operator->());
vm_->lib = mod;
} else {
LOG(FATAL) << "relay.backend.build is not registered";
}
size_t primitive_index = 0;
for (auto lfunc : lowered_funcs) {
vm_->primitive_map.insert({lfunc->name, primitive_index++});
}
}
runtime::Module CreateVMCompiler() { runtime::Module CreateVMCompiler() {
std::shared_ptr<VMCompiler> exec = std::make_shared<VMCompiler>(); std::shared_ptr<VMCompiler> exec = std::make_shared<VMCompiler>();
......
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* Copyright (c) 2019 by Contributors
* \file src/relay/backend/vm/compiler.h
* \brief A compiler from relay::Module to the VM byte code.
*/
#ifndef TVM_RELAY_BACKEND_VM_COMPILER_H_
#define TVM_RELAY_BACKEND_VM_COMPILER_H_
#include <tvm/relay/error.h>
#include <tvm/relay/expr_functor.h>
#include <tvm/relay/interpreter.h>
#include <tvm/logging.h>
#include <tvm/relay/transform.h>
#include <tvm/runtime/vm.h>
#include <iostream>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "../../../runtime/vm/profiler/vm.h"
#include "../../../runtime/vm/naive_allocator.h"
#include "../../backend/compile_engine.h"
#include "../../pass/pass_util.h"
namespace tvm {
namespace relay {
namespace vm {
using namespace tvm::runtime;
using namespace tvm::runtime::vm;
using namespace relay::transform;
template <typename T, typename U>
using NodeMap = std::unordered_map<T, U, NodeHash, NodeEqual>;
using TagMap = NodeMap<tvm::relay::Constructor, Index>;
using TagNameMap = std::unordered_map<size_t, tvm::relay::Constructor>;
using GlobalMap = NodeMap<GlobalVar, Index>;
using ConstMap = NodeMap<Constant, Index>;
using ConstTensorShapeMap = NodeMap<TensorType, std::pair<Index, NDArray>>;
using TargetsMap = Map<tvm::Integer, tvm::Target>;
struct VMCompilerContext {
// The module context for the compilation
Module module;
// Error reporter
ErrorReporter err_reporter;
// Map from a unique integer to ADT constructor tag
TagNameMap tag_index_map;
// Map from ADT constructor tag to a unique integer
TagMap tag_map;
// Map from global var to a unique integer
GlobalMap global_map;
// Map from Const object to its index in const pool
ConstMap const_map;
// Map from Const tensor shape to its index in const pool
ConstTensorShapeMap const_tensor_shape_map;
// List of lowered functions
std::vector<LoweredFunc> lowered_funcs;
// The functions that have been lowered.
std::unordered_map<LoweredFunc, size_t, NodeHash, NodeEqual> seen_funcs;
};
class VMCompiler : public runtime::ModuleNode {
public:
virtual ~VMCompiler() {}
virtual PackedFunc GetFunction(const std::string& name,
const std::shared_ptr<ModuleNode>& sptr_to_self);
const char* type_key() const {
return "VMCompiler";
}
std::shared_ptr<VirtualMachine> GetVirtualMachine() const {
return vm_;
}
virtual void InitVM() {
vm_ = std::make_shared<VirtualMachine>();
}
void Compile(const Module& mod_ref,
const TargetsMap& targets,
const tvm::Target& target_host);
protected:
Module OptimizeModule(const Module& mod);
void PopulateGlobalMap();
void LibraryCodegen();
protected:
/*! \brief Target devices. */
TargetsMap targets_;
/*! \brief Target host device. */
tvm::Target target_host_;
/*! \brief Global shared meta data */
VMCompilerContext context_;
/*! \brief Compiled virtual machine. */
std::shared_ptr<VirtualMachine> vm_;
};
} // namespace vm
} // namespace relay
} // namespace tvm
#endif // TVM_RELAY_BACKEND_VM_COMPILER_H_
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* Copyright (c) 2019 by Contributors
* \file src/relay/backend/vm/profiler/compiler.cc
* \brief A compiler from relay::Module to the VM byte code.
*/
#include "../../../../runtime/vm/profiler/vm.h"
#include "../compiler.h"
namespace tvm {
namespace relay {
namespace vm {
class VMCompilerDebug : public VMCompiler {
public:
VMCompilerDebug() {}
void InitVM() override { vm_ = std::make_shared<VirtualMachineDebug>(); }
virtual ~VMCompilerDebug() {}
};
runtime::Module CreateVMCompilerDebug() {
std::shared_ptr<VMCompilerDebug> exec = std::make_shared<VMCompilerDebug>();
return runtime::Module(exec);
}
TVM_REGISTER_GLOBAL("relay._vm._VMCompilerProfiler")
.set_body([](TVMArgs args, TVMRetValue* rv) {
*rv = CreateVMCompilerDebug();
});
} // namespace vm
} // namespace relay
} // namespace tvm
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* Copyright (c) 2019 by Contributors
* \file src/runtime/vm/profiler/vm.cc
* \brief The Relay debug virtual machine.
*/
#include <tvm/runtime/registry.h>
#include <tvm/runtime/vm.h>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "vm.h"
namespace tvm {
namespace runtime {
namespace vm {
PackedFunc VirtualMachineDebug::GetFunction(
const std::string& name, const std::shared_ptr<ModuleNode>& sptr_to_self) {
if (name == "get_stat") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
double total_duration = 0.0;
std::ostringstream os;
os << std::setw(30) << std::left << "#OpName"
<< "\t" << std::setw(10) << std::left << "#InvokeCount"
<< "\t"
<< "#Duration(us): Sum/Mean/Min/Max" << std::endl;
for (auto kv : op_durations) {
auto vals = op_durations[kv.first];
auto sum = std::accumulate(vals.begin(), vals.end(), 0.0);;
auto mean = sum / static_cast<double>(vals.size());
auto min_value = *std::min_element(vals.begin(), vals.end());
auto max_value = *std::max_element(vals.begin(), vals.end());
os << std::setw(30) << std::left << packed_index_map[kv.first] << "\t"
<< std::setw(10) << std::left << op_invokes[kv.first] << "\t"
<< sum << "/" << mean << "/" << min_value << "/" << max_value << std::endl;
total_duration += sum;
}
os << "Total Duration " << total_duration << " us" << std::endl;
*rv = os.str();
});
} else if (name == "init") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
CHECK_EQ(args.size() % 2, 0);
std::vector<TVMContext> contexts;
for (int i = 0; i < args.size() / 2; ++i) {
TVMContext ctx;
int device_type = args[i * 2];
ctx.device_type = DLDeviceType(device_type);
ctx.device_id = args[i * 2 + 1];
contexts.push_back(ctx);
}
this->Init(contexts);
});
} else {
return VirtualMachine::GetFunction(name, sptr_to_self);
}
}
void VirtualMachineDebug::Init(const std::vector<TVMContext>& ctxs) {
VirtualMachine::Init(ctxs);
for (auto kv : primitive_map) {
packed_index_map[kv.second] = kv.first;
op_invokes[kv.second] = 0;
}
}
void VirtualMachineDebug::InvokePacked(Index packed_index,
const PackedFunc& func, Index arg_count,
Index output_size,
const std::vector<Object>& args) {
auto ctx = VirtualMachine::GetParamsContext();
auto op_begin = std::chrono::high_resolution_clock::now();
VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size,
args);
TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
auto op_end = std::chrono::high_resolution_clock::now();
double op_duration =
std::chrono::duration_cast<std::chrono::duration<double> >(op_end -
op_begin)
.count();
op_durations[packed_index].push_back(op_duration * 1e6);
op_invokes[packed_index] += 1;
}
} // namespace vm
} // namespace runtime
} // namespace tvm
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* Copyright (c) 2019 by Contributors
* \file src/runtime/vm/profiler/vm.h
* \brief The Relay debug virtual machine.
*/
#ifndef TVM_RUNTIME_VM_PROFILER_VM_H_
#define TVM_RUNTIME_VM_PROFILER_VM_H_
#include <tvm/runtime/vm.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
namespace tvm {
namespace runtime {
namespace vm {
class VirtualMachineDebug : public VirtualMachine {
public:
VirtualMachineDebug() : VirtualMachine() {}
PackedFunc GetFunction(const std::string& name,
const std::shared_ptr<ModuleNode>& sptr_to_self) final;
void InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
Index output_size, const std::vector<Object>& args) final;
~VirtualMachineDebug() {}
private:
void Init(const std::vector<TVMContext>& ctxs);
std::unordered_map<Index, std::string> packed_index_map;
std::unordered_map<Index, std::vector<double>> op_durations;
std::unordered_map<Index, int> op_invokes;
};
} // namespace vm
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_VM_PROFILER_VM_H_
...@@ -721,8 +721,9 @@ Object VirtualMachine::Invoke(const std::string& name, const std::vector<Object> ...@@ -721,8 +721,9 @@ Object VirtualMachine::Invoke(const std::string& name, const std::vector<Object>
return Invoke(this->functions[func_index], args); return Invoke(this->functions[func_index], args);
} }
void InvokePacked(const PackedFunc& func, Index arg_count, Index output_size, void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func,
const std::vector<Object>& args) { Index arg_count, Index output_size,
const std::vector<Object>& args) {
size_t arity = 0; size_t arity = 0;
for (Index i = 0; i < arg_count; i++) { for (Index i = 0; i < arg_count; i++) {
if (args[i].ptr_->tag == ObjectTag::kDatatype) { if (args[i].ptr_->tag == ObjectTag::kDatatype) {
...@@ -846,7 +847,7 @@ void VirtualMachine::RunLoop() { ...@@ -846,7 +847,7 @@ void VirtualMachine::RunLoop() {
for (Index i = 0; i < arity; ++i) { for (Index i = 0; i < arity; ++i) {
args.push_back(ReadRegister(instr.packed_args[i])); args.push_back(ReadRegister(instr.packed_args[i]));
} }
InvokePacked(func, arity, instr.output_size, args); InvokePacked(instr.packed_index, func, arity, instr.output_size, args);
for (Index i = 0; i < instr.output_size; ++i) { for (Index i = 0; i < instr.output_size; ++i) {
WriteRegister(instr.packed_args[instr.arity - instr.output_size + i], WriteRegister(instr.packed_args[instr.arity - instr.output_size + i],
args[instr.arity - instr.output_size + i]); args[instr.arity - instr.output_size + i]);
......
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
import tvm
import numpy as np
from nose.tools import nottest
from tvm import relay
from tvm.relay.testing import resnet
@nottest
def test_basic():
mod, params = resnet.get_workload()
compiler = relay.profiler_vm.VMCompilerProfiler()
target = 'llvm'
ctx = tvm.cpu()
vm = compiler.compile(mod, target)
vm.init(ctx)
vm.load_params(params)
data = np.random.rand(1, 3, 224, 224).astype('float32')
res = vm.invoke("main", [data])
print("\n{}".format(vm.get_stat()))
if __name__ == "__main__":
test_basic()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment