Commit b8fa8f62 by Haichen Shen Committed by Jared Roesch

[Relay][VM] Add AllocTensor instruction and better instruction printer (#3306)

* Update vm print & add AllocTensor instruction

* patch

* fix invoke packed

* update cmake

* tweak move

* update invoke_closure

* lint

* add doc

* tweak
parent 59d8ba8f
......@@ -222,6 +222,7 @@ add_library(tvm_runtime SHARED ${RUNTIME_SRCS})
message(STATUS "Building Relay in debug mode...")
......@@ -56,13 +56,14 @@ enum class Opcode {
InvokeClosure = 3U,
InvokePacked = 4U,
AllocTensor = 5U,
AllocDatatype = 6U,
AllocClosure = 7U,
GetField = 8U,
If = 9U,
Select = 10U,
LoadConst = 11U,
Goto = 12U
AllocTensorReg = 6U,
AllocDatatype = 7U,
AllocClosure = 8U,
GetField = 9U,
If = 10U,
Select = 11U,
LoadConst = 12U,
Goto = 13U
/*! \brief A single virtual machine instruction.
......@@ -83,11 +84,19 @@ struct Instruction {
union {
struct /* AllocTensor Operands */ {
/*! \brief The number of dimensions. */
uint32_t ndim;
/*! \brief The shape of tensor. */
int64_t* shape;
/*! \brief The datatype of tensor to be allocated. */
DLDataType dtype;
} alloc_tensor;
struct /* AllocTensorReg Operands */ {
/*! \brief The register to read the shape out of. */
RegName shape_register;
/*! \brief The datatype of tensor to be allocated. */
DLDataType dtype;
} alloc_tensor_reg;
struct /* InvokeClosure Operands */ {
/*! \brief The register containing the closure. */
RegName closure;
......@@ -192,13 +201,20 @@ struct Instruction {
static Instruction InvokePacked(Index packed_index, Index arity, Index output_size,
const std::vector<RegName>& args);
/*! \brief Construct an allocate tensor instruction.
/*! \brief Construct an allocate tensor instruction with constant shape.
* \param shape The shape of the tensor.
* \param dtype The dtype of the tensor.
* \param dst The destination register.
* \return The allocate tensor instruction.
static Instruction AllocTensor(std::vector<int64_t> shape, DLDataType dtype, RegName dst);
/*! \brief Construct an allocate tensor instruction with register.
* \param shape_register The register containing the shape.
* \param dtype The dtype of the tensor.
* \param dst The destination register.
* \return The allocate tensor instruction.
static Instruction AllocTensor(RegName shape_register, DLDataType dtype, RegName dst);
static Instruction AllocTensorReg(RegName shape_register, DLDataType dtype, RegName dst);
/*! \brief Construct an allocate datatype instruction.
* \param tag The datatype tag.
* \param num_fields The number of fields for the datatype.
......@@ -103,13 +103,6 @@ struct ConstantPool : ExprVisitor {
void AddConstantTensorShape(TensorType expr, NDArray value) {
auto it = this->const_tensor_shape_map.find(expr);
if (it == this->const_tensor_shape_map.end()) {
this->const_tensor_shape_map.insert({expr, std::make_pair(index++, value)});
void VisitExpr_(const ConstantNode* const_node) {
auto konst = GetRef<Constant>(const_node);
auto it = this->const_map.find(konst);
......@@ -117,48 +110,6 @@ struct ConstantPool : ExprVisitor {
this->const_map.insert({konst, index++});
NDArray GetTensorConstant(const TensorTypeNode* ttype) {
std::vector<int64_t> shapes;
for (auto sh : ttype->shape) {
int64_t s = shapes.size();
DLContext cpu_ctx;
cpu_ctx.device_type = kDLCPU;
cpu_ctx.device_id = 0;
auto shape_tensor = NDArray::Empty({s}, Type2TVMType(Int(64)), cpu_ctx);
int64_t* dims = static_cast<int64_t*>(shape_tensor->data);
for (size_t i = 0; i < shapes.size(); ++i) {
dims[i] = shapes[i];
return shape_tensor;
void VisitExpr_(const CallNode* call_node) {
for (auto arg : call_node->args) {
Expr op = call_node->op;
auto func_node =<FunctionNode>();
if (func_node) {
auto ret_type = call_node->checked_type();
if (const TensorTypeNode* ttype =<TensorTypeNode>()) {
auto shape = GetTensorConstant(ttype);
auto tensor_type = GetRef<TensorType>(ttype);
AddConstantTensorShape(tensor_type, shape);
} else if (const TupleTypeNode* ttype =<TupleTypeNode>()) {
for (size_t i = 0; i < ttype->fields.size(); ++i) {
auto f = ttype->fields[i];
auto f_type =<TensorTypeNode>();
auto shape = GetTensorConstant(f_type);
auto tensor_type = GetRef<TensorType>(f_type);
AddConstantTensorShape(tensor_type, shape);
std::tuple<ConstMap, ConstTensorShapeMap> LayoutConstantPool(const Module& module) {
......@@ -206,6 +157,7 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
switch (instr.op) {
case Opcode::AllocDatatype:
case Opcode::AllocTensor:
case Opcode::AllocTensorReg:
case Opcode::GetField:
case Opcode::LoadConst:
case Opcode::Select:
......@@ -259,14 +211,14 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
void VisitExpr_(const MatchNode* match_node) {
auto match = GetRef<Match>(match_node);
LOG(FATAL) << "translation of match nodes to the VM is "
<< "currently unsupported" << std::endl;
LOG(FATAL) << "translation of match nodes to the VM is"
<< "currently unsupported";
void VisitExpr_(const LetNode* let_node) {
DLOG(INFO) << let_node->value << std::endl;
DLOG(INFO) << let_node->value;
DLOG(INFO) << this->last_register << std::endl;
DLOG(INFO) << this->last_register;
var_register_map.insert({let_node->var, this->last_register});
......@@ -327,18 +279,13 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
Instruction AllocTensorFromType(const TensorTypeNode* ttype) {
DataType dtype = ttype->dtype;
TVMType dltype = Type2TVMType(dtype);
TVMType dltype = Type2TVMType(ttype->dtype);
auto tensor_type = GetRef<TensorType>(ttype);
auto it = this->context->const_tensor_shape_map.find(tensor_type);
if (it == this->context->const_tensor_shape_map.end()) {
DLOG(INFO) << "Can not find constant shape for " << tensor_type;
} else {
Emit(Instruction::LoadConst(it->second.first, NewRegister()));
std::vector<int64_t> shape;
for (auto dim : tensor_type->shape) {
return Instruction::AllocTensor(last_register, dltype, NewRegister());
return Instruction::AllocTensor(shape, dltype, NewRegister());
void EmitInvokePrimitive(const Function& func,
......@@ -532,7 +479,7 @@ void PopulatePackedFuncMap(const std::vector<LoweredFunc>& lowered_funcs,
VMFunction CompileFunc(VMCompilerContext* context, const GlobalVar& var, const Function& func) {
DLOG(INFO) << "CompileFunc: " << var << std::endl << AsText(func, false) << std::endl;
DLOG(INFO) << "CompileFunc: " << var << std::endl << AsText(func, false);
size_t params = func->params.size();
VMCompiler compiler(context);
......@@ -67,8 +67,14 @@ Instruction::Instruction(const Instruction& instr) {
this->result = instr.result;
case Opcode::AllocTensor:
this->shape_register = instr.shape_register;
this->dtype = instr.dtype;
this->alloc_tensor.ndim = instr.alloc_tensor.ndim;
this->alloc_tensor.shape = Duplicate<int64_t>(instr.alloc_tensor.shape,
this->alloc_tensor.dtype = instr.alloc_tensor.dtype;
case Opcode::AllocTensorReg:
this->alloc_tensor_reg.shape_register = instr.alloc_tensor_reg.shape_register;
this->alloc_tensor_reg.dtype = instr.alloc_tensor_reg.dtype;
case Opcode::AllocDatatype:
this->constructor_tag = instr.constructor_tag;
......@@ -142,8 +148,14 @@ Instruction& Instruction::operator=(const Instruction& instr) {
this->result = instr.result;
return *this;
case Opcode::AllocTensor:
this->shape_register = instr.shape_register;
this->dtype = instr.dtype;
this->alloc_tensor.ndim = instr.alloc_tensor.ndim;
this->alloc_tensor.shape = Duplicate<int64_t>(instr.alloc_tensor.shape,
this->alloc_tensor.dtype = instr.alloc_tensor.dtype;
return *this;
case Opcode::AllocTensorReg:
this->alloc_tensor_reg.shape_register = instr.alloc_tensor_reg.shape_register;
this->alloc_tensor_reg.dtype = instr.alloc_tensor_reg.dtype;
return *this;
case Opcode::AllocDatatype:
this->constructor_tag = instr.constructor_tag;
......@@ -203,12 +215,15 @@ Instruction::~Instruction() {
case Opcode::Move:
case Opcode::Select:
case Opcode::Ret:
case Opcode::AllocTensor:
case Opcode::AllocTensorReg:
case Opcode::If:
case Opcode::LoadConst:
case Opcode::GetField:
case Opcode::Goto:
case Opcode::AllocTensor:
delete this->alloc_tensor.shape;
case Opcode::AllocDatatype:
delete this->datatype_fields;
......@@ -226,8 +241,7 @@ Instruction::~Instruction() {
std::ostringstream out;
LOG(FATAL) << "Invalid instruction " << static_cast<int>(this->op)
<< "\n";
LOG(FATAL) << "Invalid instruction " << static_cast<int>(this->op);
......@@ -252,12 +266,25 @@ Instruction Instruction::InvokePacked(Index packed_index, Index arity, Index out
return instr;
Instruction Instruction::AllocTensor(RegName shape_register, DLDataType dtype, Index dst) {
Instruction Instruction::AllocTensor(std::vector<int64_t> shape, DLDataType dtype, Index dst) {
Instruction instr;
instr.op = Opcode::AllocTensor;
instr.dst = dst;
instr.shape_register = shape_register;
instr.dtype = dtype;
instr.alloc_tensor.ndim = shape.size();
instr.alloc_tensor.shape = new int64_t[shape.size()];
for (size_t i = 0; i < shape.size(); ++i) {
instr.alloc_tensor.shape[i] = shape[i];
instr.alloc_tensor.dtype = dtype;
return instr;
Instruction Instruction::AllocTensorReg(RegName shape_register, DLDataType dtype, Index dst) {
Instruction instr;
instr.op = Opcode::AllocTensorReg;
instr.dst = dst;
instr.alloc_tensor_reg.shape_register = shape_register;
instr.alloc_tensor_reg.dtype = dtype;
return instr;
......@@ -381,85 +408,92 @@ void DLDatatypePrint(std::ostream& os, const DLDataType& dtype) {
os << dtype.bits;
if (dtype.lanes != 0) {
os << "[" << dtype.lanes << "]";
os << int(dtype.bits);
if (dtype.lanes != 1) {
os << "x" << dtype.lanes;
template<typename T>
std::string StrJoin(T* items, int offset, int cnt, std::string delim = ",") {
if (cnt == 0) {
return "";
std::ostringstream oss;
oss << items[offset];
for (int i = 1; i < cnt; ++i) {
oss << delim << items[offset + i];
return oss.str();
void InstructionPrint(std::ostream& os, const Instruction& instr) {
switch (instr.op) {
case Opcode::Move: {
os << "move " << instr.from << " " << instr.dst;
os << "move $" << instr.dst << " $" << instr.from;
case Opcode::Ret: {
os << "ret " << instr.result;
os << "ret $" << instr.result;
case Opcode::InvokePacked: {
os << "invoke_packed ";
os << instr.packed_index;
os << " " << instr.arity;
os << "(";
for (Index i = 0; i < instr.arity; ++i) {
os << instr.packed_args[i] << ",";
os << ")";
os << " " << instr.output_size;
os << "invoke_packed PackedFunc[" << instr.packed_index << "](in: $"
<< StrJoin<RegName>(instr.packed_args, 0, instr.arity - instr.output_size, ",$")
<< ", out: $"
<< StrJoin<RegName>(instr.packed_args, instr.arity - instr.output_size,
instr.output_size, ",$")
<< ")";
case Opcode::AllocTensor: {
os << "alloc_tensor ";
os << instr.dst << " ";
os << instr.shape_register << " ";
DLDatatypePrint(os, instr.dtype);
os << "alloc_tensor $" << instr.dst << " ["
<< StrJoin<int64_t>(instr.alloc_tensor.shape, 0, instr.alloc_tensor.ndim)
<< "] ";
DLDatatypePrint(os, instr.alloc_tensor.dtype);
case Opcode::AllocTensorReg: {
os << "alloc_tensor_reg $" << instr.dst << " $"
<< instr.alloc_tensor_reg.shape_register << " ";
DLDatatypePrint(os, instr.alloc_tensor_reg.dtype);
case Opcode::AllocDatatype: {
os << "alloc_data ";
os << instr.dst << " ";
os << instr.constructor_tag << " ";
os << instr.num_fields;
os << "alloc_data $" << instr.dst << " tag(" << instr.constructor_tag << ") [$"
<< StrJoin<RegName>(instr.datatype_fields, 0, instr.num_fields, ",$") << "]";
case Opcode::AllocClosure: {
os << "alloc_closure ";
os << instr.dst << " ";
os << instr.clo_index << " ";
os << instr.num_freevar << "(";
for (Index i = 0; i < instr.num_freevar; ++i) {
os << instr.free_vars[i] << ",";
os << ")";
os << "alloc_closure $" << instr.dst << " VMFunc[" << instr.clo_index
<< "]($" << StrJoin<RegName>(instr.free_vars, 0, instr.num_freevar, ",$")
<< ")";
case Opcode::If: {
os << "if "
<< "$" << instr.if_cond << " " << instr.true_offset << " " << instr.false_offset;
os << "if " << "$" << instr.if_cond << " " << instr.true_offset << " "
<< instr.false_offset;
case Opcode::Invoke: {
os << "invoke "
<< "$" << instr.dst << " " << instr.func_index << " " << instr.num_args << "(";
for (Index i = 0; i < instr.num_args; ++i) {
os << instr.invoke_args_registers[i] << ",";
os << ")";
os << "invoke $" << instr.dst << " VMFunc[" << instr.func_index << "]($"
<< StrJoin<RegName>(instr.invoke_args_registers, 0, instr.num_args, ",$")
<< ")";
case Opcode::InvokeClosure: {
os << "invoke_closure "
<< "$" << instr.dst << " " << instr.closure << " " << instr.closure_args_num << "()";
os << "invoke_closure $" << instr.dst << " $" << instr.closure << "($"
<< StrJoin<RegName>(instr.closure_args, 0, instr.closure_args_num, ",$")
<< ")";
case Opcode::LoadConst: {
os << "load_const "
<< "$" << instr.dst << " " << instr.const_index;
os << "load_const $" << instr.dst << " Const[" << instr.const_index << "]";
case Opcode::GetField: {
os << "get_field " << instr.dst << " " << instr.object << " " << instr.field_index;
os << "get_field $" << instr.dst << " $" << instr.object << "["
<< instr.field_index << "]";
case Opcode::Goto: {
......@@ -467,8 +501,8 @@ void InstructionPrint(std::ostream& os, const Instruction& instr) {
case Opcode::Select: {
os << "select " << instr.dst << " " << instr.select_cond << " " << instr.select_op1 << " "
<< instr.select_op2;
os << "select $" << instr.dst << " $" << instr.select_cond << " $"
<< instr.select_op1 << " $" << instr.select_op2;
......@@ -513,48 +547,64 @@ Index VirtualMachine::PopFrame() {
void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<Object>& args) {
DLOG(INFO) << "===================\nInvoking global " << << " " << args.size()
<< std::endl;
DLOG(INFO) << "Invoking global " << << " " << args.size();
PushFrame(func.params, this->pc + 1, func);
for (size_t i = 0; i < args.size(); ++i) {
WriteRegister(i, args[i]);
DLOG(INFO) << "func.params= " << func.params << std::endl;
DLOG(INFO) << "func.params= " << func.params;
code =;
pc = 0;
Object VirtualMachine::Invoke(const VMFunction& func, const std::vector<Object>& args) {
DLOG(INFO) << "Executing Function: " << std::endl << func << std::endl;
DLOG(INFO) << "Executing Function: " << std::endl << func;
InvokeGlobal(func, args);
auto alloc = MemoryManager::Global()->GetAllocator(ctxs[0]);
DLOG(INFO) << "Memory used: " << alloc->UsedMemory() << " B\n";
DLOG(INFO) << "Memory used: " << alloc->UsedMemory() << " B";
return return_register;
Object VirtualMachine::Invoke(const std::string& name, const std::vector<Object>& args) {
auto func_index = this->global_map_[name];
DLOG(INFO) << "Invoke Global " << name << " at index " << func_index << std::endl;
DLOG(INFO) << "Invoke Global " << name << " at index " << func_index;
return Invoke(this->functions[func_index], args);
void InvokePacked(const PackedFunc& func, Index arg_count, Index output_size,
const std::vector<Object>& args) {
std::vector<TVMValue> values(arg_count);
std::vector<int> codes(arg_count);
runtime::TVMArgsSetter setter(,;
size_t arity = 0;
for (Index i = 0; i < arg_count; i++) {
if (args[i].ptr_->tag == ObjectTag::kDatatype) {
arity += args[i].AsDatatype()->fields.size();
} else {
std::vector<TVMValue> values(arity);
std::vector<int> codes(arity);
runtime::TVMArgsSetter setter(,;
int idx = 0;
for (Index i = 0; i < arg_count; i++) {
if (args[i].ptr_->tag == ObjectTag::kDatatype) {
auto dt_cell = args[i].AsDatatype();
for (auto obj : dt_cell->fields) {
NDArray data = ToNDArray(obj);
setter(idx++, data);
} else {
NDArray data = ToNDArray(args[i]);
setter(i, data);
setter(idx++, data);
TVMRetValue rv;
func.CallPacked(TVMArgs(,, arg_count), &rv);
func.CallPacked(TVMArgs(,, arity), &rv);
void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) { this->ctxs = ctxs; }
......@@ -574,7 +624,7 @@ void VirtualMachine::Run() {
while (true) {
auto const& instr = this->code[this->pc];
DLOG(INFO) << "\nExecuting(" << pc << "): ";
DLOG(INFO) << "Executing(" << pc << "): ";
InstructionPrint(std::cout, instr);
......@@ -669,11 +719,23 @@ void VirtualMachine::Run() {
goto main_loop;
case Opcode::AllocTensor: {
auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim);
for (uint i = 0; i < instr.alloc_tensor.ndim; ++i) {
shape[i] = instr.alloc_tensor.shape[i];
auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]);
auto data = allocator->Empty(shape, instr.alloc_tensor.dtype, ctxs[0]);
auto obj = Object::Tensor(data);
WriteRegister(instr.dst, obj);
goto main_loop;
case Opcode::AllocTensorReg: {
DLContext cpu_ctx;
cpu_ctx.device_type = kDLCPU;
cpu_ctx.device_id = 0;
auto shape_tensor_obj = ReadRegister(instr.shape_register);
auto shape_tensor_obj = ReadRegister(instr.alloc_tensor_reg.shape_register);
NDArray shape_tensor = ToNDArray(shape_tensor_obj).CopyTo(cpu_ctx);
int64_t* dims = static_cast<int64_t*>(shape_tensor->data);
......@@ -681,7 +743,7 @@ void VirtualMachine::Run() {
auto shape = std::vector<int64_t>(shape_tensor->shape[0]);
shape.assign(dims, dims + num_dims);
auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]);
auto data = allocator->Empty(shape, instr.dtype, ctxs[0]);
auto data = allocator->Empty(shape, instr.alloc_tensor_reg.dtype, ctxs[0]);
auto obj = Object::Tensor(data);
WriteRegister(instr.dst, obj);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment