Commit a12c556a by Denis Khalikov Committed by Tianqi Chen

[RUNTIME][TRACE] Support trace primitive. (#1973)

Add trace call expr to allow trace Tensor data
at the runtime. By default the default handler
is enabled which prints a tracing data to stdout,
otherwise user should specify a call_back as
global_function (aka @tvm.register_func).

The issue is related to:
https://discuss.tvm.ai/t/idea-trace-expression/945
parent 851d8da7
......@@ -395,6 +395,17 @@ constexpr const char* tvm_stack_make_array = "tvm_stack_make_array";
constexpr const char* tvm_call_packed = "tvm_call_packed";
/*!
* \brief See pesudo code
*
* int tvm_call_trace_packed(name, TVMValue* args) {
* ModuleNode* env = GetCurrentEnv();
* const PackedFunc* f = env->GetFuncFromEnv(name);
* (*f)(args, type_code_of(args), len(args));
* return 0;
* }
*/
constexpr const char *tvm_call_trace_packed = "tvm_call_trace_packed";
/*!
* \brief See pesudo code
* Mark the content as thread local context, can get optimized
* by only call the call once at thread start.
*
......@@ -423,6 +434,25 @@ constexpr const char* tvm_thread_context = "tvm_thread_context";
*/
constexpr const char* tvm_call_packed_lowered = "tvm_call_packed_lowered";
/*!
* \brief Lowered version of trace intrinsic, the space of value and
* type codes are explicitly allocated. The return value is the
* (end - 1) value on the stack.
*
* int tvm_call_trace_packed_lowered(name,
* TVMValue* value_stack,
* int* tcode_stack,
* int begin,
* int end) {
* ModuleNode* env = GetCurrentEnv();
* const PackedFunc* f = env->GetFuncFromEnv(name);
* f->CallPacked(TVMArgs(value_stack[begin:end],
* tcode_stack[begin:end]),
* TVMRetValue(value_stack + end, tcode_stack + end));
* }
*/
constexpr const char *tvm_call_trace_packed_lowered =
"tvm_call_trace_packed_lowered";
/*!
* \brief See pseudo code
*
* int tvm_storage_sync(std::string storage_scope) {
......
......@@ -488,6 +488,42 @@ def _rule_float_direct(op):
return call_pure_extern(op.dtype, op.name, *op.args)
return None
@_register_func("tvm.default_trace_action")
def _tvm_default_trace_action(*args):
print(list(args))
def trace(args, trace_action="tvm.default_trace_action"):
"""Trace tensor data at the runtime.
The trace function allows to trace specific tensor at the
runtime. The tracing value should come as last argument.
The trace action should be specified, by default
tvm.default_trace_action is used.
Parameters
----------
args : list of Expr or Buffers.
Positional arguments.
trace_action : str.
The name of the trace action.
Returns
-------
call : Expr
The call expression.
See Also
--------
tvm.call_packed : Creates packed function.
"""
if not isinstance(args, list):
raise Exception("tvm.trace consumes the args as list type")
call_args = [_pack_buffer(x) if isinstance(x, _Buffer) else x for x in args]
call_args.insert(0, trace_action)
return _make.Call(
args[-1].dtype, "tvm_call_trace_packed", call_args, _Call.Intrinsic, None, 0)
# opencl pattern for exp
register_intrin_rule("opencl", "exp", _rule_float_direct, override=True)
# default pattern for exp
......
......@@ -526,42 +526,81 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
return phi;
}
llvm::Value* CodeGenCPU::CreateCallPacked(const Call* op) {
CHECK_EQ(op->args.size(), 5U);
std::string func_name = op->args[0].as<StringImm>()->value;
llvm::Value* handle = GetPackedFuncHandle(func_name);
llvm::BasicBlock *
CodeGenCPU::MakeCallPacked(const Array<Expr> &args, llvm::Value **rvalue,
llvm::Value **ret_tcode, const Type &r_type,
const int64_t begin, const int64_t end) {
using llvm::BasicBlock;
std::string func_name = args[0].as<StringImm>()->value;
llvm::Value *handle = GetPackedFuncHandle(func_name);
// call the function
int64_t begin = op->args[3].as<IntImm>()->value;
int64_t end = op->args[4].as<IntImm>()->value;
int64_t nargs = end - begin;
CHECK_GE(nargs, 0);
llvm::Value* stack_value = MakeValue(op->args[1]);
llvm::Value* stack_tcode = MakeValue(op->args[2]);
llvm::Value* arg_value = builder_->CreateInBoundsGEP(
builder_->CreatePointerCast(
stack_value, t_tvm_value_->getPointerTo()), ConstInt32(begin));
llvm::Value* arg_tcode = CreateBufferPtr(
Int(32), stack_tcode, ConstInt32(begin));
llvm::Value* ret_value = builder_->CreateInBoundsGEP(
builder_->CreatePointerCast(
stack_value, t_tvm_value_->getPointerTo()), ConstInt32(end));
llvm::Value* ret_tcode = CreateBufferPtr(
Int(32), stack_tcode, ConstInt32(end));
CheckCallSuccess(
builder_->CreateCall(
RuntimeTVMFuncCall(),
{handle, arg_value, arg_tcode, ConstInt32(nargs),
ret_value, ret_tcode}));
Type r_type = op->type;
llvm::Value *stack_value = MakeValue(args[1]);
llvm::Value *stack_tcode = MakeValue(args[2]);
llvm::Value *arg_value = builder_->CreateInBoundsGEP(
builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
ConstInt32(begin));
llvm::Value *arg_tcode =
CreateBufferPtr(Int(32), stack_tcode, ConstInt32(begin));
llvm::Value *ret_value = builder_->CreateInBoundsGEP(
builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
ConstInt32(end));
*ret_tcode = CreateBufferPtr(Int(32), stack_tcode, ConstInt32(end));
BasicBlock *end_block = CheckCallSuccess(builder_->CreateCall(
RuntimeTVMFuncCall(), {handle, arg_value, arg_tcode, ConstInt32(nargs),
ret_value, *ret_tcode}));
Type r_api_type = ir::APIType(r_type);
llvm::Value* rvalue =
builder_->CreateAlignedLoad(
builder_->CreatePointerCast(
ret_value, LLVMType(r_api_type)->getPointerTo()), 8);
rvalue = CreateCast(r_api_type, r_type, rvalue);
*rvalue = builder_->CreateAlignedLoad(
builder_->CreatePointerCast(ret_value,
LLVMType(r_api_type)->getPointerTo()),
8);
*rvalue = CreateCast(r_api_type, r_type, *rvalue);
return end_block;
}
llvm::Value *CodeGenCPU::CreateCallPacked(const Call *op) {
CHECK_EQ(op->args.size(), 5U);
llvm::Value *rvalue = nullptr;
llvm::Value *ret_tcode = nullptr;
MakeCallPacked(op->args, &rvalue, &ret_tcode, op->type,
op->args[3].as<IntImm>()->value,
op->args[4].as<IntImm>()->value);
return rvalue;
}
llvm::Value *CodeGenCPU::CreateCallTracePacked(const Call *op) {
using llvm::BasicBlock;
CHECK_EQ(op->args.size(), 6U);
llvm::Value *rvalue = nullptr;
llvm::Value *ret_tcode = nullptr;
BasicBlock *end_block = MakeCallPacked(
op->args, &rvalue, &ret_tcode, op->type, op->args[3].as<IntImm>()->value,
op->args[4].as<IntImm>()->value);
// Get traced value.
llvm::Value *traced_value = MakeValue(op->args[5]);
// The update_block handles case when we need to update the return value.
BasicBlock *update_block =
BasicBlock::Create(*ctx_, "update_block", function_);
// The continue_block handles case when we need to return original
// traced value.
BasicBlock *continue_block =
BasicBlock::Create(*ctx_, "continue_block", function_);
llvm::Value *ret_tcode_value = builder_->CreateAlignedLoad(ret_tcode, 8);
// Check the ret_type_code and create cmp instruction.
llvm::Value *cmp = builder_->CreateICmpNE(
ret_tcode_value, llvm::ConstantInt::get(t_int_, kNull));
builder_->CreateCondBr(cmp, update_block, continue_block);
builder_->SetInsertPoint(update_block);
builder_->CreateBr(continue_block);
builder_->SetInsertPoint(continue_block);
// The return value depends on from what bb we come from.
llvm::PHINode *phi_rvalue = builder_->CreatePHI(traced_value->getType(), 2);
phi_rvalue->addIncoming(rvalue, update_block);
phi_rvalue->addIncoming(traced_value, end_block);
return phi_rvalue;
}
llvm::Value* CodeGenCPU::RuntimeTVMFuncCall() {
if (f_tvm_func_call_ != nullptr) return f_tvm_func_call_;
return GetContextPtr(gv_tvm_func_call_);
......@@ -608,6 +647,8 @@ void CodeGenCPU::AddStartupFunction() {
llvm::Value* CodeGenCPU::CreateIntrinsic(const Call* op) {
if (op->is_intrinsic(intrinsic::tvm_call_packed_lowered)) {
return CreateCallPacked(op);
} else if (op->is_intrinsic(intrinsic::tvm_call_trace_packed_lowered)) {
return CreateCallTracePacked(op);
} else if (op->is_intrinsic(intrinsic::tvm_static_handle)) {
return CreateStaticHandle();
} else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) {
......
......@@ -79,8 +79,15 @@ class CodeGenCPU : public CodeGenLLVM {
void UnpackClosureData(llvm::Value*cdata,
const Array<Var>& fields,
std::unordered_map<const Variable*, llvm::Value*>* vmap);
// Make packed call.
llvm::BasicBlock *MakeCallPacked(const Array<Expr> &args,
llvm::Value **rvalue,
llvm::Value **ret_tcode, const Type &r_type,
const int64_t begin, const int64_t end);
// create call into tvm packed function.
llvm::Value* CreateCallPacked(const Call* op);
// Create trace call into tvm packed function.
llvm::Value* CreateCallTracePacked(const Call *op);
// Create static initialization
void CreateStaticInit(const std::string& init_fname, const Stmt& body);
// Create parallel launch
......
......@@ -54,7 +54,6 @@ class BuiltinLower : public IRMutator {
stmt = IRMutator::Mutate(stmt);
CHECK_EQ(run_shape_stack_, 0);
CHECK_EQ(run_array_stack_, 0);
CHECK_EQ(run_arg_stack_, 0);
while (prep_seq_.size() != 0) {
stmt = Block::make(prep_seq_.back(), stmt);
prep_seq_.pop_back();
......@@ -140,6 +139,8 @@ class BuiltinLower : public IRMutator {
Expr Mutate_(const Call* op, const Expr &e) final {
if (op->is_intrinsic(intrinsic::tvm_call_packed)) {
return MakeCallPacked(op, e);
} else if (op->is_intrinsic(intrinsic::tvm_call_trace_packed)) {
return MakeCallTracePacked(op, e);
} else if (op->is_intrinsic(intrinsic::tvm_stack_make_shape)) {
return MakeShape(op, e);
} else if (op->is_intrinsic(intrinsic::tvm_stack_make_array)) {
......@@ -256,6 +257,56 @@ class BuiltinLower : public IRMutator {
packed_args, Call::Intrinsic);
}
Expr MakeCallTracePacked(const Call *op, const Expr &e) {
size_t restore_shape_stack = run_shape_stack_;
size_t restore_array_stack = run_array_stack_;
size_t arg_stack_begin = run_arg_stack_;
run_arg_stack_ += op->args.size();
size_t args_size = op->args.size();
CHECK_GT(args_size, 0);
Expr expr = IRMutator::Mutate_(op, e);
op = expr.as<Call>();
for (size_t i = 1; i < op->args.size(); ++i) {
Expr stack_index = ConstInt32(arg_stack_begin + i - 1);
Expr arg = op->args[i];
Type t = arg.type();
Type api_type = APIType(t);
if (t != api_type) {
arg = Cast::make(api_type, arg);
}
prep_seq_.emplace_back(TVMStructSet(
stack_value_, static_cast<int>(arg_stack_begin + i - 1),
intrinsic::kTVMValueContent, arg));
int arg_tcode = api_type.code();
CHECK(!IsArrayHandle(arg)) << "Trace does not support Buffers";
prep_seq_.emplace_back(
Store::make(stack_tcode_,
ConstInt32(arg_tcode),
stack_index, const_true(1)));
}
// UPDATE stack value
max_arg_stack_ = std::max(run_arg_stack_, max_arg_stack_);
max_shape_stack_ = std::max(run_shape_stack_, max_shape_stack_);
max_array_stack_ = std::max(run_array_stack_, max_array_stack_);
run_shape_stack_ = restore_shape_stack;
run_array_stack_ = restore_array_stack;
// Update the top of the stack, so we can use more than one
// packed function's arguments with the one stack.
run_arg_stack_ = arg_stack_begin + args_size - 1;
Array<Expr> packed_args = {
op->args[0],
stack_value_,
stack_tcode_,
ConstInt32(arg_stack_begin),
ConstInt32(arg_stack_begin + op->args.size() - 1),
// Pass traced value.
op->args[args_size - 1]
};
return Call::make(
op->type, intrinsic::tvm_call_trace_packed_lowered,
packed_args, Call::Intrinsic);
}
private:
bool IsArrayHandle(const Expr& arg) {
// specially set array handle.
......
......@@ -80,6 +80,177 @@ def test_ctx():
x = tvm._api_internal._context_test(x, x.device_type, x.device_id)
assert x == tvm.opencl(10)
def test_trace_default_action():
n = 2
x = tvm.placeholder((n,n,n), name="X", dtype="float32")
y = tvm.compute(x.shape, lambda i, j, k: tvm.trace([i, j, k, x[i][j][k]]))
s = tvm.create_schedule(y.op)
f = tvm.build(s, [x, y], target="llvm")
xnd = tvm.nd.array(np.ones((n,n,n), dtype=x.dtype))
ynd = tvm.nd.array(np.zeros((n,n,n), dtype=y.dtype))
f(xnd, ynd)
def test_trace_expr_assign():
@tvm.register_func("tvm.trace_callback2")
def trace_buffer(x):
return
def check_assign(dtype):
n = 4
x = tvm.placeholder((n,n,n), name="X", dtype=dtype)
y = tvm.compute(x.shape, lambda i, j, k: tvm.trace([x[i][j][k]], "tvm.trace_callback2"))
z = tvm.compute(x.shape, lambda i, j, k: tvm.trace([y[i][j][k]], "tvm.trace_callback2"))
s = tvm.create_schedule(z.op)
f = tvm.build(s, [x, y, z], "llvm")
xnd = tvm.nd.array(np.ones((n,n,n), dtype=x.dtype))
ynd = tvm.nd.array(np.zeros((n,n,n), dtype=y.dtype))
znd = tvm.nd.array(np.zeros((n,n,n), dtype=z.dtype))
f(xnd, ynd, znd)
assert(np.array_equal(xnd.asnumpy(), np.ones((n,n,n))))
assert(np.array_equal(ynd.asnumpy(), np.ones((n,n,n))))
assert(np.array_equal(znd.asnumpy(), np.ones((n,n,n))))
for t in ["float64", "float32", "int64", "int32"]:
check_assign(t)
def test_trace_expr_sum_generated():
@tvm.register_func("tvm.trace_callback3")
def trace_buffer(x):
return
def check_expr_sum(dtype):
n = 4
a = tvm.placeholder((n,n,n), name="a", dtype=dtype)
b = tvm.placeholder((n,n,n), name="b", dtype=dtype)
c = tvm.compute(a.shape, lambda i, j, k: tvm.trace([a[i][j][k]],"tvm.trace_callback3")
+ tvm.trace([b[i][j][k]],"tvm.trace_callback3"))
s = tvm.create_schedule(c.op)
f = tvm.build(s, [a, b, c])
xnd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=a.dtype)))
ynd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=b.dtype)))
znd = tvm.nd.array(np.zeros((n,n,n), dtype=c.dtype))
f(xnd, ynd, znd)
assert(np.array_equal(znd.asnumpy(), xnd.asnumpy() + ynd.asnumpy()))
for t in ["float64", "float32", "int64", "int32"]:
check_expr_sum(t)
def test_trace_expr_sum_args():
@tvm.register_func("tvm.trace_silent")
def silent(*args):
return
def check_expr_sum(dtype):
n = 4
a = tvm.placeholder((n,n,n), name="a", dtype=dtype)
b = tvm.placeholder((n,n,n), name="b", dtype=dtype)
e = tvm.placeholder((n,n,n), name="e", dtype=dtype)
d = tvm.placeholder((n,n,n), name="d", dtype=dtype)
c = tvm.compute(a.shape, lambda i, j, k: tvm.trace([i, j, k, a[i][j][k]], "tvm.trace_silent")
+ tvm.trace([i, j, k, b[i][j][k]], "tvm.trace_silent")
+ tvm.trace([i, j, k, d[i][j][k]], "tvm.trace_silent")
+ tvm.trace([i, j, k, e[i][j][k]], "tvm.trace_silent"))
s = tvm.create_schedule(c.op)
f = tvm.build(s, [a, b, d, e, c])
a_nd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=a.dtype)))
b_nd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=b.dtype)))
d_nd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=d.dtype)))
e_nd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=e.dtype)))
c_nd = tvm.nd.array(np.zeros((n,n,n), dtype=c.dtype))
f(a_nd, b_nd, d_nd, e_nd, c_nd)
assert(np.array_equal(c_nd.asnumpy(), a_nd.asnumpy()
+ b_nd.asnumpy()
+ d_nd.asnumpy()
+ e_nd.asnumpy()))
for t in ["float64", "float32", "int64", "int32"]:
check_expr_sum(t)
def test_trace_expr_sum_custom():
@tvm.register_func("tvm.trace_callback4")
def trace_buffer(x):
return
def check_expr_sum_custom(dtype):
n = 4
a = tvm.placeholder((n,n), name="a", dtype=dtype)
b = tvm.placeholder((n,n), name="b", dtype=dtype)
c = tvm.compute(a.shape, lambda i,j: tvm.trace([a[i][j]], "tvm.trace_callback4")
+ tvm.trace([b[i][j]], "tvm.trace_callback4"))
s = tvm.create_schedule(c.op)
f = tvm.build(s, [a, b, c])
npa = np.array([[1,0,0,0], [0,1,0,0],[0,0,1,0],[0,0,0,1]], dtype=a.dtype)
npb = np.array([[1,0,0,0], [0,1,0,0],[0,0,1,0],[0,0,0,1]], dtype=a.dtype)
xnd = tvm.nd.array(npa)
ynd = tvm.nd.array(npb)
znd = tvm.nd.array(np.zeros((n,n), dtype=c.dtype))
f(xnd, ynd, znd)
assert(np.array_equal(znd.asnumpy(), npa + npb))
for t in ["float64", "float32", "int64", "int32"]:
check_expr_sum_custom(t)
def test_trace_can_change_traced_value_int():
@tvm.register_func("tvm.trace_change_int_first")
def trace_buffer(x):
return 13
@tvm.register_func("tvm.trace_change_int_second")
def trace_buffer(x):
return 14
def check_assign(dtype):
n = 4
x = tvm.placeholder((n,), name="X", dtype=dtype)
y = tvm.compute(x.shape, lambda i: tvm.trace([x[i]], "tvm.trace_change_int_first"))
z = tvm.compute(x.shape, lambda i: tvm.trace([y[i]], "tvm.trace_change_int_second"))
s = tvm.create_schedule(z.op)
f = tvm.build(s, [x, y, z], "llvm")
xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype))
ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype))
znd = tvm.nd.array(np.zeros((n,), dtype=z.dtype))
f(xnd, ynd, znd)
check_array_first = np.array([13, 13, 13, 13])
check_array_second = np.array([14, 14, 14, 14])
assert(np.array_equal(ynd.asnumpy(), check_array_first))
assert(np.array_equal(znd.asnumpy(), check_array_second))
for t in ["int64", "int32"]:
check_assign(t)
def test_trace_can_change_traced_value_float():
@tvm.register_func("tvm.trace_change_float_first")
def trace_buffer(x):
return 13.0
@tvm.register_func("tvm.trace_change_float_second")
def trace_buffer(x):
return 14.0
def check_assign(dtype):
n = 4
x = tvm.placeholder((n,), name="X", dtype=dtype)
y = tvm.compute(x.shape, lambda i: tvm.trace([x[i]], "tvm.trace_change_float_first"))
z = tvm.compute(x.shape, lambda i: tvm.trace([y[i]], "tvm.trace_change_float_second"))
s = tvm.create_schedule(z.op)
f = tvm.build(s, [x, y, z], "llvm")
xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype))
ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype))
znd = tvm.nd.array(np.zeros((n,), dtype=z.dtype))
f(xnd, ynd, znd)
check_array_first = np.array([13.0, 13.0, 13.0, 13.0])
check_array_second = np.array([14.0, 14.0, 14.0, 14.0])
assert(np.array_equal(ynd.asnumpy(), check_array_first))
assert(np.array_equal(znd.asnumpy(), check_array_second))
for t in ["float64", "float32"]:
check_assign(t)
if __name__ == "__main__":
test_empty_array()
test_get_global()
......@@ -88,3 +259,11 @@ if __name__ == "__main__":
test_return_func()
test_byte_array()
test_ctx()
test_trace_expr_assign()
test_trace_expr_sum_generated()
test_trace_expr_sum_custom()
test_trace_expr_sum_args()
test_trace_default_action()
test_trace_can_change_traced_value_int()
test_trace_can_change_traced_value_float()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment