Unverified Commit 05b0f7e0 by Wei Pan Committed by GitHub

[CodeGen][CUDA] Vectorization for intrinsics (#5101)

- This allows to emit vectorized loads/stores
  for CUDA math intrinsics.

- A few intrinsics should be lowered as CUDAMath not CUDAFastMath ones.

- Fixed the code block identation.
parent a5d7bdab
...@@ -257,6 +257,29 @@ class CodeGenC : ...@@ -257,6 +257,29 @@ class CodeGenC :
/*! \brief the data type of allocated buffers */ /*! \brief the data type of allocated buffers */
std::unordered_map<const VarNode*, DataType> handle_data_type_; std::unordered_map<const VarNode*, DataType> handle_data_type_;
/*!
* \brief A RAII utility class for emitting code in a scoped region.
*/
class EnterScopeRAII {
// The codegen context.
CodeGenC* cg;
// The new scope level.
int scope;
public:
explicit EnterScopeRAII(CodeGenC* cg) : cg(cg) {
cg->PrintIndent();
cg->stream << "{\n";
scope = cg->BeginScope();
}
~EnterScopeRAII() {
cg->EndScope(scope);
cg->PrintIndent();
cg->stream << "}\n";
}
};
private: private:
/*! \brief whether to print in SSA form */ /*! \brief whether to print in SSA form */
bool print_ssa_form_{false}; bool print_ssa_form_{false};
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <tvm/runtime/registry.h> #include <tvm/runtime/registry.h>
#include <cmath> #include <cmath>
#include <utility>
#include <vector> #include <vector>
#include <string> #include <string>
#include "literal/cuda_half_t.h" #include "literal/cuda_half_t.h"
...@@ -235,25 +236,19 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) { // NOLINT(*) ...@@ -235,25 +236,19 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) { // NOLINT(*)
void CodeGenCUDA::PrintVecBinaryOp( void CodeGenCUDA::PrintVecBinaryOp(
const std::string& op, DataType t, const std::string& op, DataType t,
PrimExpr lhs, PrimExpr rhs, std::ostream& os) { // NOLINT(*) PrimExpr lhs, PrimExpr rhs, std::ostream& os) { // NOLINT(*)
// unpacking operations. // Delcare the result.
int lanes = t.lanes();
{
// The assignment below introduces side-effect, and the resulting value cannot
// be reused across multiple expression, thus a new scope is needed
int vec_scope = BeginScope();
// default: unpack into individual ops.
std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.dtype());
std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.dtype());
std::string sret = GetUniqueName("_"); std::string sret = GetUniqueName("_");
{
// delcare type.
this->PrintIndent(); this->PrintIndent();
this->PrintType(t, stream); this->PrintType(t, stream);
stream << ' ' << sret << ";\n"; stream << ' ' << sret << ";\n";
} {
for (int i = 0; i < lanes; ++i) { EnterScopeRAII scope(this);
// Unpack into individual ops.
std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.dtype());
std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.dtype());
for (int i = 0, lanes = t.lanes(); i < lanes; ++i) {
std::ostringstream value_temp; std::ostringstream value_temp;
if (isalpha(op[0])) { if (isalpha(op[0])) {
value_temp << op << "("; value_temp << op << "(";
...@@ -270,9 +265,8 @@ void CodeGenCUDA::PrintVecBinaryOp( ...@@ -270,9 +265,8 @@ void CodeGenCUDA::PrintVecBinaryOp(
} }
PrintVecElemStore(sret, t, i, value_temp.str()); PrintVecElemStore(sret, t, i, value_temp.str());
} }
os << sret;
EndScope(vec_scope);
} }
os << sret;
} }
void CodeGenCUDA::PrintVecElemLoad( void CodeGenCUDA::PrintVecElemLoad(
...@@ -418,6 +412,54 @@ void CodeGenCUDA::VisitExpr_(const CallNode *op, std::ostream& os) { ...@@ -418,6 +412,54 @@ void CodeGenCUDA::VisitExpr_(const CallNode *op, std::ostream& os) {
this->PrintExpr(op->args[i * 2 + 1], os); this->PrintExpr(op->args[i * 2 + 1], os);
os << "]" << ((i < 3) ? ", ": ")"); os << "]" << ((i < 3) ? ", ": ")");
} }
} else if (op->call_type == CallNode::PureExtern && op->dtype.is_vector()) {
//
// Emit an unsupported vector call
//
// v = intrin_f((float4*)A[0], (float4*)B[0])
//
// as
//
// float4 __ret;
// {
// float4 __arg0 = ((float4*)A)[0];
// float4 __arg1 = ((float4*)B)[0];
// __ret.x = intrin_f(__arg0.x, __arg1.x);
// __ret.y = intrin_f(__arg0.y, __arg1.y);
// __ret.z = intrin_f(__arg0.z, __arg1.z);
// __ret.w = intrin_f(__arg0.w, __arg1.w);
// }
// v = __ret;
//
// Declare the result vector.
std::string sret = GetUniqueName("_");
this->PrintIndent();
this->PrintType(op->dtype, stream);
stream << ' ' << sret << ";\n";
{
EnterScopeRAII scope(this);
// Load arguments.
std::vector<std::string> sargs;
for (size_t i = 0; i < op->args.size(); ++i) {
std::string val = SSAGetID(PrintExpr(op->args[i]), op->args[i].dtype());
sargs.push_back(std::move(val));
}
// Emit a scalar call for each lane.
for (int i = 0; i < op->dtype.lanes(); ++i) {
std::ostringstream scall;
scall << op->name << "(";
for (size_t j = 0; j < op->args.size(); ++j) {
if (j > 0)
scall << ", ";
PrintVecElemLoad(sargs[j], op->args[j].dtype(), i, scall);
}
scall << ")";
PrintVecElemStore(sret, op->dtype, i, scall.str());
}
}
os << sret;
} else { } else {
CodeGenC::VisitExpr_(op, os); CodeGenC::VisitExpr_(op, os);
} }
...@@ -580,19 +622,19 @@ void CodeGenCUDA::VisitExpr_(const SelectNode* op, std::ostream &os) { ...@@ -580,19 +622,19 @@ void CodeGenCUDA::VisitExpr_(const SelectNode* op, std::ostream &os) {
op->true_value->dtype == op->dtype && op->true_value->dtype == op->dtype &&
op->dtype.lanes() == op->condition.dtype().lanes()); op->dtype.lanes() == op->condition.dtype().lanes());
int lanes = op->dtype.lanes();
int scope = BeginScope();
std::string c_var = SSAGetID(PrintExpr(op->condition), op->dtype);
std::string t_var = SSAGetID(PrintExpr(op->true_value), op->dtype);
std::string f_var = SSAGetID(PrintExpr(op->false_value), op->dtype);
std::string r_var = GetUniqueName("_"); std::string r_var = GetUniqueName("_");
this->PrintIndent(); this->PrintIndent();
this->PrintType(op->dtype, stream); this->PrintType(op->dtype, stream);
stream << ' ' << r_var << ";\n"; stream << ' ' << r_var << ";\n";
{
EnterScopeRAII scope(this);
std::string c_var = SSAGetID(PrintExpr(op->condition), op->dtype);
std::string t_var = SSAGetID(PrintExpr(op->true_value), op->dtype);
std::string f_var = SSAGetID(PrintExpr(op->false_value), op->dtype);
// The condition is stored as an ushort vector. // The condition is stored as an ushort vector.
int lanes = op->dtype.lanes();
DataType memory_ty(DataType::TypeCode::kUInt, 16, lanes); DataType memory_ty(DataType::TypeCode::kUInt, 16, lanes);
for (int i = 0; i < lanes; ++i) { for (int i = 0; i < lanes; ++i) {
...@@ -606,8 +648,8 @@ void CodeGenCUDA::VisitExpr_(const SelectNode* op, std::ostream &os) { ...@@ -606,8 +648,8 @@ void CodeGenCUDA::VisitExpr_(const SelectNode* op, std::ostream &os) {
item << ')'; item << ')';
PrintVecElemStore(r_var, op->dtype, i, item.str()); PrintVecElemStore(r_var, op->dtype, i, item.str());
} }
}
os << r_var; os << r_var;
EndScope(scope);
} }
inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p) { // NOLINT(*) inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p) { // NOLINT(*)
......
...@@ -29,7 +29,6 @@ namespace intrin { ...@@ -29,7 +29,6 @@ namespace intrin {
// Add float suffix to the intrinsics, CUDA fast math. // Add float suffix to the intrinsics, CUDA fast math.
struct CUDAMath { struct CUDAMath {
std::string operator()(DataType t, std::string name) const { std::string operator()(DataType t, std::string name) const {
if (t.lanes() == 1) {
if (t.is_float()) { if (t.is_float()) {
switch (t.bits()) { switch (t.bits()) {
case 64: return name; case 64: return name;
...@@ -38,14 +37,13 @@ struct CUDAMath { ...@@ -38,14 +37,13 @@ struct CUDAMath {
default: return ""; default: return "";
} }
} }
}
return ""; return "";
} }
}; };
struct CUDAFastMath : public CUDAMath { struct CUDAFastMath : public CUDAMath {
std::string operator()(DataType t, std::string name) const { std::string operator()(DataType t, std::string name) const {
if (t.lanes() == 1 && t.is_float() && t.bits() == 32) { if (t.is_float() && t.bits() == 32) {
return "__" + name + 'f'; return "__" + name + 'f';
} else { } else {
return CUDAMath::operator()(t, name); return CUDAMath::operator()(t, name);
...@@ -56,7 +54,7 @@ struct CUDAFastMath : public CUDAMath { ...@@ -56,7 +54,7 @@ struct CUDAFastMath : public CUDAMath {
struct CUDAFastMathTan : public CUDAMath { struct CUDAFastMathTan : public CUDAMath {
std::string operator()(DataType t, std::string name) const { std::string operator()(DataType t, std::string name) const {
if (t.lanes() == 1 && t.is_float()) { if (t.is_float()) {
switch (t.bits()) { switch (t.bits()) {
case 64: return name; case 64: return name;
// `__tanf` seems to produce some values too deviant from numpy tan version. // `__tanf` seems to produce some values too deviant from numpy tan version.
...@@ -72,7 +70,7 @@ struct CUDAFastMathTan : public CUDAMath { ...@@ -72,7 +70,7 @@ struct CUDAFastMathTan : public CUDAMath {
struct CUDAPopcount { struct CUDAPopcount {
std::string operator()(DataType t, std::string name) const { std::string operator()(DataType t, std::string name) const {
if (t.lanes() == 1 && t.is_uint()) { if (t.is_uint()) {
switch (t.bits()) { switch (t.bits()) {
case 32: return "__popc"; case 32: return "__popc";
case 64: return "__popcll"; case 64: return "__popcll";
...@@ -108,7 +106,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp") ...@@ -108,7 +106,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp")
.set_body(DispatchExtern<CUDAFastMath>); .set_body(DispatchExtern<CUDAFastMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp2") TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp2")
.set_body(DispatchExtern<CUDAFastMath>); .set_body(DispatchExtern<CUDAMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp10") TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp10")
.set_body(DispatchExtern<CUDAFastMath>); .set_body(DispatchExtern<CUDAFastMath>);
...@@ -132,13 +130,13 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cos") ...@@ -132,13 +130,13 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cos")
.set_body(DispatchExtern<CUDAFastMath>); .set_body(DispatchExtern<CUDAFastMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cosh") TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cosh")
.set_body(DispatchExtern<CUDAFastMath>); .set_body(DispatchExtern<CUDAMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sin") TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sin")
.set_body(DispatchExtern<CUDAFastMath>); .set_body(DispatchExtern<CUDAFastMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sinh") TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sinh")
.set_body(DispatchExtern<CUDAFastMath>); .set_body(DispatchExtern<CUDAMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.atan") TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.atan")
.set_body(DispatchExtern<CUDAMath>); .set_body(DispatchExtern<CUDAMath>);
......
...@@ -348,6 +348,125 @@ def test_cuda_floordiv_with_vectorization(): ...@@ -348,6 +348,125 @@ def test_cuda_floordiv_with_vectorization():
func(a_nd, b_nd) func(a_nd, b_nd)
tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3) tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
def sched(B):
s = te.create_schedule(B.op)
io, ii = s[B].split(s[B].op.axis[0], nparts=1)
iio, iii = s[B].split(ii, nparts=32)
_, iiii = s[B].split(iii, factor=4)
s[B].vectorize(iiii)
s[B].bind(io, bx)
s[B].bind(iio, tx)
return s
def test_vectorized_intrin1():
test_funcs = [
(tvm.tir.floor, lambda x : np.floor(x)),
(tvm.tir.ceil, lambda x : np.ceil(x)),
(tvm.tir.trunc, lambda x : np.trunc(x)),
(tvm.tir.abs, lambda x : np.fabs(x)),
(tvm.tir.round, lambda x : np.round(x)),
(tvm.tir.exp, lambda x : np.exp(x)),
(tvm.tir.exp2, lambda x : np.exp2(x)),
(tvm.tir.exp10, lambda x : np.power(10,x)),
(tvm.tir.log, lambda x : np.log(x)),
(tvm.tir.log2, lambda x : np.log2(x)),
(tvm.tir.log10, lambda x : np.log10(x)),
(tvm.tir.tan, lambda x : np.tan(x)),
(tvm.tir.cos, lambda x : np.cos(x)),
(tvm.tir.cosh, lambda x : np.cosh(x)),
(tvm.tir.sin, lambda x : np.sin(x)),
(tvm.tir.sinh, lambda x : np.sinh(x)),
(tvm.tir.atan, lambda x : np.arctan(x)),
(tvm.tir.tanh, lambda x : np.tanh(x)),
(tvm.tir.sqrt, lambda x : np.sqrt(x)),
]
def run_test(tvm_intrin, np_func, dtype):
if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
print("skip because cuda is not enabled..")
return
if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
print("Skip because gpu does not have fp16 support")
return
# set of intrinsics does not support fp16 yet.
skip_set = {tvm.tir.abs,
tvm.tir.round,
tvm.tir.tan,
tvm.tir.atan,
tvm.tir.tanh,
tvm.tir.cosh,
tvm.tir.sinh}
if dtype == "float16" and tvm_intrin in skip_set:
print("Skip because '{0}' does not support fp16 yet".format(tvm_intrin.__name__))
return
n = 128
A = te.placeholder((n,), dtype=dtype, name='A')
B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name='B')
s = sched(B)
f = tvm.build(s, [A, B], "cuda")
ctx = tvm.gpu(0)
a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx)
b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx)
f(a, b)
tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-3, rtol=1e-3)
for func in test_funcs:
run_test(*func, "float32")
run_test(*func, "float16")
def test_vectorized_intrin2(dtype="float32"):
c2 = tvm.tir.const(2, dtype=dtype)
test_funcs = [
(tvm.tir.power, lambda x : np.power(x, 2.0)),
(tvm.tir.fmod, lambda x : np.fmod(x, 2.0))
]
def run_test(tvm_intrin, np_func):
if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
print("skip because cuda is not enabled..")
return
n = 128
A = te.placeholder((n,), dtype=dtype, name='A')
B = te.compute((n,), lambda i: tvm_intrin(A[i], c2), name='B')
s = sched(B)
f = tvm.build(s, [A, B], "cuda")
ctx = tvm.gpu(0)
a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx)
b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx)
f(a, b)
tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-3, rtol=1e-3)
for func in test_funcs:
run_test(*func)
def test_vectorized_popcount():
def ref_popcount(x):
cnt = 0
while x:
x -= x & -x
cnt += 1
return cnt
def run_test(dtype):
if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
print("skip because cuda is not enabled..")
return
n = 128
A = te.placeholder((n,), dtype=dtype, name='A')
B = te.compute((n,), lambda i: tvm.tir.popcount(A[i]), name='B')
s = sched(B)
f = tvm.build(s, [A, B], "cuda")
ctx = tvm.gpu(0)
a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), ctx)
b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), ctx)
f(a, b)
ref = np.vectorize(ref_popcount)(a.asnumpy())
tvm.testing.assert_allclose(b.asnumpy(), ref)
run_test("uint32")
run_test("uint64")
if __name__ == "__main__": if __name__ == "__main__":
test_cuda_vectorize_add() test_cuda_vectorize_add()
test_cuda_multiply_add() test_cuda_multiply_add()
...@@ -360,3 +479,6 @@ if __name__ == "__main__": ...@@ -360,3 +479,6 @@ if __name__ == "__main__":
test_cuda_const_float_to_half() test_cuda_const_float_to_half()
test_cuda_reduction() test_cuda_reduction()
test_cuda_floordiv_with_vectorization() test_cuda_floordiv_with_vectorization()
test_vectorized_intrin1()
test_vectorized_intrin2()
test_vectorized_popcount()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment