/*! * Copyright (c) 2017 by Contributors * \file codegen_cuda.cc */ #include <tvm/base.h> #include <tvm/runtime/config.h> #include <tvm/runtime/registry.h> #include <tvm/packed_func_ext.h> #include <vector> #include <string> #include "./codegen_cuda.h" #include "../arithmetic/compute_expr.h" namespace tvm { namespace codegen { CodeGenCUDA::CodeGenCUDA() { restrict_keyword_ = "__restrict__"; } void CodeGenCUDA::Init(bool output_ssa) { CodeGenC::Init(output_ssa); vid_global_barrier_state_ = GetUniqueName(runtime::symbol::tvm_global_barrier_state); vid_global_barrier_expect_ = GetUniqueName("__barrier_expect"); CHECK_EQ(vid_global_barrier_state_, runtime::symbol::tvm_global_barrier_state); } void CodeGenCUDA::AddFunction(LoweredFunc f) { this->stream << "extern \"C\" __global__ "; CodeGenC::AddFunction(f); } void CodeGenCUDA::VisitStmt_(const ir::For* op) { CHECK(is_zero(op->min)); if (op->for_type == ir::ForType::Unrolled) { PrintIndent(); stream << "#pragma unroll\n"; } CodeGenC::VisitStmt_(op); } void CodeGenCUDA::BindThreadIndex(const IterVar& iv) { CHECK(!var_idmap_.count(iv->var.get())); var_idmap_[iv->var.get()] = CastFromTo(iv->thread_tag, UInt(32), iv->var.type()); } void CodeGenCUDA::PrintType(Type t, std::ostream& os) const { // NOLINT(*) int lanes = t.lanes(); if (t.is_handle()) { CHECK_EQ(lanes, 1) << "do not yet support vector types"; os << "void*"; return; } bool fail = false; if (t.is_float()) { switch (t.bits()) { case 16: os << "half"; break; case 32: os << "float"; break; case 64: os << "double"; break; default: fail = true; break; } if (!fail && lanes == 1) return; if (!fail && (lanes >= 2 && lanes <= 4)) { os << lanes; return; } } else if (t.is_uint() || t.is_int()) { if (t.is_uint()) { os << 'u'; } if (t.bits() == 8 && t.lanes() == 4) { // directly 4 8 bit int in integer. os << "int"; return; } switch (t.bits()) { case 8: os << "char"; break; case 16: os << "short"; break; case 32: os << "int"; break; case 64: { if (lanes != 1 && sizeof(long) == 64) { // NOLINT(*) os << "long"; break; } else { os << "int64_t"; break; } } case 1: os << "int"; break; default: fail = true; break; } if (!fail && lanes == 1) return; if (!fail && (lanes >= 2 && lanes <= 4)) { os << lanes; return; } } LOG(FATAL) << "Cannot convert type " << t << " to CUDA type"; } void CodeGenCUDA::PrintVecBinaryOp( const std::string&op, Type t, Expr lhs, Expr rhs, std::ostream& os) { // NOLINT(*) // unpacking operations. int lanes = t.lanes(); { // default: unpack into individual ops. std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.type()); std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.type()); std::string sret = GetUniqueName("_"); { // delcare type. this->PrintIndent(); this->PrintType(t, stream); stream << ' ' << sret << ";\n"; } for (int i = 0; i < lanes; ++i) { std::ostringstream value_temp; if (isalpha(op[0])) { value_temp << op << "("; PrintVecElemLoad(vlhs, lhs.type(), i, value_temp); value_temp << ", "; PrintVecElemLoad(vrhs, rhs.type(), i, value_temp); value_temp << ")"; } else { value_temp << "("; PrintVecElemLoad(vlhs, lhs.type(), i, value_temp); value_temp << op; PrintVecElemLoad(vrhs, rhs.type(), i, value_temp); value_temp << ")"; } PrintVecElemStore(sret, t, i, value_temp.str()); } os << sret; } } void CodeGenCUDA::PrintVecElemLoad( const std::string& vec, Type t, int i, std::ostream& os) { // NOLINT(*) const char access[] = {'x', 'y', 'z', 'w'}; CHECK(i >= 0 && i < 4); os << vec << "." << access[i]; } void CodeGenCUDA::PrintVecElemStore( const std::string& vec, Type t, int i, const std::string& value) { this->PrintIndent(); const char access[] = {'x', 'y', 'z', 'w'}; CHECK(i >= 0 && i < 4); stream << vec << "." << access[i] << " = " << value << ";\n"; } void CodeGenCUDA::PrintStorageSync(const Call* op) { const std::string& sync = op->args[0].as<StringImm>()->value; if (sync == "warp") { // DO nothing. } else if (sync == "shared") { this->PrintIndent(); this->stream << "__syncthreads();\n"; } else if (sync == "global") { if (!need_global_barrier_) { need_global_barrier_ = true; this->decl_stream << "extern \"C\" __device__ unsigned " << vid_global_barrier_state_ << ";\n"; } // global synchronizer std::string is_load = PrintExpr(op->args[1]); std::string num_blocks = PrintExpr(op->args[2]); this->PrintIndent(); // In theory only threadfence is needed // but we observed problems with only threadfence this->stream <<"__threadfence_system();\n"; this->PrintIndent(); this->stream <<"if (" << is_load << ") {\n"; int wb = this->BeginScope(); this->PrintIndent(); this->stream << "atomicAdd(&" << vid_global_barrier_state_ << ", 1);\n"; this->PrintIndent(); std::string ptr = GetUniqueName("pf"); this->stream << "volatile unsigned* " << ptr << " = &" << vid_global_barrier_state_<< ";\n"; this->PrintIndent(); this->stream << vid_global_barrier_expect_ << " += " << num_blocks << ";\n"; this->PrintIndent(); this->stream <<"while (" << ptr << "[0] < " << vid_global_barrier_expect_ << ");\n"; this->EndScope(wb); this->PrintIndent(); this->stream <<"}\n"; this->PrintIndent(); this->stream <<"__syncthreads();\n"; } } void CodeGenCUDA::PrintStorageScope( const std::string& scope, std::ostream& os) { // NOLINT(*) CHECK_NE(scope, "global"); if (scope == "shared") { os << "__shared__"; } } void CodeGenCUDA::VisitStmt_(const Evaluate *op) { if (is_const(op->value)) return; const Call* call = op->value.as<Call>(); if (call && call->is_intrinsic(intrinsic::tvm_global_barrier_kinit)) { PrintIndent(); stream << "__shared__ unsigned " << vid_global_barrier_expect_ << ";\n"; PrintIndent(); stream << "if (threadIdx.x == 0) {\n"; PrintIndent(); stream << " " << vid_global_barrier_expect_ << " = 0;\n"; PrintIndent(); stream << "}\n"; } else { CodeGenC::VisitStmt_(op); } } void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) { // NOLINT(*) std::string v = PrintExpr(op->value); os << "make_"; PrintType(op->type, os); os << "("; for (int i = 0; i < op->lanes; ++i) { if (i != 0) os << ", "; os << v; } os << ')'; } } // namespace codegen } // namespace tvm