Commit 2c512ca7 by Tianqi Chen Committed by GitHub

[LLVM] Vectorized load/store (#60)

parent 2111bbf3
...@@ -80,7 +80,7 @@ inline bool GetConstInt(Expr e, int* out) { ...@@ -80,7 +80,7 @@ inline bool GetConstInt(Expr e, int* out) {
} \ } \
uint64_t ua = 0, ub = 0; \ uint64_t ua = 0, ub = 0; \
if (GetConst(a, &ua) && GetConst(b, &ub)) { \ if (GetConst(a, &ua) && GetConst(b, &ub)) { \
return ir::UIntImm::make(a.type(), ua + ub); \ return ir::UIntImm::make(a.type(), ua OP ub); \
} \ } \
template<> template<>
......
...@@ -113,7 +113,7 @@ class ModularEvaluator ...@@ -113,7 +113,7 @@ class ModularEvaluator
private: private:
const std::unordered_map< const std::unordered_map<
const Variable*, ModularEntry>& mod_map_; const Variable*, ModularEntry>& mod_map_;
friend struct ModularEntry;
// simplify the base by putting it in range. // simplify the base by putting it in range.
static int BaseSimplify(int base, int coeff) { static int BaseSimplify(int base, int coeff) {
if (coeff == 0) return base; if (coeff == 0) return base;
...@@ -136,6 +136,15 @@ class ModularEvaluator ...@@ -136,6 +136,15 @@ class ModularEvaluator
} }
}; };
ModularEntry ModularEntry::Add(const ModularEntry& a,
const ModularEntry& b) {
ModularEntry ret;
ret.coeff = ModularEvaluator::ZeroAwareGCD(a.coeff, b.coeff);
ret.base = ModularEvaluator::BaseSimplify(a.base + b.base, ret.coeff);
return ret;
}
ModularEntry EvalModular( ModularEntry EvalModular(
const Expr& e, const Expr& e,
const std::unordered_map<const Variable*, ModularEntry>& mod_map) { const std::unordered_map<const Variable*, ModularEntry>& mod_map) {
......
...@@ -37,6 +37,14 @@ struct ModularEntry { ...@@ -37,6 +37,14 @@ struct ModularEntry {
e.base = 0; e.coeff = 1; e.base = 0; e.coeff = 1;
return e; return e;
} }
/*!
* \brief Add two modular entries together to get a new modular entry.
* \param a The left operand.
* \param b The right operand.
* \return The combined modular entry.
*/
static ModularEntry Add(const ModularEntry& a,
const ModularEntry& b);
}; };
/*! /*!
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include "./llvm_common.h" #include "./llvm_common.h"
#include "../../arithmetic/modular.h"
namespace tvm { namespace tvm {
namespace codegen { namespace codegen {
...@@ -109,18 +110,29 @@ class CodeGenLLVM : ...@@ -109,18 +110,29 @@ class CodeGenLLVM :
virtual llvm::Value* CreateCallExtern(const Call* op); virtual llvm::Value* CreateCallExtern(const Call* op);
// create call into tvm packed function. // create call into tvm packed function.
virtual llvm::Value* CreateCallPacked(const Call* op); virtual llvm::Value* CreateCallPacked(const Call* op);
// Scalarize e by iterating elements of e.
// f is a callback that takes index and v.
virtual void Scalarize(const Expr& e,
std::function<void(int i, llvm::Value* v)> f);
protected: protected:
/*! /*!
* \param t The original type. * \param t The original type.
* \return LLVM type of t * \return LLVM type of t
*/ */
llvm::Type* LLVMType(const Type& t) const; llvm::Type* LLVMType(const Type& t) const;
// initialize the function state.
void InitFuncState();
// Get alignment given index.
void GetAlignment(
Type t, const Variable* buf_var, const Expr& index,
int* p_alignment, int* p_native_bits);
// do a scalarize call with f // do a scalarize call with f
llvm::Value* CreateScalarizedCall( llvm::Value* CreateScalarizedCall(
const Call* op, llvm::Function* f, const std::vector<llvm::Value*>& args); const Call* op, llvm::Function* f, const std::vector<llvm::Value*>& args);
// apply optimization on the module. // apply optimization on the module.
virtual void Optimize(); virtual void Optimize();
// Get the maximim storage align bits of buffer pointer given storage scope.
virtual int NativeVectorBits(const std::string& storage_scope) const;
// The IRBuilder. // The IRBuilder.
using IRBuilder = llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>; using IRBuilder = llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>;
// The current function // The current function
...@@ -162,6 +174,8 @@ class CodeGenLLVM : ...@@ -162,6 +174,8 @@ class CodeGenLLVM :
llvm::Function* f_tvm_parallel_for_{nullptr}; llvm::Function* f_tvm_parallel_for_{nullptr};
// The acting body // The acting body
llvm::BasicBlock* block_{nullptr}; llvm::BasicBlock* block_{nullptr};
/*! \brief the storage scope of allocation */
std::unordered_map<const Variable*, std::string> alloc_storage_scope_;
private: private:
// comparison op // comparison op
...@@ -178,6 +192,11 @@ class CodeGenLLVM : ...@@ -178,6 +192,11 @@ class CodeGenLLVM :
llvm::Value* CreateBufferPtr(Type t, llvm::Value* buffer, llvm::Value* index); llvm::Value* CreateBufferPtr(Type t, llvm::Value* buffer, llvm::Value* index);
llvm::Value* CreateCast(Type from, Type to, llvm::Value* value); llvm::Value* CreateCast(Type from, Type to, llvm::Value* value);
llvm::Value* GetPackedFuncHandle(const std::string& str); llvm::Value* GetPackedFuncHandle(const std::string& str);
// Vector concatenation.
llvm::Value* CreateVecSlice(llvm::Value* vec, int begin, int extent);
llvm::Value* CreateVecFlip(llvm::Value* vec);
llvm::Value* CreateVecConcat(std::vector<llvm::Value*> vecs);
llvm::Value* CreateVecPad(llvm::Value* vec, int target_lanes);
// Create parallel for. // Create parallel for.
void CreateParallelFor(const For* op); void CreateParallelFor(const For* op);
// Create serial for // Create serial for
...@@ -197,6 +216,8 @@ class CodeGenLLVM : ...@@ -197,6 +216,8 @@ class CodeGenLLVM :
std::unordered_map<const Variable*, llvm::Value*> var_map_; std::unordered_map<const Variable*, llvm::Value*> var_map_;
// global strings // global strings
std::unordered_map<std::string, llvm::Constant*> str_map_; std::unordered_map<std::string, llvm::Constant*> str_map_;
// The alignment information
std::unordered_map<const Variable*, arith::ModularEntry> align_map_;
// The local module_context // The local module_context
llvm::GlobalVariable* gv_mod_ctx_{nullptr}; llvm::GlobalVariable* gv_mod_ctx_{nullptr};
// global to packed function handle // global to packed function handle
......
...@@ -355,7 +355,9 @@ class Vectorizer : public IRMutator { ...@@ -355,7 +355,9 @@ class Vectorizer : public IRMutator {
const Ramp* a_ramp = a.as<Ramp>(); const Ramp* a_ramp = a.as<Ramp>();
if (a.type().lanes() == 1 && b_ramp) { if (a.type().lanes() == 1 && b_ramp) {
return Ramp::make( return Ramp::make(
arith::ComputeExpr<T>(a, b_ramp->base), b_ramp->stride, b_ramp->lanes); arith::ComputeExpr<T>(a, b_ramp->base),
arith::ComputeExpr<T>(make_zero(b_ramp->stride.type()), b_ramp->stride),
b_ramp->lanes);
} }
if (b.type().lanes() == 1 && a_ramp) { if (b.type().lanes() == 1 && a_ramp) {
return Ramp::make( return Ramp::make(
......
...@@ -2,13 +2,15 @@ import tvm ...@@ -2,13 +2,15 @@ import tvm
import numpy as np import numpy as np
def test_llvm_add_pipeline(): def test_llvm_add_pipeline():
n = tvm.Var('n') nn = 1024
n = tvm.convert(nn)
A = tvm.placeholder((n,), name='A') A = tvm.placeholder((n,), name='A')
B = tvm.placeholder((n,), name='B') B = tvm.placeholder((n,), name='B')
C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
s = tvm.Schedule(C.op) s = tvm.Schedule(C.op)
s[C].parallel(C.op.axis[0]) xo, xi = s[C].split(C.op.axis[0], factor=4)
s[C].parallel(xo)
s[C].vectorize(xi)
def check_llvm(): def check_llvm():
if not tvm.codegen.enabled("llvm"): if not tvm.codegen.enabled("llvm"):
return return
...@@ -16,16 +18,71 @@ def test_llvm_add_pipeline(): ...@@ -16,16 +18,71 @@ def test_llvm_add_pipeline():
f = tvm.build(s, [A, B, C], "llvm") f = tvm.build(s, [A, B, C], "llvm")
ctx = tvm.cpu(0) ctx = tvm.cpu(0)
# launch the kernel. # launch the kernel.
n = 1027 * 1024 n = nn
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
for i in range(1000): f(a, b, c)
f(a, b, c)
np.testing.assert_allclose( np.testing.assert_allclose(
c.asnumpy(), a.asnumpy() + b.asnumpy()) c.asnumpy(), a.asnumpy() + b.asnumpy())
check_llvm() check_llvm()
def test_llvm_flip_pipeline():
def check_llvm(nn, base):
if not tvm.codegen.enabled("llvm"):
return
n = tvm.convert(nn)
A = tvm.placeholder((n + base), name='A')
C = tvm.compute((n,), lambda i: A(nn + base- i - 1), name='C')
s = tvm.Schedule(C.op)
xo, xi = s[C].split(C.op.axis[0], factor=4)
s[C].parallel(xo)
s[C].vectorize(xi)
# build and invoke the kernel.
f = tvm.build(s, [A, C], "llvm")
ctx = tvm.cpu(0)
# launch the kernel.
n = nn
a = tvm.nd.array(np.random.uniform(size=(n + base)).astype(A.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
f(a, c)
np.testing.assert_allclose(
c.asnumpy(), a.asnumpy()[::-1][:n])
check_llvm(4, 0)
check_llvm(128, 8)
check_llvm(3, 0)
check_llvm(128, 1)
def test_llvm_madd_pipeline():
def check_llvm(nn, base, stride):
if not tvm.codegen.enabled("llvm"):
return
n = tvm.convert(nn)
A = tvm.placeholder((n + base, stride), name='A')
C = tvm.compute((n, stride), lambda i, j: A(base + i, j) + 1, name='C')
s = tvm.Schedule(C.op)
xo, xi = s[C].split(C.op.axis[0], factor=4)
s[C].parallel(xo)
s[C].vectorize(xi)
# build and invoke the kernel.
f = tvm.build(s, [A, C], "llvm")
ctx = tvm.cpu(0)
# launch the kernel.
n = nn
a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), ctx)
c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), ctx)
f(a, c)
np.testing.assert_allclose(
c.asnumpy(), a.asnumpy()[base:] + 1)
check_llvm(64, 0, 2)
check_llvm(4, 0, 1)
check_llvm(4, 0, 3)
if __name__ == "__main__": if __name__ == "__main__":
test_llvm_add_pipeline() test_llvm_add_pipeline()
test_llvm_flip_pipeline()
test_llvm_madd_pipeline()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment