Commit 2c512ca7 by Tianqi Chen Committed by GitHub

[LLVM] Vectorized load/store (#60)

parent 2111bbf3
......@@ -80,7 +80,7 @@ inline bool GetConstInt(Expr e, int* out) {
} \
uint64_t ua = 0, ub = 0; \
if (GetConst(a, &ua) && GetConst(b, &ub)) { \
return ir::UIntImm::make(a.type(), ua + ub); \
return ir::UIntImm::make(a.type(), ua OP ub); \
} \
template<>
......
......@@ -113,7 +113,7 @@ class ModularEvaluator
private:
const std::unordered_map<
const Variable*, ModularEntry>& mod_map_;
friend struct ModularEntry;
// simplify the base by putting it in range.
static int BaseSimplify(int base, int coeff) {
if (coeff == 0) return base;
......@@ -136,6 +136,15 @@ class ModularEvaluator
}
};
ModularEntry ModularEntry::Add(const ModularEntry& a,
const ModularEntry& b) {
ModularEntry ret;
ret.coeff = ModularEvaluator::ZeroAwareGCD(a.coeff, b.coeff);
ret.base = ModularEvaluator::BaseSimplify(a.base + b.base, ret.coeff);
return ret;
}
ModularEntry EvalModular(
const Expr& e,
const std::unordered_map<const Variable*, ModularEntry>& mod_map) {
......
......@@ -37,6 +37,14 @@ struct ModularEntry {
e.base = 0; e.coeff = 1;
return e;
}
/*!
* \brief Add two modular entries together to get a new modular entry.
* \param a The left operand.
* \param b The right operand.
* \return The combined modular entry.
*/
static ModularEntry Add(const ModularEntry& a,
const ModularEntry& b);
};
/*!
......
......@@ -14,6 +14,7 @@
#include <vector>
#include <string>
#include "./llvm_common.h"
#include "../../arithmetic/modular.h"
namespace tvm {
namespace codegen {
......@@ -109,18 +110,29 @@ class CodeGenLLVM :
virtual llvm::Value* CreateCallExtern(const Call* op);
// create call into tvm packed function.
virtual llvm::Value* CreateCallPacked(const Call* op);
// Scalarize e by iterating elements of e.
// f is a callback that takes index and v.
virtual void Scalarize(const Expr& e,
std::function<void(int i, llvm::Value* v)> f);
protected:
/*!
* \param t The original type.
* \return LLVM type of t
*/
llvm::Type* LLVMType(const Type& t) const;
// initialize the function state.
void InitFuncState();
// Get alignment given index.
void GetAlignment(
Type t, const Variable* buf_var, const Expr& index,
int* p_alignment, int* p_native_bits);
// do a scalarize call with f
llvm::Value* CreateScalarizedCall(
const Call* op, llvm::Function* f, const std::vector<llvm::Value*>& args);
// apply optimization on the module.
virtual void Optimize();
// Get the maximim storage align bits of buffer pointer given storage scope.
virtual int NativeVectorBits(const std::string& storage_scope) const;
// The IRBuilder.
using IRBuilder = llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>;
// The current function
......@@ -162,6 +174,8 @@ class CodeGenLLVM :
llvm::Function* f_tvm_parallel_for_{nullptr};
// The acting body
llvm::BasicBlock* block_{nullptr};
/*! \brief the storage scope of allocation */
std::unordered_map<const Variable*, std::string> alloc_storage_scope_;
private:
// comparison op
......@@ -178,6 +192,11 @@ class CodeGenLLVM :
llvm::Value* CreateBufferPtr(Type t, llvm::Value* buffer, llvm::Value* index);
llvm::Value* CreateCast(Type from, Type to, llvm::Value* value);
llvm::Value* GetPackedFuncHandle(const std::string& str);
// Vector concatenation.
llvm::Value* CreateVecSlice(llvm::Value* vec, int begin, int extent);
llvm::Value* CreateVecFlip(llvm::Value* vec);
llvm::Value* CreateVecConcat(std::vector<llvm::Value*> vecs);
llvm::Value* CreateVecPad(llvm::Value* vec, int target_lanes);
// Create parallel for.
void CreateParallelFor(const For* op);
// Create serial for
......@@ -197,6 +216,8 @@ class CodeGenLLVM :
std::unordered_map<const Variable*, llvm::Value*> var_map_;
// global strings
std::unordered_map<std::string, llvm::Constant*> str_map_;
// The alignment information
std::unordered_map<const Variable*, arith::ModularEntry> align_map_;
// The local module_context
llvm::GlobalVariable* gv_mod_ctx_{nullptr};
// global to packed function handle
......
......@@ -355,7 +355,9 @@ class Vectorizer : public IRMutator {
const Ramp* a_ramp = a.as<Ramp>();
if (a.type().lanes() == 1 && b_ramp) {
return Ramp::make(
arith::ComputeExpr<T>(a, b_ramp->base), b_ramp->stride, b_ramp->lanes);
arith::ComputeExpr<T>(a, b_ramp->base),
arith::ComputeExpr<T>(make_zero(b_ramp->stride.type()), b_ramp->stride),
b_ramp->lanes);
}
if (b.type().lanes() == 1 && a_ramp) {
return Ramp::make(
......
......@@ -2,13 +2,15 @@ import tvm
import numpy as np
def test_llvm_add_pipeline():
n = tvm.Var('n')
nn = 1024
n = tvm.convert(nn)
A = tvm.placeholder((n,), name='A')
B = tvm.placeholder((n,), name='B')
C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
s = tvm.Schedule(C.op)
s[C].parallel(C.op.axis[0])
xo, xi = s[C].split(C.op.axis[0], factor=4)
s[C].parallel(xo)
s[C].vectorize(xi)
def check_llvm():
if not tvm.codegen.enabled("llvm"):
return
......@@ -16,16 +18,71 @@ def test_llvm_add_pipeline():
f = tvm.build(s, [A, B, C], "llvm")
ctx = tvm.cpu(0)
# launch the kernel.
n = 1027 * 1024
n = nn
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
for i in range(1000):
f(a, b, c)
f(a, b, c)
np.testing.assert_allclose(
c.asnumpy(), a.asnumpy() + b.asnumpy())
check_llvm()
def test_llvm_flip_pipeline():
def check_llvm(nn, base):
if not tvm.codegen.enabled("llvm"):
return
n = tvm.convert(nn)
A = tvm.placeholder((n + base), name='A')
C = tvm.compute((n,), lambda i: A(nn + base- i - 1), name='C')
s = tvm.Schedule(C.op)
xo, xi = s[C].split(C.op.axis[0], factor=4)
s[C].parallel(xo)
s[C].vectorize(xi)
# build and invoke the kernel.
f = tvm.build(s, [A, C], "llvm")
ctx = tvm.cpu(0)
# launch the kernel.
n = nn
a = tvm.nd.array(np.random.uniform(size=(n + base)).astype(A.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
f(a, c)
np.testing.assert_allclose(
c.asnumpy(), a.asnumpy()[::-1][:n])
check_llvm(4, 0)
check_llvm(128, 8)
check_llvm(3, 0)
check_llvm(128, 1)
def test_llvm_madd_pipeline():
def check_llvm(nn, base, stride):
if not tvm.codegen.enabled("llvm"):
return
n = tvm.convert(nn)
A = tvm.placeholder((n + base, stride), name='A')
C = tvm.compute((n, stride), lambda i, j: A(base + i, j) + 1, name='C')
s = tvm.Schedule(C.op)
xo, xi = s[C].split(C.op.axis[0], factor=4)
s[C].parallel(xo)
s[C].vectorize(xi)
# build and invoke the kernel.
f = tvm.build(s, [A, C], "llvm")
ctx = tvm.cpu(0)
# launch the kernel.
n = nn
a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), ctx)
c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), ctx)
f(a, c)
np.testing.assert_allclose(
c.asnumpy(), a.asnumpy()[base:] + 1)
check_llvm(64, 0, 2)
check_llvm(4, 0, 1)
check_llvm(4, 0, 3)
if __name__ == "__main__":
test_llvm_add_pipeline()
test_llvm_flip_pipeline()
test_llvm_madd_pipeline()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment