[LLVM] Vectorized load/store (#60)

2c512ca7 · Tianqi Chen · GitHub · 2111bbf3 · 2c512ca7 · 2c512ca7
Commit 2c512ca7 authored Mar 02, 2017 by Tianqi Chen Committed by GitHub Mar 02, 2017
7 changed files
--- a/src/arithmetic/compute_expr.h
+++ b/src/arithmetic/compute_expr.h
@@ -80,7 +80,7 @@ inline bool GetConstInt(Expr e, int* out) {
  }                                                              \
  uint64_t ua = 0, ub = 0;                                       \
  if (GetConst(a, &ua) && GetConst(b, &ub)) {                    \
-    return ir::UIntImm::make(a.type(), ua + ub);                 \
+    return ir::UIntImm::make(a.type(), ua OP ub);                \
  }                                                              \
 template<>

--- a/src/arithmetic/modular.cc
+++ b/src/arithmetic/modular.cc
@@ -113,7 +113,7 @@ class ModularEvaluator
 private:
  const std::unordered_map<
    const Variable*, ModularEntry>& mod_map_;
+  friend struct ModularEntry;
  // simplify the base by putting it in range.
  static int BaseSimplify(int base, int coeff) {
    if (coeff == 0) return base;
@@ -136,6 +136,15 @@ class ModularEvaluator
  }
 };
+ModularEntry ModularEntry::Add(const ModularEntry& a,
+                               const ModularEntry& b) {
+  ModularEntry ret;
+  ret.coeff = ModularEvaluator::ZeroAwareGCD(a.coeff, b.coeff);
+  ret.base = ModularEvaluator::BaseSimplify(a.base + b.base, ret.coeff);
+  return ret;
+}
 ModularEntry EvalModular(
    const Expr& e,
    const std::unordered_map<const Variable*, ModularEntry>& mod_map) {

--- a/src/arithmetic/modular.h
+++ b/src/arithmetic/modular.h
@@ -37,6 +37,14 @@ struct ModularEntry {
    e.base = 0; e.coeff = 1;
    return e;
  }
+  /*!
+   * \brief Add two modular entries together to get a new modular entry.
+   * \param a The left operand.
+   * \param b The right operand.
+   * \return The combined modular entry.
+   */
+  static ModularEntry Add(const ModularEntry& a,
+                          const ModularEntry& b);
 };
 /*!

--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -14,6 +14,7 @@
 #include <vector>
 #include <string>
 #include "./llvm_common.h"
+#include "../../arithmetic/modular.h"
 namespace tvm {
 namespace codegen {
@@ -109,18 +110,29 @@ class CodeGenLLVM :
  virtual llvm::Value* CreateCallExtern(const Call* op);
  // create call into tvm packed function.
  virtual llvm::Value* CreateCallPacked(const Call* op);
+  // Scalarize e by iterating elements of e.
+  // f is a callback that takes index and v.
+  virtual void Scalarize(const Expr& e,
+                         std::function<void(int i, llvm::Value* v)> f);
 protected:
  /*!
   * \param t The original type.
   * \return LLVM type of t
   */
  llvm::Type* LLVMType(const Type& t) const;
+  // initialize the function state.
+  void InitFuncState();
+  // Get alignment given index.
+  void GetAlignment(
+      Type t, const Variable* buf_var, const Expr& index,
+      int* p_alignment, int* p_native_bits);
  // do a scalarize call with f
  llvm::Value* CreateScalarizedCall(
      const Call* op, llvm::Function* f, const std::vector<llvm::Value*>& args);
  // apply optimization on the module.
  virtual void Optimize();
+  // Get the maximim storage align bits of buffer pointer given storage scope.
+  virtual int NativeVectorBits(const std::string& storage_scope) const;
  // The IRBuilder.
  using IRBuilder = llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>;
  // The current function
@@ -162,6 +174,8 @@ class CodeGenLLVM :
  llvm::Function* f_tvm_parallel_for_{nullptr};
  // The acting body
  llvm::BasicBlock* block_{nullptr};
+  /*! \brief the storage scope of allocation */
+  std::unordered_map<const Variable*, std::string> alloc_storage_scope_;
 private:
  // comparison op
@@ -178,6 +192,11 @@ class CodeGenLLVM :
  llvm::Value* CreateBufferPtr(Type t, llvm::Value* buffer, llvm::Value* index);
  llvm::Value* CreateCast(Type from, Type to, llvm::Value* value);
  llvm::Value* GetPackedFuncHandle(const std::string& str);
+  // Vector concatenation.
+  llvm::Value* CreateVecSlice(llvm::Value* vec, int begin, int extent);
+  llvm::Value* CreateVecFlip(llvm::Value* vec);
+  llvm::Value* CreateVecConcat(std::vector<llvm::Value*> vecs);
+  llvm::Value* CreateVecPad(llvm::Value* vec, int target_lanes);
  // Create parallel for.
  void CreateParallelFor(const For* op);
  // Create serial for
@@ -197,6 +216,8 @@ class CodeGenLLVM :
  std::unordered_map<const Variable*, llvm::Value*> var_map_;
  // global strings
  std::unordered_map<std::string, llvm::Constant*> str_map_;
+  // The alignment information
+  std::unordered_map<const Variable*, arith::ModularEntry> align_map_;
  // The local module_context
  llvm::GlobalVariable* gv_mod_ctx_{nullptr};
  // global to packed function handle

--- a/src/pass/vectorize_loop.cc
+++ b/src/pass/vectorize_loop.cc
@@ -355,7 +355,9 @@ class Vectorizer : public IRMutator {
        const Ramp* a_ramp = a.as<Ramp>();
        if (a.type().lanes() == 1 && b_ramp) {
          return Ramp::make(
-            arith::ComputeExpr<T>(a, b_ramp->base), b_ramp->stride, b_ramp->lanes);
+              arith::ComputeExpr<T>(a, b_ramp->base),
+              arith::ComputeExpr<T>(make_zero(b_ramp->stride.type()), b_ramp->stride),
+              b_ramp->lanes);
        }
        if (b.type().lanes() == 1 && a_ramp) {
          return Ramp::make(

--- a/tests/python/unittest/test_codegen_llvm.py
+++ b/tests/python/unittest/test_codegen_llvm.py
@@ -2,13 +2,15 @@ import tvm
 import numpy as np
 def test_llvm_add_pipeline():
-    n = tvm.Var('n')
+    nn = 1024
+    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = tvm.Schedule(C.op)
-    s[C].parallel(C.op.axis[0])
+    xo, xi = s[C].split(C.op.axis[0], factor=4)
+    s[C].parallel(xo)
+    s[C].vectorize(xi)
    def check_llvm():
        if not tvm.codegen.enabled("llvm"):
            return
@@ -16,16 +18,71 @@ def test_llvm_add_pipeline():
        f = tvm.build(s, [A, B, C], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
-        n = 1027 * 1024
+        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
-        for i in range(1000):
+        f(a, b, c)
-            f(a, b, c)
        np.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + b.asnumpy())
    check_llvm()
+def test_llvm_flip_pipeline():
+    def check_llvm(nn, base):
+        if not tvm.codegen.enabled("llvm"):
+            return
+        n = tvm.convert(nn)
+        A = tvm.placeholder((n + base), name='A')
+        C = tvm.compute((n,), lambda i: A(nn + base- i - 1), name='C')
+        s = tvm.Schedule(C.op)
+        xo, xi = s[C].split(C.op.axis[0], factor=4)
+        s[C].parallel(xo)
+        s[C].vectorize(xi)
+        # build and invoke the kernel.
+        f = tvm.build(s, [A, C], "llvm")
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        n = nn
+        a = tvm.nd.array(np.random.uniform(size=(n + base)).astype(A.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        f(a, c)
+        np.testing.assert_allclose(
+            c.asnumpy(), a.asnumpy()[::-1][:n])
+    check_llvm(4, 0)
+    check_llvm(128, 8)
+    check_llvm(3, 0)
+    check_llvm(128, 1)
+def test_llvm_madd_pipeline():
+    def check_llvm(nn, base, stride):
+        if not tvm.codegen.enabled("llvm"):
+            return
+        n = tvm.convert(nn)
+        A = tvm.placeholder((n + base, stride), name='A')
+        C = tvm.compute((n, stride), lambda i, j: A(base + i, j) + 1, name='C')
+        s = tvm.Schedule(C.op)
+        xo, xi = s[C].split(C.op.axis[0], factor=4)
+        s[C].parallel(xo)
+        s[C].vectorize(xi)
+        # build and invoke the kernel.
+        f = tvm.build(s, [A, C], "llvm")
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        n = nn
+        a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), ctx)
+        c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), ctx)
+        f(a, c)
+        np.testing.assert_allclose(
+            c.asnumpy(), a.asnumpy()[base:] + 1)
+    check_llvm(64, 0, 2)
+    check_llvm(4, 0, 1)
+    check_llvm(4, 0, 3)
 if __name__ == "__main__":
    test_llvm_add_pipeline()
+    test_llvm_flip_pipeline()
+    test_llvm_madd_pipeline()