[CODEGEN] More storage alignment info aware generation (#186)

* [CODEGEN] More storage alignment info aware generation * fix * fix * fix warning

[CODEGEN] More storage alignment info aware generation (#186)
* [CODEGEN] More storage alignment info aware generation * fix * fix * fix warning
54450614 · Tianqi Chen · ziheng · 3b8e70ae · 54450614 · 54450614
Commit 54450614 authored Jun 17, 2017 by Tianqi Chen Committed by ziheng Jun 17, 2017
11 changed files
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -146,6 +146,8 @@ constexpr const char* virtual_thread = "virtual_thread";
 constexpr const char* volatile_scope = "volatile_scope";
 /*! \brief Mark storage scope of buffers */
 constexpr const char* storage_scope = "storage_scope";
+/*! \brief Mark storage alignement requirement of buffers */
+constexpr const char* storage_alignment = "storage_alignment";
 /*! \brief Mark storage scope of realization */
 constexpr const char* realize_scope = "realize_scope";
 /*! \brief The allocation context for global malloc in host. */

--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -20,6 +20,10 @@ enum DeviceAttrKind : int {
  kMaxThreadsPerBlock = 1,
  kWarpSize = 2
 };
+
+/*! \brief Number of bytes each allocation must align to */
+constexpr int kAllocAlignment = 64;
+
 /*!
 * \brief TVM Runtime Device API, abstracts the device
 *  specific interface for memory management.

--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -562,7 +562,7 @@ def comm_reducer(fcombine, fidentity, name="reduce"):
        result = convert(result)
        id_elem = convert(id_elem)
        combiner = _make.CommReducer(lhs, rhs, result, id_elem)
-        axis = convert(axis if isinstance(axis, list) else [axis])
+        axis = convert(axis if isinstance(axis, (list, tuple)) else [axis])
        if where is None:
            where = convert(True)
        outputs = tuple(_make.Reduce(combiner, expr, axis, where, i)
@@ -570,7 +570,7 @@ def comm_reducer(fcombine, fidentity, name="reduce"):
        return outputs[0] if size == 1 else outputs

    def reducer(expr, axis, where=None, *args):
-        if isinstance(axis, (_schedule.IterVar, list)):
+        if isinstance(axis, (_schedule.IterVar, list, tuple)):
            assert not args
            return _make_reduce(expr, axis, where)
        if where is None:

--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -91,6 +91,20 @@ void CodeGenLLVM::InitTarget(llvm::TargetMachine* tm) {
  module_->setTargetTriple(tm->getTargetTriple().str());
  module_->setDataLayout(tm->createDataLayout());
  data_layout_.reset(new llvm::DataLayout(module_.get()));
+  // initialize native vector bits
+  std::string target = tm->getTarget().getName();
+  if (target == "arm") {
+    native_vector_bits_ = 16 * 8;
+  } else if (target == "x86-64") {
+    // for avx512
+    native_vector_bits_ = 64 * 8;
+  } else if (target == "x86") {
+    native_vector_bits_ = 32 * 8;
+  } else {
+    native_vector_bits_ = 32 * 8;
+    LOG(WARNING) << "set native vector to be " << native_vector_bits_ / 8
+                 << " for target " << target;
+  }
 }

 void CodeGenLLVM::InitGlobalContext() {
@@ -104,7 +118,7 @@ void CodeGenLLVM::InitGlobalContext() {
 void CodeGenLLVM::InitFuncState() {
  var_map_.clear();
  align_map_.clear();
-  alloc_storage_scope_.clear();
+  alloc_storage_info_.clear();
 }

 void CodeGenLLVM::AddFunction(const LoweredFunc& f) {
@@ -750,7 +764,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinstic(const Call* op) {

 int CodeGenLLVM::NativeVectorBits(const std::string& storage_scope) const {
  // By default, we ask the buffer to be aligned to 64 bytes
-  return 64 * 8;
+  return native_vector_bits_;
 }

 void CodeGenLLVM::GetAlignment(
@@ -759,17 +773,20 @@ void CodeGenLLVM::GetAlignment(
  int& alignment = *p_alignment;
  int& native_bits = *p_native_bits;
  // The storage scope.
-  std::string scope;
-  auto it = alloc_storage_scope_.find(buf_var);
-  if (it != alloc_storage_scope_.end()) {
-    scope = it->second;
+  StorageInfo info;
+  auto it = alloc_storage_info_.find(buf_var);
+  if (it != alloc_storage_info_.end()) {
+    info = it->second;
  }
  arith::ModularEntry m = EvalModular(index, align_map_);
-  native_bits = NativeVectorBits(scope);
+  native_bits = NativeVectorBits(info.scope);
  alignment = t.element_of().bits();
-  // find alignment
+  // find alignment, cannot exceed allocated alignment
+  int max_align_bits = std::min(
+      info.alignment * 8, alignment * t.lanes());
  while ((m.coeff & 1) == 0 &&
         (m.base & 1) == 0 &&
+         alignment < max_align_bits &&
         alignment < native_bits) {
    m.coeff /= 2;
    m.base /= 2;
@@ -1291,8 +1308,19 @@ void CodeGenLLVM::VisitStmt_(const Allocate* op) {
    int32_t constant_size = op->constant_allocation_size();
    CHECK_GT(constant_size, 0)
        << "Can only handle constant size stack allocation for now";
-    buf = builder_->CreateAlloca(
+    llvm::AllocaInst* alloca = builder_->CreateAlloca(
        LLVMType(op->type), ConstInt32(constant_size));
+    buf = alloca;
+    StorageInfo& info = alloc_storage_info_[op->buffer_var.get()];
+    // Align stack to be multiple of 4 if it is
+    // TODO(tqchen) have pass to detect vector access and pre-set alignment
+    if (constant_size % 4 == 0 && info.alignment == 0) {
+      info.alignment = op->type.bytes() * 4;
+    }
+    if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
+      alloca->setAlignment(info.alignment);
+    }
+    info.alignment = alloca->getAlignment();
  }
  buf = builder_->CreatePointerCast(buf, LLVMType(op->type)->getPointerTo());
  CHECK(!var_map_.count(op->buffer_var.get()));
@@ -1304,7 +1332,13 @@ void CodeGenLLVM::VisitStmt_(const AttrStmt* op) {
  if (op->attr_key == ir::attr::storage_scope) {
    const Variable* v = op->node.as<Variable>();
    CHECK(v);
-    alloc_storage_scope_[v] = op->value.as<StringImm>()->value;
+    alloc_storage_info_[v].scope = op->value.as<StringImm>()->value;
+    this->VisitStmt(op->body);
+  } else if (op->attr_key == ir::attr::storage_alignment) {
+    const Variable* v = op->node.as<Variable>();
+    CHECK(v);
+    alloc_storage_info_[v].alignment =
+        static_cast<int>(op->value.as<IntImm>()->value);
    this->VisitStmt(op->body);
  } else {
    this->VisitStmt(op->body);

--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -115,6 +115,13 @@ class CodeGenLLVM :
  virtual void Scalarize(const Expr& e,
                         std::function<void(int i, llvm::Value* v)> f);
 protected:
+  /*! \brief The storage information */
+  struct StorageInfo {
+    /*! \brief The storage scope */
+    std::string scope;
+    /*! \brief The alignment of allocation */
+    int alignment{0};
+  };
  /*!
   * \param t The original type.
   * \return LLVM type of t
@@ -174,8 +181,10 @@ class CodeGenLLVM :
  llvm::Function* f_tvm_parallel_for_{nullptr};
  // The acting body
  llvm::BasicBlock* block_{nullptr};
+  /*! \brief native vector bits of current targetx*/
+  int native_vector_bits_{0};
  /*! \brief the storage scope of allocation */
-  std::unordered_map<const Variable*, std::string> alloc_storage_scope_;
+  std::unordered_map<const Variable*, StorageInfo> alloc_storage_info_;

 private:
  // comparison op

--- a/src/codegen/verilog/vpi_device_api.cc
+++ b/src/codegen/verilog/vpi_device_api.cc
@@ -19,7 +19,6 @@ namespace codegen {
 class VPIDeviceAPI final : public runtime::DeviceAPI {
 public:
  VPIDeviceAPI() {
-    static const size_t kAllocAlign = 32U;
    const char* s_ram_size = getenv("TVM_VPI_RAM_SIZE_MB");
    // 16 MB ram.
    int ram_size = 32;
@@ -27,7 +26,7 @@ class VPIDeviceAPI final : public runtime::DeviceAPI {
      ram_size = atoi(s_ram_size);
    }
    ram_.resize(ram_size << 17);
-    ram_head_ = kAllocAlign;
+    ram_head_ = runtime::kAllocAlignment;
    ram_max_ = ram_.size() * sizeof(int64_t);
    LOG(INFO) << "Initialize VPI simulated ram " << ram_size << "MB ...";
  }
@@ -51,10 +50,9 @@ class VPIDeviceAPI final : public runtime::DeviceAPI {
    }
  }
  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
-    static const size_t kAllocAlign = 32U;
    // always align to 32 bytes at least.
-    CHECK_LE(alignment, kAllocAlign);
-    alignment = kAllocAlign;
+    CHECK_LE(alignment, runtime::kAllocAlignment);
+    alignment = runtime::kAllocAlignment;
    // always allocate block with aligned size.
    size += alignment - (size % alignment);
    // This is not thread safe, but fine for simulation.
@@ -67,7 +65,7 @@ class VPIDeviceAPI final : public runtime::DeviceAPI {
      b.is_free = false;
      return reinterpret_cast<void*>(head);
    } else {
-      CHECK_EQ(ram_head_ % kAllocAlign, 0U);
+      CHECK_EQ(ram_head_ % runtime::kAllocAlignment, 0U);
      Block b;
      b.size = size;
      b.is_free = false;

--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -6,7 +6,7 @@
 #include <tvm/ir.h>
 #include <tvm/ir_visitor.h>
 #include <tvm/buffer.h>
-
+#include <tvm/runtime/device_api.h>
 #include <vector>
 #include <utility>
 #include <unordered_set>
@@ -180,6 +180,10 @@ LoweredFunc MakeAPI(Stmt body,
                 v_arg->name_hint + ".data")) {
        Var vptr(buf->data);
        handle_data_type.Set(vptr, make_const(buf->dtype, 0));
+        // mark storage alignment of external buffer arguments.
+        seq_init.emplace_back(AttrStmt::make(
+            vptr, ir::attr::storage_alignment,
+            IntImm::make(Int(32), runtime::kAllocAlignment), nop));
      }
      // shape field
      Var v_shape(v_arg->name_hint + ".shape", Handle());

--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -136,7 +136,7 @@ inline size_t GetDataSize(TVMArray* arr) {

 inline size_t GetDataAlignment(TVMArray* arr) {
  size_t align = (arr->dtype.bits / 8) * arr->dtype.lanes;
-  if (align < 8) return 8;
+  if (align < kAllocAlignment) return kAllocAlignment;
  return align;
 }


--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -22,7 +22,7 @@ class CUDADeviceAPI final : public DeviceAPI {
    CUDA_CALL(cudaSetDevice(ctx.device_id));
  }
  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
-    int value;
+    int value = 0;
    switch (kind) {
      case kExist:
        value = (

--- a/tests/python/unittest/test_codegen_cross_llvm.py
+++ b/tests/python/unittest/test_codegen_cross_llvm.py
@@ -40,7 +40,7 @@ def test_llvm_add_pipeline():
            print("Skip because llvm is not enabled..")
            return
        temp = util.tempdir()
-        target = "llvm -target=arm-none-linux-gnueabihf"
+        target = "llvm -target=armv7-none-linux-gnueabihf"
        f = tvm.build(s, [A, B, C], target)
        path = temp.relpath("myadd.o")
        f.save(path)

--- a/tests/python/unittest/test_lang_tensor.py
+++ b/tests/python/unittest/test_lang_tensor.py
@@ -33,6 +33,16 @@ def test_tensor_slice():
    B = tvm.compute((n,), lambda i: A[0][i] + A[0][i])


+def test_tensor_reduce_multi_axis():
+    m = tvm.var('m')
+    n = tvm.var('n')
+    A = tvm.placeholder((m, n), name='A')
+    k1 = tvm.reduce_axis((0, n), "k")
+    k2 = tvm.reduce_axis((0, m), "k")
+    C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=(k1, k2)))
+    C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=[k1, k2]))
+
+
 def test_tensor_comm_reducer():
    m = tvm.var('m')
    n = tvm.var('n')
@@ -157,6 +167,7 @@ def test_tuple_with_different_deps():
    assert stmt.node == C.op and len(ret) == 1

 if __name__ == "__main__":
+    test_tensor_reduce_multi_axis()
    test_conv1d()
    test_tensor_slice()
    test_tensor()