[SCHEDULE] Add store_predicate (#131)

bf8a5c07 · Tianqi Chen · GitHub · 2112a1f9 · bf8a5c07 · bf8a5c07
Commit bf8a5c07 authored May 08, 2017 by Tianqi Chen Committed by GitHub May 08, 2017
14 changed files
--- a/Makefile
+++ b/Makefile
@@ -76,7 +76,7 @@ else
 endif
 # llvm configuration
-ifeq ($(USE_LLVM), 1)
+ifdef LLVM_CONFIG
 	LLVM_VERSION=$(shell $(LLVM_CONFIG) --version| cut -b 1,3)
 	LLVM_INCLUDE=$(filter -I%, $(shell $(LLVM_CONFIG) --cxxflags))
 	LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags --libs --system-libs)

--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -82,10 +82,22 @@ class Stage : public NodeRef {
   */
  Stage& bind(IterVar ivar, IterVar thread_ivar);
  /*!
+   * \brief Set predicate under which store to the array can be performed.
+   *  Use this when there are duplicated threads doing the same store and we only
+   *  need one of them to do the store.
+   *
+   * \note This is a dangerous scheduling primitive that can change behavior of program.
+   *    Only do when we are certain that thare are duplicated store.
+   * \param predicate The condition to be checked.
+   * \return reference to self.
+   */
+  Stage& set_store_predicate(Expr predicate);
+  /*!
   * \brief Specify environment threads that launched around the group's scope.
   *  This can only be used in group stage.
   * \param threads The threads to be launched around the scope.
   * \note Each thread can only appear in one env_threads.
+   *    This is a beta feature.
   * \return reference to self.
   */
  Stage& env_threads(Array<IterVar> threads);
@@ -341,8 +353,15 @@ class StageNode : public Node {
  /*!
   * \brief Specify threads to be launched at the stage.
   *  This is only valid for composite ops such as Scan.
+   * \note Experimental primitive: used for thread persistence.
   */
  Array<IterVar> env_threads;
+  /*!
+   * \brief The predicate under which store can happen
+   *  Use this when there can be duplicated threads doing the same store.
+   * \note Experimental primitive: used by cross thread-reduction.
+   */
+  Expr store_predicate;
  /*! \brief The relation bwteen of IterVars */
  Array<IterVarRelation> relations;
  /*! \brief additional attributes about iter var. */

--- a/make/config.mk
+++ b/make/config.mk
@@ -39,9 +39,9 @@ USE_METAL = 0
 # whether build with LLVM support
 # Requires LLVM version >= 4.0
-# Set LLVM_CONFIG to your version
+# Set LLVM_CONFIG to your version, uncomment to build with llvm support
-# LLVM_CONFIG = llvm-config-4.0
+#
-USE_LLVM = 0
+# LLVM_CONFIG = llvm-config
 #---------------------------------------------
 # Contrib optional libraries.

--- a/python/tvm/build.py
+++ b/python/tvm/build.py
@@ -85,7 +85,7 @@ def build(sch,
          target_host=None,
          name="default_function",
          binds=None,
-          max_auto_unroll_step=8,
+          max_auto_unroll_step=0,
          detect_global_barrier=True):
    """Build a function with arguments as signiture.

--- a/python/tvm/expr.py
+++ b/python/tvm/expr.py
@@ -63,7 +63,7 @@ class ExprOp(object):
        return _make.LE(self, other)
    def __eq__(self, other):
-        return _make.EQ(self, other)
+        return self.equal(other)
    def __ne__(self, other):
        return _make.NE(self, other)
@@ -74,6 +74,21 @@ class ExprOp(object):
    def __ge__(self, other):
        return _make.GE(self, other)
+    def equal(self, other):
+        """Build an equal check expression with other expr.
+        Parameters
+        ----------
+        other : Expr
+            The other expression
+        Returns
+        -------
+        ret : Expr
+            The equality expression.
+        """
+        return _make.EQ(self, other)
 class Expr(NodeBase, ExprOp):
    """Base class of all tvm Expressions"""

--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -276,6 +276,19 @@ class Stage(NodeBase):
            threads = [threads]
        _api_internal._StageEnvThreads(self, threads)
+    def set_store_predicate(self, predicate):
+        """Set predicate under which store to the array can be performed.
+        Use this when there are duplicated threads doing the same store and we only
+        need one of them to do the store.
+        Parameters
+        ----------
+        predicate : Expr
+            The guard condition fo store.
+        """
+        _api_internal._StageSetStorePredicate(self, predicate)
    def compute_at(self, parent, scope):
        """Attach the stage at parent's scope

--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -307,6 +307,12 @@ TVM_REGISTER_API("_StageEnvThreads")
        .env_threads(args[1]);
  });
+TVM_REGISTER_API("_StageSetStorePredicate")
+  .set_body([](TVMArgs args, TVMRetValue* ret) {
+    args[0].operator Stage()
+        .set_store_predicate(args[1]);
+  });
 TVM_REGISTER_API("_StageUnroll")
  .set_body([](TVMArgs args, TVMRetValue* ret) {
    args[0].operator Stage()

--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -38,7 +38,7 @@ class CodeGenCUDA final : public CodeGenC {
 private:
  // magic number to add pragma unroll to it.
  // used to generate code that is compact but still unrolls.
-  int max_auto_unroll_{64};
+  int max_auto_unroll_{256};
  // Whether global barrier is needed.
  bool need_global_barrier_{false};
  // Global barrier state

--- a/src/codegen/intrin_rule_cuda.cc
+++ b/src/codegen/intrin_rule_cuda.cc
@@ -9,13 +9,13 @@ namespace tvm {
 namespace codegen {
 namespace intrin {
 // Add float suffix to the intrinsics, CUDA fast math.
-struct CUDAFastMath {
+struct CUDAMath {
  std::string operator()(Type t, std::string name) const {
    if (t.lanes() == 1) {
      if (t.is_float()) {
        switch (t.bits()) {
          case 64: return name;
-          case 32: return "__" + name + 'f';
+          case 32: return name + 'f';
          case 16: return 'h' + name;
          default: return "";
        }
@@ -25,6 +25,17 @@ struct CUDAFastMath {
  }
 };
+struct CUDAFastMath : public CUDAMath {
+  std::string operator()(Type t, std::string name) const {
+    if (t.lanes() == 1 && t.is_float() && t.bits() == 32) {
+      return "__" + name + 'f';
+    } else {
+      return CUDAMath::operator()(t, name);
+    }
+    return "";
+  }
+};
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp")
 .set_body(DispatchExtern<CUDAFastMath>);
@@ -32,7 +43,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.log")
 .set_body(DispatchExtern<CUDAFastMath>);
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.tanh")
-.set_body(DispatchExtern<CUDAFastMath>);
+.set_body(DispatchExtern<CUDAMath>);
 }  // namespace intrin
 }  // namespace codegen

--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -242,7 +242,6 @@ Stmt MakeCrossThreadReduction(
  freduce_args.push_back(reduce->source);
  freduce_args.push_back(cond);
-  std::vector<Expr> thread_head_check;
  for (IterVar iv : stage->leaf_iter_vars) {
    if (iv->iter_type == kCommReduce) {
      auto it = stage->iter_var_attrs.find(iv);
@@ -250,10 +249,14 @@ Stmt MakeCrossThreadReduction(
          (*it).second->bind_thread.defined()) {
        IterVar tv = (*it).second->bind_thread;
        freduce_args.push_back(tv->var);
-        thread_head_check.push_back(tv->var == 0);
      }
    }
  }
+  // Checks for the thread.
+  std::vector<Expr> thread_head_check;
+  if (stage->store_predicate.defined()) {
+    thread_head_check.emplace_back(stage->store_predicate);
+  }
  Type t = reduce->type;
  Expr pred = const_true(t.lanes());
  Stmt reduce_body = Store::make(res_handle,
@@ -311,6 +314,9 @@ Stmt ComputeOpNode::BuildProvide(
  nest.push_back(op::MakeIfNest(op::MakeBoundCheck(
      stage, dom_map, false,
      std::unordered_set<IterVar>(), value_map)));
+  if (stage->store_predicate.defined()) {
+    nest.emplace_back(op::MakeIfNest({stage->store_predicate}));
+  }
  provide = Substitute(provide, value_map);
  if (init.defined()) {

--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -200,6 +200,12 @@ Stage& Stage::env_threads(Array<IterVar> threads) {
  return *this;
 }
+Stage& Stage::set_store_predicate(Expr predicate) {
+  StageNode* self = operator->();
+  self->store_predicate = predicate;
+  return *this;
+}
 Stage& Stage::split(
    IterVar parent, Expr factor, IterVar* p_outer, IterVar* p_inner) {  // NOLINT(*)
  Split(operator->(), parent, factor, Expr(), p_outer, p_inner);

--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -98,8 +98,10 @@ def test_rfactor_threads():
    s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
    s[B].bind(ty, tvm.thread_axis("threadIdx.y"))
    tx = s[B].op.reduce_axis[0]
-    s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+    thread_x = tvm.thread_axis("threadIdx.x")
+    s[B].bind(tx, thread_x)
    s[BF].compute_at(s[B], tx)
+    s[B].set_store_predicate(thread_x.var.equal(0))
    # one line to build the function.
    def check_target(device, host="stackvm"):

--- a/tests/python/perf/lstm.py
+++ b/tests/python/perf/lstm.py
+"""LSTM Example, still work in progress.."""
+import tvm
+import time
+import os
+import argparse
+from tvm.contrib import nvcc_compiler
+import numpy as np
+# Quick knobs
+TASK="lstm"
+USE_MANUAL_CODE = False
+PERSIST_KERNEL = True
+DETECT_GLOBAL_BARRIER = PERSIST_KERNEL
+SKIP_CHECK = False
+UNROLL_WLOAD = True
+@tvm.register_func
+def tvm_callback_cuda_compile(code):
+    """Use nvcc compiler for better perf."""
+    ptx =  nvcc_compiler.compile_source(code, target="ptx", options=["-arch=sm_52"])
+    return ptx
+def write_code(code, fname):
+    with open(fname, "w") as f:
+        f.write(code)
+@tvm.register_func
+def tvm_callback_cuda_postproc(code):
+    if not os.path.exists("perf"):
+        os.mkdir("perf")
+    write_code(code, "perf/%s_generated.cu" % TASK)
+    if USE_MANUAL_CODE:
+        code = open("perf/%s_manual.cu" % TASK).read()
+    return code
+def lstm():
+    if not PERSIST_KERNEL:
+        raise ValueError("Non persist LSTM not yet supported")
+    detect_global_barrier = DETECT_GLOBAL_BARRIER
+    num_thread_y = 8
+    num_thread_x = 16 * 3 / 2
+    num_sm = 24
+    n_num_step = 128
+    num_step = tvm.var('num_step')
+    num_hidden = 1152 / 2
+    batch_size = 1
+    # Global transition matrix
+    # Input hidden channel can be pre-caculated by a gemm
+    Xi2h = tvm.placeholder((num_step, batch_size, 4, num_hidden), name="Xi2h")
+    # Only handle hidden transition, saves space.
+    Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h")
+    # h: output hidden state, c: cell state.
+    s_state_h = tvm.placeholder((num_step, batch_size, num_hidden))
+    s_state_c = tvm.placeholder((num_step, batch_size, num_hidden))
+    s_init_c = tvm.compute((1, batch_size, num_hidden),
+                           lambda *i: 0.0, name="init_c")
+    s_init_h = tvm.compute((1, batch_size, num_hidden),
+                           lambda *i: 0.0, name="init_h")
+    # LSTM transition
+    k = tvm.reduce_axis((0, num_hidden), name="ki2h")
+    s_h2h = tvm.compute(
+        (num_step, batch_size, 4, num_hidden),
+        lambda t, i, x, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
+        name="s_h2h")
+    # Gate rules
+    gates = tvm.compute(Xi2h.shape, lambda *i:
+                        Xi2h(*i) + s_h2h(*i), name="gates")
+    gshape = (num_step, batch_size, num_hidden)
+    in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, i, 0, j]), name="in_gate")
+    in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, i, 1, j]), name="in_transform")
+    forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, i, 2, j]), name="forget_gate")
+    out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, i, 3, j]), name="out_gate")
+    next_c = tvm.compute(gshape,
+                         lambda t, i, j:
+                         forget_gate[t, i, j] * s_state_c[t - 1, i, j] +
+                         in_gate[t, i, j] * in_transform[t, i, j], name="next_c")
+    next_h = tvm.compute(gshape,
+                         lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h")
+    update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c")
+    update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h")
+    # schedule
+    scan_h, scan_c = tvm.scan(
+        [s_init_h, s_init_c],
+        [update_h, update_c],
+        [s_state_h, s_state_c],
+        inputs=[Xi2h],
+        name="lstm_scan")
+    # schedule
+    s = tvm.create_schedule(scan_h.op)
+    # Inline gate computations
+    s[gates].compute_inline()
+    s[in_gate].compute_inline()
+    s[in_transform].compute_inline()
+    s[forget_gate].compute_inline()
+    s[out_gate].compute_inline()
+    block_x = tvm.thread_axis((0, num_sm), "blockIdx.x")
+    thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
+    thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
+    s_state_h_S = s.cache_read(s_state_h, "shared", [s_h2h])
+    s_state_c_S = s.cache_read(s_state_c, "shared", [next_c])
+    Wh2hL = s.cache_read(Wh2h, "local", [s_h2h])
+    ko, ki = s[s_h2h].split(s[s_h2h].op.reduce_axis[0], nparts=num_thread_y)
+    s_h2h_rf = s.rfactor(s_h2h, ko)
+    s[s_h2h].bind(s[s_h2h].op.reduce_axis[0], thread_y)
+    s[s_h2h_rf].compute_at(s[s_h2h], s[s_h2h].op.reduce_axis[0])
+    if PERSIST_KERNEL:
+        s[scan_h.op].env_threads([block_x, thread_y, thread_x])
+        s[Wh2hL].compute_at(s[scan_h.op], thread_x)
+    else:
+        s[Wh2hL].compute_at(s[s_h2h], s[s_h2h].op.axis[3])
+    if UNROLL_WLOAD:
+        s[Wh2hL].unroll(Wh2hL.op.axis[0])
+        s[Wh2hL].unroll(Wh2hL.op.axis[2])
+    s[s_state_h_S].compute_at(s[s_h2h_rf], s[s_h2h_rf].op.axis[3])
+    s[s_state_c_S].compute_at(s[scan_h.op], s[scan_h].op.scan_axis)
+    for ss in [s_state_h_S]:
+        xo, xi = s[ss].split(ss.op.axis[2], factor=num_thread_x * num_thread_y)
+        ty, xi = s[ss].split(xi, nparts=num_thread_y)
+        tx, xi = s[ss].split(xi, nparts=num_thread_x)
+        s[ss].bind(ty, thread_y)
+        s[ss].bind(tx, thread_x)
+    for init in [s_init_c, s_init_h]:
+        bx, xi = s[init].split(init.op.axis[2], nparts=num_sm)
+        tx, xi = s[init].split(xi, nparts=num_thread_x)
+        s[init].bind(bx, block_x)
+        s[init].bind(tx, thread_x)
+    s[next_c].set_store_predicate(thread_y.equal(0))
+    s[next_h].set_store_predicate(thread_y.equal(0))
+    for update in [update_c, update_h]:
+        bx, xi = s[update].split(s[update].op.axis[2], nparts=num_sm)
+        tx, xi = s[update].split(xi, nparts=num_thread_x)
+        s[update].bind(bx, block_x)
+        s[update].bind(tx, thread_x)
+        s[update].set_store_predicate(thread_y.equal(0))
+    # verify we can lower correctly
+    def check_device(target):
+        num_step = n_num_step
+        flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c],
+                          target,
+                          detect_global_barrier=DETECT_GLOBAL_BARRIER)
+        ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
+        # launch the kernel.
+        scan_h_np = np.zeros(
+            (num_step, batch_size, num_hidden)).astype("float32")
+        scan_c_np = np.zeros(
+            (num_step, batch_size, num_hidden)).astype("float32")
+        Xi2h_np = np.random.normal(
+            size=(num_step, batch_size, 4, num_hidden)).astype("float32")
+        Wh2h_np = np.random.normal(
+            size=(4, num_hidden, num_hidden)).astype("float32")
+        scan_h_a = tvm.nd.array(scan_h_np, ctx)
+        scan_c_a = tvm.nd.array(scan_c_np, ctx)
+        Xi2h_a = tvm.nd.array(Xi2h_np, ctx)
+        Wh2h_a = tvm.nd.array(Wh2h_np, ctx)
+        flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
+        ctx.sync()
+        # measure time cost of second step.
+        tstart = time.time()
+        flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
+        ctx.sync()
+        tgap = time.time() - tstart
+        print("Time cost=%g" % tgap)
+    check_device("cuda")
+if __name__ == "__main__":
+    lstm()
--- a/tests/python/perf/rnn_matexp.py
+++ b/tests/python/perf/rnn_matexp.py
@@ -90,6 +90,8 @@ def rnn_matexp():
    s[s_update].bind(tx, thread_x)
    s[CL].bind(s[CL].op.reduce_axis[0], thread_y)
    s[CLF].compute_at(s[CL], s[CL].op.reduce_axis[0])
+    # Duplicate store predicate.
+    s[CL].set_store_predicate(thread_y.equal(0))
    if PERSIST_KERNEL:
        s[WhhL].compute_at(s[s_scan], thread_x)
@@ -109,7 +111,6 @@ def rnn_matexp():
    s[SS].bind(tx, thread_x)
    def check_device(target):
-        codes = []
        f = tvm.build(s, [s_scan, Whh],
                      target,
                      max_auto_unroll_step=max_auto_unroll_step,