[SCHEDULE] Add factor_axis to rfactor (#895)

61cdf903 · libing4752 · Tianqi Chen · 12d1ab5a · 61cdf903 · 61cdf903
Commit 61cdf903 authored Feb 23, 2018 by libing4752 Committed by Tianqi Chen Feb 22, 2018
6 changed files
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -313,10 +313,12 @@ class Schedule : public NodeRef {
   *
   * \param tensor The tensor to be factored.
   * \param axis The reduction axis in tensor's schedule to be factored.
+   * \param factor_axis The position where the new axis is placed.
   * \return The created factored tensors.
   */
  EXPORT Array<Tensor> rfactor(const Tensor& tensor,
-                        const IterVar& axis);
+                        const IterVar& axis,
+                        int factor_axis = 0);
  /*!
   * \brief Normalize the schedule.
   *  This is needed before bound inference.

--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -279,7 +279,7 @@ class Schedule(NodeBase):
        """
        return _api_internal._ScheduleCacheWrite(self, tensor, scope)
-    def rfactor(self, tensor, axis):
+    def rfactor(self, tensor, axis, factor_axis=0):
        """ Factor a reduction axis in tensor's schedule to be an explicit axis.
        This will create a new stage that generated the new tensor with axis
@@ -292,13 +292,15 @@ class Schedule(NodeBase):
            The tensor to be factored.
        axis : IterVar
            The reduction axis in the schedule to be factored.
+        factor_axis : int
+            The position where the new axis is placed.
        Returns
        -------
        tfactor : Tensor or Array of Tensor
            The created factored tensor.
        """
-        factored = _api_internal._ScheduleRFactor(self, tensor, axis)
+        factored = _api_internal._ScheduleRFactor(self, tensor, axis, factor_axis)
        return factored[0] if len(factored) == 1 else factored

--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -432,7 +432,7 @@ TVM_REGISTER_API("_ScheduleCacheWrite")
 TVM_REGISTER_API("_ScheduleRFactor")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
    *ret = args[0].operator Schedule()
-        .rfactor(args[1], args[2]);
+        .rfactor(args[1], args[2], args[3]);
  });
 TVM_REGISTER_API("_CommReducerCombine")

--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -395,7 +395,8 @@ Schedule Schedule::normalize() {
 // Handle reduction factor.
 Array<Tensor> Schedule::rfactor(const Tensor& tensor,
-                                const IterVar& axis) {
+                                const IterVar& axis,
+                                int factor_axis) {
  (*this)->InvalidateCache();
  using ir::Reduce;
  CHECK_EQ(axis->iter_type, kCommReduce)
@@ -448,6 +449,9 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
      reduce_stage, dom_map, value_map, true, skip_bound_check);
  // Get the factored op node.
+  const int factor_axis_pos = \
+      factor_axis >= 0 ? factor_axis : static_cast<int>(compute_op->axis.size() + 1) + factor_axis;
+  CHECK_LE(factor_axis_pos, compute_op->axis.size());
  auto n = std::make_shared<ComputeOpNode>();
  n->name = compute_op->name + ".rf";
  {
@@ -458,10 +462,16 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
        << "Can only factor reduction domain starting from 0";
    iv_node->var = axis->var;
    iv_node->iter_type = kDataPar;
-    n->axis.push_back(IterVar(iv_node));
-    for (IterVar iv : compute_op->axis) {
+    const int size = compute_op->axis.size();
-      n->axis.push_back(iv);
+    for (int idx = 0; idx < size; ++idx) {
+      if (factor_axis_pos == idx) {
+        n->axis.push_back(IterVar(iv_node));
+      }
+      n->axis.push_back(compute_op->axis[idx]);
+    }
+    if (factor_axis_pos == size) {
+      n->axis.push_back(IterVar(iv_node));
    }
  }
  // predicate generation, copy not touched axis.
@@ -548,9 +558,15 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
  Array<Tensor> repl_tensors = compute(old_tensors[0]->shape,
    [&](const Array<Var>& i) {
      Array<Expr> indices;
+      const int idx_size = static_cast<int>(i.size());
+      for (int idx = 0; idx < idx_size; ++idx) {
+        if (factor_axis_pos == idx) {
+          indices.push_back(repl_red_axis->var);
+        }
+        indices.push_back(i[idx]);
+      }
+      if (factor_axis_pos == idx_size) {
          indices.push_back(repl_red_axis->var);
-      for (Var v : i) {
-        indices.push_back(v);
      }
      Array<Expr> factor_exprs;
      for (int idx = 0; idx < size; ++idx) {

--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -83,6 +83,36 @@ def test_rfactor():
    check_target()
+def test_rfactor_factor_axis():
+    n = tvm.convert(1027)
+    A = tvm.placeholder((n,), name='A')
+    k = tvm.reduce_axis((0, n))
+    B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B')
+    # schedule
+    s = tvm.create_schedule(B.op)
+    kf, ki = s[B].split(k, nparts=4)
+    BF = s.rfactor(B, kf, 1)
+    s[BF].parallel(BF.op.axis[0])
+    # one line to build the function.
+    def check_target(target="llvm"):
+        if not tvm.module.enabled(target):
+            return
+        ctx = tvm.cpu(0)
+        fapi = tvm.lower(s, args=[A, B])
+        fsum = tvm.build(fapi,
+                         target=target,
+                         name="mysum")
+        # launch the kernel.
+        n = 1027
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
+        b  = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx)
+        fsum(a, b)
+        res = np.sum(a.asnumpy(), axis=0)
+        np.testing.assert_allclose(
+            b.asnumpy(), res, rtol=1e-4)
+    check_target()
 def test_rfactor_threads():
    nn = 1027
@@ -294,6 +324,7 @@ def test_rfactor_argmax():
 if __name__ == "__main__":
    test_rfactor_elemwise_threads()
    test_rfactor_threads()
+    test_rfactor_factor_axis()
    test_rfactor()
    test_reduce_prims()
    test_argmax()

--- a/tests/python/unittest/test_lang_schedule.py
+++ b/tests/python/unittest/test_lang_schedule.py
@@ -137,6 +137,16 @@ def test_rfactor():
    assert(BF.op.body[0].axis[0] ==  k2)
    assert(BF.op.body[0].axis[1].var ==  ko.var)
    assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
+    # schedule with factor_axis
+    s = tvm.create_schedule(B.op)
+    ko, ki = s[B].split(k1, factor=4)
+    xo, xi = s[B].split(B.op.axis[0], factor=8)
+    BF = s.rfactor(B, ki, 1)
+    assert(n == BF.shape[0])
+    assert(BF.shape[1].value == 4)
+    assert(BF.op.body[0].axis[0] ==  k2)
+    assert(BF.op.body[0].axis[1].var ==  ko.var)
+    assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
 def test_tensor_intrin():
    n = 16