[SCAN/Refactor] Refactor scan interface, enable fix point analysis. (#47)

c8ec4111 · Tianqi Chen · GitHub · 5198c100 · c8ec4111 · c8ec4111
Commit c8ec4111 authored Feb 20, 2017 by Tianqi Chen Committed by GitHub Feb 20, 2017
21 changed files
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -152,14 +152,12 @@ Tensor compute(Array<Expr> shape, FCompute fcompute, std::string name = "tensor"
 /*!
 * \brief Construct new tensors by scan over scan_axis.
 *
- * \param scan_axis The iteration representing the scan.
 * \param init The intialize tensor of first K steps.
 * \param update The update tensor indicated the updated result after each timestamp.
 * \param state_placeholder The placeholder for the states.
 * \param name The optional name of the tensor.
 */
-Array<Tensor> scan(IterVar scan_axis,
-                   Array<Tensor> init,
+Array<Tensor> scan(Array<Tensor> init,
                   Array<Tensor> update,
                   Array<Tensor> state_placeholder,
                   std::string name = "scan");

--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -26,7 +26,9 @@ enum AttachType : int {
  kNone = 0,
  kRoot = 1,
  kInline = 2,
-  kScope = 3
+  kInlinedAlready = 3,
+  kScope = 4,
+  kScanUpdate = 5
 };

 /*! \brief IterVar type */

--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -175,6 +175,8 @@ class OperationNode : public FunctionBaseNode {
  virtual Type output_dtype(size_t i) const = 0;
  /*! \return shape of i-th output */
  virtual Array<Expr> output_shape(size_t i) const = 0;
+
+  static constexpr const char* _type_key = "Operation";
 };

 // Implementations of inline functions

--- a/python/tvm/addon/nvcc_compiler.py
+++ b/python/tvm/addon/nvcc_compiler.py
@@ -4,7 +4,7 @@ import sys
 import tempfile
 import subprocess

-def compile_source(code, target="cubin"):
+def compile_source(code, target="cubin", options=None):
    """Compile cuda code with NVCC from env.

    Parameters
@@ -12,9 +12,12 @@ def compile_source(code, target="cubin"):
    code : str
        The cuda code.

-    target: str
+    target : str
        The target format

+    options : str
+        The additional options
+
    Return
    ------
    cubin : bytearray
@@ -32,6 +35,8 @@ def compile_source(code, target="cubin"):
    cmd = ["nvcc"]
    cmd += ["--%s" % target, "-O3"]
    cmd += ["-o", path_target]
+    if options:
+        cmd += options
    cmd += [path_code]
    args = ' '.join(cmd)


--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -140,14 +140,11 @@ def compute(shape, fcompute, name="compute"):
    return op_node.output(0)


-def scan(axis, init, update, state_placeholder, name="scan"):
+def scan(init, update, state_placeholder, name="scan"):
    """Construct new tensors by scanning over axis.

    Parameters
    ----------
-    axis: IterVar
-        The scanning axis.
-
    init: Tensor or list of Tensor
        The initial condition of first init.shape[0] timestamps

@@ -170,12 +167,11 @@ def scan(axis, init, update, state_placeholder, name="scan"):
    # The following code is equivalent to numpy.cumsum
    m = tvm.Var("m")
    n = tvm.Var("n")
-    t = tvm.IterVar((1, m), name="t")
    X = tvm.placeholder((m, n), name="X")
    s_state = tvm.placeholder((m, n))
    s_init = tvm.compute((1, n), lambda _, i: X[0, i])
-    s_update = tvm.compute((n,), lambda i: s_state[t-1, i] + X[t, i])
-    res = tvm.scan(t, s_init, s_update, s_state)
+    s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
+    res = tvm.scan(s_init, s_update, s_state)
    """
    if isinstance(init, _tensor.Tensor):
        init = [init]
@@ -185,6 +181,7 @@ def scan(axis, init, update, state_placeholder, name="scan"):
        state_placeholder = [state_placeholder]
    if len(init) != len(update) or len(init) != len(state_placeholder):
        raise ValueError("init, update, state_placeholder must have same length")
+    axis = IterVar((init[0].shape[0], update[0].shape[0]), "%s.idx" % name)
    op = _api_internal._ScanOp(name, axis, init, update, state_placeholder)
    res = [op.output(i) for i in range(len(update))]
    return (res[0] if len(res) == 1 else res)

--- a/python/tvm/build.py
+++ b/python/tvm/build.py
@@ -63,7 +63,8 @@ def build(sch,
            arg_list.append(x)
        else:
            raise ValueError("args must be Tensor, Buffer or Var")
-    # lowering
+    # normalize schedule first
+    sch.normalize()
    bounds = schedule.InferBound(sch)
    stmt = schedule.ScheduleOps(sch, bounds)
    stmt = ir_pass.StorageFlatten(stmt, binds)

--- a/src/api/api_schedule.cc
+++ b/src/api/api_schedule.cc
@@ -34,6 +34,9 @@ TVM_REGISTER_API(_schedule_AutoInlineElemWise)
 REGISTER_SCHEDULE_PASS1(InferBound);
 REGISTER_SCHEDULE_PASS1(CreateReadGraph);
 REGISTER_SCHEDULE_PASS2(PostDFSOrder);
+REGISTER_SCHEDULE_PASS1(ScanGetBody);
+REGISTER_SCHEDULE_PASS1(CreateAttachPath);
+REGISTER_SCHEDULE_PASS2(ScanFixPointAnalysis);
 REGISTER_SCHEDULE_PASS2(ScheduleOps);

 }  // namespace schedule

--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -166,7 +166,15 @@ IntSet Union(const Array<IntSet>& set) {
  if (set.size() == 1) return set[0];
  Interval x = set[0].cover_interval().as<IntervalSet>()->i;
  for (size_t i = 1; i < set.size(); ++i) {
-    x.include(set[i].cover_interval().as<IntervalSet>()->i);
+    IntSet s = set[i].cover_interval();
+    const Interval& y = s.as<IntervalSet>()->i;
+    if (can_prove(x.max + 1 >= y.min)) {
+      x.max = y.max;
+    } else if (can_prove(y.max + 1 >= x.min)) {
+      x.min = y.min;
+    } else {
+      x.include(y);
+    }
  }
  return IntervalSet::make(x);
 }

--- a/src/lang/operation.cc
+++ b/src/lang/operation.cc
@@ -51,8 +51,6 @@ Operation PlaceholderOpNode::make(std::string name,
  return Operation(n);
 }

-
-
 Tensor placeholder(Array<Expr> shape, Type dtype, std::string name) {
  return PlaceholderOpNode::make(name, shape, dtype).output(0);
 }
@@ -162,24 +160,25 @@ Operation ScanOpNode::make(std::string name,
        << " scan_axis.dom.min + scan_axis.dom.extent";
    CHECK_EQ(state_placeholder[i].ndim(), init[i].ndim())
        << "The dimension of init need to match state_placeholder";
-    CHECK_EQ(update[i].ndim() + 1, state_placeholder[i].ndim())
+    CHECK_EQ(update[i].ndim(), state_placeholder[i].ndim())
        << "The update.ndim need to be state_placeholder.ndim - 1";
    for (size_t k = 0;  k < update[i].ndim(); ++k) {
      CHECK(prove_equal(
-          update[i]->shape[k], state_placeholder[i]->shape[k + 1]));
-      // setup spatial axis
-      std::ostringstream spatial_name;
-      spatial_name << name << ".out" << i << ".i" << k + 1;
-      n->spatial_axis_.push_back(
-          IterVar(Range::make_with_min_extent(0, update[i]->shape[k]),
-                  spatial_name.str()));
+          update[i]->shape[k], state_placeholder[i]->shape[k]));
+      if (k != 0) {
+        // setup spatial axis
+        std::ostringstream spatial_name;
+        spatial_name << name << ".out" << i << ".i" << k;
+        n->spatial_axis_.push_back(
+            IterVar(Range::make_with_min_extent(0, update[i]->shape[k]),
+                    spatial_name.str()));
+      }
    }
    for (size_t k = 1;  k < init[i].ndim(); ++k) {
      CHECK(prove_equal(
          init[i]->shape[k], state_placeholder[i]->shape[k]));
    }
  }
-
  n->name = name;
  n->scan_axis = axis;
  n->init = init;
@@ -188,11 +187,14 @@ Operation ScanOpNode::make(std::string name,
  return Operation(n);
 }

-Array<Tensor> scan(IterVar scan_axis,
-                   Array<Tensor> init,
+Array<Tensor> scan(Array<Tensor> init,
                   Array<Tensor> update,
                   Array<Tensor> state_placeholder,
                   std::string name) {
+  IterVar scan_axis(
+      Range::make_with_min_extent(
+          init[0]->shape[0], update[0]->shape[0] - init[0]->shape[0]),
+      name + ".idx");
  Operation op = ScanOpNode::make(
      name, scan_axis, init, update, state_placeholder);
  Array<Tensor> res;

--- a/src/pass/inline.cc
+++ b/src/pass/inline.cc
@@ -61,7 +61,9 @@ Stmt Inline(Stmt stmt,
            Expr body) {
  CHECK_EQ(f->num_outputs(), 1)
      << "can only inline output single value operation";
-  return ConvertSSA(IRInline(f, args, body).Mutate(stmt));
+  Stmt ret = IRInline(f, args, body).Mutate(stmt);
+  if (ret.same_as(stmt)) return ret;
+  return ConvertSSA(ret);
 }
 }  // namespace ir
 }  // namespace tvm
--- a/src/schedule/bound.cc
+++ b/src/schedule/bound.cc
+
 /*!
 *  Copyright (c) 2016 by Contributors
 * \file bound.cc
@@ -259,11 +260,14 @@ void BoundProp(const Operation& op,
        init_dom->data[0].push_back(IntSet::range(
            Range::make_with_min_extent(0, scan->init[i]->shape[0])));
      }
+      if (update_dom) {
+        update_dom->data[0].push_back(dom_map.at(scan->scan_axis->var.get()));
+      }
      // The update dimensions
-      for (size_t k = 0; k < scan->update[i]->shape.size(); ++k, ++sp_idx) {
+      for (size_t k = 1; k < scan->update[i]->shape.size(); ++k, ++sp_idx) {
        IterVar sp_ax = scan->spatial_axis_[sp_idx];
        if (init_dom) {
-          init_dom->data[k + 1].push_back(dom_map.at(sp_ax->var.get()));
+          init_dom->data[k].push_back(dom_map.at(sp_ax->var.get()));
        }
        if (update_dom) {
          update_dom->data[k].push_back(dom_map.at(sp_ax->var.get()));
@@ -277,10 +281,12 @@ void BoundProp(const Operation& op,
  }
 }

+
 // Given the bound of output of op
 // Pass the bound to the related axis in op.
 void GatherOpBound(const ScanOpNode* scan,
                   const Operation& op,
+                   const FeedGraph& fg,
                   const std::unordered_map<Tensor, TensorDom>& tmap,
                   std::unordered_map<IterVar, Range>* rmap) {
  CHECK(!rmap->count(scan->scan_axis));
@@ -299,21 +305,29 @@ void GatherOpBound(const ScanOpNode* scan,
  Range r = arith::Union(time_dom).cover_range(sdom);
  (*rmap)[scan->scan_axis] = Range::make_with_min_extent(
      sdom->min, ir::Simplify(r->extent + r->min - sdom->min));
+  Array<Operation> body = ScanGetBody_(scan, fg);
+  Map<IterVar, Expr> fix_pt = ScanFixPointAnalysis(op, body);
  // Update for spatial axis.
  size_t sp_idx = 0;
  for (size_t i = 0; i < output.size(); ++i) {
-    for (size_t k = 0; k < scan->update[i]->shape.size(); ++k, ++sp_idx) {
+    const TensorDom& d = tmap.at(output[i]);
+    for (size_t k = 1; k < scan->update[i]->shape.size(); ++k, ++sp_idx) {
      IterVar sp_ax = scan->spatial_axis_[sp_idx];
      CHECK(!rmap->count(sp_ax));
-      // In default, we always need all spatial axis
-      // Unless that axis only refers back to itself as a fixed point.
-      // TODO(tqchen): Add fix point detection.
-      (*rmap)[sp_ax] = sp_ax->dom;
+      CHECK(fix_pt.count(sp_ax));
+      if (fix_pt[sp_ax].as<ir::IntImm>()->value) {
+        // fix point, we can slice it.
+        (*rmap)[sp_ax] = arith::Union(d.data[k + 1]).cover_range(sp_ax->dom);
+      } else {
+        // not a fix point, need to include everything.
+        (*rmap)[sp_ax] = sp_ax->dom;
+      }
    }
  }
 }

 void GatherOpBound(const Operation& op,
+                   const FeedGraph& fg,
                   const std::unordered_map<Tensor, TensorDom>& tmap,
                   std::unordered_map<IterVar, Range>* rmap) {
  if (op.as<ComputeOpNode>()) {
@@ -329,7 +343,7 @@ void GatherOpBound(const Operation& op,
      (*rmap)[compute->reduce_axis[i]] = compute->reduce_axis[i]->dom;
    }
  } else if (op.as<ScanOpNode>()) {
-    GatherOpBound(op.as<ScanOpNode>(), op, tmap, rmap);
+    GatherOpBound(op.as<ScanOpNode>(), op, fg, tmap, rmap);
  } else if (op.as<PlaceholderOpNode>()) {
    // dp nothing
  } else {
@@ -347,20 +361,14 @@ inline bool ScopeRelax(const IterVar& iv, const std::string& scope) {
  return StorageScope::make(scope).rank <= ThreadScope::make(iv->thread_tag).rank;
 }

-// The map beteen tensor and operation it feeds ti
-using FeedGraph = std::unordered_map<Tensor, std::vector<Operation> >;
-
-// AttachPath maps op-> a list of IterVar
-// That represents the loop nest op sits in from inner most to outermost
-using AttachPath = Map<Operation, Array<IterVar> >;
-
-
 void InferRootBound(const Stage& stage,
                    const FeedGraph& feed_graph,
                    const AttachPath& attach_path,
                    std::unordered_map<IterVar, Range>* rmap) {
-  if (stage->attach_type == kInline) return;
-  if (stage->attach_type == kRoot || stage->attach_type == kNone) {
+  CHECK_NE(stage->attach_type, kInline)
+      << "call schedule.normalize before scheduleops";
+  if (stage->attach_type == kInlinedAlready) return;
+  if (stage->is_output || stage->op.as<PlaceholderOpNode>()) {
    for (auto iv :  OutputRelatedIterVars(stage->op)) {
      CHECK(iv->dom.defined());
      CHECK(!rmap->count(iv));
@@ -368,11 +376,11 @@ void InferRootBound(const Stage& stage,
    }
    return;
  }
-  // Infer root bounds for the attached node.
-  CHECK_EQ(stage->attach_type, kScope);
-  Stage parent = stage->attach_stage;
-  CHECK(parent.defined());
-
+  // parent stage, if any
+  Stage parent;
+  if (stage->attach_type == kScope || stage->attach_type == kScanUpdate) {
+    parent = stage->attach_stage;
+  }
  // The tensor domain.
  std::unordered_map<Tensor, TensorDom> tmap;
  // consumers other than parent
@@ -385,7 +393,7 @@ void InferRootBound(const Stage& stage,
    auto it = feed_graph.find(t);
    if (it != feed_graph.end()) {
      for (const Operation& op : it->second) {
-        if (op != parent->op) {
+        if (!parent.defined() || op != parent->op) {
          consumers.insert(op);
        } else {
          direct_consume_by_parent = true;
@@ -404,16 +412,20 @@ void InferRootBound(const Stage& stage,
      relax_set[iv->var.get()] = IntSet::range(rmap->at(iv));
    }
  }
-
  if (direct_consume_by_parent) {
+    // parent stage if exist
+    Stage parent = stage->attach_stage;
    // Bound inference logics in parent.
    std::unordered_map<IterVar, IntSet> up_state;
    bool fix_value = true;
    for (auto iv : parent->leaf_iter_vars) {
-      Range vrange = rmap->at(iv);
+      auto it = rmap->find(iv);
+      CHECK(it != rmap->end());
+      Range vrange = it->second;
      CHECK(is_zero(vrange->min))
          << "InferBound requires every leaf iter var's min equals 0, "
-          << "call schedule.normalize to achieve this.";
+          << " call schedule.normalize to achieve this. "
+          << " stage=" << parent;
      // special optimization to remove trivial loop
      if (is_one(vrange->extent)) {
        up_state[iv] = IntSet::single_point(vrange->min);
@@ -464,8 +476,9 @@ void InferRootBound(const Stage& stage,
  for (const Operation& op : consumers) {
    std::unordered_map<const Variable*, IntSet> dom_map;
    bool found = false;
+    Array<IterVar> attach = attach_path.at(stage->op);
    for (IterVar iv : attach_path.at(op)) {
-      if (iv == stage->attach_ivar) {
+      if (attach.size() != 0 && iv == attach[0]) {
        found = true; break;
      }
      Range vrange = rmap->at(iv);
@@ -474,7 +487,7 @@ void InferRootBound(const Stage& stage,
          << "call schedule.normalize to achieve this.";
      relax_set[iv->var.get()] = IntSet::range(vrange);
    }
-    CHECK(found)
+    CHECK(found || attach.size() == 0)
        << "Invalid Schedule, cannot find the producer " << stage->op
        << " along the loop nest specified by compute_at of consumer " << op;
    for (auto iv : OutputRelatedIterVars(op)) {
@@ -483,50 +496,15 @@ void InferRootBound(const Stage& stage,
    }
    BoundProp(op, dom_map, &tmap);
  }
-  GatherOpBound(stage->op, tmap, rmap);
+  GatherOpBound(stage->op, feed_graph, tmap, rmap);
 }

-FeedGraph CreateFeedGraph(const Schedule& sch) {
+Map<IterVar, Range> InferBound(const Schedule& sch) {
  Array<Operation> roots;
  for (Operation op : sch->outputs) {
    roots.push_back(sch->stage_map[op]->op);
  }
-  auto g = CreateReadGraph(roots);
-  FeedGraph fg;
-  for (auto kv : g) {
-    for (Tensor t : kv.second) {
-      fg[t].push_back(kv.first);
-    }
-  }
-  return fg;
-}
-
-// Create AttachPath that  maps op-> a list of IterVar
-// That represents the loop nest op sits in from inner most to outermost
-AttachPath CreateAttachPath(const Schedule& sch) {
-  AttachPath ret;
-  for (Stage stage : sch->stages) {
-    Array<IterVar> path;
-    for (Stage s = stage; s->attach_type == kScope;) {
-      IterVar attach_ivar = s->attach_ivar;
-      s = s->attach_stage;
-      bool start_attach = false;
-      for (size_t i = s->leaf_iter_vars.size(); i != 0; --i) {
-        IterVar iv = s->leaf_iter_vars[i - 1];
-        if (iv == attach_ivar) start_attach = true;
-        if (start_attach) path.push_back(iv);
-      }
-      CHECK(start_attach)
-          << "Invalid Schedule: cannot find attach point " << attach_ivar
-          << " in the schedule of " << s->op;
-    }
-    ret.Set(stage->op, path);
-  }
-  return ret;
-}
-
-Map<IterVar, Range> InferBound(const Schedule& sch) {
-  FeedGraph feed_graph = CreateFeedGraph(sch);
+  FeedGraph feed_graph = CreateFeedGraph(CreateReadGraph(roots));
  AttachPath attach_path = CreateAttachPath(sch);

  std::unordered_map<IterVar, Range> ret;
@@ -535,6 +513,11 @@ Map<IterVar, Range> InferBound(const Schedule& sch) {
    InferRootBound(stage, feed_graph, attach_path, &ret);
    // pass down to get bound of all iter vars.
    PassDown(stage, &ret);
+    // setup outer most threads.
+    for (IterVar iv : stage->outermost_threads) {
+      CHECK(iv->dom.defined());
+      ret[iv] = iv->dom;
+    }
  }
  return Map<IterVar, Range>(ret.begin(), ret.end());
 }

--- a/src/schedule/graph.cc
+++ b/src/schedule/graph.cc
--- a/src/schedule/graph.h
+++ b/src/schedule/graph.h
@@ -9,6 +9,7 @@
 #include <tvm/expr.h>
 #include <tvm/schedule.h>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>

 namespace tvm {
@@ -20,6 +21,16 @@ namespace schedule {
 using ReadGraph = Map<Operation, Array<Tensor> >;

 /*!
+ * \brief The map beteen tensor and operation it feeds to
+ */
+using FeedGraph = std::unordered_map<Tensor, std::vector<Operation> >;
+
+/*!
+ * \brief AttachPath maps op-> a list of IterVar
+ */
+using AttachPath = Map<Operation, Array<IterVar> >;
+
+/*!
 * \brief Get read graph of each operation to all the
 *  Tensors that it directly depends on.
 *
@@ -41,6 +52,49 @@ ReadGraph CreateReadGraph(const Array<Operation>& roots);
 Array<Operation> PostDFSOrder(
    const Array<Operation>& roots, const ReadGraph& g);

+/*!
+ * \brief Create feedgraph for given Schedule
+ * \param  g The read graph.
+ * \return The created feedgraph.
+ */
+FeedGraph CreateFeedGraph(const ReadGraph& g);
+
+/*!
+ * \brief Create AttachPath that  maps op-> a list of IterVar
+ *  That represents the loop nest op sits in from inner most to outermost
+ *  Also inserts attach_stage for scan updates when needed.
+ *
+ * \param sch The schedule.
+ * \return The attach path.
+ */
+AttachPath CreateAttachPath(Schedule sch);
+
+/*!
+ * \brief Get all operations inside the recursion of scan.
+ * \param scan The scan node.
+ * \param feed_graph The feed graph to help analysis.
+ * \return The body operations, in read dependency order.
+ */
+Array<Operation> ScanGetBody_(
+    const ScanOpNode* scan, const FeedGraph& feed_graph);
+// same as ScanGetBody_, but create FeedGraph internally.
+Array<Operation> ScanGetBody(const Operation& scan);
+
+/*!
+ * \brief Analyze each spatial dimension of scan's result.
+ *  Give check on whether each dimension is fix point,
+ *  An axis is a fixed point if it only refers back to itself in recursion
+ *  and it is not used in axis of other recursion field.
+ *
+ *  next_state[t, ..., axis, ...] = f(prev_state[t-1, ...,axis,...]
+ *
+ * \param scan The scan node.
+ * \param body The body of scan, sorted in reverse PostDFSOrder.
+ * \return Map of spatial_axis -> IntImm
+ */
+Map<IterVar, Expr> ScanFixPointAnalysis(
+    const Operation& scan, const Array<Operation>& body);
+
 }  // namespace schedule
 }  // namespace tvm


--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
--- a/src/schedule/schedule_ops.cc
+++ b/src/schedule/schedule_ops.cc
@@ -369,7 +369,7 @@ Stmt MakeRealize(const ScanOpNode* op,
    CHECK_EQ(static_cast<size_t>(t->value_index), i);
    Halide::Internal::Region bounds;
    bounds.push_back(tdom);
-    for (size_t k = 0; k < op->update[i]->shape.size(); ++k, ++sp_idx) {
+    for (size_t k = 1; k < op->update[i]->shape.size(); ++k, ++sp_idx) {
      IterVar sp_ax = op->spatial_axis_[sp_idx];
      bounds.push_back(dom_map.at(sp_ax));
    }
@@ -561,6 +561,7 @@ class InjectScanStep : public IRMutator {

 Stmt InjectInline(const Operation op, Stmt body) {
  CHECK(body.defined());
+
  const ComputeOpNode* compute = op.as<ComputeOpNode>();
  CHECK(compute != nullptr)
      << "can only inline compute op";
@@ -614,7 +615,7 @@ class SchedulePostProc : public IRMutator {
        if (it->second.defined()) {
          Stmt ret = AttrStmt::make(
              it->second, op->type_key, op->value, op->body);
-          return this->Mutate_(ret.as<AttrStmt>(), ret);
+          return this->Mutate(ret);
        } else {
          return this->Mutate(op->body);
        }
@@ -631,7 +632,7 @@ class SchedulePostProc : public IRMutator {
        Stmt ret = Realize::make(
            it->second->op, it->second->value_index,
            op->type, op->bounds, op->condition, op->body);
-        return this->Mutate_(ret.as<Realize>(), ret);
+        return this->Mutate(ret);
      } else {
        return this->Mutate(op->body);
      }
@@ -644,11 +645,10 @@ class SchedulePostProc : public IRMutator {
    TensorKey key{op->func, op->value_index};
    auto it = replace_buffer_.find(key);
    if (it != replace_buffer_.end()) {
-      const Tensor& dst = it->second.first;
+      const Tensor& dst = it->second;
      Stmt ret = Provide::make(
-          dst->op, dst->value_index, op->value,
-          RewriteArgs(it->second.second, op->args));
-      return IRMutator::Mutate_(ret.as<Provide>(), ret);
+          dst->op, dst->value_index, op->value, op->args);
+      return this->Mutate(ret);
    } else {
      return IRMutator::Mutate_(op, s);
    }
@@ -659,12 +659,11 @@ class SchedulePostProc : public IRMutator {
      TensorKey key{op->func, op->value_index};
      auto it = replace_buffer_.find(key);
      if (it != replace_buffer_.end()) {
-        const Tensor& dst = it->second.first;
+        const Tensor& dst = it->second;
        Expr ret = Call::make(
-            op->type, dst->op->name,
-            RewriteArgs(it->second.second, op->args),
+            op->type, dst->op->name, op->args,
            op->call_type, dst->op, dst->value_index);
-        return IRMutator::Mutate_(ret.as<Call>(), ret);
+        return this->Mutate(ret);
      }
    }
    return IRMutator::Mutate_(op, e);
@@ -685,14 +684,14 @@ class SchedulePostProc : public IRMutator {
        const ScanOpNode* scan = s->op.as<ScanOpNode>();
        for (size_t i = 0; i < scan->update.size(); ++i) {
          Tensor t = s->origin_op.output(i);
-          AddReplace(scan->init[i], t, Expr());
-          AddReplace(scan->update[i], t, scan->scan_axis->var);
-          AddReplace(scan->state_placeholder[i], t, Expr());
+          AddReplace(scan->init[i], t);
+          AddReplace(scan->update[i], t);
+          AddReplace(scan->state_placeholder[i], t);
        }
      } else if (!s->op.same_as(s->origin_op)) {
        Tensor target = s->origin_op.output(0);
        AddReplace(s->op.output(0), target,
-                   Expr(), target, s->origin_op);
+                   target, s->origin_op);
      }
    }
  }
@@ -700,26 +699,17 @@ class SchedulePostProc : public IRMutator {
 private:
  void AddReplace(Tensor src,
                  Tensor dst,
-                  Expr head_idx,
                  Tensor repl_realize = Tensor(),
                  Operation repl_op = Operation()) {
    TensorKey key{src->op, src->value_index};
-    replace_buffer_[key] = std::make_pair(dst, head_idx);
+    replace_buffer_[key] = dst;
    replace_realize_[key] = repl_realize;
    replace_op_[src->op.get()] = repl_op;
  }
-  Array<Expr> RewriteArgs(Expr head, Array<Expr> args) {
-    if (!head.defined()) return args;
-    Array<Expr> new_args{head};
-    for (Expr e : args) {
-      new_args.push_back(e);
-    }
-    return new_args;
-  }
  // The scan value
  std::unordered_map<const Variable*, Expr> var_value_;
  // buffer replacement
-  std::unordered_map<TensorKey, std::pair<Tensor, Expr> > replace_buffer_;
+  std::unordered_map<TensorKey, Tensor> replace_buffer_;
  // buffere realization to be replaced
  std::unordered_map<TensorKey, Tensor> replace_realize_;
  // replace producer consumer.
@@ -755,10 +745,13 @@ Stmt ScheduleOps(
  // reverse the post DFS order.
  for (size_t i = sch->stages.size(); i != 0; --i) {
    Stage s = sch->stages[i - 1];
+    CHECK_NE(s->attach_type, kInline)
+        << "call schedule.normalize before scheduleops";
    // no need to specify place holder op.
    if (s->op.as<PlaceholderOpNode>()) continue;
    if (scan_attach.count(s->op)) {
-      CHECK(s->attach_type == kNone || s->attach_type == kInline)
+      CHECK(s->attach_type == kNone ||
+            s->attach_type == kScanUpdate)
          << "Cannot specify compute_at for scan's init/update";
      CHECK(body.defined());
      const auto& p = scan_attach.at(s->op);
@@ -766,8 +759,8 @@ Stmt ScheduleOps(
      body = mu.Mutate(body);
      CHECK(mu.found_attach)
          << "did not find attachment point for scan.init/update";
-    } else if (s->attach_type == kInline) {
-      body = InjectInline(s->op, body);
+    } else if (s->attach_type == kInlinedAlready) {
+      // do nothing
    } else if (s->attach_type == kRoot || s-> attach_type == kNone) {
      body = MakePipeline(s, dom_map, body);
    } else if (s->attach_type == kScope) {

--- a/tests/python/integration/test_scan.py
+++ b/tests/python/integration/test_scan.py
@@ -8,8 +8,8 @@ def test_scan():
    X = tvm.placeholder((m, n), name="X")
    s_state = tvm.placeholder((m, n))
    s_init = tvm.compute((1, n), lambda _, i: X[0, i])
-    s_update = tvm.compute((n,), lambda i: s_state[t-1, i] + X[t, i])
-    res = tvm.scan(t, s_init, s_update, s_state)
+    s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
+    res = tvm.scan(s_init, s_update, s_state)

    # schedule
    s = tvm.Schedule(res.op)
@@ -18,7 +18,7 @@ def test_scan():
    thread_x = tvm.IterVar((0, num_thread), thread_tag="threadIdx.x")
    _, x = s[s_init].split(s_init.op.axis[1], factor=num_thread, outer=block_x)
    _, x = s[s_init].split(x, outer=thread_x)
-    _, x = s[s_update].split(s_update.op.axis[0], factor=num_thread, outer=block_x)
+    _, x = s[s_update].split(s_update.op.axis[1], factor=num_thread, outer=block_x)
    _, x = s[s_update].split(x, outer=thread_x)

    # one line to build the function.

--- a/tests/python/unittest/test_lang_tensor.py
+++ b/tests/python/unittest/test_lang_tensor.py
@@ -40,9 +40,8 @@ def test_tensor_scan():
    t = tvm.IterVar((1, m), "t")
    x = tvm.placeholder((m, n))
    s = tvm.placeholder((m, n))
-    res = tvm.scan(t,
-                   tvm.compute((1, n), lambda _, i: x[0, i]),
-                   tvm.compute((n,), lambda i: s[t-1, i] + x[t, i]),
+    res = tvm.scan(tvm.compute((1, n), lambda _, i: x[0, i]),
+                   tvm.compute((m, n), lambda t, i: s[t-1, i] + x[t, i]),
                   s)
    assert tuple(res.shape) == (m, n)


--- a/tests/python/unittest/test_schedule_bound_inference.py
+++ b/tests/python/unittest/test_schedule_bound_inference.py
@@ -50,25 +50,30 @@ def test_bound3():
    assert(bounds[A1.op.axis[0]].extent.value==32)
    assert(bounds[A1.op.axis[1]].extent.value==16)

+def test_bound_scan():
+    m = tvm.Var("m")
+    n = tvm.Var("n")
+    X = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
+    s_state = tvm.placeholder((m, n))
+    s_init = tvm.compute((1, n), lambda _, i: X[0, i])
+    s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
+    s_scan = tvm.scan(s_init, s_update, s_state)

-def test_create_read_graph():
-    m = tvm.Var('m')
-    l = tvm.Var('l')
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j])
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3)
+    assert tuple(s_scan.shape) == (m, n)

-    g = tvm.schedule.CreateReadGraph([A2.op])
+    s = tvm.Schedule(s_scan.op)
+    XX = s.cache_read(X, "local", s_update)
+    xo, xi = s[s_update].split(s_update.op.axis[1], factor=4)
+    s[XX].compute_at(s[s_update], xo)

-    assert g[A2.op][0] == A1
-    assert g[A1.op][0] == A
-    post_order = tvm.schedule.PostDFSOrder([A2.op], g)
-    assert(post_order[0] == A.op)
-    assert(post_order[1] == A1.op)
+    s.normalize()
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    assert bounds[XX.op.axis[1]].extent.value == 4


 if __name__ == "__main__":
-    test_create_read_graph()
+    test_bound_scan()
    test_bound3()
    test_bound1()
    test_bound2()
--- a/tests/python/unittest/test_schedule_graph.py
+++ b/tests/python/unittest/test_schedule_graph.py
+import tvm
+
+def test_scan():
+    m = tvm.Var("m")
+    n = tvm.Var("n")
+    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
+    s_state = tvm.placeholder((m, n))
+    s_init = tvm.compute((1, n), lambda _, i: x[0, i], name="s_init")
+    x_trans = tvm.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans")
+    s_up1 = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1")
+    s_update = tvm.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update")
+    s_scan = tvm.scan(s_init, s_update, s_state)
+
+    def test_getbody():
+        body = tvm.schedule.ScanGetBody(s_scan.op)
+        assert set(body) == set([s_scan.op, s_update.op, s_up1.op])
+
+    def test_attach_path():
+        s = tvm.Schedule(s_scan.op)
+        s[x_trans].compute_at(s[s_update], s_update.op.axis[0])
+        apath = tvm.schedule.CreateAttachPath(s)
+        assert(tuple(apath[s_update.op]) == tuple([s_scan.op.scan_axis]))
+        assert(tuple(apath[x_trans.op]) == tuple([s_update.op.axis[0], s_scan.op.scan_axis]))
+
+    def test_fix_pt():
+        body = tvm.schedule.ScanGetBody(s_scan.op)
+        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body)
+        assert(fxpt[s_scan.spatial_axis_[0]].value != 0)
+
+def test_scan_fix_point():
+    m = tvm.Var("m")
+    n = tvm.Var("n")
+    l = tvm.Var("l")
+    x = tvm.compute((l, m, n), lambda *i: tvm.const(1, "float32"), name="x")
+    s_state = tvm.placeholder((l, m, n))
+    s_init = tvm.compute((1, m, n), lambda _, i, j: x[0, i, j], name="s_init")
+
+    def test_scan0():
+        s_update = tvm.compute((l, m, n),
+                               lambda t, i, j: x[t, j, i]  + s_state[t-1, i, j], name="update")
+        s_scan = tvm.scan(s_init, s_update, s_state)
+        body = tvm.schedule.ScanGetBody(s_scan.op)
+        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body)
+        assert(fxpt[s_scan.op.spatial_axis_[0]].value == 1)
+        assert(fxpt[s_scan.op.spatial_axis_[1]].value == 1)
+
+    def test_scan1():
+        s_update = tvm.compute((l, m, n),
+                               lambda t, i, j: x[t, j, i]  + s_state[t-1, j, i], name="update")
+        s_scan = tvm.scan(s_init, s_update, s_state)
+        body = tvm.schedule.ScanGetBody(s_scan.op)
+        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body)
+        assert(fxpt[s_scan.op.spatial_axis_[0]].value == 0)
+        assert(fxpt[s_scan.op.spatial_axis_[1]].value == 0)
+
+    def test_scan3_not_exact_reach():
+        s_h1 = tvm.compute((l, n, m), lambda t, j, i: s_state[t-1, i, j], name="h1")
+        s_h2 = tvm.compute((l, m, n), lambda t, i, j: s_state[t-1, i, 10] * 2, name="h1")
+        s_update = tvm.compute((l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update")
+        s_scan = tvm.scan(s_init, s_update, s_state)
+        body = tvm.schedule.ScanGetBody(s_scan.op)
+        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body)
+        assert(fxpt[s_scan.op.spatial_axis_[0]].value == 1)
+        assert(fxpt[s_scan.op.spatial_axis_[1]].value == 0)
+
+    def test_scan4_reach_other():
+        s_h1 = tvm.compute((l, n, m), lambda t, j, i: s_state[t-1, j, j], name="h1")
+        s_h2 = tvm.compute((l, m, n), lambda t, i, j: s_state[t-1, i, j] * 2, name="h1")
+        s_update = tvm.compute((l, m, n),
+                               lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update")
+        s_scan = tvm.scan(s_init, s_update, s_state)
+        body = tvm.schedule.ScanGetBody(s_scan.op)
+        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body)
+        assert(fxpt[s_scan.op.spatial_axis_[0]].value == 0)
+        assert(fxpt[s_scan.op.spatial_axis_[1]].value == 0)
+
+    test_scan0()
+    test_scan1()
+    test_scan3_not_exact_reach()
+    test_scan4_reach_other()
+
+def test_create_read_graph():
+    m = tvm.Var('m')
+    l = tvm.Var('l')
+    A = tvm.placeholder((m, l), name='A')
+    A1 = tvm.compute((m, l), lambda i, j: A[i, j])
+    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3)
+
+    g = tvm.schedule.CreateReadGraph([A2.op])
+
+    assert g[A2.op][0] == A1
+    assert g[A1.op][0] == A
+    post_order = tvm.schedule.PostDFSOrder([A2.op], g)
+    assert(post_order[0] == A.op)
+    assert(post_order[1] == A1.op)
+
+
+if __name__ == "__main__":
+    test_scan()
+    test_create_read_graph()
+    test_scan_fix_point()
--- a/tests/python/unittest/test_schedule_schedule_ops.py
+++ b/tests/python/unittest/test_schedule_schedule_ops.py
@@ -43,13 +43,11 @@ def test_schedule2():
 def test_schedule_scan():
    m = tvm.Var("m")
    n = tvm.Var("n")
-    l = tvm.Var("l")
-    t = tvm.IterVar((1, m), name="t")
    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
    s_state = tvm.placeholder((m, n))
    s_init = tvm.compute((1, n), lambda _, i: x[0, i])
-    s_update = tvm.compute((n,), lambda i: s_state[t-1, i] + x[t, i])
-    res = tvm.scan(t, s_init, s_update, s_state)
+    s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i])
+    res = tvm.scan(s_init, s_update, s_state)

    assert tuple(res.shape) == (m, n)
    s = tvm.Schedule(res.op)
@@ -59,7 +57,6 @@ def test_schedule_scan():
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    print(stmt)

-
 def test_auto_inline():
    m = tvm.Var('m')
    n = tvm.Var('n')
@@ -71,9 +68,27 @@ def test_auto_inline():

    s = tvm.Schedule(T2.op)
    tvm.schedule.AutoInlineElemWise(s)
+    s.normalize()
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)

+def test_inline_mixed():
+    n = tvm.Var('n')
+    A = tvm.placeholder((n, ), name='A')
+    A1 = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='A1')
+    A2 = tvm.compute(A.shape, lambda *i: A1(*i) + 2, name='A2')
+    C = tvm.compute((n,), lambda i: A2[i] + A1[i], name='C')
+
+    s = tvm.Schedule(C.op)
+    xo, xi = s[C].split(C.op.axis[0], factor=8)
+    s[A1].compute_at(s[C], xo)
+    s[A2].compute_inline()
+    s.normalize()
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    print(stmt)
+
+
 def test_schedule_cache():
    m = tvm.Var('m')
    n = tvm.Var('n')
@@ -90,9 +105,10 @@ def test_schedule_cache():


 if __name__ == "__main__":
+    test_inline_mixed()
+    test_auto_inline()
    test_schedule_scan()
    test_schedule0()
    test_schedule1()
    test_schedule2()
-    test_auto_inline()
    test_schedule_cache()