[LANG/PASS] Support Vectorize (#37)

45597d00 · Tianqi Chen · GitHub · 6a62beb2 · 45597d00 · 45597d00
Commit 45597d00 authored Feb 09, 2017 by Tianqi Chen Committed by GitHub Feb 09, 2017
24 changed files
--- a/include/tvm/ir_mutator.h
+++ b/include/tvm/ir_mutator.h
@@ -62,6 +62,7 @@ class IRMutator {
  virtual Stmt Mutate_(const Realize* op, const Stmt& s);
  virtual Stmt Mutate_(const Store* op, const Stmt& s);
  virtual Stmt Mutate_(const Free* op, const Stmt& s);
+  virtual Stmt Mutate_(const IfThenElse* op, const Stmt& s);
  virtual Expr Mutate_(const Call* op, const Expr& e);
  virtual Expr Mutate_(const Load* op, const Expr& s);
  virtual Expr Mutate_(const Variable* op, const Expr& e);

--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -112,6 +112,12 @@ Stmt StorageFlatten(Stmt stmt,
 Stmt UnrollLoop(Stmt stmt, int max_auto_step);

 /*!
+ * \brief vectorize the constant loops
+ * \param stmt The statment to be vectorized.
+ */
+Stmt VectorizeLoop(Stmt stmt);
+
+/*!
 * \brief Make an user callable API LoweredFunc.
 *
 *  The main task of this function is to create code to :

--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -18,6 +18,8 @@ class StageNode;
 class ScheduleNode;
 // Node container for IterVarRelation
 class IterVarRelationNode;
+// Attribute of itervar.
+class IterVarAttrNode;

 /*! \brief the attachment type */
 enum AttachType : int {
@@ -27,6 +29,12 @@ enum AttachType : int {
  kScope = 3
 };

+/*! \brief IterVar type */
+enum IterVarType : int {
+  kUnrolled = 1,
+  kVectorized = 2
+};
+
 /*! \brief Stage, contains scheduling for a stage of computation. */
 class Stage : public NodeRef {
 public:
@@ -124,11 +132,22 @@ class Stage : public NodeRef {
              IterVar* p_x_inner, IterVar* p_y_inner,
              Expr x_factor, Expr y_factor);
  /*!
+   * \brief Vectorize iteration.
+   * \param var The axis to be vectorized.
+   * \return reference to self.
+   */
+  Stage& vectorize(IterVar var);   // NOLINT(*)
+  /*!
+   * \brief Unroll iteration.
+   * \param var The axis to be vectorized.
+   * \return reference to self.
+   */
+  Stage& unroll(IterVar var);   // NOLINT(*)
+  /*!
   * \brief whether the stage has been scheduled.
   * \return whether the stage has been scheduled.
   */
  inline bool is_scheduled() const;
-
  // declare container type
  using ContainerType = StageNode;
 };
@@ -193,6 +212,21 @@ class IterVarRelation : public NodeRef {
  inline const IterVarRelationNode* operator->() const;
 };

+/*!
+ * \brief Additional scheduable attributes about IterVar.
+ */
+class IterVarAttr : public NodeRef {
+ public:
+  IterVarAttr() {}
+  explicit IterVarAttr(IterVarType t);
+  explicit IterVarAttr(std::shared_ptr<Node> n) : NodeRef(n) {}
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  inline const IterVarAttrNode* operator->() const;
+};
+
 // defintion of node containers
 /*!
 * \brief represents the schedule of the tensor
@@ -223,6 +257,8 @@ class StageNode : public Node {
  Array<IterVar> leaf_iter_vars;
  /*! \brief The relation bwteen of IterVars */
  Array<IterVarRelation> relations;
+  /*! \brief additional attributes about iter var. */
+  Map<IterVar, IterVarAttr> iter_var_attrs;
  /*! \brief The attachment type of the schedule */
  AttachType attach_type{kNone};
  /*! \brief The attach point of this schedule. */
@@ -236,6 +272,7 @@ class StageNode : public Node {
    v->Visit("all_iter_vars", &all_iter_vars);
    v->Visit("leaf_iter_vars", &leaf_iter_vars);
    v->Visit("relations", &relations);
+    v->Visit("iter_var_attrs", &iter_var_attrs);
    v->Visit("attach_type", &attach_type);
    v->Visit("attach_ivar", &attach_ivar);
    v->Visit("attach_stage", &attach_stage);
@@ -268,6 +305,20 @@ class ScheduleNode : public Node {
  TVM_DECLARE_NODE_TYPE_INFO(ScheduleNode);
 };

+/*! \brief node container for IterVar attr */
+class IterVarAttrNode : public Node {
+ public:
+  /*! \brief The iteration type. */
+  IterVarType iter_type;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("iter_type", &iter_type);
+  }
+
+  static constexpr const char* _type_key = "IterVarAttr";
+  TVM_DECLARE_NODE_TYPE_INFO(IterVarAttrNode);
+};
+
 /*! \brief base node of iteration var */
 class IterVarRelationNode : public Node {
 };
@@ -372,5 +423,9 @@ inline const IterVarRelationNode* IterVarRelation::operator->() const {
  return static_cast<const IterVarRelationNode*>(node_.get());
 }

+inline const IterVarAttrNode* IterVarAttr::operator->() const {
+  return static_cast<const IterVarAttrNode*>(node_.get());
+}
+
 }  // namespace tvm
 #endif  // TVM_SCHEDULE_H_
--- a/python/tvm/build.py
+++ b/python/tvm/build.py
@@ -69,6 +69,7 @@ def build(sch,
    stmt = schedule.ScheduleOps(sch, bounds)
    stmt = ir_pass.StorageFlatten(stmt, binds)
    stmt = ir_pass.CanonicalSimplify(stmt)
+    stmt = ir_pass.VectorizeLoop(stmt)
    stmt = ir_pass.UnrollLoop(stmt, max_auto_unroll_step)
    stmt = ir_pass.Simplify(stmt)
    fapi = ir_pass.MakeAPI(stmt, name, arg_list, len(arg_list))

--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -177,3 +177,23 @@ class Stage(NodeBase):
        x_outer, y_outer, x_inner, y_inner = _api_internal._StageTile(
            self, x_parent, y_parent, x_factor, y_factor)
        return x_outer, y_outer, x_inner, y_inner
+
+    def vectorize(self, var):
+        """Vectorize the iteration.
+
+        Parameters
+        ----------
+        var : IterVar
+            The iteration to be vectorize
+        """
+        _api_internal._StageVectorize(self, var)
+
+    def unroll(self, var):
+        """Unroll the iteration.
+
+        Parameters
+        ----------
+        var : IterVar
+            The iteration to be unrolled.
+        """
+        _api_internal._StageUnroll(self, var)
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -253,6 +253,18 @@ TVM_REGISTER_API(_StageTile)
    *ret = Array<IterVar>({x_outer, y_outer, x_inner, y_inner});
  });

+TVM_REGISTER_API(_StageUnroll)
+  .set_body([](TVMArgs args, TVMRetValue* ret) {
+    args[0].operator Stage()
+        .unroll(args[1]);
+  });
+
+TVM_REGISTER_API(_StageVectorize)
+  .set_body([](TVMArgs args, TVMRetValue* ret) {
+    args[0].operator Stage()
+        .vectorize(args[1]);
+  });
+
 TVM_REGISTER_API(_ScheduleNormalize)
 .set_body([](TVMArgs args, TVMRetValue* ret) {
    args[0].operator Schedule()

--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -62,6 +62,7 @@ REGISTER_PASS1(VerifySSA);
 REGISTER_PASS1(CanonicalSimplify);
 REGISTER_PASS4(Inline);
 REGISTER_PASS2(StorageFlatten);
+REGISTER_PASS1(VectorizeLoop);
 REGISTER_PASS2(UnrollLoop);
 REGISTER_PASS2(StorageSync);
 REGISTER_PASS4(MakeAPI);

--- a/src/arithmetic/compute_expr.h
+++ b/src/arithmetic/compute_expr.h
@@ -9,6 +9,7 @@

 #include <tvm/ir.h>
 #include <pass/Interval.h>
+#include <limits>

 namespace tvm {
 namespace arith {
@@ -52,6 +53,23 @@ inline bool GetConst<uint64_t>(Expr e, uint64_t *out) {
  }
 }

+// get a small constant int
+inline bool GetConstInt(Expr e, int* out) {
+  int64_t v1 = 0;
+  uint64_t v2 = 0;
+  if (GetConst(e, &v1)) {
+    if (v1 > static_cast<int64_t>(
+            std::numeric_limits<int>::max())) return false;
+    *out = static_cast<int>(v1); return true;
+  }
+  if (GetConst(e, &v2)) {
+    if (v2 > static_cast<uint64_t>(
+            std::numeric_limits<int>::max())) return false;
+    *out = static_cast<int>(v2); return true;
+  }
+  return false;
+}
+
 #define TVM_CONST_PROPAGATION(OP_NAME, OP)                       \
  int64_t ia = 0, ib = 0;                                        \
  if (GetConst(a, &ia) && GetConst(b, &ib)) {                    \

--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -3,7 +3,9 @@
 * \file codegen_c.cc
 */
 #include <iomanip>
+#include <cctype>
 #include "./codegen_c.h"
+#include "../arithmetic/compute_expr.h"

 namespace tvm {
 namespace codegen {
@@ -14,10 +16,10 @@ std::string CodeGenC::Compile(LoweredFunc f,
                              bool output_ssa) {
  print_ssa_form_ = output_ssa;
  // skip the first underscore, so SSA variable starts from _1
-  if (print_ssa_form_) GetUniqueName("_");
+  GetUniqueName("_");
  // add to alloc buffer type.
  for (const auto & kv : f->handle_data_type) {
-    HandleTypeRegister(kv.first.get(), kv.second.type());
+    RegisterHandleType(kv.first.get(), kv.second.type());
  }

  this->stream << "void " << f->name << "(";
@@ -26,7 +28,11 @@ std::string CodeGenC::Compile(LoweredFunc f,
    std::string vid = AllocVarID(v.get());
    if (i != 0) stream << ", ";
    if (v.type().is_handle()) {
-      stream << arg_addr_space_;
+      auto it = alloc_storage_scope_.find(v.get());
+      if (it != alloc_storage_scope_.end()) {
+        PrintStorageScope(it->second, stream);
+      }
+      stream << ' ';
    }
    if (handle_data_type_.count(v.get())) {
      PrintType(handle_data_type_.at(v.get()), stream);
@@ -126,7 +132,7 @@ bool CodeGenC::HandleTypeMatch(const Variable* buf_var, Type t) const {
  return it->second == t;
 }

-void CodeGenC::HandleTypeRegister(const Variable* buf_var, Type t) {
+void CodeGenC::RegisterHandleType(const Variable* buf_var, Type t) {
  auto it = handle_data_type_.find(buf_var);
  if (it == handle_data_type_.end()) {
    handle_data_type_[buf_var] = t;
@@ -259,23 +265,39 @@ inline void PrintBinaryExpr(const T* op,
                            const char *opstr,
                            std::ostream& os,  // NOLINT(*)
                            CodeGenC* p) {
+  if (op->type.lanes() == 1) {
+    if (isalpha(opstr[0])) {
+      os << opstr << '(';
+      p->PrintExpr(op->a, os);
+      os << ", ";
+      p->PrintExpr(op->b, os);
+      os << ')';
+    } else {
      os << '(';
      p->PrintExpr(op->a, os);
-  os << opstr;
+      os << ' ' << opstr << ' ';
      p->PrintExpr(op->b, os);
      os << ')';
+    }
+  } else {
+    p->PrintVecBinaryOp(opstr, op->type, op->a, op->b, os);
+  }
 }

 inline void PrintBinaryIntrinsitc(const Call* op,
                                  const char *opstr,
                                  std::ostream& os,  // NOLINT(*)
                                  CodeGenC* p) {
+  if (op->type.lanes() == 1) {
    CHECK_EQ(op->args.size(), 2U);
    os << '(';
    p->PrintExpr(op->args[0], os);
    os << opstr;
    p->PrintExpr(op->args[1], os);
    os << ')';
+  } else {
+    p->PrintVecBinaryOp(opstr, op->type, op->args[0], op->args[1], os);
+  }
 }

 TVM_STATIC_IR_FUNCTOR(CodeGenC, vtable_print_expr)
@@ -289,57 +311,49 @@ TVM_STATIC_IR_FUNCTOR(CodeGenC, vtable_print_expr)
    os << p->GetVarID(op);
  })
 .set_dispatch<Add>([](const Add *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " + ", os, p);
+    PrintBinaryExpr(op, "+", os, p);
  })
 .set_dispatch<Sub>([](const Sub *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " - ", os, p);
+    PrintBinaryExpr(op, "-", os, p);
  })
 .set_dispatch<Mul>([](const Mul *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " * ", os, p);
+    PrintBinaryExpr(op, "*", os, p);
  })
 .set_dispatch<Div>([](const Div *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " / ", os, p);
+    PrintBinaryExpr(op, "/", os, p);
  })
 .set_dispatch<Mod>([](const Mod *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " % ", os, p);
+    PrintBinaryExpr(op, "%", os, p);
 })
 .set_dispatch<Min>([](const Min *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    os << "min(";
-    p->PrintExpr(op->a, os);
-    os << ", ";
-    p->PrintExpr(op->b, os);
-    os << ")";
+    PrintBinaryExpr(op, "min", os, p);
 })
 .set_dispatch<Max>([](const Max *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    os << "max(";
-    p->PrintExpr(op->a, os);
-    os << ", ";
-    p->PrintExpr(op->b, os);
-    os << ")";
+    PrintBinaryExpr(op, "max", os, p);
 })
 .set_dispatch<EQ>([](const EQ *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " == ", os, p);
+    PrintBinaryExpr(op, "==", os, p);
 })
 .set_dispatch<NE>([](const NE *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " != ", os, p);
+    PrintBinaryExpr(op, "!=", os, p);
 })
 .set_dispatch<LT>([](const LT *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " < ", os, p);
+    PrintBinaryExpr(op, "<", os, p);
 })
 .set_dispatch<LE>([](const LE *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " <= ", os, p);
+    PrintBinaryExpr(op, "<=", os, p);
 })
 .set_dispatch<GT>([](const GT *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " > ", os, p);
+    PrintBinaryExpr(op, ">", os, p);
 })
 .set_dispatch<GE>([](const GE *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " >= ", os, p);
+    PrintBinaryExpr(op, ">=", os, p);
 })
 .set_dispatch<And>([](const And *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " && ", os, p);
+    PrintBinaryExpr(op, "&&", os, p);
 })
 .set_dispatch<Or>([](const Or *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-    PrintBinaryExpr(op, " || ", os, p);
+    PrintBinaryExpr(op, "||", os, p);
 })
 .set_dispatch<Not>([](const Not *op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
    os << '!';
@@ -460,18 +474,179 @@ void CodeGenC::PrintExpr(const Call *op, std::ostream& os) {  // NOLINT(*)
  }
 }

-void CodeGenC::PrintExpr(const Load* op, std::ostream& os) {  // NOLINT(*)
-  std::string vid = GetVarID(op->buffer_var.get());
-  if (!HandleTypeMatch(op->buffer_var.get(), op->type)) {
-    os << "((const ";
-    PrintType(op->type, os);
+void CodeGenC::PrintVecBinaryOp(
+    const std::string&op, Type t,
+    Expr lhs, Expr rhs, std::ostream& os) {  // NOLINT(*)
+  if (isalpha(op[0])) {
+    os << op << "(";
+    this->PrintExpr(lhs, os);
+    os << ", ";
+    this->PrintExpr(rhs, os);
+    os << ")";
+  } else {
+    os <<"(";
+    this->PrintExpr(lhs, os);
+    os << ' ' << op << ' ';
+    this->PrintExpr(rhs, os);
+    os << ")";
+  }
+}
+
+inline bool TryGetRamp1Base(Expr index, int lanes, Expr *base) {
+  const Ramp* r = index.as<Ramp>();
+  if (!r) return false;
+  if (!is_one(r->stride)) return false;
+  CHECK_EQ(r->lanes, lanes);
+  *base = r->base;
+  return true;
+}
+
+// Print a reference expression to a buffer.
+void CodeGenC::PrintBufferRef(
+    const Variable* buffer,
+    Type t, Expr index,
+    std::ostream& os) {  // NOLINT(*)
+  std::string vid = GetVarID(buffer);
+  if (t.lanes() == 1) {
+    if (!HandleTypeMatch(buffer, t)) {
+      os << "((";
+      PrintType(t, os);
      os << "*)" << vid << ')';
    } else {
      os << vid;
    }
    os << '[';
-  PrintExpr(op->index, os);
+    PrintExpr(index, os);
    os << ']';
+  } else {
+    // Buffer declared as vector type.
+    // optimize for case where it is in register,
+    if (HandleTypeMatch(buffer, t)) {
+      // optimize for constant access
+      int offset;
+      if (arith::GetConstInt(index, &offset)) {
+        CHECK_EQ(offset % t.lanes(), 0)
+            << "Find unaligned vector load to a vector type";
+        os << vid << '[' << (offset / t.lanes()) << ']';
+        return;
+      }
+    }
+    os << "((";
+    PrintType(t, os);
+    os << "*)(";
+    if (!HandleTypeMatch(buffer, t.element_of())) {
+      os << '(';
+      PrintType(t.element_of(), os);
+      os << "*)";
+    }
+    os << vid << " + ";
+    PrintExpr(index, os);
+    os << "))[0]";
+  }
+}
+
+void CodeGenC::PrintExpr(const Load* op, std::ostream& os) {  // NOLINT(*)
+  int lanes = op->type.lanes();
+  if (op->type.lanes() == 1) {
+    this->PrintBufferRef(op->buffer_var.get(), op->type, op->index, os);
+  } else {
+    Expr base;
+    if (TryGetRamp1Base(op->index, op->type.lanes(), &base)) {
+      this->PrintVecLoad(op->buffer_var.get(), op->type, base, os);
+    } else {
+      // Load elements seperately
+      std::string sindex = SSAGetID(PrintExpr(op->index), op->index.type());
+      std::string svalue = GetUniqueName("_");
+      {
+        // delcare type.
+        this->PrintIndent();
+        this->PrintType(op->type, stream);
+        stream << ' ' << svalue << ";\n";
+      }
+      std::string vid = GetVarID(op->buffer_var.get());
+      Type elem_type = op->type.element_of();
+      for (int i = 0; i < lanes; ++i) {
+        std::ostringstream value_temp;
+        if (!HandleTypeMatch(op->buffer_var.get(), elem_type)) {
+          value_temp << "((";
+          PrintType(elem_type, os);
+          value_temp << "*)" << vid << ')';
+        } else {
+          value_temp << vid;
+        }
+        value_temp << '[';
+        PrintVecElemLoad(sindex, op->index.type(), i, value_temp);
+        value_temp << ']';
+        PrintVecElemStore(svalue, op->type, i, value_temp.str());
+      }
+      os << svalue;
+    }
+  }
+}
+
+void CodeGenC::PrintStmt(const Store* op) {
+  Type t = op->value.type();
+  if (t.lanes() == 1) {
+    this->PrintIndent();
+    std::string value = this->PrintExpr(op->value);
+    this->PrintBufferRef(op->buffer_var.get(), t, op->index, stream);
+    stream << " = " << value << ";\n";
+  } else {
+    Expr base;
+    if (TryGetRamp1Base(op->index, t.lanes(), &base)) {
+      std::string value = this->PrintExpr(op->value);
+      this->PrintVecStore(op->buffer_var.get(), t, base, value);
+    } else {
+      // store elements seperately
+      std::string index = SSAGetID(PrintExpr(op->index), op->index.type());
+      std::string value = SSAGetID(PrintExpr(op->value), op->value.type());
+      std::string vid = GetVarID(op->buffer_var.get());
+      for (int i = 0; i < t.lanes(); ++i) {
+        this->PrintIndent();
+        Type elem_type = t.element_of();
+        if (!HandleTypeMatch(op->buffer_var.get(), elem_type)) {
+          stream << "((";
+          PrintType(elem_type, stream);
+          stream << "*)" << vid << ')';
+        } else {
+          stream << vid;
+        }
+        stream << '[';
+        PrintVecElemLoad(index, op->index.type(), i, stream);
+        stream << "] = ";
+        PrintVecElemLoad(value, op->value.type(), i, stream);
+        stream << ";\n";
+      }
+    }
+  }
+}
+
+void CodeGenC::PrintVecElemLoad(const std::string& vec,
+                                Type t, int i,
+                                std::ostream& os) {  // NOLINT(*)
+  os << vec << ".s" << std::hex << i;
+}
+
+void CodeGenC::PrintVecElemStore(const std::string& vec,
+                                 Type t, int i,
+                                 const std::string& value) {
+  this->PrintIndent();
+  stream << vec << ".s" << std::hex << i
+         << " = " << value << ";\n";
+}
+
+void CodeGenC::PrintVecLoad(const Variable* buffer,
+                            Type t, Expr base,
+                            std::ostream& os) {
+  PrintBufferRef(buffer, t, base, os);
+}
+
+void CodeGenC::PrintVecStore(const Variable* buffer,
+                             Type t, Expr base,
+                             const std::string& value) {
+  this->PrintIndent();
+  PrintBufferRef(buffer, t, base, stream);
+  stream << " = " << value << ";\n";
 }

 void CodeGenC::PrintExpr(const Let* op, std::ostream& os) {  // NOLINT(*)
@@ -483,15 +658,15 @@ void CodeGenC::PrintExpr(const Let* op, std::ostream& os) {  // NOLINT(*)
 }

 void CodeGenC::PrintExpr(const Ramp* op, std::ostream& os) {  // NOLINT(*)
-  LOG(FATAL) << "not supported ";
+  LOG(FATAL) << "Ramp: not supported ";
 }

 void CodeGenC::PrintExpr(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
-  LOG(FATAL) << "not supported ";
+  LOG(FATAL) << "Broadcast: not supported ";
 }

 void CodeGenC::PrintExpr(const Select* op, std::ostream& os) {  // NOLINT(*)
-  LOG(FATAL) << "not supported ";
+  LOG(FATAL) << "Select: not supported ";
 }

 // Disoatch back to member functions
@@ -541,23 +716,6 @@ void CodeGenC::PrintStmt(const LetStmt* op) {
  PrintStmt(op->body);
 }

-void CodeGenC::PrintStmt(const Store* op) {
-  std::string index = this->PrintExpr(op->index);
-  std::string value = this->PrintExpr(op->value);
-  this->PrintIndent();
-  std::string vid = GetVarID(op->buffer_var.get());
-  if (!HandleTypeMatch(op->buffer_var.get(), op->value.type())) {
-    this->stream << "((";
-    PrintType(op->value.type(), this->stream);
-    this->stream << "*)" << vid << ')';
-  } else {
-    this->stream << vid;
-  }
-  this->stream << '[' << index
-               << "] = " << value
-               << ";\n";
-}
-
 void CodeGenC::PrintStmt(const Allocate* op) {
  CHECK(!is_zero(op->condition));
  std::string vid = AllocVarID(op->buffer_var.get());
@@ -580,7 +738,7 @@ void CodeGenC::PrintStmt(const Allocate* op) {
    stream << ' '<< vid << '['
           << constant_size << "];\n";
  }
-  HandleTypeRegister(op->buffer_var.get(), op->type);
+  RegisterHandleType(op->buffer_var.get(), op->type);
  this->PrintStmt(op->body);
 }


--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -102,6 +102,20 @@ class CodeGenC {
  virtual void PrintExpr(const ir::Ramp* op, std::ostream& os);  // NOLINT(*)
  virtual void PrintExpr(const ir::Broadcast* op, std::ostream& os);  // NOLINT(*)
  virtual void PrintExpr(const ir::Select* op, std::ostream& os);  // NOLINT(*)
+  // Binary vector op.
+  virtual void PrintVecBinaryOp(
+      const std::string&op, Type op_type,
+      Expr lhs, Expr rhs, std::ostream& os);  // NOLINT(*)
+  virtual void PrintVecLoad(const Variable* buffer,
+                            Type t, Expr base,
+                            std::ostream& os);  // NOLINT(*)
+  virtual void PrintVecStore(const Variable* buffer,
+                             Type t, Expr base,
+                             const std::string& value);  // NOLINT(*)
+  virtual void PrintVecElemLoad(
+      const std::string& vec, Type t, int i, std::ostream& os);  // NOLINT(*)
+  virtual void PrintVecElemStore(
+      const std::string& vec, Type t, int i, const std::string& value);
  /*! \brief function print into the ostream */
  using FPrintExpr = IRFunctor<void(const NodeRef&, std::ostream& os, CodeGenC *)>; // NOLINT(*)
  /*! \brief function to to print normal code */
@@ -116,17 +130,10 @@ class CodeGenC {
  std::ostringstream stream;

 protected:
-  // additional string for arg addr_space.
-  std::string arg_addr_space_;
-
- private:
-  /*! \brief entry in ssa assign map */
-  struct SSAEntry {
-    /*! \brief The value id */
-    std::string vid;
-    /*! \brief The scope id */
-    int scope_id;
-  };
+  // print reference to a buffer as type t in index.
+  void PrintBufferRef(const Variable* buffer,
+                      Type t, Expr index,
+                      std::ostream& os);  // NOLINT(*)
  /*!
   * \brief Get the SSA ID corresponds to src
   *  If necessary, generate new assignment
@@ -135,6 +142,19 @@ class CodeGenC {
   */
  std::string SSAGetID(std::string src, Type t);
  /*!
+   * \brief get a unique name with the corresponding prefix
+   * \param prefix The prefix of the name
+   * \return The returned name.
+   */
+  std::string GetUniqueName(std::string prefix);
+  /*! \brief entry in ssa assign map */
+  struct SSAEntry {
+    /*! \brief The value id */
+    std::string vid;
+    /*! \brief The scope id */
+    int scope_id;
+  };
+  /*!
   * \brief mark the beginning of a new scope
   * \return The scope id.
   */
@@ -155,25 +175,28 @@ class CodeGenC {
   * \param buf_var The buffer variable.
   * \param t The type to be checked.
   */
-  void HandleTypeRegister(const Variable* buf_var, Type t);
+  void RegisterHandleType(const Variable* buf_var, Type t);
  /*!
-   * \brief get a unique name with the corresponding prefix
-   * \param prefix The prefix of the name
-   * \return The returned name.
+   * \brief Get the storage scope of buf_var.
+   * \param buf_var The buf_var to be queryed.
+   * \return The storage scope.
   */
-  std::string GetUniqueName(std::string prefix);
-  /*! \brief whether to print in SSA form */
-  bool print_ssa_form_{true};
-  /*! \brief name of each variable */
-  std::unordered_map<const Variable*, std::string> var_idmap_;
-  /*! \brief the data type of allocated buffers */
-  std::unordered_map<const Variable*, Type> handle_data_type_;
+  std::string GetStorageScope(const Variable* buf_var) const;
+
  /*! \brief the storage scope of allocation */
  std::unordered_map<const Variable*, std::string> alloc_storage_scope_;
+
+ private:
+  /*! \brief whether to print in SSA form */
+  bool print_ssa_form_{true};
  /*! \brief name allocation map */
  std::unordered_map<std::string, int> name_alloc_map_;
  /*! \brief assignment map of ssa */
  std::unordered_map<std::string, SSAEntry> ssa_assign_map_;
+  /*! \brief name of each variable */
+  std::unordered_map<const Variable*, std::string> var_idmap_;
+  /*! \brief the data type of allocated buffers */
+  std::unordered_map<const Variable*, Type> handle_data_type_;
  /*! \brief array to check whether we are inside certain scope */
  std::vector<bool> scope_mark_;
 };

--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -22,6 +22,108 @@ std::string CodeGenCUDA::Compile(
  return CodeGenC::Compile(f, output_ssa);
 }

+void CodeGenCUDA::PrintType(Type t, std::ostream& os) const {  // NOLINT(*)
+  int lanes = t.lanes();
+  if (t.is_handle()) {
+    CHECK_EQ(lanes, 1)
+        << "do not yet support vector types";
+    os << "void*"; return;
+  }
+  bool fail = false;
+  if (t.is_float()) {
+    switch (t.bits()) {
+      case 16: os << "half"; break;
+      case 32: os << "float"; break;
+      case 64: os << "double"; break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 4)) {
+      os << lanes; return;
+    }
+  } else if (t.is_uint() || t.is_int()) {
+    if (t.is_uint()) {
+      os << 'u';
+    }
+    if (t.bits() == 8 && t.lanes() == 4) {
+      // directly 4 8 bit int in integer.
+      os << "int"; return;
+    }
+    switch (t.bits()) {
+      case 8: os << "char"; break;
+      case 16: os << "short"; break;
+      case 32: os << "int"; break;
+      case 64: {
+        if (lanes != 1 && sizeof(long) == 64) {  // NOLINT(*)
+          os << "long"; break;
+        } else {
+          os << "int64_t"; break;
+        }
+      }
+      case 1: os << "int"; break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 4)) {
+      os << lanes; return;
+    }
+  }
+  LOG(FATAL) << "Cannot convert type " << t << " to CUDA type";
+}
+
+void CodeGenCUDA::PrintVecBinaryOp(
+    const std::string&op, Type t,
+    Expr lhs, Expr rhs, std::ostream& os) {  // NOLINT(*)
+  // unpacking operations.
+  int lanes = t.lanes();
+
+  {
+    // default: unpack into individual ops.
+    std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.type());
+    std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.type());
+    std::string sret = GetUniqueName("_");
+    {
+      // delcare type.
+      this->PrintIndent();
+      this->PrintType(t, stream);
+      stream << ' ' << sret << ";\n";
+    }
+    for (int i = 0; i < lanes; ++i) {
+      std::ostringstream value_temp;
+      if (isalpha(op[0])) {
+        value_temp << op << "(";
+        PrintVecElemLoad(vlhs, lhs.type(), i, value_temp);
+        value_temp << ", ";
+        PrintVecElemLoad(vrhs, rhs.type(), i, value_temp);
+        value_temp << ")";
+      } else {
+        value_temp << "(";
+        PrintVecElemLoad(vlhs, lhs.type(), i, value_temp);
+        value_temp << op;
+        PrintVecElemLoad(vrhs, rhs.type(), i, value_temp);
+        value_temp << ")";
+      }
+      PrintVecElemStore(sret, t, i, value_temp.str());
+    }
+    os << sret;
+  }
+}
+
+void CodeGenCUDA::PrintVecElemLoad(
+    const std::string& vec, Type t, int i, std::ostream& os) {  // NOLINT(*)
+  const char access[] = {'x', 'y', 'z', 'w'};
+  CHECK(i >= 0 && i < 4);
+  os << vec << "." << access[i];
+}
+
+void CodeGenCUDA::PrintVecElemStore(
+    const std::string& vec, Type t, int i, const std::string& value) {
+  this->PrintIndent();
+  const char access[] = {'x', 'y', 'z', 'w'};
+  CHECK(i >= 0 && i < 4);
+  stream << vec << "." << access[i] << " = " << value << ";\n";
+}
+
 void CodeGenCUDA::PrintStorageSync(const std::string& sync) {
  if (sync == "shared") {
    this->PrintIndent();
@@ -43,8 +145,6 @@ void CodeGenCUDA::PrintStorageScope(
 std::unordered_map<LoweredFunc, PackedFunc>
 MakeNVRTC(Array<LoweredFunc> funcs) {
  std::ostringstream os;
-  os << "typedef int int32_t;\n"
-     << "typedef unsigned unt32_t;\n";
  bool output_ssa = false;
  for (LoweredFunc f : funcs) {
    os << CodeGenCUDA().Compile(f, output_ssa);
@@ -56,6 +156,7 @@ MakeNVRTC(Array<LoweredFunc> funcs) {
    const auto& f = PackedFunc::GetGlobal("tvm_callback_cuda_postproc");
    code = f(code).operator std::string();
  }
+    LOG(INFO) << code;
  std::string ptx;
  if (PackedFunc::GlobalExist("tvm_callback_cuda_compile")) {
    const auto& f = PackedFunc::GetGlobal("tvm_callback_cuda_compile");

--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -25,9 +25,18 @@ class CodeGenCUDA : public CodeGenC {
   */
  std::string Compile(LoweredFunc f,
                      bool output_ssa);
+
  // override behavior
  void PrintStorageSync(const std::string& sync) final;
  void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
+  void PrintVecBinaryOp(
+      const std::string&op, Type t,
+      Expr lhs, Expr rhs, std::ostream& os) final;  // NOLINT(*)
+  void PrintType(Type t, std::ostream& os) const final; // NOLINT(*)
+  void PrintVecElemLoad(
+      const std::string& vec, Type t, int i, std::ostream& os) final;  // NOLINT(*)
+  void PrintVecElemStore(
+      const std::string& vec, Type t, int i, const std::string& value) final;
 };

 }  // namespace codegen

--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -19,7 +19,11 @@ std::string CodeGenOpenCL::Compile(
    LoweredFunc f,
    bool output_ssa) {
  this->stream << " __kernel ";
-  this->arg_addr_space_ = "__global ";
+  for (Var arg : f->args) {
+    if (arg.type().is_handle()) {
+      alloc_storage_scope_[arg.get()] = "global";
+    }
+  }
  return CodeGenC::Compile(f, output_ssa);
 }

@@ -34,6 +38,80 @@ void CodeGenOpenCL::PrintThreadIndexExpr(
  }
 }

+void CodeGenOpenCL::PrintType(Type t, std::ostream& os) const {  // NOLINT(*)
+  int lanes = t.lanes();
+  if (t.is_handle()) {
+    CHECK_EQ(lanes, 1)
+        << "do not yet support vector types";
+    os << "void*"; return;
+  }
+  bool fail = false;
+  if (t.is_float()) {
+    switch (t.bits()) {
+      case 16: os << "half"; break;
+      case 32: os << "float"; break;
+      case 64: os << "double"; break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes; return;
+    }
+  } else if (t.is_uint() || t.is_int()) {
+    if (t.is_uint()) {
+      os << 'u';
+    }
+    if (t.bits() == 8 && t.lanes() == 4) {
+      // directly 4 8 bit int in integer.
+      os << "int"; return;
+    }
+    switch (t.bits()) {
+      case 8: os << "char"; break;
+      case 16: os << "short"; break;
+      case 32: os << "int"; break;
+      case 64: os << "long"; break;
+      case 1: os << "int"; break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes; return;
+    }
+  }
+  LOG(FATAL) << "Cannot convert type " << t << " to OpenCL type";
+}
+
+void CodeGenOpenCL::PrintVecAddr(const Variable* buffer, Type t,
+                                 Expr base, std::ostream& os) {  // NOLINT(*)
+  if (!HandleTypeMatch(buffer, t.element_of())) {
+    os << '(';
+    auto it = alloc_storage_scope_.find(buffer);
+    if (it != alloc_storage_scope_.end()) {
+      PrintStorageScope(it->second, os);
+    }
+    os << ' ';
+    PrintType(t.element_of(), os);
+    os << "*)";
+  }
+  os << GetVarID(buffer) << " + ";
+  PrintExpr(base, os);
+}
+void CodeGenOpenCL::PrintVecLoad(const Variable* buffer,
+                                 Type t, Expr base,
+                                 std::ostream& os) {
+  os << "vload" << t.lanes() << "(0, ";
+  PrintVecAddr(buffer, t, base, os);
+  os << ")";
+}
+
+void CodeGenOpenCL::PrintVecStore(const Variable* buffer,
+                                  Type t, Expr base,
+                                  const std::string& value) {
+  this->PrintIndent();
+  stream << "vstore" << t.lanes() << "(" << value << ", 0, ";
+  PrintVecAddr(buffer, t, base, stream);
+  stream << ");\n";
+}

 void CodeGenOpenCL::PrintStorageSync(const std::string& sync) {
  if (sync == "shared") {
@@ -45,8 +123,9 @@ void CodeGenOpenCL::PrintStorageSync(const std::string& sync) {
 }

 void CodeGenOpenCL::PrintStorageScope(const std::string& scope, std::ostream& os) { // NOLINT(*)
-  CHECK_NE(scope, "global");
-  if (scope == "shared") {
+  if (scope == "global") {
+    os << "__global";
+  } else if (scope == "shared") {
    os << "__local ";
  }
 }
@@ -55,8 +134,6 @@ void CodeGenOpenCL::PrintStorageScope(const std::string& scope, std::ostream& os
 std::unordered_map<LoweredFunc, PackedFunc>
 MakeOpenCL(Array<LoweredFunc> funcs) {
  std::ostringstream os;
-  os << "typedef int int32_t;\n"
-     << "typedef unsigned unt32_t;\n";
  bool output_ssa = false;
  for (LoweredFunc f : funcs) {
    os << CodeGenOpenCL().Compile(f, output_ssa);

--- a/src/codegen/codegen_opencl.h
+++ b/src/codegen/codegen_opencl.h
@@ -30,6 +30,16 @@ class CodeGenOpenCL : public CodeGenC {
      std::string tag, std::ostream& os) final;  // NOLINT(*)
  void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*)
  void PrintStorageSync(const std::string& scope) final;  // NOLINT(*)
+  void PrintType(Type t, std::ostream& os) const final; // NOLINT(*)
+  void PrintVecLoad(const Variable* buffer,
+                    Type t, Expr base,
+                    std::ostream& os) final;  // NOLINT(*)
+  void PrintVecStore(const Variable* buffer,
+                      Type t, Expr base,
+                      const std::string& value) final;  // NOLINT(*)
+  // the address of load/store
+  void PrintVecAddr(const Variable* buffer, Type t,
+                    Expr base, std::ostream& os);  // NOLINT(*)
 };

 }  // namespace codegen

--- a/src/pass/ir_mutator.cc
+++ b/src/pass/ir_mutator.cc
@@ -74,6 +74,7 @@ TVM_STATIC_IR_FUNCTOR(IRMutator, vtable_stmt)
 .DISPATCH_TO_MUTATE_STMT(Provide)
 .DISPATCH_TO_MUTATE_STMT(Realize)
 .DISPATCH_TO_MUTATE_STMT(Store)
+.DISPATCH_TO_MUTATE_STMT(IfThenElse)
 .DISPATCH_TO_MUTATE_STMT(For)
 .DISPATCH_TO_MUTATE_STMT(Allocate)
 .DISPATCH_TO_MUTATE_STMT(Free);
@@ -195,6 +196,22 @@ Stmt IRMutator::Mutate_(const Free *op, const Stmt& s) {
  return s;
 }

+Stmt IRMutator::Mutate_(const IfThenElse *op, const Stmt& s) {
+  Expr condition = this->Mutate(op->condition);
+  Stmt then_case = this->Mutate(op->then_case);
+  Stmt else_case;
+  if (else_case.defined()) {
+    else_case = this->Mutate(op->else_case);
+  }
+  if (condition.same_as(op->condition) &&
+      then_case.same_as(op->then_case) &&
+      else_case.same_as(op->else_case)) {
+    return s;
+  } else {
+    return IfThenElse::make(condition, then_case, else_case);
+  }
+}
+
 TVM_STATIC_IR_FUNCTOR(IRMutator, vtable_expr)
 .DISPATCH_TO_MUTATE_EXPR(Call)
 .DISPATCH_TO_MUTATE_EXPR(Let)
@@ -363,21 +380,6 @@ TVM_STATIC_IR_FUNCTOR(IRMutator, vtable_stmt)
      return Block::make(first, rest);
    }
  })
-.set_dispatch<IfThenElse>([](const IfThenElse *op, const Stmt& s, IRMutator* m) {
-    Expr condition = m->Mutate(op->condition);
-    Stmt then_case = m->Mutate(op->then_case);
-    Stmt else_case;
-    if (else_case.defined()) {
-      else_case = m->Mutate(op->else_case);
-    }
-    if (condition.same_as(op->condition) &&
-        then_case.same_as(op->then_case) &&
-        else_case.same_as(op->else_case)) {
-      return s;
-    } else {
-      return IfThenElse::make(condition, then_case, else_case);
-    }
-  })
 .set_dispatch<Evaluate>([](const Evaluate *op, const Stmt& s, IRMutator* m) {
    Expr v = m->Mutate(op->value);
    if (v.same_as(op->value)) {

--- a/src/pass/split_host_device.cc
+++ b/src/pass/split_host_device.cc
@@ -101,9 +101,14 @@ class IRUseDefAnalysis : public IRMutator {
  }

  void HandleDef(const Variable* v) {
+    CHECK(!def_count_.count(v))
+        << "variable " << v->name_hint
+        << " has already been defined, the Stmt is not SSA";
    CHECK(!use_count_.count(v))
-        << "variable is already defined";
+        << "variable " << v->name_hint
+        << " has been used before definition!";
    use_count_[v] = 0;
+    def_count_[v] = 1;
  }

  void HandleUse(const Expr& v) {
@@ -127,6 +132,7 @@ class IRUseDefAnalysis : public IRMutator {
  Array<IterVar> thread_axis_;
  Array<Expr> thread_extent_;
  std::unordered_map<const Variable*, int> use_count_;
+  std::unordered_map<const Variable*, int> def_count_;
 };

 class HostDeviceSplitter : public IRMutator {

--- a/src/pass/unroll_loop.cc
+++ b/src/pass/unroll_loop.cc
 /*!
- *  Copyright (c) 2016 by Contributors
- *  SSA related checks and pass.
- * \file ssa.cc
+ *  Copyright (c) 2017 by Contributors
+ *  Loop unrolling.
+ * \file unroll_loop.cc
 */
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
@@ -9,7 +9,7 @@
 #include <unordered_set>
 #include <unordered_map>
 #include <vector>
-#include "../arithmetic//compute_expr.h"
+#include "../arithmetic/compute_expr.h"

 namespace tvm {
 namespace ir {
@@ -33,7 +33,8 @@ class LoopUnroller : public IRMutator {
    if (v2 != nullptr) {
      value = static_cast<int>(v2->value);
    }
-    bool allow_unroll = value >= 0 && value <= max_auto_step_;
+    bool allow_unroll = (op->for_type == ForType::Serial &&
+                         value >= 0 && value <= max_auto_step_);
    if (op->for_type == ForType::Unrolled) {
      CHECK_GE(value, 0)
          << "Cannot unroll non-constant loop";

--- a/src/pass/vectorize_loop.cc
+++ b/src/pass/vectorize_loop.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ *  Vectorize the loop
+ * \file vectorize_loop.cc
+ */
+#include <tvm/ir.h>
+#include <tvm/ir_pass.h>
+#include <tvm/ir_mutator.h>
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+#include "../arithmetic/compute_expr.h"
+
+namespace tvm {
+namespace ir {
+
+inline Expr BroadcastTo(Expr e, int lanes) {
+  if (e.type().lanes() == lanes) return e;
+  CHECK_EQ(e.type().lanes(), 1)
+      << "Cannot broadcast lane=" << e.type().lanes()
+      << " to " << lanes;
+  return Broadcast::make(e, lanes);
+}
+
+// Rewrite vectorized allocation access
+// s[i] = s[i * lanes + var]
+class VecAllocAccess : public IRMutator {
+ public:
+  VecAllocAccess(const Variable* buf, Var var, int var_lanes)
+      : buf_(buf), var_(var), var_lanes_(var_lanes) {}
+  // Load
+  Expr Mutate_(const Load* op, const Expr& e) final {
+    Expr expr = IRMutator::Mutate_(op, e);
+    op = expr.as<Load>();
+    if (op->buffer_var.get() == buf_) {
+      return Load::make(op->type, op->buffer_var,
+                        op->index * var_lanes_ + var_);
+    } else {
+      return expr;
+    }
+  }
+  // Store
+  Stmt Mutate_(const Store* op, const Stmt& s) final {
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    op = stmt.as<Store>();
+    if (op->buffer_var.get() == buf_) {
+      return Store::make(op->buffer_var,
+                         op->value,
+                         op->index * var_lanes_ + var_);
+    } else {
+      return stmt;
+    }
+  }
+
+ private:
+  // buffer var
+  const Variable* buf_;
+  // variable to be replaced
+  Var var_;
+  // the lanes.
+  int var_lanes_;
+};
+
+class Vectorizer : public IRMutator {
+ public:
+  Vectorizer(Var var, int var_lanes)
+      : var_(var), var_lanes_(var_lanes) {
+    ramp_ = Ramp::make(0, 1, var_lanes);
+  }
+  // user mutate from parent.
+  using IRMutator::Mutate;
+  // override mutate
+  Expr Mutate(Expr expr) final {
+    static const FMutateExpr& f = Vectorizer::vtable_expr();
+    return  (f.can_dispatch(expr) ?
+             f(expr, expr, this) : IRMutator::Mutate(expr));
+  }
+  // Variable
+  Expr Mutate_(const Variable* v, const Expr& e) final {
+    if (v == var_.get()) {
+      return ramp_;
+    } else if (lets_.count(v)) {
+        return lets_[v];
+    } else {
+      return e;
+    }
+  }
+  // Call
+  Expr Mutate_(const Call* op, const Expr& e) final {
+    int lane = 0;
+    Array<Expr> new_args = MutateArray(op->args, &lane);
+    if (op->args.same_as(new_args)) {
+      return e;
+    } else {
+      return Call::make(
+          op->type.with_lanes(lane), op->name, new_args,
+          op->call_type, op->func, op->value_index);
+    }
+  }
+  // Load
+  Expr Mutate_(const Load* op, const Expr& e) final {
+    Expr index = this->Mutate(op->index);
+    if (index.same_as(op->index)) {
+      return e;
+    } else {
+      return Load::make(op->type.with_lanes(index.type().lanes()),
+                        op->buffer_var, index);
+    }
+  }
+  // Let
+  Expr Mutate_(const Let* op, const Expr& e) final {
+    Expr value = this->Mutate(op->value);
+    CHECK(!lets_.count(op->var.get())) << "not SSA";
+    if (value.type().lanes() != op->value.type().lanes()) {
+      Var v(op->var->name_hint, value.type());
+      lets_[op->var.get()] = v;
+      return Let::make(v, value, Mutate(op->body));
+    } else {
+      Expr body = this->Mutate(op->body);
+      if (value.same_as(op->value) &&
+          body.same_as(op->body)) {
+        return e;
+      } else {
+        return Let::make(op->var, value, body);
+      }
+    }
+  }
+  // Provide
+  Stmt Mutate_(const Provide* op, const Stmt& s) final {
+    Expr new_value = this->Mutate(op->value);
+    int lane = new_value.type().lanes();
+    Array<Expr> new_args = MutateArray(op->args, &lane);
+    if (op->args.same_as(new_args) && op->value.same_as(new_value)) {
+      return s;
+    } else {
+      new_value = BroadcastTo(new_value, lane);
+      return Provide::make(op->func, op->value_index, new_value, new_args);
+    }
+  }
+  // Store
+  Stmt Mutate_(const Store* op, const Stmt& s) final {
+    Expr value = this->Mutate(op->value);
+    Expr index = this->Mutate(op->index);
+    if (value.same_as(op->value) && index.same_as(op->index)) {
+      return s;
+    } else {
+      int lanes = std::max(value.type().lanes(), index.type().lanes());
+      return Store::make(op->buffer_var,
+                         BroadcastTo(value, lanes),
+                         BroadcastTo(index, lanes));
+    }
+  }
+  // For
+  Stmt Mutate_(const For* op, const Stmt& s) final {
+    if (op->for_type == ForType::Vectorized) {
+      LOG(WARNING) << "Detect vectorize inside vectorized loop, ignoring...";
+    }
+    CHECK(is_zero(op->min));
+    CHECK(!op->extent.type().is_vector());
+    Expr extent = Mutate(op->extent);
+    if (extent.type().is_vector()) {
+      LOG(WARNING) << "Detect vectorized extent type, scalarizing...";
+      return Scalarize(s);
+    }
+    Stmt body = Mutate(op->body);
+    if (extent.same_as(op->extent) &&
+        body.same_as(op->body)) {
+      return s;
+    } else {
+      return For::make(
+          op->loop_var, op->min, extent,
+          op->for_type, op->device_api, body);
+    }
+  }
+  // IfThenElse
+  Stmt Mutate_(const IfThenElse* op, const Stmt& s) final {
+    CHECK(!op->condition.type().is_vector());
+    Expr condition = this->Mutate(op->condition);
+    if (condition.type().is_vector()) {
+      LOG(WARNING) << "Detect vector condition in Vectorized Loop, scalarizing...";
+      return Scalarize(s);
+    }
+    Stmt then_case = this->Mutate(op->then_case);
+    Stmt else_case;
+    if (else_case.defined()) {
+      else_case = this->Mutate(op->else_case);
+    }
+    if (condition.same_as(op->condition) &&
+        then_case.same_as(op->then_case) &&
+        else_case.same_as(op->else_case)) {
+      return s;
+    } else {
+      return IfThenElse::make(condition, then_case, else_case);
+    }
+  }
+  // LetStmt
+  Stmt Mutate_(const LetStmt* op, const Stmt& s) final {
+    LOG(WARNING) << "Cannot vectorize with LetStmt, remove it with Simplify Before Vectorize";
+    return Scalarize(s);
+  }
+  // Allocate
+  Stmt Mutate_(const Allocate* op, const Stmt& s) final {
+    if (op->new_expr.defined()) {
+      LOG(WARNING) << "Cannot vectorize with new expr";
+      return Scalarize(s);
+    }
+    Expr condition = Mutate(op->condition);
+    if (condition.type().is_vector()) {
+      LOG(WARNING) << "Cannot handle vector extent in alloc ";
+      return Scalarize(s);
+    }
+    Array<Expr> extents;
+    for (size_t i = 0; i < op->extents.size(); i++) {
+      Expr new_ext = Mutate(op->extents[i]);
+      if (new_ext.type().is_vector()) {
+        LOG(WARNING) << "Cannot handle vector extent in alloc ";
+        return Scalarize(s);
+      }
+      extents.push_back(new_ext);
+    }
+    // place the vector lanes in least significant dimension.
+    extents.push_back(var_lanes_);
+    // rewrite access to buffer internally.
+    Stmt body = VecAllocAccess(
+        op->buffer_var.get(), var_, var_lanes_).Mutate(op->body);
+    body = Mutate(body);
+    return Allocate::make(
+        op->buffer_var, op->type,
+        extents, condition, body,
+        op->new_expr, op->free_function);
+  }
+  // scalarize the statment
+  Stmt Scalarize(Stmt stmt) {
+    Var idx(var_->name_hint + ".s", var_->type);
+    stmt = Substitute(stmt, {{var_, idx}});
+    return For::make(idx, 0, var_lanes_, ForType::Serial, DeviceAPI::None, stmt);
+  }
+  // The overloads for vectorize.
+  static FMutateExpr& vtable_expr() {  // NOLINT(*)
+    static FMutateExpr inst; return inst;
+  }
+
+ private:
+  // variable to be replaced
+  Var var_;
+  // the lanes.
+  int var_lanes_;
+  // ramp representing the var.
+  Expr ramp_;
+  // The lets
+  std::unordered_map<const Variable*, Expr> lets_;
+  // mutate array, with given lane requirement
+  // when finished, p_lane updates the lane requirement.
+  Array<Expr> MutateArray(Array<Expr> arr, int* p_lanes) {
+    if (arr.size() == 0) return arr;
+    int& lanes = *p_lanes;
+    bool changed = false;
+    std::vector<Expr> new_arr(arr.size());
+    for (size_t i = 0; i < arr.size(); i++) {
+      Expr old_elem = arr[i];
+      Expr new_elem = this->Mutate(old_elem);
+      if (!new_elem.same_as(old_elem)) changed = true;
+      new_arr[i] = new_elem;
+      lanes = std::max(lanes, new_elem.type().lanes());
+    }
+
+    for (size_t i = 0; i < arr.size(); ++i) {
+      if (new_arr[i].type().lanes() != lanes) {
+        new_arr[i] = BroadcastTo(new_arr[i], lanes);
+        changed = true;
+      }
+    }
+    if (!changed) return arr;
+    return Array<Expr>(new_arr);
+  }
+};
+
+// binary vectorize
+template<typename T>
+inline Expr BinaryVec(const T* op, const Expr& e, IRMutator* m) {
+  Expr a = m->Mutate(op->a);
+  Expr b = m->Mutate(op->b);
+  if (a.same_as(op->a) &&
+      b.same_as(op->b)) {
+    return e;
+  } else {
+    int lanes = std::max(a.type().lanes(), b.type().lanes());
+    return T::make(BroadcastTo(a, lanes), BroadcastTo(b, lanes));
+  }
+}
+
+template<typename T>
+inline Expr AddSubVec(const T* op, const Expr& e, IRMutator* m) {
+  Expr a = m->Mutate(op->a);
+  Expr b = m->Mutate(op->b);
+  if (a.same_as(op->a) &&
+      b.same_as(op->b)) {
+    return e;
+  } else {
+    int lanes = std::max(a.type().lanes(), b.type().lanes());
+    if (lanes != 1) {
+      const Ramp* b_ramp = b.as<Ramp>();
+      const Ramp* a_ramp = a.as<Ramp>();
+      if (a.type().lanes() == 1 && b_ramp) {
+        return Ramp::make(
+            arith::ComputeExpr<T>(a, b_ramp->base), b_ramp->stride, b_ramp->lanes);
+      }
+      if (b.type().lanes() == 1 && a_ramp) {
+        return Ramp::make(
+            arith::ComputeExpr<T>(a_ramp->base, b), a_ramp->stride, a_ramp->lanes);
+      }
+    }
+    return T::make(BroadcastTo(a, lanes), BroadcastTo(b, lanes));
+  }
+}
+
+TVM_STATIC_IR_FUNCTOR(Vectorizer, vtable_expr)
+.set_dispatch<Add>(AddSubVec<Add>)
+.set_dispatch<Sub>(AddSubVec<Sub>)
+.set_dispatch<Mul>(BinaryVec<Mul>)
+.set_dispatch<Div>(BinaryVec<Div>)
+.set_dispatch<Mod>(BinaryVec<Mod>)
+.set_dispatch<Min>(BinaryVec<Min>)
+.set_dispatch<Max>(BinaryVec<Max>)
+.set_dispatch<EQ>(BinaryVec<EQ>)
+.set_dispatch<NE>(BinaryVec<NE>)
+.set_dispatch<LT>(BinaryVec<LT>)
+.set_dispatch<LE>(BinaryVec<LE>)
+.set_dispatch<GT>(BinaryVec<GT>)
+.set_dispatch<GE>(BinaryVec<GE>)
+.set_dispatch<And>(BinaryVec<And>)
+.set_dispatch<Or>(BinaryVec<Or>);
+
+
+TVM_STATIC_IR_FUNCTOR(Vectorizer, vtable_expr)
+.set_dispatch<Select>([](const Select *op, const Expr& e, IRMutator* m) {
+    Expr cond = m->Mutate(op->condition);
+    Expr t = m->Mutate(op->true_value);
+    Expr f = m->Mutate(op->false_value);
+    if (cond.same_as(op->condition) &&
+        t.same_as(op->true_value) &&
+        f.same_as(op->false_value)) {
+      return e;
+    } else {
+      int lanes = std::max(std::max(
+          cond.type().lanes(),
+          t.type().lanes()), f.type().lanes());
+      return Select::make(cond, BroadcastTo(t, lanes), BroadcastTo(f, lanes));
+    }
+  })
+.set_dispatch<Cast>([](const Cast *op, const Expr& e, IRMutator* m) {
+    Expr value = m->Mutate(op->value);
+    if (value.same_as(op->value)) {
+      return e;
+    } else {
+      return Cast::make(op->type.with_lanes(value.type().lanes()), value);
+    }
+  });
+
+
+class LoopVectorizer : public IRMutator {
+ public:
+  Stmt Mutate_(const For* op, const Stmt& s) final {
+    if (op->for_type == ForType::Vectorized) {
+      CHECK(is_zero(op->min));
+      CHECK(is_positive_const(op->extent));
+      int lanes = 0;
+      bool succ = arith::GetConstInt(op->extent, &lanes);
+      if (!succ || lanes < 1) {
+        LOG(FATAL) << "Failed to vectorize loop with extent " << op->extent;
+      }
+      Var var(op->loop_var.node_);
+      return Vectorizer(var, lanes).Mutate(op->body);
+    } else {
+      return IRMutator::Mutate_(op, s);
+    }
+  }
+};
+
+Stmt VectorizeLoop(Stmt stmt) {
+  return LoopVectorizer().Mutate(stmt);
+}
+
+}  // namespace ir
+}  // namespace tvm
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -57,6 +57,14 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
            << ")";
 });

+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<IterVarAttrNode>([](const IterVarAttrNode *op, IRPrinter *p) {
+    switch (op->iter_type) {
+      case kUnrolled: p->stream << "unroll"; break;
+      case kVectorized: p->stream << "vectorize"; break;
+    }
+  });
+
 Stage::Stage(Operation op) {
  auto n = std::make_shared<StageNode>();
  n->op = op;
@@ -246,7 +254,38 @@ void Schedule::normalize() {
  }
 }

+IterVarAttr::IterVarAttr(IterVarType t) {
+  std::shared_ptr<IterVarAttrNode> n = std::make_shared<IterVarAttrNode>();
+  n->iter_type = t;
+  node_ = n;
+}
+
+inline void SetAttr(StageNode* self, IterVar var, IterVarAttr attr) {
+  ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
+  ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
+  FindLeafVar(all_vars, leaf_vars, var);
+  auto it = self->iter_var_attrs.find(var);
+  if (it != self->iter_var_attrs.end()) {
+    CHECK_EQ((*it).second->iter_type, attr->iter_type)
+        << "IterVar's is already set to "
+        << (*it).second << " instead of " << attr;
+  } else {
+    self->iter_var_attrs.Set(var, attr);
+  }
+}
+
+Stage& Stage::vectorize(IterVar var) {   // NOLINT(*)
+  SetAttr(operator->(), var, IterVarAttr(kVectorized));
+  return *this;
+}
+
+Stage& Stage::unroll(IterVar var) {   // NOLINT(*)
+  SetAttr(operator->(), var, IterVarAttr(kUnrolled));
+  return *this;
+}
+
 TVM_REGISTER_NODE_TYPE(StageNode);
+TVM_REGISTER_NODE_TYPE(IterVarAttrNode);
 TVM_REGISTER_NODE_TYPE(SplitNode);
 TVM_REGISTER_NODE_TYPE(FuseNode);
 TVM_REGISTER_NODE_TYPE(RebaseNode);

--- a/src/schedule/schedule_ops.cc
+++ b/src/schedule/schedule_ops.cc
@@ -177,6 +177,13 @@ MakeLoopNest(const Stage& sch,
    }
    // Mark the iter var in the IR, to remember the point
    if (iv->thread_tag.length() == 0) {
+      ForType for_type = ForType::Serial;
+      if (sch->iter_var_attrs.count(iv)) {
+        switch (sch->iter_var_attrs[iv]->iter_type) {
+          case kUnrolled: for_type = ForType::Unrolled; break;
+          case kVectorized: for_type = ForType::Vectorized; break;
+        }
+      }
      if (is_one(dom->extent)) {
        nest[i + 1].emplace_back(
            LetStmt::make(var, dom->min, no_op));
@@ -184,13 +191,13 @@ MakeLoopNest(const Stage& sch,
      } else if (is_zero(dom->min)) {
        nest[i + 1].emplace_back(
            For::make(var, 0, dom->extent,
-                      ForType::Serial, DeviceAPI::None, no_op));
+                      for_type, DeviceAPI::None, no_op));
        value_map[iv] = var;
      } else {
        Var idx(iv->var->name_hint + ".idx", iv->var.type());
        nest[i + 1].emplace_back(
            For::make(idx, 0, dom->extent,
-                      ForType::Serial, DeviceAPI::None, no_op));
+                      for_type, DeviceAPI::None, no_op));
        Expr new_value = dom->min + idx;
        value_map[iv] = new_value;
        nest[i + 1].emplace_back(

--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -3,7 +3,7 @@ import numpy as np

 def test_add():
    # graph
-    n = tvm.Var('n')
+    n = tvm.convert(1024)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
@@ -13,26 +13,28 @@ def test_add():
    num_thread = 256
    block_x = tvm.IterVar(thread_tag="blockIdx.x")
    thread_x = tvm.IterVar((0, num_thread), thread_tag="threadIdx.x")
-    _, x = s[C].split(C.op.axis[0], factor=num_thread, outer=block_x)
+    _, x = s[C].split(C.op.axis[0], factor=num_thread*4, outer=block_x)
    _, x = s[C].split(x, outer=thread_x)
+    _, x = s[C].split(x, factor=4)
+    s[C].vectorize(x)

    # one line to build the function.
+    def check_device(target):
        codes = []
-    fadd = tvm.build(s,
-                     args=[A, B, C],
-                     target="cuda", name="myadd",
-                     record_codes=codes)
-    for c in codes:
-        print(c)
-
-    # call the function
-    num_device = 1
-    for i in range(num_device):
-        ctx = tvm.gpu(i)
+        fadd = tvm.build(s, [A, B, C],
+                      target, record_codes=codes,
+                      name="myadd")
+        if target == "cuda":
+            ctx = tvm.gpu(0)
+        else:
+            ctx = tvm.cl(0)
        if not ctx.enabled:
-            continue
+            return
+
+        for c in codes[1:]:
+            print(c)
        # launch the kernel.
-        n = 1027
+        n = 1024
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
@@ -40,6 +42,10 @@ def test_add():
        np.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + b.asnumpy())

+    tvm.init_opencl()
+    check_device("cuda")
+    check_device("opencl")
+

 if __name__ == "__main__":
    test_add()
--- a/tests/python/unittest/test_lang_schedule.py
+++ b/tests/python/unittest/test_lang_schedule.py
@@ -76,6 +76,21 @@ def test_fuse():
    assert any(isinstance(x, tvm.schedule.Fuse) for x in s[T].relations)
    assert tuple(s[T].leaf_iter_vars) == (fused, xi, yi)

+def test_vectorize():
+    m = tvm.Var('m')
+    n = tvm.Var('n')
+    A = tvm.placeholder((m, n), name='A')
+    T = tvm.compute((m, n), lambda i, j: A[i, j])
+
+    s = tvm.Schedule(T.op)
+    xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5)
+    s[T].vectorize(yi)
+    s[T].unroll(xi)
+    UNROLL = 1
+    VECTORIZE = 2
+    assert s[T].iter_var_attrs[xi].iter_type == UNROLL
+    assert s[T].iter_var_attrs[yi].iter_type == VECTORIZE
+

 if __name__ == "__main__":
    test_schedule_create()
@@ -83,3 +98,4 @@ if __name__ == "__main__":
    test_tile()
    test_split()
    test_fuse()
+    test_vectorize()
--- a/tests/python/unittest/test_pass_unroll.py
+++ b/tests/python/unittest/test_pass_unroll.py
@@ -9,11 +9,13 @@ def test_unroll_loop():
    # for i in 0 to n-1:
    stmt = tvm.make.For(
        i, n, 2, 0, 0,
-        tvm.make.For(j, 0, n, 0, 0,
+        tvm.make.For(j, 0, 8, 3, 0,
                     tvm.make.Store(Ab.data,
                                    tvm.make.Load(dtype, Ab.data, i) + 1,
                                    j + 1)))
-    stmt = tvm.ir_pass.UnrollLoop(stmt, 8)
+    assert isinstance(stmt, tvm.stmt.For)
+    stmt = tvm.ir_pass.UnrollLoop(stmt, 4)
+    assert not isinstance(stmt, tvm.stmt.For)
    print(stmt)

 if __name__ == "__main__":

--- a/tests/python/unittest/test_pass_vectorize.py
+++ b/tests/python/unittest/test_pass_vectorize.py
+import tvm
+
+def test_vectorize_loop():
+    dtype = 'int64'
+    n = tvm.Var('n')
+    Ab = tvm.Buffer((n, ), dtype)
+    i = tvm.Var('i')
+    j = tvm.Var('j')
+    VECTORIZE = 2
+    # for i in 0 to n-1:
+    stmt = tvm.make.For(
+        i, n, 2, 0, 0,
+        tvm.make.For(j, 0, 4, VECTORIZE, 0,
+                     tvm.make.Store(Ab.data,
+                                    tvm.make.Load(dtype, Ab.data, i) + 1,
+                                    j + 1)))
+    assert isinstance(stmt.body, tvm.stmt.For)
+    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    assert isinstance(stmt, tvm.stmt.For)
+    assert not isinstance(stmt.body, tvm.stmt.For)
+    print(stmt)
+
+if __name__ == "__main__":
+    test_vectorize_loop()