[CODEGEN] More robust llvm intrin handling, remove graph executor (#519)

338bfd45 · Tianqi Chen · GitHub · 4468c576 · 338bfd45 · 4468c576
Commit 338bfd45 authored Oct 07, 2017 by Tianqi Chen Committed by GitHub Oct 07, 2017
22 changed files
--- a/apps/README.md
+++ b/apps/README.md
@@ -3,9 +3,9 @@ This folder contains various extension projects using TVM,
 they also serve as examples on how to use TVM in your own project.
 If you are interested in writing optimized kernels with TVM, checkout [TOPI: TVM Operator Inventory](../topi).
+If you are interested in end to end deep learning model compilation, checkout  [NNVM Compiler](https://github.com/dmlc/nnvm).
 - [extension](extension) How to extend TVM C++ api along with python API.
- [graph_executor](graph_executor) Build nnvm graph executor with TVM.
 - [ios_rpc](ios_rpc) iOS RPC server.
 - [android_rpc](android_rpc) Android RPC server.
 - [howto_deploy](howto_depploy) Tutorial on how to deploy TVM with minimum code dependency.
\ No newline at end of file
--- a/apps/graph_executor/Makefile
+++ b/apps/graph_executor/Makefile
-# Minimum Makefile for the extension package
-TVM_ROOT=$(shell cd ../..; pwd)
-NNVM_PATH=nnvm
-DMLC_CORE=${TVM_ROOT}/dmlc-core
-PKG_CFLAGS = -std=c++11 -O2 -fPIC\
-	-I${TVM_ROOT}/include\
-	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/dlpack/include\
-	-I${TVM_ROOT}/HalideIR/src
-PKG_LDFLAGS =
-UNAME_S := $(shell uname -s)
-ifeq ($(UNAME_S), Darwin)
-	PKG_LDFLAGS += -undefined dynamic_lookup
-	WHOLE_ARCH= -all_load
-	NO_WHOLE_ARCH= -noall_load
-else
-	WHOLE_ARCH= --whole-archive
-	NO_WHOLE_ARCH= --no-whole-archive
-endif
-NNVM_CONTRIB_SRC = $(wildcard src/*.cc)
-NNVM_CONTRIB_OBJ = $(patsubst src/%.cc, build/%.o, $(NNVM_CONTRIB_SRC))
-include $(DMLC_CORE)/make/dmlc.mk
-ALL_DEP = $(NNVM_CONTRIB_OBJ)
-PKG_CFLAGS += -I${NNVM_PATH}/include
-ALL_DEP += ${DMLC_CORE}/libdmlc.a ${NNVM_PATH}/lib/libnnvm.a
-.PHONY: clean all
-all: lib/libtvm_graph_exec.so
-nnvm:
-	git clone https://github.com/dmlc/nnvm --recursive
-nnvm/lib/libnnvm.a: | nnvm
-	+	cd nnvm; make ; cd -
-$(DMLC_CORE)/libdmlc.a:
-	+	cd $(DMLC_CORE); make libdmlc.a; cd $(TVM_ROOT)
-build/%.o: src/%.cc | nnvm
-	@mkdir -p $(@D)
-	$(CXX) $(PKG_CFLAGS) -MM -MT build/$*.o $< >build/$*.d
-	$(CXX) -c $(PKG_CFLAGS) -c $< -o $@
-lib/libtvm_graph_exec.so: $(ALL_DEP)
-	@mkdir -p $(@D)
-	$(CXX) $(PKG_CFLAGS) -shared -o $@ $(filter %.o, $^) $(PKG_LDFLAGS) \
-	-Wl,${WHOLE_ARCH} $(filter %.a, $^) -Wl,${NO_WHOLE_ARCH} $(PKG_LDFLAGS)
-clean:
-	$(RM) -rf build lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o */*.d */*/*.d */*/*/*.d
-include build/*.d
-include build/*/*.d
--- a/apps/graph_executor/README.md
+++ b/apps/graph_executor/README.md
-Example Graph Executor
-======================
-This folder contains a minimum example of graph executor library based on TVM and NNVM.
-It demonstrates how to build a computation graph compilation and execution framework.
- The to build library, need to clone and build into root of the repo.
--- a/apps/graph_executor/python/tvm_graph/__init__.py
+++ b/apps/graph_executor/python/tvm_graph/__init__.py
-"""The graph build library"""
-from __future__ import absolute_import as _abs
-import tvm
-from . import _base
-from nnvm.symbol import *
-from . import op_tvm_def
-from .build import build, bind, save_params, compile_graph, remote_load_exec
--- a/apps/graph_executor/python/tvm_graph/_base.py
+++ b/apps/graph_executor/python/tvm_graph/_base.py
-from __future__ import absolute_import as _abs
-import os
-import sys
-if sys.version_info[0] == 3:
-    import builtins as __builtin__
-else:
-    import __builtin__
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-if hasattr(__builtin__, "NNVM_BASE_PATH"):
-    assert __builtin__.NNVM_BASE_PATH == curr_path
-else:
-    __builtin__.NNVM_BASE_PATH = curr_path
-if hasattr(__builtin__, "NNVM_LIBRARY_NAME"):
-    assert __builtin__.NNVM_LIBRARY_NAME == curr_path
-else:
-    __builtin__.NNVM_LIBRARY_NAME = "libtvm_graph_exec"
--- a/apps/graph_executor/python/tvm_graph/build.py
+++ b/apps/graph_executor/python/tvm_graph/build.py
-"""Logics related to build."""
-import nnvm.graph as graph
-import tvm
-import json
-DTYPE_DICT = {
-    "float32": 0
-}
-_create_exec = tvm.get_global_func("tvm_graph._create_executor")
-def build(sym, target, shape, dtype="float32"):
-    # Do shape inference in python.
-    g = graph.create(sym)
-    jgraph = json.loads(g.apply('SaveJSON').json_attr('json'))
-    jnodes = jgraph['nodes']
-    jnode_row_ptr = jgraph['node_row_ptr']
-    nindex = {n['name']: i for i, n in enumerate(jnodes)}
-    list_shape = [[]] * jnode_row_ptr[-1]
-    list_dtype = [DTYPE_DICT[dtype]] * jnode_row_ptr[-1]
-    for k, v in shape.items():
-        list_shape[jnode_row_ptr[nindex[k]]] = v
-    g._set_json_attr("shape", list_shape, 'list_shape')
-    g._set_json_attr("dtype", list_dtype, 'list_int')
-    g._set_json_attr("target", target, 'str')
-    g = g.apply("InferShape").apply("InferType")
-    g = g.apply("GraphPartition").apply("GraphFuse")
-    return g
-def bind(g, ctx):
-    m = _create_exec(g.handle, ctx.device_type, ctx.device_id)
-    return m
-_get_module = tvm.get_global_func("tvm_graph._get_module_from_graph")
-def compile_graph(lib_fname, sym, target, shape, dtype="float32"):
-    g = build(sym, target, shape, dtype)
-    m = _get_module(g.handle)
-    m.save(lib_fname)
-    json_str = g.apply('SaveJSON').json_attr('json')
-    return json_str
-@tvm.register_func("tvm_graph.lower")
-def _lower(sch, inputs, func_name):
-    f = tvm.lower(sch, inputs, name=func_name)
-    return f if isinstance(
-        f, (tvm.container.Array, tuple, list)) else [f]
-@tvm.register_func("tvm_graph.build_target")
-def _build(funcs, target):
-    return tvm.build(funcs, target=target)
-_save_param_dict = tvm.get_global_func("tvm_graph._save_param_dict")
-def save_params(fname, params):
-    args = []
-    args.append(fname)
-    args.append(len(params))
-    for kv in params.items():
-        args.append(kv[0])
-        args.append(kv[1])
-    _save_param_dict(*args)
-def remote_load_exec(sess, sym_json, remote_module_name, param_blob, ctx):
-    """Load a remote graph executor, with the local files.
-    Parameters
-    ----------
-    sym_json : str
-        The symbol json file.
-    remote_module_fname : str
-        The relative library location to remote temp folder. The
-        library need to be uploaded first.
-    param_blob : bytes or bytearray
-        The binary file to the local parameters.
-    Returns
-    -------
-    exec : GraphExecutor
-        The remote graph executor containing remote function.
-    """
-    if "load_executor" not in sess._remote_funcs:
-        sess._remote_funcs["load_executor"] = sess.get_function("tvm_graph._load_executor")
-    assert ctx.device_type / tvm.contrib.rpc.RPC_SESS_MASK == sess._tbl_index + 1
-    device_type = ctx.device_type % tvm.contrib.rpc.RPC_SESS_MASK
-    return sess._remote_funcs["load_executor"](sym_json,
-                                               remote_module_name,
-                                               bytearray(param_blob),
-                                               device_type,
-                                               ctx.device_id)
--- a/apps/graph_executor/python/tvm_graph/op_tvm_def.py
+++ b/apps/graph_executor/python/tvm_graph/op_tvm_def.py
-"""NNVM operator definitions."""
-import tvm
-@tvm.register_func("tvm_graph.compute.add")
-def compute_add(a, b):
-    return tvm.compute(a.shape, lambda *i: a(*i) + b(*i))
-@tvm.register_func("tvm_graph.compute.exp")
-def compute_exp(a):
-    return tvm.compute(a.shape, lambda *i: tvm.exp(a(*i)))
-@tvm.register_func("tvm_graph.schedule.ewise")
-def schedule_ewise(outs, target):
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineElemWise(s)
-    return s
--- a/apps/graph_executor/src/graph_executor.cc
+++ b/apps/graph_executor/src/graph_executor.cc
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file graph_executor.cc
- */
-#include "./graph_executor.h"
-namespace tvm {
-namespace contrib {
-PackedFunc GraphExecutor::GetFunction(
-    const std::string& name,
-    const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  // return member functions during query.
-  if (name == "set_input") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        if (args[0].type_code() == kStr) {
-          this->SetInput(this->GetIndex(args[0]), args[1]);
-        } else {
-          this->SetInput(args[0], args[1]);
-        }
-      });
-  } else if (name == "get_output") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        this->GetOutput(args[0], args[1]);
-      });
-  } else if (name == "run") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        this->Run();
-      });
-  } else if (name == "load_params") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        this->LoadParamsFromBlob(args[0]);
-      });
-  } else {
-    return PackedFunc();
-  }
-}
-GraphExecutor::~GraphExecutor() {
-  for (DLTensor* t : storage_pool_) {
-    TVM_CCALL(TVMArrayFree(t));
-  }
-}
-void GraphExecutor::Run() {
-  // setup the array and requirements.
-  for (size_t i = 0; i < op_execs_.size(); ++i) {
-    if (op_execs_[i]) op_execs_[i]();
-  }
-}
-void GraphExecutor::Init(const nnvm::Graph& g, TVMContext ctx) {
-  graph_ = g;
-  ctx_ = ctx;
-  module_ = g.GetAttr<tvm::runtime::Module>("module");
-  this->SetupNameIndex();
-  this->SetupStorage();
-  this->SetupOpExecs();
-}
-int GraphExecutor::GetIndex(std::string name) {
-  CHECK(name_idx_.count(name))
-    << name << " is not in the graph.";
-  return name_idx_.at(name);
-}
-void GraphExecutor::SetInput(int index, DLTensor* data_in) {
-  const auto& idx = graph_.indexed_graph();
-  CHECK_LT(static_cast<size_t>(index), idx.input_nodes().size());
-  uint32_t eid = idx.entry_id(idx.input_nodes()[index], 0);
-  TVM_CCALL(TVMArrayCopyFromTo(data_in, &data_entry_[eid], nullptr));
-}
-void GraphExecutor::GetOutput(int index, DLTensor* data_out) {
-  const auto& idx = graph_.indexed_graph();
-  CHECK_LT(static_cast<size_t>(index), idx.outputs().size());
-  uint32_t eid = idx.entry_id(idx.outputs()[index]);
-  TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr));
-}
-bool LoadDLTensor(dmlc::Stream* strm, DLTensor* tensor) {
-    uint64_t header, reserved;
-    CHECK(strm->Read(&header, sizeof(header)))
-      << "Invalid DLTensor file format";
-    CHECK(strm->Read(&reserved, sizeof(reserved)))
-      << "Invalid DLTensor file format";
-    CHECK(header == kTVMNDArrayMagic)
-      << "Invalid DLTensor file format";
-    CHECK(strm->Read(&tensor->ctx, sizeof(tensor->ctx)))
-      << "Invalid DLTensor file format";
-    CHECK(strm->Read(&tensor->ndim, sizeof(tensor->ndim)))
-      << "Invalid DLTensor file format";
-    CHECK(strm->Read(&tensor->dtype, sizeof(tensor->dtype)))
-      << "Invalid DLTensor file format";
-    int ndim = tensor->ndim;
-    CHECK(strm->Read(tensor->shape, sizeof(int64_t) * ndim))
-      << "Invalid DLTensor file format";
-    int64_t size = 1;
-    int type_size = tensor->dtype.bits / 8;
-    for (int i = 0; i < ndim; ++i) {
-      size *= tensor->shape[i];
-    }
-    int64_t data_byte_size;
-    CHECK(strm->Read(&data_byte_size, sizeof(data_byte_size)))
-      << "Invalid DLTensor file format";
-    CHECK(data_byte_size == type_size * size)
-      << "Invalid DLTensor file format";
-    CHECK(strm->Read(tensor->data, type_size * size))
-      << "Invalid DLTensor file format";
-    return true;
-}
-void GraphExecutor::LoadParams(dmlc::Stream *strm) {
-  uint64_t header, reserved;
-  CHECK(strm->Read(&header))
-    << "Invalid parameters file format";
-  CHECK(header == kTVMNDArrayListMagic)
-    << "Invalid parameters file format";
-  CHECK(strm->Read(&reserved))
-    << "Invalid parameters file format";
-  std::vector<std::string> names;
-  CHECK(strm->Read(&names))
-    << "Invalid parameters file format";
-  std::unordered_map<std::string, size_t> name_eid;
-  const auto& idx = graph_.indexed_graph();
-  for (int nid : idx.input_nodes()) {
-    name_eid.emplace(idx[nid].source->attrs.name, idx.entry_id(nid, 0));
-  }
-  uint64_t sz;
-  strm->Read(&sz, sizeof(sz));
-  size_t size = static_cast<size_t>(sz);
-  CHECK(size == names.size())
-    << "Invalid parameters file format";
-  for (size_t i = 0; i < size; ++i) {
-    auto iter = name_eid.find(names[i]);
-    CHECK(iter != name_eid.end());
-    CHECK(LoadDLTensor(strm, &data_entry_[iter->second]))
-      << "Invalid parameters file format";
-  }
-}
-void GraphExecutor::LoadParamsFromBlob(std::string param_blob) {
-  dmlc::MemoryStringStream strm(&param_blob);
-  this->LoadParams(&strm);
-}
-void GraphExecutor::SetupNameIndex() {
-  nnvm::Symbol s;
-  s.outputs = graph_.outputs;
-  std::vector<std::string> input_names = s.ListInputNames(nnvm::Symbol::kAll);
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    name_idx_[input_names[i]] = i;
-  }
-}
-void GraphExecutor::SetupStorage() {
-  const auto& idx = graph_.indexed_graph();
-  // Grab saved optimization plan from graph.
-  auto vstorage = graph_.MoveCopyAttr<StorageVector>("storage_id");
-  const auto& vtype = graph_.GetAttr<DLTypeVector>("dltype");
-  data_shape_ = graph_.GetAttr<ShapeVector>("shape");
-  data_entry_.resize(idx.num_node_entries());
-  // Find the maximum space size.
-  int max_id = 0;
-  for (size_t i = 0; i < data_shape_.size(); ++i) {
-    max_id = std::max(vstorage[i] + 1, max_id);
-  }
-  for (const auto& e : idx.input_nodes()) {
-    vstorage[idx.entry_id(e, 0)] = max_id++;
-  }
-  // size of each storage pool entry
-  std::vector<size_t> pool_entry_bytes;
-  // Find the maximum space size.
-  for (size_t i = 0; i < data_shape_.size(); ++i) {
-    int storage_id = vstorage[i];
-    size_t size = data_shape_[i].Size();
-    CHECK_GE(storage_id, 0) << "Do not support runtime shape op";
-    DLDataType t = vtype[i];
-    size_t bits = t.bits * t.lanes;
-    CHECK_EQ(bits % 8U, 0U);
-    size_t bytes = (bits / 8U) * size;
-    size_t sid = static_cast<size_t>(storage_id);
-    if (sid >= pool_entry_bytes.size()) {
-      pool_entry_bytes.resize(sid + 1, 0);
-    }
-    pool_entry_bytes[sid] = std::max(pool_entry_bytes[sid], bytes);
-  }
-  // Allocate the space.
-  for (size_t i = 0; i < pool_entry_bytes.size(); ++i) {
-    TShape shape{static_cast<int64_t>(pool_entry_bytes[i] + 3) / 4};
-    DLTensor* tensor;
-    TVM_CCALL(TVMArrayAlloc(
-        shape.data(), 1, kFloat, 32, 1, ctx_.device_type, ctx_.device_id, &tensor));
-    storage_pool_.push_back(tensor);
-  }
-  // Assign the pooled entries.
-  for (size_t i = 0; i < data_entry_.size(); ++i) {
-    int storage_id = vstorage[i];
-    data_entry_[i] = *storage_pool_[storage_id];
-    data_entry_[i].shape = const_cast<int64_t*>(data_shape_[i].data());
-    data_entry_[i].ndim = data_shape_[i].ndim();
-    data_entry_[i].dtype = vtype[i];
-  }
-}
-void GraphExecutor::SetupOpExecs() {
-  static const nnvm::Op* tvm_op = nnvm::Op::Get("tvm_op");
-  const auto& idx = graph_.indexed_graph();
-  op_execs_.resize(idx.num_nodes());
-  // setup the array and requirements.
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    if (inode.source->is_variable()) continue;
-    std::vector<DLTensor> args;
-    for (const auto& e : inode.inputs) {
-      args.push_back(data_entry_[idx.entry_id(e)]);
-    }
-    for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
-      uint32_t eid = idx.entry_id(nid, index);
-      args.push_back(data_entry_[eid]);
-    }
-    CHECK_EQ(inode.source->op(), tvm_op)
-        << "transform the graph to tvm op";
-    op_execs_[nid] = CreateTVMOp(
-        inode.source->attrs, args, inode.inputs.size());
-  }
-}
-FOpExec GraphExecutor::CreateTVMOp(const nnvm::NodeAttrs& attrs,
-                                   std::vector<DLTensor> args,
-                                   size_t num_inputs) {
-  struct OpArgs {
-    std::vector<DLTensor> args;
-    std::vector<TVMValue> arg_values;
-    std::vector<int> arg_tcodes;
-    std::vector<int64_t> shape_data;
-  };
-  auto it = attrs.dict.find("func_name");
-  CHECK(it != attrs.dict.end())
-      << "tvm_op must need func_name attr";
-  bool flatten = (attrs.dict.at("flatten_data") == "1");
-  std::shared_ptr<OpArgs> arg_ptr = std::make_shared<OpArgs>();
-  // setup address.
-  arg_ptr->args = std::move(args);
-  if (flatten) {
-    arg_ptr->shape_data.resize(arg_ptr->args.size());
-  }
-  for (size_t i = 0; i < arg_ptr->args.size(); ++i) {
-    TVMValue v;
-    DLTensor* t = &(arg_ptr->args[i]);
-    v.v_handle = t;
-    arg_ptr->arg_values.push_back(v);
-    arg_ptr->arg_tcodes.push_back(kArrayHandle);
-    if (flatten) {
-      int64_t s = 1;
-      arg_ptr->shape_data[i] = std::accumulate(
-          t->shape, t->shape + t->ndim, 1, std::multiplies<int64_t>());
-      t->ndim = 1;
-      t->shape = &(arg_ptr->shape_data[i]);
-    }
-  }
-  // get compiled function from module.
-  runtime::PackedFunc pf = module_.GetFunction(it->second, false);
-  CHECK(pf != nullptr) << "no such function in module: " << it->second;
-  auto fexec = [arg_ptr, pf] () {
-    runtime::TVMRetValue rv;
-    runtime::TVMArgs targs(arg_ptr->arg_values.data(),
-                           arg_ptr->arg_tcodes.data(),
-                           static_cast<int>(arg_ptr->arg_values.size()));
-    pf.CallPacked(targs, &rv);
-  };
-  return fexec;
-}
-/*! \brief Parse keyword arguments as PType arguments and save to parsed */
-template<typename PType>
-inline void ParamParser(nnvm::NodeAttrs* attrs) {
-  PType param;
-  try {
-    param.Init(attrs->dict);
-  } catch (const dmlc::ParamError& e) {
-    std::ostringstream os;
-    os << e.what();
-    os << ", in operator " << attrs->op->name << "("
-       << "name=\"" << attrs->name << "\"";
-    for (const auto& k : attrs->dict) {
-      os << ", " << k.first << "=\"" << k.second << "\"";
-    }
-    os << ")";
-    throw dmlc::ParamError(os.str());
-  }
-  attrs->parsed = std::move(param);
-}
-DMLC_REGISTER_PARAMETER(TVMOpParam);
-// ewise tvm op
-NNVM_REGISTER_OP(tvm_op)
-.set_attr_parser(ParamParser<TVMOpParam>)
-.set_num_inputs([](const NodeAttrs& attrs) {
-    const TVMOpParam& param = nnvm::get<TVMOpParam>(attrs.parsed);
-    return param.num_inputs;
-  })
-.set_num_outputs([](const NodeAttrs& attrs) {
-    const TVMOpParam& param = nnvm::get<TVMOpParam>(attrs.parsed);
-    return param.num_outputs;
-  });
-TVM_REGISTER_GLOBAL("tvm_graph._load_executor")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-    std::string sym_json    = args[0];
-    std::string lib_fname   = args[1];
-    std::string param_blob  = args[2];
-    TVMContext ctx;
-    ctx.device_type = static_cast<DLDeviceType>(args[3].operator int());
-    ctx.device_id   = args[4];
-    // load graph from json string
-    nnvm::Graph g;
-    g.attrs["json"] = std::make_shared<nnvm::any>(sym_json);
-    g = nnvm::ApplyPass(std::move(g), "LoadJSON");
-    // load module from file
-    static const PackedFunc* fsys_load_ = nullptr;
-    if (fsys_load_ == nullptr) {
-      fsys_load_ = runtime::Registry::Get("tvm.contrib.rpc.server.load_module");
-      CHECK(fsys_load_ != nullptr);
-    }
-    runtime::Module m = (*fsys_load_)(lib_fname);
-    g.attrs["module"] = std::make_shared<nnvm::any>(m);
-    std::shared_ptr<GraphExecutor> exec =
-        std::make_shared<GraphExecutor>();
-    exec->Init(g, ctx);
-    // load params form stream of string
-    exec->LoadParamsFromBlob(std::move(param_blob));
-    *rv = tvm::runtime::Module(exec);
-  });
-}  // namespace contrib
-}  // namespace tvm
-namespace dmlc {
-namespace json {
-template<>
-struct Handler<DLDataType> {
-  static void Write(JSONWriter *writer, const DLDataType& data) {
-    std::vector<int> tmp({data.code, data.bits, data.lanes});
-    writer->Write(tmp);
-  }
-  static void Read(JSONReader *reader, DLDataType* data) {
-    std::vector<int> tmp;
-    reader->Read(&tmp);
-    data->code  = tmp[0];
-    data->bits  = tmp[1];
-    data->lanes = tmp[2];
-  }
-};
-DMLC_JSON_ENABLE_ANY(std::vector<DLDataType>, list_dltype);
-}  // namespace dmlc
-}  // namespace json
--- a/apps/graph_executor/src/graph_executor.h
+++ b/apps/graph_executor/src/graph_executor.h
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file graph_executor.h
- */
-#ifndef TVM_GRAPH_EXECUTOR_H_
-#define TVM_GRAPH_EXECUTOR_H_
-#include <dmlc/io.h>
-#include <dmlc/memory_io.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/module.h>
-#include <nnvm/graph.h>
-#include <nnvm/graph_attr_types.h>
-#include <nnvm/tuple.h>
-#include <nnvm/pass.h>
-#include <numeric>
-#include <string>
-namespace tvm {
-namespace contrib {
-using tvm::runtime::TVMArgs;
-using tvm::runtime::TVMRetValue;
-using tvm::runtime::PackedFunc;
-using nnvm::StorageVector;
-using nnvm::ShapeVector;
-using nnvm::TShape;
-using nnvm::NodeAttrs;
-/*! \brief DLPack compatible data types */
-using DLTypeVector = std::vector<DLDataType>;
-/*! \brief The executor function */
-using FOpExec = std::function<void()>;
-/*! \brief macro to do C API call */
-#define TVM_CCALL(func)                                            \
-  {                                                                \
-    int ret = (func);                                              \
-    CHECK_EQ(ret, 0)                                               \
-        << TVMGetLastError();                                      \
-  }
-constexpr uint64_t kTVMNDArrayMagic     = 0xDD5E40F096B4A13F;
-constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
-/*! \brief Graph Executor with TVM runtime */
-class GraphExecutor : public runtime::ModuleNode {
- public:
-  const char* type_key() const {
-    return "GraphExecutor";
-  }
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self);
-  // Destructor
-  ~GraphExecutor();
-  // Setup with a given graph
-  void Init(const nnvm::Graph& g, TVMContext ctx);
-  // Get index of variable
-  int GetIndex(std::string name);
-  // Copy data to index-th input
-  void SetInput(int index, DLTensor* data_in);
-  // Copy index-th output to data_out
-  void GetOutput(int index, DLTensor* data_out);
-  // Load parameters from stream
-  void LoadParams(dmlc::Stream* strm);
-  // Load parameters from binary file blob
-  void LoadParamsFromBlob(std::string param_blob);
-  // Execute the graph.
-  void Run();
- private:
-  // functions
-  void SetupNameIndex();
-  void SetupStorage();
-  void SetupOpExecs();
-  // Constructor to create TVM op
-  FOpExec CreateTVMOp(const nnvm::NodeAttrs& attrs,
-                      std::vector<DLTensor> inputs,
-                      size_t num_inputs);
-  // The graph to be executed.
-  nnvm::Graph graph_;
-  // The execution context
-  TVMContext ctx_;
-  // Common storage pool
-  std::vector<DLTensor*> storage_pool_;
-  // The data shape
-  std::vector<TShape> data_shape_;
-  // The data entry
-  std::vector<DLTensor> data_entry_;
-  // The operation lambda on each node
-  std::vector<FOpExec> op_execs_;
-  // The code module.
-  tvm::runtime::Module module_;
-  std::unordered_map<std::string, size_t> name_idx_;
-};
-struct TVMOpParam : public dmlc::Parameter<TVMOpParam> {
-  std::string func_name;
-  uint32_t num_inputs;
-  uint32_t num_outputs;
-  bool flatten_data;
-  DMLC_DECLARE_PARAMETER(TVMOpParam) {
-    DMLC_DECLARE_FIELD(func_name);
-    DMLC_DECLARE_FIELD(num_inputs)
-    .set_default(1);
-    DMLC_DECLARE_FIELD(num_outputs)
-    .set_default(1);
-    DMLC_DECLARE_FIELD(flatten_data)
-    .set_default(false);
-  }
-};
-}  // namespace contrib
-}  // namespace tvm
-#endif  // TVM_GRAPH_EXECUTOR_H_
--- a/apps/graph_executor/src/graph_executor_ext.cc
+++ b/apps/graph_executor/src/graph_executor_ext.cc
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file graph_executor_ext.cc
- */
-#include "./graph_executor.h"
-namespace tvm {
-namespace contrib {
-bool SaveDLTensor(dmlc::Stream* strm, DLTensor* tensor) {
-    uint64_t header = kTVMNDArrayMagic, reserved = 0;
-    strm->Write(&header, sizeof(header));
-    strm->Write(&reserved, sizeof(reserved));
-    strm->Write(&tensor->ctx, sizeof(tensor->ctx));
-    strm->Write(&tensor->ndim, sizeof(tensor->ndim));
-    strm->Write(&tensor->dtype, sizeof(tensor->dtype));
-    int ndim = tensor->ndim;
-    strm->Write(tensor->shape, sizeof(int64_t) * ndim);
-    int type_size = tensor->dtype.bits / 8;
-    int64_t size = 1;
-    for (int i = 0; i < ndim; ++i) {
-      size *= tensor->shape[i];
-    }
-    int64_t data_byte_size = type_size * size;
-    strm->Write(&data_byte_size, sizeof(data_byte_size));
-    strm->Write(tensor->data, data_byte_size);
-    return true;
-}
-TVM_REGISTER_GLOBAL("tvm_graph._save_param_dict")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-    std::string fname = args[0];
-    int num_params = args[1];
-    std::vector<std::string> names;
-    names.reserve(num_params);
-    std::vector<DLTensor*> arrays;
-    arrays.reserve(num_params);
-    for (int i = 2; i < (2 + 2*num_params); i += 2) {
-      names.emplace_back(args[i].operator std::string());
-      arrays.emplace_back(args[i+1].operator DLTensor*());
-    }
-    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
-    uint64_t header = kTVMNDArrayListMagic, reserved = 0;
-    fo->Write(&header, sizeof(header));
-    fo->Write(&reserved, sizeof(reserved));
-    fo->Write(names);
-    {
-      uint64_t sz = static_cast<uint64_t>(arrays.size());
-      fo->Write(&sz, sizeof(sz));
-      for (size_t i = 0; i < sz; ++i) {
-        SaveDLTensor(fo.get(), arrays[i]);
-      }
-    }
-  });
-// Create executor
-tvm::runtime::Module CreateExecutor(nnvm::Graph g, TVMContext ctx) {
-  std::shared_ptr<GraphExecutor> exec =
-      std::make_shared<GraphExecutor>();
-  exec->Init(g, ctx);
-  return tvm::runtime::Module(exec);
-}
-TVM_REGISTER_GLOBAL("tvm_graph._create_executor")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-    void* graph_handle = args[0];
-    int device_type = args[1];
-    int device_id = args[2];
-    TVMContext ctx{static_cast<DLDeviceType>(device_type), device_id};
-    nnvm::Graph g = static_cast<nnvm::Graph*>(graph_handle)[0];
-    *rv = CreateExecutor(g, ctx);
-  });
-TVM_REGISTER_GLOBAL("tvm_graph._get_module_from_graph")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-    void* graph_handle = args[0];
-    nnvm::Graph* g = static_cast<nnvm::Graph*>(graph_handle);
-    *rv = g->MoveCopyAttr<tvm::runtime::Module>("module");
-  });
-}  // namespace contrib
-}  // namespace tvm
--- a/apps/graph_executor/src/graph_handle.cc
+++ b/apps/graph_executor/src/graph_handle.cc
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file graph_handle.cc
- */
-#include <tvm/packed_func_ext.h>
-#include "./graph_handle.h"
-namespace tvm {
-TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
-.set_dispatch<GraphHandleNode>([](const GraphHandleNode *op, IRPrinter *p) {
-    p->stream << "graph-handle("
-              << "handle=0x" << std::hex
-              << reinterpret_cast<uint64_t>(op->graph_handle) << ")";
-});
-TVM_REGISTER_NODE_TYPE(GraphHandleNode);
-}  // namespace tvm
--- a/apps/graph_executor/src/graph_handle.h
+++ b/apps/graph_executor/src/graph_handle.h
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file graph.h
- * \brief Data structure about computational graph.
- */
-#ifndef TVM_GRAPH_HANDLE_H_
-#define TVM_GRAPH_HANDLE_H_
-#include <string>
-#include <tvm/base.h>
-namespace tvm {
-/*!
- * \brief Computational graph handle.
- *  Use GraphHandle as its container type
- */
-struct GraphHandleNode : public Node {
-  void *graph_handle;
-  void VisitAttrs(AttrVisitor* v) final {
-    v->Visit("graph_handle", &graph_handle);
-  }
-  static constexpr const char* _type_key = "GraphHandle";
-  TVM_DECLARE_NODE_TYPE_INFO(GraphHandleNode, Node);
-};
-/*! \brief Defines graph handle */
-TVM_DEFINE_NODE_REF(GraphHandle, GraphHandleNode);
-}  // namespace tvm
-#endif  // TVM_GRAPH_HANDLE_H_
--- a/apps/graph_executor/src/graph_pass.cc
+++ b/apps/graph_executor/src/graph_pass.cc
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file Additional optimization pass of NNVM.
- */
-#include <dmlc/json.h>
-#include <nnvm/graph.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/graph_attr_types.h>
-#include <nnvm/tuple.h>
-#include <nnvm/pass.h>
-#include <tvm/operation.h>
-#include <tvm/lowered_func.h>
-#include "./op_attr_types.h"
-namespace tvm {
-namespace contrib {
-using nnvm::any;
-using nnvm::IndexedGraph;
-// The single fuse rule.
-enum class FuseRule {
-  kUknown,
-  kFuseToMaster,
-  kRealize
-};
-DLDataType GetDLType(int type_flag) {
-  if (type_flag == 0) return Type2TVMType(Float(32));
-  LOG(FATAL) << "unknown type_flag=" << type_flag;
-  return Type2TVMType(Float(32));
-}
-// Partition the graph into segments
-// Each segment will be compiled into one operator.
-// Need also mark the property of the segment.
-nnvm::Graph GraphPartition(nnvm::Graph g) {
-  // setup ref counter
-  const IndexedGraph& idx = g.indexed_graph();
-  // Get attributes from the graph
-  const ShapeVector& shape_vec = g.GetAttr<ShapeVector>("shape");
-  const DTypeVector& dtype_vec = g.GetAttr<DTypeVector>("dtype");
-  // Transform to dltype
-  // In future, directly fo type inference in dltype.
-  DLTypeVector dltype_vec = DLTypeVector(dtype_vec.size());
-  for (size_t i = 0; i < dtype_vec.size(); ++i) {
-    dltype_vec[i] = GetDLType(dtype_vec[i]);
-  }
-  // Reference counter of each op node
-  // For now, always store result when an op is referred more than once.
-  std::vector<uint32_t> ref_count(idx.num_nodes(), 0);
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    if (inode.source->is_variable()) continue;
-    for (const auto& e : inode.inputs) {
-      ++ref_count[e.node_id];
-    }
-  }
-  for (const auto& e : idx.outputs()) {
-    // this line will realize all the outputs
-    ref_count[e.node_id] += 2;
-  }
-  // Pattern fo the subgraph
-  std::vector<TOpPattern> pattern_vec(idx.num_nodes(),  kExtern);
-  // Whether node can be fused to parent.
-  std::vector<FuseRule> fuse_vec(idx.num_nodes(), FuseRule::kUknown);
-  // Master node id of fusion segment.
-  std::vector<int> master_vec(idx.num_nodes(), -1);
-  // Operator pattern
-  static auto& op_pattern = nnvm::Op::GetAttr<TOpPattern>("TOpPattern");
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    if (inode.source->is_variable()) {
-      fuse_vec[nid] = FuseRule::kRealize; continue;
-    }
-    TOpPattern pt = op_pattern.get(inode.source->op(), kExtern);
-    if (pt <= kBroadcast) {
-      int chosen_master = -1;
-      bool ewise = inode.source->num_outputs() == 1;
-      for (const auto& e : inode.inputs) {
-        if (fuse_vec[e.node_id] == FuseRule::kUknown) {
-          TOpPattern ipt = pattern_vec[e.node_id];
-          if (ipt != kElemWise) ewise = false;
-          if (ipt <= kBroadcast) {
-            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
-          } else if (ipt == kComplex && chosen_master == -1 &&
-            shape_vec[idx.entry_id(nid, 0)] == shape_vec[idx.entry_id(e)]) {
-            chosen_master = master_vec[e.node_id];
-            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
-          } else {
-            fuse_vec[e.node_id] = FuseRule::kRealize;
-          }
-        }
-        if (ewise) {
-          if (shape_vec[idx.entry_id(nid, 0)] != shape_vec[idx.entry_id(e)]) {
-            ewise = false;
-          }
-        }
-      }
-      master_vec[nid] = chosen_master;
-      if (chosen_master != -1) {
-        pt = kComplex;
-      } else {
-        pt = ewise ? kElemWise : kBroadcast;
-      }
-    } else {
-      master_vec[nid] = nid;
-      for (const auto& e : inode.inputs) {
-        if (fuse_vec[e.node_id] == FuseRule::kUknown) {
-          fuse_vec[e.node_id] = FuseRule::kRealize;
-          if (master_vec[e.node_id] == -1) {
-            master_vec[e.node_id] = e.node_id;
-          }
-        }
-      }
-    }
-    pattern_vec[nid] = pt;
-    if (ref_count[nid] > 1) {
-      fuse_vec[nid] = FuseRule::kRealize;
-      if (master_vec[nid] == -1) {
-        master_vec[nid] = nid;
-      }
-    }
-  }
-  // point to the group root id of each node
-  std::vector<int> group_vec(idx.num_nodes(), -1);
-  for (uint32_t i = idx.num_nodes(); i != 0; --i) {
-    uint32_t nid = i - 1;
-    const auto& inode = idx[nid];
-    if (group_vec[nid] == -1) {
-      group_vec[nid] = nid;
-    }
-    // propagate the group id.
-    for (const auto& e : inode.inputs) {
-      if (fuse_vec[e.node_id] == FuseRule::kFuseToMaster) {
-        CHECK(group_vec[e.node_id] == -1||
-              group_vec[e.node_id] == group_vec[nid]);
-        group_vec[e.node_id] = group_vec[nid];
-      }
-    }
-  }
-  g.attrs["group_root"] = std::make_shared<any>(std::move(group_vec));
-  g.attrs["group_master"] = std::make_shared<any>(std::move(master_vec));
-  g.attrs["pattern"] = std::make_shared<any>(std::move(pattern_vec));
-  g.attrs["dltype"] = std::make_shared<any>(std::move(dltype_vec));
-  return g;
-}
-NNVM_REGISTER_PASS(GraphPartition)
-.set_body(GraphPartition)
-.depend_graph_attr("shape")
-.depend_graph_attr("dtype")
-.provide_graph_attr("dltype");
-struct NodeEntryHash {
-  size_t operator()(const IndexedGraph::NodeEntry& e) const {
-    return e.node_id;
-  }
-};
-struct NodeEntryEqual {
-  size_t operator()(const IndexedGraph::NodeEntry& a,
-                    const IndexedGraph::NodeEntry& b) const {
-    return a.node_id == b.node_id && a.index == b.index;
-  }
-};
-// Auxiliary data structure for representing fused op.
-struct FuseEntry {
-  // The inputs
-  std::vector<IndexedGraph::NodeEntry> inputs;
-  // The input map
-  std::unordered_map<IndexedGraph::NodeEntry, Tensor,
-                     NodeEntryHash, NodeEntryEqual> imap;
-  // Output tensors
-  Array<Tensor> outputs;
-  // Placeholder for inputs
-  Array<Tensor> placeholder;
-  // Computing schedule
-  Schedule schedule;
-  // Function name
-  std::string func_name;
-};
-// Fuse the partitioned graph into segments.
-// Create a new graph with fused noded.
-// Also inheritate attribute shape, dltype from previous graph.
-nnvm::Graph GraphFuse(nnvm::Graph g) {
-  // setup ref counter
-  const IndexedGraph& idx = g.indexed_graph();
-  // Get attributes from the graph
-  const ShapeVector& shape_vec = g.GetAttr<ShapeVector>("shape");
-  const DLTypeVector& dltype_vec = g.GetAttr<DLTypeVector>("dltype");
-  const DTypeVector& dtype_vec = g.GetAttr<DTypeVector>("dtype");
-  const std::vector<int>& group_vec = g.GetAttr<std::vector<int> >("group_root");
-  const std::vector<int>& master_vec = g.GetAttr<std::vector<int> >("group_master");
-  const std::vector<TOpPattern>& pattern_vec =
-      g.GetAttr<std::vector<TOpPattern> >("pattern");
-  std::string target = g.GetAttr<std::string>("target");
-  std::vector<FuseEntry> fuse_vec(idx.num_nodes());
-  // setup inputs and placeholder.
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    if (inode.source->is_variable()) continue;
-    CHECK_GE(group_vec[nid], 0);
-    int root_id = group_vec[nid];
-    FuseEntry& fe = fuse_vec[root_id];
-    TOpPattern pt = pattern_vec[root_id];
-    for (const auto& e : inode.inputs) {
-      if (group_vec[e.node_id] != root_id && fe.imap.count(e) == 0) {
-        Array<Expr> shape;
-        if (pt == kElemWise) {
-          // elementwise support flatten
-          int64_t prod = 1;
-          for (int64_t x : shape_vec[idx.entry_id(e)]) {
-            prod *= x;
-          }
-          CHECK_LE(prod, static_cast<int64_t>(std::numeric_limits<int>::max()));
-          shape.push_back(make_const(Int(32), prod));
-        } else {
-          for (int64_t x : shape_vec[idx.entry_id(e)]) {
-            CHECK_LE(x, static_cast<int64_t>(std::numeric_limits<int>::max()));
-            shape.push_back(make_const(Int(32), x));
-          }
-        }
-        std::ostringstream os_name;
-        os_name << "input" << fe.inputs.size();
-        Tensor data = placeholder(
-            shape, TVMType2Type(dltype_vec[idx.entry_id(e)]),
-            os_name.str());
-        fe.imap[e] = data;
-        fe.inputs.push_back(e);
-        fe.placeholder.push_back(data);
-      }
-    }
-  }
-  // Setup the Tensor
-  std::vector<Tensor> tensor_vec(idx.num_node_entries());
-  static auto& fcompute =
-      nnvm::Op::GetAttr<FTVMCompute>("FTVMCompute");
-  static auto& fschedule =
-      nnvm::Op::GetAttr<FTVMSchedule>("FTVMSchedule");
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    if (inode.source->is_variable()) continue;
-    int root_id = group_vec[nid];
-    FuseEntry& fe = fuse_vec[root_id];
-    Array<Tensor> inputs;
-    // input loading
-    for (const auto& e : inode.inputs) {
-      if (group_vec[e.node_id] != root_id) {
-        auto it = fe.imap.find(e);
-        CHECK(it != fe.imap.end());
-        inputs.push_back(it->second);
-      } else {
-        Tensor t = tensor_vec[idx.entry_id(e)];
-        CHECK(t.defined());
-        inputs.push_back(t);
-      }
-    }
-    Array<Tensor> out = fcompute[inode.source->op()](
-        inode.source->attrs, inputs);
-    CHECK_EQ(out.size(), inode.source->num_outputs());
-    // schedule on root node, and use master's schedule
-    if (nid != root_id) {
-      for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
-        uint32_t eid = idx.entry_id(nid, index);
-        tensor_vec[eid] = out[index];
-      }
-    } else {
-      fe.outputs = out;
-      int master = master_vec[root_id];
-      CHECK_GE(master, 0);
-      fe.schedule = fschedule[idx[master].source->op()](
-          idx[master].source->attrs, fe.outputs, target);
-      std::ostringstream os;
-      os << idx[master].source->attrs.name + "_id" << nid;
-      fe.func_name = os.str();
-    }
-  }
-  static const PackedFunc& flower = GetPackedFunc("tvm_graph.lower");
-  static const PackedFunc& fbuild = GetPackedFunc("tvm_graph.build_target");
-  Array<tvm::LoweredFunc> funcs;
-  for (const FuseEntry& fe : fuse_vec) {
-    if (fe.schedule.defined()) {
-      Array<tvm::Tensor> args = fe.placeholder;
-      for (tvm::Tensor x : fe.outputs) {
-        args.push_back(x);
-      }
-      Array<tvm::LoweredFunc> ret = flower(fe.schedule, args, fe.func_name);
-      for (LoweredFunc x : ret) {
-        funcs.push_back(x);
-      }
-    }
-  }
-  tvm::runtime::Module module = fbuild(funcs, target);
-  // Final step: Remap the node, with given attribute
-  const nnvm::Op* tvm_op = nnvm::Op::Get("tvm_op");
-  std::unordered_map<uint32_t, nnvm::NodePtr> old_new;
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    if (inode.source->is_variable()) {
-      nnvm::NodePtr np = nnvm::Node::Create();
-      np->attrs = inode.source->attrs;
-      old_new[nid] = np;
-    } else {
-      int root_id = group_vec[nid];
-      if (nid != root_id) continue;
-      FuseEntry& fe = fuse_vec[root_id];
-      nnvm::NodePtr np = nnvm::Node::Create();
-      np->attrs.op = tvm_op;
-      np->attrs.name = inode.source->attrs.name;
-      np->attrs.dict["num_inputs"] = std::to_string(fe.inputs.size());
-      np->attrs.dict["num_outputs"] = std::to_string(fe.outputs.size());
-      np->attrs.dict["func_name"] = fuse_vec[nid].func_name;
-      np->attrs.dict["flatten_data"] = std::to_string(pattern_vec[nid] == kElemWise);
-      np->op()->attr_parser(&(np->attrs));
-      for (const auto& e : fe.inputs) {
-        auto it = old_new.find(e.node_id);
-        CHECK(it != old_new.end())
-            << "cannot find node_id=" << e.node_id;
-        np->inputs.emplace_back(
-            nnvm::NodeEntry{it->second, e.index, e.version});
-      }
-      for (const uint32_t node_id : inode.control_deps) {
-        auto it = old_new.find(node_id);
-        CHECK(it != old_new.end());
-        np->control_deps.emplace_back(it->second);
-      }
-      old_new[nid] = np;
-    }
-  }
-  nnvm::Graph ret;
-  for (const auto& e : idx.outputs()) {
-    auto it = old_new.find(group_vec[e.node_id]);
-    CHECK(it != old_new.end())
-        << "cannot find node_id=" << e.node_id;
-    ret.outputs.emplace_back(
-        nnvm::NodeEntry{it->second, e.index, e.version});
-  }
-  const IndexedGraph& new_idx = ret.indexed_graph();
-  ShapeVector new_shape_vec = ShapeVector(new_idx.num_node_entries(), TShape());
-  DTypeVector new_dtype_vec = DTypeVector(new_idx.num_node_entries());
-  DLTypeVector new_dltype_vec = DLTypeVector(new_idx.num_node_entries());
-  for (const auto& kv : old_new) {
-    uint32_t nid = kv.first;
-    const auto& inode = idx[nid];
-    for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
-      uint32_t new_eid = new_idx.entry_id(new_idx.node_id(kv.second.get()), i);
-      uint32_t old_eid = idx.entry_id(nid, i);
-      new_shape_vec[new_eid] = shape_vec[old_eid];
-      new_dtype_vec[new_eid] = dtype_vec[old_eid];
-      new_dltype_vec[new_eid] = dltype_vec[old_eid];
-    }
-  }
-  ret.attrs["shape"] = std::make_shared<any>(std::move(new_shape_vec));
-  ret.attrs["dtype"] = std::make_shared<any>(std::move(new_dtype_vec));
-  ret.attrs["dltype"] = std::make_shared<any>(std::move(new_dltype_vec));
-  ret.attrs["module"] = std::make_shared<any>(std::move(module));
-  ret = nnvm::ApplyPass(ret, "PlanMemory");
-  return ret;
-}
-NNVM_REGISTER_PASS(GraphFuse)
-.set_body(GraphFuse);
-const TLayoutInfo& GetDefaultLayout() {
-  static TLayoutInfo default_layout = "default";
-  return default_layout;
-}
-nnvm::NodePtr CreateLayoutTransformNode(const std::string& src,
-                                        const std::string& dst) {
-  static const nnvm::Op* trans_op = nnvm::Op::Get("layout_transform");
-  static int count = 0;
-  nnvm::NodePtr n = nnvm::Node::Create();
-  n->attrs.op = trans_op;
-  n->attrs.name = src + "_to_" + dst + std::to_string(count++);
-  n->attrs.dict["src_layout"] = src;
-  n->attrs.dict["dst_layout"] = dst;
-  n->op()->attr_parser(&(n->attrs));
-  return n;
-}
-/*!
- * \brief A simple layout transform pass that will
- *  insert layout transform nodes automatically.
- */
-nnvm::Graph LayoutTransform(nnvm::Graph src) {
-  static auto& op_layout_request =
-    nnvm::Op::GetAttr<FTVMLayoutRequest>("FTVMLayoutRequest");
-  static auto& op_vecop =
-    nnvm::Op::GetAttr<FTVMVectorizedOp>("FTVMVectorizedOp");
-  static auto& op_pattern = nnvm::Op::GetAttr<TOpPattern>("TOpPattern");
-  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
-  const std::vector<TLayoutInfo>& input_layouts =
-    src.GetAttr<std::vector<TLayoutInfo> >("layout");
-  const IndexedGraph& idx = src.indexed_graph();
-  std::vector<TLayoutInfo> produce_vec(idx.num_node_entries(), GetDefaultLayout());
-  std::vector<nnvm::NodePtr> mirror_vec(idx.num_nodes(), nullptr);
-  // use op pattern to decide whether an op is map
-  auto is_map_op = [&](size_t nid) {
-    TOpPattern pt = op_pattern.get(idx[nid].source->op(), kExtern);
-    bool is_map = (pt <= kBroadcast);
-    if (pt == kBroadcast) {
-      for (const auto& e : idx[nid].inputs) {
-        if (shape_vec[idx.entry_id(nid, 0)] != shape_vec[idx.entry_id(e)]) {
-          is_map = false;
-          break;
-        }
-      }
-    }
-    return is_map;
-  };
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    nnvm::NodePtr new_node = nnvm::Node::Create();
-    *new_node = *(inode.source);
-    if (new_node->is_variable()) {
-      auto input_iter = std::find(
-        idx.input_nodes().cbegin(), idx.input_nodes().cend(), nid);
-      CHECK(input_iter != idx.input_nodes().cend());
-      size_t input_id = std::distance(idx.input_nodes().cbegin(), input_iter);
-      produce_vec[idx.entry_id(nid, 0)] = input_layouts[input_id];
-      mirror_vec[nid] = new_node;
-      continue;
-    }
-    if (op_vecop.count(inode.source->op())) {
-      new_node = op_vecop[inode.source->op()](inode.source);
-      new_node->inputs.resize(new_node->num_inputs());
-    }
-    // set up output and input layouts
-    std::vector<TLayoutInfo> request_ilayouts(new_node->num_inputs(), GetDefaultLayout());
-    if (op_layout_request.count(new_node->op())) {
-      std::vector<TLayoutInfo> produce_olayouts(new_node->num_outputs(), GetDefaultLayout());
-      CHECK(op_layout_request[new_node->op()](new_node->attrs, &request_ilayouts, &produce_olayouts))
-        << "Layout request fail";
-      CHECK_EQ(request_ilayouts.size(), new_node->num_inputs());
-      CHECK_EQ(produce_olayouts.size(), new_node->num_outputs());
-      for (size_t i = 0; i < new_node->num_outputs(); ++i) {
-        produce_vec[idx.entry_id(nid, i)] = produce_olayouts[i];
-      }
-    }
-    bool map_layout = is_map_op(nid);
-    if (map_layout) {
-      const TLayoutInfo& layout = produce_vec[idx.entry_id(inode.inputs[0])];
-      for (const auto& e : inode.inputs) {
-        if (produce_vec[idx.entry_id(e)] != layout) {
-          map_layout = false;
-          break;
-        }
-      }
-      if (map_layout) {
-        for (size_t i = 0; i < inode.source->num_outputs(); ++i) {
-          produce_vec[idx.entry_id(nid, i)] = layout;
-        }
-      }
-    }
-    for (size_t i = 0; i < inode.inputs.size(); ++i) {
-      const auto& e = inode.inputs[i];
-      const nnvm::NodePtr& in = mirror_vec[e.node_id];
-      new_node->inputs[i] =
-        nnvm::NodeEntry{in, e.index, e.version};
-      TLayoutInfo produce = produce_vec[idx.entry_id(e)];
-      TLayoutInfo request = request_ilayouts[i];
-      if (!map_layout && (produce != request)) {
-        nnvm::NodePtr tnode = CreateLayoutTransformNode(produce, request);
-        tnode->attrs.name =
-          idx[e.node_id].source->attrs.name + "_" + request;
-        tnode->inputs.emplace_back(new_node->inputs[i]);
-        new_node->inputs[i] = nnvm::NodeEntry{tnode, 0, 0};
-      }
-    }
-    mirror_vec[nid] = new_node;
-  }
-  std::vector<nnvm::NodeEntry> outputs;
-  for (const auto& e : idx.outputs()) {
-    TLayoutInfo produce = produce_vec[idx.entry_id(e)];
-    if (produce != GetDefaultLayout()) {
-      nnvm::NodePtr tnode = CreateLayoutTransformNode(produce, GetDefaultLayout());
-      tnode->attrs.name =
-        idx[e.node_id].source->attrs.name + "_default";
-      tnode->inputs.emplace_back(
-        nnvm::NodeEntry{mirror_vec[e.node_id], e.index, e.version});
-      outputs.emplace_back(nnvm::NodeEntry{tnode, 0, 0});
-    } else {
-      outputs.emplace_back(
-        nnvm::NodeEntry{mirror_vec[e.node_id], e.index, e.version});
-    }
-  }
-  nnvm::Graph ret;
-  ret.outputs = std::move(outputs);
-  return ret;
-}
-NNVM_REGISTER_PASS(LayoutTransform)
-.set_body(LayoutTransform);
-DMLC_REGISTER_PARAMETER(LayoutTransformParam);
-/*! \brief Parse keyword arguments as PType arguments and save to parsed */
-template<typename PType>
-inline void ParamParser(nnvm::NodeAttrs* attrs) {
-  PType param;
-  try {
-    param.Init(attrs->dict);
-  } catch (const dmlc::ParamError& e) {
-    std::ostringstream os;
-    os << e.what();
-    os << ", in operator " << attrs->op->name << "("
-       << "name=\"" << attrs->name << "\"";
-    for (const auto& k : attrs->dict) {
-      os << ", " << k.first << "=\"" << k.second << "\"";
-    }
-    os << ")";
-    throw dmlc::ParamError(os.str());
-  }
-  attrs->parsed = std::move(param);
-}
-NNVM_REGISTER_OP(layout_transform)
-.set_attr_parser(ParamParser<LayoutTransformParam>)
-.set_num_inputs(1)
-.set_num_outputs(1)
-.add_argument("data", "NDArray-or-Symbol", "Input data")
-.add_arguments(LayoutTransformParam::__FIELDS__());
-nnvm::Graph PruneGraph(nnvm::Graph src) {
-  const auto& params = src.GetAttr<std::unordered_set<std::string> >("params");
-  std::unordered_set<nnvm::Node*> pruned;
-  nnvm::NodeEntryMap<nnvm::NodePtr> entry_var;
-  DFSVisit(src.outputs, [&](const nnvm::NodePtr& n) {
-    bool can_be_pruned = true;
-    if (n->is_variable()) {
-      if (params.count(n->attrs.name)) {
-        pruned.emplace(n.get());
-      }
-      can_be_pruned = false;
-    }
-    for (const auto& e : n->inputs) {
-      if (!pruned.count(e.node.get())) {
-        can_be_pruned = false;
-      }
-    }
-    if (can_be_pruned) {
-      pruned.emplace(n.get());
-    } else {
-      // scan again to find edge nodes, skip variables
-      for (auto& e : n->inputs) {
-        if (!e.node->is_variable() && pruned.count(e.node.get())) {
-          if (!entry_var.count(e)) {
-            nnvm::NodePtr var = nnvm::Node::Create();
-            var->attrs.name = e.node->attrs.name + "_output" + std::to_string(e.index);
-            entry_var.emplace(e, var);
-          }
-          e = nnvm::NodeEntry{entry_var.at(e), 0, 0};
-        }
-      }
-    }
-  });
-  nnvm::Graph pre_graph;
-  pre_graph.outputs.reserve(entry_var.size());
-  std::vector<std::string> output_names;
-  output_names.reserve(entry_var.size());
-  for (auto kv : entry_var) {
-    if (kv.first.node->is_variable()) continue;
-    pre_graph.outputs.emplace_back(kv.first);
-    output_names.emplace_back(kv.second->attrs.name);
-  }
-  pre_graph.attrs["pruned_params"] =
-    std::make_shared<dmlc::any>(std::move(output_names));
-  src.attrs["pre_graph"] =
-    std::make_shared<dmlc::any>(std::move(pre_graph));
-  return src;
-}
-NNVM_REGISTER_PASS(PruneGraph)
-.set_body(PruneGraph);
-}  // namespace contrib
-}  // namespace tvm
--- a/apps/graph_executor/src/op_attr_types.h
+++ b/apps/graph_executor/src/op_attr_types.h
-/*!
- *  Copyright (c) 2016 by Contributors
- * \file op_attr_types.h
- * \brief The Expr and related elements in DataFlow construction.
- */
-#ifndef TVM_OP_ATTR_TYPES_H_
-#define TVM_OP_ATTR_TYPES_H_
-#include <tvm/expr.h>
-#include <tvm/tensor.h>
-#include <tvm/schedule.h>
-#include <tvm/packed_func_ext.h>
-#include <tvm/runtime/registry.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/graph_attr_types.h>
-#include <nnvm/graph.h>
-#include <vector>
-#include <string>
-namespace tvm {
-namespace contrib {
-using runtime::PackedFunc;
-using nnvm::StorageVector;
-using nnvm::ShapeVector;
-using nnvm::DTypeVector;
-using nnvm::TShape;
-using nnvm::NodeAttrs;
-/*! \brief DLPack compatible data types */
-using DLTypeVector = std::vector<DLDataType>;
-/*!
- * \brief Computation description interface
- * \param attrs The attribute of the node.
- * \param inputs The input tensors(placeholders)
- * \return The output description of the tensor.
- */
-using FTVMCompute = std::function<
-  Array<Tensor>
-  (const NodeAttrs& attrs, const Array<Tensor>& inputs)>;
-/*!
- * \brief Build the computation schedule for
- *  op whose  root is at current op.
- * \param attrs The attribute of the node.
- * \param outs The output tensors.
- * \param target The build target.
- * \return schedule The computation schedule.
- */
-using FTVMSchedule = std::function<
-  Schedule(const NodeAttrs& attrs,
-           const Array<Tensor>& outs,
-           const std::string& target)>;
-/*! \brief Layout Information. */
-using TLayoutInfo = std::string;
-/*!
- * \brief The producer consumer function of node layout
- * \param attrs The attribute of the node.
- * \param ilayouts The input layouts that the node request.
- * \param olayouts The output layouts that the node produce.
- * \return bool The success flag.
- */
-using FTVMLayoutRequest = std::function<bool (const NodeAttrs& attrs,
-                                              std::vector<TLayoutInfo> *ilayouts,
-                                              std::vector<TLayoutInfo> *olayouts)>;
-/*! \brief The default layout. */
-const TLayoutInfo& GetDefaultLayout();
-/*! \brief Parameters of layout transform operator */
-struct LayoutTransformParam : public dmlc::Parameter<LayoutTransformParam> {
-  std::string src_layout;
-  std::string dst_layout;
-  DMLC_DECLARE_PARAMETER(LayoutTransformParam) {
-    DMLC_DECLARE_FIELD(src_layout);
-    DMLC_DECLARE_FIELD(dst_layout);
-  }
-};
-/*! \brief Transform from normal operator to vectorized operator */
-using FTVMVectorizedOp = std::function<nnvm::NodePtr (const nnvm::Node*)>;
-// The storage result of op
-enum OpPatternKind : int {
-  // Elementwise operation
-  kElemWise,
-  // Broadcast operation
-  kBroadcast,
-  // Complex operation, can fuse bcast in input/outputs
-  // but cannot chain another complex op
-  kComplex,
-  // Extern operation, cannot fuse anything.
-  kExtern
-};
-using TOpPattern = int;
-/*!
- * \brief Get PackedFunction from global registry and
- *  report error if it does not exist
- * \param name The name of the function.
- * \return The created PackedFunc.
- */
-inline const PackedFunc& GetPackedFunc(const std::string& name) {
-  const PackedFunc* pf = tvm::runtime::Registry::Get(name);
-  CHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
-  return *pf;
-}
-/*!
- * \brief Create a Graph execution module by a given graph and the code module.
- * \param g The graph to be executed.
- * \param m The tvm module containing the functions.
- * \return The created executor module.
- */
-tvm::runtime::Module CreateExecutor(nnvm::Graph g);
-}  // namespace contrib
-}  // namespace tvm
-#endif  // TVM_OP_ATTR_TYPES_H_
--- a/apps/graph_executor/src/op_decl.cc
+++ b/apps/graph_executor/src/op_decl.cc
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file Operator Declarations.
- */
-#include <nnvm/op.h>
-#include <nnvm/op_attr_types.h>
-#include "./op_attr_types.h"
-namespace tvm {
-namespace contrib {
-using namespace nnvm;
-inline bool SameShape(const NodeAttrs& attrs,
-                      std::vector<TShape> *ishape,
-                      std::vector<TShape> *oshape) {
-  if (ishape->size() == 0 || (*ishape)[0].ndim() == 0) return false;
-  for (TShape& pshape : *oshape) {
-    pshape = (*ishape)[0];
-  }
-  for (TShape& pshape : *ishape) {
-    pshape = (*ishape)[0];
-  }
-  return true;
-}
-NNVM_REGISTER_OP_GROUP(ElementwiseOpAttr)
-.set_attr<TOpPattern>("TOpPattern", kBroadcast)
-.set_attr<FInferShape>("FInferShape", SameShape);
-NNVM_REGISTER_OP(__add_symbol__)
-.describe("add two data together")
-.set_num_inputs(2)
-.include("ElementwiseOpAttr");
-NNVM_REGISTER_OP(exp)
-.describe("Take exp")
-.set_num_inputs(1)
-.include("ElementwiseOpAttr");
-}  // namespace contrib
-}  // namespace tvm
--- a/apps/graph_executor/src/op_tvm_def.cc
+++ b/apps/graph_executor/src/op_tvm_def.cc
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file Operator defintions in TVM.
- */
-#include <nnvm/op.h>
-#include <nnvm/op_attr_types.h>
-#include "./op_attr_types.h"
-namespace tvm {
-namespace contrib {
-using namespace nnvm;
-Array<Tensor>
-ComputeAdd(const NodeAttrs& attrs,
-           const Array<Tensor>& inputs) {
-  static const PackedFunc& pf = GetPackedFunc("tvm_graph.compute.add");
-  CHECK_EQ(inputs.size(), 2U);
-  Tensor ret = pf(inputs[0], inputs[1]);
-  return {ret};
-}
-Array<Tensor>
-ComputeExp(const NodeAttrs& attrs,
-           const Array<Tensor>& inputs) {
-  static const PackedFunc& pf = GetPackedFunc("tvm_graph.compute.exp");
-  CHECK_EQ(inputs.size(), 1U);
-  Tensor ret = pf(inputs[0]);
-  return {ret};
-}
-Schedule ScheduleEWise(const NodeAttrs& attrs,
-                       const Array<Tensor>& outs,
-                       const std::string& target) {
-  static const PackedFunc& pf = GetPackedFunc("tvm_graph.schedule.ewise");
-  return pf(outs, target);
-}
-NNVM_REGISTER_OP(__add_symbol__)
-.set_attr<FTVMCompute>("FTVMCompute", ComputeAdd)
-.set_attr<FTVMSchedule>("FTVMSchedule", ScheduleEWise);
-NNVM_REGISTER_OP(exp)
-.set_attr<FTVMCompute>("FTVMCompute", ComputeExp)
-.set_attr<FTVMSchedule>("FTVMSchedule", ScheduleEWise);
-}  // namespace contrib
-}  // namespace tvm
--- a/apps/graph_executor/tests/test_executor.py
+++ b/apps/graph_executor/tests/test_executor.py
-import tvm_graph as tg
-import numpy as np
-import tvm
-def test_compile():
-    x = tg.Variable('x')
-    y = tg.Variable('y')
-    z = tg.exp(y + x)
-    shape = (10, 128)
-    dtype = tvm.float32
-    g = tg.build(z, "llvm",
-                 shape={'x': shape,
-                        'y': shape})
-    m = tg.bind(g, tvm.cpu(0))
-    # get member functions
-    set_input, run, get_output = m['set_input'], m['run'], m['get_output']
-    na = tvm.nd.array(np.ones(shape).astype(dtype))
-    nb = tvm.nd.array(np.ones(shape).astype(dtype))
-    # set inputs
-    set_input('x', na)
-    set_input('y', nb)
-    # execute
-    run()
-    # get outputs
-    out = tvm.nd.array(np.zeros(shape).astype(dtype))
-    get_output(0, out)
-    np.testing.assert_allclose(
-        out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
-if __name__ == "__main__":
-    test_compile()
--- a/apps/graph_executor/tests/test_rpc_executor.py
+++ b/apps/graph_executor/tests/test_rpc_executor.py
-import tvm
-from tvm.contrib import util, rpc
-import tvm_graph as tg
-import numpy as np
-import os
-def test_rpc_executor():
-    host = 'localhost'
-    port = 9091
-    server = rpc.Server(host, port)
-    tmp = util.tempdir()
-    sym_fname    = tmp.relpath('net.json')
-    lib_fname    = tmp.relpath('net.o')
-    param_fname  = tmp.relpath('net.param')
-    x = tg.Variable('x')
-    y = tg.Variable('y')
-    sym = tg.exp(y + x) + tg.exp(x + y)
-    shape = (10, 128)
-    dtype = tvm.float32
-    na = tvm.nd.array(np.ones(shape).astype(dtype))
-    nb = tvm.nd.array(np.ones(shape).astype(dtype))
-    tg.save_params(param_fname, {'x': na, 'y': nb})
-    remote = rpc.connect(host, port)
-    ctx = remote.cpu(0)
-    target = "llvm"
-    shapes = {'x': shape, 'y': shape}
-    sym_json = tg.compile_graph(lib_fname, sym, target, shapes)
-    remote.upload(lib_fname)
-    param_blob = bytearray(open(param_fname, "rb").read())
-    rm = tg.remote_load_exec(remote,
-                             sym_json,
-                             os.path.basename(lib_fname),
-                             param_blob,
-                             ctx)
-    run, get_output = rm['run'], rm['get_output']
-    nc = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
-    run()
-    get_output(0, nc)
-    npa = na.asnumpy()
-    npb = nb.asnumpy()
-    np.testing.assert_allclose(nc.asnumpy(),
-                               np.exp(npa + npb) + np.exp(npb + npa))
-    server.terminate()
-if __name__ == "__main__":
-    test_rpc_executor()
--- a/apps/graph_executor/tests/test_save_load.py
+++ b/apps/graph_executor/tests/test_save_load.py
-import tvm_graph as tg
-import numpy as np
-import tvm
-def test_save_load():
-    shape = (10, 128)
-    dtype = tvm.float32
-    na = tvm.nd.array(np.ones(shape).astype(dtype))
-    nb = tvm.nd.array(np.ones(shape).astype(dtype))
-    x = tg.Variable('x')
-    y = tg.Variable('y')
-    z = tg.exp(y + x)
-    g = tg.build(z, "llvm", shape={'x': shape, 'y': shape})
-    m0 = tg.bind(g, tvm.cpu(0))
-    set_input0, run0, get_output0 = m0['set_input'], m0['run'], m0['get_output']
-    set_input0(0, na)
-    set_input0(1, nb)
-    run0()
-    out0 = tvm.nd.array(np.zeros(shape).astype(dtype))
-    get_output0(0, out0)
-    tg.save_params('test.params', {'x': na, 'y': nb})
-    # create another executor
-    m1 = tg.bind(g, tvm.cpu(0))
-    load_params1 = m1['load_params']
-    load_params1(bytearray(open('test.params', 'rb').read()))
-    run1, get_output1 = m1['run'], m1['get_output']
-    run1()
-    out1 = tvm.nd.array(np.zeros(shape).astype(dtype))
-    get_output1(0, out1)
-    np.testing.assert_allclose(out0.asnumpy(), out1.asnumpy())
-if __name__ == "__main__":
-    test_save_load()
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -525,27 +525,20 @@ llvm::Value* CodeGenLLVM::CreateCallExtern(const Call* op) {
 llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
  if (op->is_intrinsic("llvm_intrin")) {
-    CHECK_GE(op->args.size(), 1U);
+    CHECK_GE(op->args.size(), 2U);
    llvm::Intrinsic::ID id = static_cast<llvm::Intrinsic::ID>(
        op->args[0].as<UIntImm>()->value);
+    uint64_t num_signature = op->args[1].as<UIntImm>()->value;
    std::vector<llvm::Value*> arg_value;
-    std::vector<llvm::Type*> arg_type;
+    std::vector<llvm::Type*> sig_type;
-    for (size_t i = 1; i < op->args.size(); ++i) {
+    for (size_t i = 2; i < op->args.size(); ++i) {
      arg_value.push_back(MakeValue(op->args[i]));
-      arg_type.push_back(arg_value.back()->getType());
+      if (i - 2 < num_signature) {
+        sig_type.push_back(arg_value.back()->getType());
      }
-    llvm::Function* f = llvm::Intrinsic::getDeclaration(
-        module_.get(), id, arg_type);
-    return builder_->CreateCall(f, arg_value);
-  } else if (op->is_intrinsic("llvm_builtin")) {
-    CHECK_GE(op->args.size(), 1U);
-    llvm::Intrinsic::ID id = static_cast<llvm::Intrinsic::ID>(
-        op->args[0].as<UIntImm>()->value);
-    std::vector<llvm::Value*> arg_value;
-    for (size_t i = 1; i < op->args.size(); ++i) {
-      arg_value.push_back(MakeValue(op->args[i]));
    }
-    llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), id, {});
+    llvm::Function* f = llvm::Intrinsic::getDeclaration(
+        module_.get(), id, sig_type);
    return builder_->CreateCall(f, arg_value);
  } else if (op->is_intrinsic(Call::bitwise_and)) {
    return builder_->CreateAnd(MakeValue(op->args[0]), MakeValue(op->args[1]));

--- a/src/codegen/llvm/intrin_rule_llvm.cc
+++ b/src/codegen/llvm/intrin_rule_llvm.cc
@@ -16,25 +16,8 @@ namespace llvm {
 using namespace ir;
-template<unsigned id>
+// num_signature means number of arguments used to query signature
-inline void DispatchLLVMBuildin(const TVMArgs& targs, TVMRetValue* rv) {
+template<unsigned id, int num_signature>
-  Expr e = targs[0];
-  const Call* call = e.as<Call>();
-  CHECK(call != nullptr);
-  Array<Expr> cargs;
-  // intrin id.
-  cargs.push_back(UIntImm::make(UInt(32), id));
-  for (Expr arg : call->args) {
-    cargs.push_back(arg);
-  }
-  *rv = Call::make(
-      call->type, "llvm_builtin", cargs, Call::Intrinsic);
-}
-TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.prefetch")
-.set_body(DispatchLLVMBuildin<::llvm::Intrinsic::prefetch>);
-template<unsigned id>
 inline void DispatchLLVMPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
  Expr e = targs[0];
  const Call* call = e.as<Call>();
@@ -42,6 +25,8 @@ inline void DispatchLLVMPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
  Array<Expr> cargs;
  // intrin id.
  cargs.push_back(UIntImm::make(UInt(32), id));
+  cargs.push_back(UIntImm::make(UInt(32), num_signature));
  for (Expr arg : call->args) {
    cargs.push_back(arg);
  }
@@ -49,7 +34,7 @@ inline void DispatchLLVMPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
      call->type, "llvm_intrin", cargs, Call::PureIntrinsic);
 }
-template<unsigned id>
+template<unsigned id, int num_signature>
 inline void DispatchLLVMIntrin(const TVMArgs& targs, TVMRetValue* rv) {
  Expr e = targs[0];
  const Call* call = e.as<Call>();
@@ -57,6 +42,7 @@ inline void DispatchLLVMIntrin(const TVMArgs& targs, TVMRetValue* rv) {
  Array<Expr> cargs;
  // intrin id.
  cargs.push_back(UIntImm::make(UInt(32), id));
+  cargs.push_back(UIntImm::make(UInt(32), num_signature));
  for (Expr arg : call->args) {
    cargs.push_back(arg);
  }
@@ -64,20 +50,23 @@ inline void DispatchLLVMIntrin(const TVMArgs& targs, TVMRetValue* rv) {
      call->type, "llvm_intrin", cargs, Call::Intrinsic);
 }
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.prefetch")
+.set_body(DispatchLLVMIntrin<::llvm::Intrinsic::prefetch, 0>);
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.exp")
-.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::exp>);
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::exp, 1>);
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.fma")
-.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::fmuladd>);
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::fmuladd, 1>);
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.log")
-.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::log>);
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::log, 1>);
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.sqrt")
-.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::sqrt>);
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::sqrt, 1>);
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.pow")
-.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::pow>);
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::pow, 1>);
 }  // namespace llvm
 }  // namespace codegen

--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
 #!/bin/bash
 export PYTHONPATH=python:apps/extension/python
-export PYTHONPATH=${PYTHONPATH}:apps/graph_executor/python:apps/graph_executor/nnvm/python
 export LD_LIBRARY_PATH=lib:${LD_LIBRARY_PATH}
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc
@@ -14,12 +13,6 @@ make || exit -1
 cd ../..
 python -m nose -v apps/extension/tests || exit -1
-# Test NNVM integration
-cd apps/graph_executor
-make || exit -1
-cd ../..
-python -m nose -v apps/graph_executor/tests || exit -1
 TVM_FFI=cython python -m nose -v tests/python/integration || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/integration || exit -1
 TVM_FFI=cython python -m nose -v tests/python/contrib || exit -1