[COMPILER] Initial compiler infra (#12)

c6c42af0 · Tianqi Chen · f6f448e1 · c6c42af0 · c6c42af0 · c6c42af0
Commit c6c42af0 authored Sep 17, 2017 by Tianqi Chen
39 changed files
--- a/nnvm/Makefile
+++ b/nnvm/Makefile
@@ -11,7 +11,7 @@ include $(config)
 export LDFLAGS = -pthread -lm
 export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
-CFLAGS += -Itvm/include -Itvm/dlpack/include
+CFLAGS += -Itvm/include -Itvm/dlpack/include -Itvm/HalideIR/src
 ifdef DMLC_CORE_PATH
  CFLAGS += -I$(DMLC_CORE_PATH)/include
@@ -38,7 +38,7 @@ PLUGIN_OBJ =
 include $(NNVM_PLUGINS)
 # specify tensor path
-.PHONY: clean all test lint doc cython cython3 cyclean
+.PHONY: clean all test lint pylint doc cython cython3 cyclean
 UNAME_S := $(shell uname -s)
@@ -55,7 +55,7 @@ endif
 all: lib/libnnvm.a lib/libnnvm_top.$(SHARED_LIBRARY_SUFFIX) lib/libnnvm_top_runtime.$(SHARED_LIBRARY_SUFFIX)
 SRC = $(wildcard src/*.cc src/c_api/*.cc src/core/*.cc src/pass/*.cc)
-SRC_TOP = $(wildcard src/top/*.cc, src/top/*/*.cc src/runtime/*.cc)
+SRC_TOP = $(wildcard src/top/*/*.cc src/runtime/*.cc src/compiler/*.cc src/compiler/*/*.cc)
 ALL_OBJ = $(patsubst %.cc, build/%.o, $(SRC))
 TOP_OBJ = $(patsubst %.cc, build/%.o, $(SRC_TOP))
 ALL_DEP = $(ALL_OBJ)
@@ -90,9 +90,12 @@ cython3:
 cyclean:
 	rm -rf python/nnvm/*/*.so python/nnvm/*/*.dylib python/nnvm/*/*.cpp
-lint:
+lint: pylint
 	python dmlc-core/scripts/lint.py nnvm cpp include src
+pylint:
+	pylint python/nnvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
 doc:
 	doxygen docs/Doxyfile

--- a/nnvm/include/nnvm/compiler/contrib_op_param.h
+++ b/nnvm/include/nnvm/compiler/contrib_op_param.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file contrib_op_param.h
+ * \brief Additional parameters for compiler optimized operators.
+ */
+#ifndef NNVM_COMPILER_CONTRIB_OP_PARAM_H_
+#define NNVM_COMPILER_CONTRIB_OP_PARAM_H_
+#include <dmlc/parameter.h>
+#include <string>
+namespace nnvm {
+namespace compiler {
+/*! \brief Parameters of layout transform operator */
+struct LayoutTransformParam : public dmlc::Parameter<LayoutTransformParam> {
+  std::string src_layout;
+  std::string dst_layout;
+  DMLC_DECLARE_PARAMETER(LayoutTransformParam) {
+    DMLC_DECLARE_FIELD(src_layout);
+    DMLC_DECLARE_FIELD(dst_layout);
+  }
+};
+}  // namespace compiler
+}  // namespace nnvm
+#endif  // NNVM_COMPILER_CONTRIB_OP_PARAM_H_
--- a/nnvm/include/nnvm/compiler/op_attr_types.h
+++ b/nnvm/include/nnvm/compiler/op_attr_types.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file op_attr_types.h
+ * \brief The Expr and related elements in DataFlow construction.
+ */
+#ifndef NNVM_COMPILER_OP_ATTR_TYPES_H_
+#define NNVM_COMPILER_OP_ATTR_TYPES_H_
+#include <tvm/expr.h>
+#include <tvm/tensor.h>
+#include <tvm/schedule.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/runtime/registry.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/graph.h>
+#include <vector>
+#include <string>
+namespace nnvm {
+namespace compiler {
+using ::tvm::Array;
+using ::tvm::Tensor;
+using ::tvm::Schedule;
+/*! \brief operator pattern used in graph fusion */
+enum OpPatternKind : int {
+  // Elementwise operation
+  kElemWise = 0,
+  // Broadcast operation
+  kBroadcast = 1,
+  // Complex operation, can fuse bcast in input/outputs
+  // but cannot chain another complex op
+  kComplex = 2,
+  // Extern operation, cannot fuse anything.
+  kExtern = 3
+};
+/*! \brief the operator pattern */
+using TOpPattern = int;
+/*!
+ * \brief Computation description interface
+ * \param attrs The attribute of the node.
+ * \param inputs The input tensors(placeholders)
+ * \return The output description of the tensor.
+ */
+using FTVMCompute = std::function<
+  Array<Tensor>
+  (const NodeAttrs& attrs, const Array<Tensor>& inputs)>;
+/*!
+ * \brief Build the computation schedule for
+ *  op whose root is at current op.
+ * \param attrs The attribute of the node.
+ * \param outs The output tensors.
+ * \param target The build target.
+ * \return schedule The computation schedule.
+ */
+using FTVMSchedule = std::function<
+  Schedule(const NodeAttrs& attrs,
+           const Array<Tensor>& outs,
+           const std::string& target)>;
+/*! \brief Layout Information about an entry */
+using TLayoutInfo = std::string;
+/*!
+ * \brief The producer consumer function of node layout
+ * \param attrs The attribute of the node.
+ * \param ilayouts The input layouts that the node request.
+ * \param olayouts The output layouts that the node produce.
+ * \return bool The success flag.
+ */
+using FTVMLayoutRequest = std::function<bool (const NodeAttrs& attrs,
+                                              std::vector<TLayoutInfo> *ilayouts,
+                                              std::vector<TLayoutInfo> *olayouts)>;
+/*!
+ * \brief Transform from normal operator to vectorized operator
+ * \param node The source node.
+ * \return Transformed vectorized op.
+ */
+using FTVMVectorizedOp = std::function<nnvm::NodePtr (const nnvm::Node* node)>;
+}  // namespace compiler
+}  // namespace nnvm
+#endif  // NNVM_COMPILER_OP_ATTR_TYPES_H_
--- a/nnvm/include/nnvm/compiler/packed_func_ext.h
+++ b/nnvm/include/nnvm/compiler/packed_func_ext.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file packed_func_ext.h
+ * \brief Extension to enable packed functionn for nnvm types
+ */
+#ifndef NNVM_COMPILER_PACKED_FUNC_EXT_H_
+#define NNVM_COMPILER_PACKED_FUNC_EXT_H_
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <nnvm/graph.h>
+#include <nnvm/symbolic.h>
+#include <string>
+#include <unordered_map>
+namespace nnvm {
+namespace compiler {
+using tvm::runtime::PackedFunc;
+using AttrDict = std::unordered_map<std::string, std::string>;
+/*!
+ * \brief Get PackedFunction from global registry and
+ *  report error if it does not exist
+ * \param name The name of the function.
+ * \return The created PackedFunc.
+ */
+inline const PackedFunc& GetPackedFunc(const std::string& name) {
+  const PackedFunc* pf = tvm::runtime::Registry::Get(name);
+  CHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
+  return *pf;
+}
+}  // namespace compiler
+}  // namespace nnvm
+// Enable the graph and symbol object exchange.
+namespace tvm {
+namespace runtime {
+template<>
+struct extension_class_info<nnvm::Symbol> {
+  static const int code = 16;
+};
+template<>
+struct extension_class_info<nnvm::Graph> {
+  static const int code = 17;
+};
+template<>
+struct extension_class_info<nnvm::compiler::AttrDict> {
+  static const int code = 18;
+};
+}  // namespace runtime
+}  // namespace tvm
+#endif  // NNVM_COMPILER_PACKED_FUNC_EXT_H_
--- a/nnvm/include/nnvm/op_attr_types.h
+++ b/nnvm/include/nnvm/op_attr_types.h
@@ -72,6 +72,18 @@ template<typename AttrType>
 using FInferNodeEntryAttr = std::function<bool (const NodeAttrs& attrs,
                                                std::vector<AttrType> *in_attrs,
                                                std::vector<AttrType> *out_attrs)>;
+/*!
+ * \brief Get attribute dictionary from node.
+ *
+ * \param attrs The attributes of the node.
+ * \return The attribute dict.
+ * \note Register under "FUpdateAttrDict"
+ */
+using FGetAttrDict = std::function<
+  std::unordered_map<std::string, std::string>
+  (const NodeAttrs& attrs)>;
 /*!
 * \brief Shape inference function.
 *  Update the shapes given the input shape information.

--- a/nnvm/include/nnvm/top/README
+++ b/nnvm/include/nnvm/top/README
-NNVM Core Operator Specs
+NNVM Core Operator and Compiler
--- a/nnvm/python/nnvm/_base.py
+++ b/nnvm/python/nnvm/_base.py
 # coding: utf-8
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name, unused-import
 """ ctypes library of nnvm and helper functions """
 from __future__ import absolute_import
 import sys
-import os
 import ctypes
 import numpy as np
 from . import libinfo
-__all__ = ['NNNetError']
+try:
+    import tvm
+except ImportError:
+    pass
 #----------------------------
 # library loading
 #----------------------------
@@ -181,7 +184,7 @@ def ctypes2docstring(num_args, arg_names, arg_types, arg_descs, remove_dup=True)
        param_keys.add(key)
        type_info = py_str(arg_types[i])
        ret = '%s : %s' % (key, type_info)
-        if len(arg_descs[i]) != 0:
+        if arg_descs[i]:
            ret += '\n    ' + py_str(arg_descs[i])
        param_str.append(ret)
    doc_str = ('Parameters\n' +

--- a/nnvm/python/nnvm/_ctypes/symbol.py
+++ b/nnvm/python/nnvm/_ctypes/symbol.py
 # coding: utf-8
-# pylint: disable=invalid-name, protected-access, too-many-arguments, too-many-lines
+# pylint: disable=invalid-name, protected-access, too-many-arguments, too-many-lines,
+# pylint: disable=len-as-condition, consider-iterating-dictionary
 """Symbolic configuration API."""
 from __future__ import absolute_import as _abs
@@ -7,7 +8,7 @@ import copy
 import ctypes
 import sys
 from .._base import _LIB
-from .._base import c_array, c_str, nn_uint, py_str, string_types
+from .._base import c_array, c_str, nn_uint, py_str
 from .._base import SymbolHandle, OpHandle
 from .._base import check_call, ctypes2docstring
 from ..name import NameManager

--- a/nnvm/python/nnvm/compiler/__init__.py
+++ b/nnvm/python/nnvm/compiler/__init__.py
+"""Namespace for NNVM-TVM compiler toolchain"""
+from __future__ import absolute_import
+import tvm
+from . import build_module
+from . build_module import build
+from .. import symbol as _symbol
+from .. import graph as _graph
+from .registry import OpPattern
+from .registry import register_compute, register_schedule, register_pattern
+from .. import top as _top
+tvm.register_extension(_symbol.Symbol, _symbol.Symbol)
+tvm.register_extension(_graph.Graph, _graph.Graph)
--- a/nnvm/python/nnvm/compiler/build_module.py
+++ b/nnvm/python/nnvm/compiler/build_module.py
+# pylint: disable=invalid-name
+"""Namespace for building operators."""
+from __future__ import absolute_import as _abs
+import tvm
+from . import graph_attr
+from .. import graph as _graph
+@tvm.register_func("nnvm.compiler.lower")
+def _lower(sch, inputs, func_name):
+    f = tvm.lower(sch, inputs, name=func_name)
+    return f if isinstance(
+        f, (tvm.container.Array, tuple, list)) else [f]
+@tvm.register_func("nnvm.compiler.build_target")
+def _build(funcs, target):
+    return tvm.build(funcs, target=target)
+_move_module = tvm.get_global_func("nnvm.compiler._move_module")
+def optimize(graph):
+    """Perform graph optimization
+    Parameters
+    ----------
+    graph : Graph
+        The graph to be used in lowering.
+    Returns
+    -------
+    graph : Graph
+        The optimized execution graph.
+    """
+    return graph
+def build(graph, target, shape, dtype="float32"):
+    """Build graph into runtime library.
+    This is the final step of graph compilation.
+    Parameters
+    ----------
+    graph : Graph
+        The graph to be used in lowering
+    target : str
+        The build target
+    shape : dict of str to tuple
+        The input shape to the graph
+    dtype : str or dict of str to str
+        The input types to the graph
+    Returns
+    -------
+    graph : Graph
+        The final execution graph.
+    libmod : tvm.Module
+        The modue that comes with the execution graph
+    """
+    if not isinstance(target, str):
+        raise TypeError("require target to be str")
+    if not isinstance(shape, dict):
+        raise TypeError("require shape to be dict")
+    graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
+    graph = graph_attr.set_shape(graph, shape)
+    graph = graph_attr.set_dtype(graph, dtype)
+    graph._set_json_attr("target", target, "str")
+    graph = graph.apply("InferShape").apply("InferType")
+    graph = graph.apply("GraphFusePartition").apply("GraphFuse")
+    libmod = _move_module(graph)
+    return graph, libmod
--- a/nnvm/python/nnvm/compiler/graph_attr.py
+++ b/nnvm/python/nnvm/compiler/graph_attr.py
+"""Utilities to access graph attributes"""
+from __future__ import absolute_import as _abs
+def set_shape(g, shape):
+    """Set the shape of graph nodes in the graph attribute.
+    Parameters
+    ----------
+    g : Graph
+        The input graph
+    shape : dict of str to tuple
+        The input shape
+    Returns
+    -------
+    g : Graph
+        The updated graph with updated shape.
+    """
+    index = g.index
+    list_shape = [[]] * index.num_node_entries
+    for k, v in shape.items():
+        list_shape[index.entry_id(k)] = v
+    g._set_json_attr("shape", list_shape, 'list_shape')
+    return g
+DTYPE_DICT = {
+    "float32": 0
+}
+def set_dtype(g, dtype):
+    """Set the dtype of graph nodes
+    Parameters
+    ----------
+    g : Graph
+        The input graph
+    dtype : dict of str to str or str
+        The input dtype
+    Returns
+    -------
+    g : Graph
+        The updated graph with updated dtype.
+    """
+    index = g.index
+    if isinstance(dtype, dict):
+        list_dtype = [-1] * index.num_node_entries
+        for k, v in dtype.items():
+            list_dtype[index.entry_id(k)] = DTYPE_DICT[v]
+    else:
+        list_dtype = [DTYPE_DICT[dtype]] * index.num_node_entries
+    g._set_json_attr("dtype", list_dtype, "list_int")
+    return g
--- a/nnvm/python/nnvm/compiler/graph_pass.py
+++ b/nnvm/python/nnvm/compiler/graph_pass.py
+"""Namespace of graph pass.
+Principle:
+- Graph in, graph out: always takes in graph as first argument and returns a graph
+- Composable API: break graph transformation pass as segments of small transformations.
+"""
+from __future__ import absolute_import as _abs
--- a/nnvm/python/nnvm/compiler/registry.py
+++ b/nnvm/python/nnvm/compiler/registry.py
+# pylint: disable=invalid-name
+"""Information registry to register operator information for compiler"""
+import tvm
+class OpPattern(object):
+    ELEM_WISE = 0
+    BROADCAST = 1
+    COMPLEX = 2
+    EXTERN = 2
+_register_compute = tvm.get_global_func("nnvm._register_compute")
+_register_schedule = tvm.get_global_func("nnvm._register_schedule")
+_register_pattern = tvm.get_global_func("nnvm._register_pattern")
+def register_compute(op_name, f=None, level=10):
+    """Register compute function for operator
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+    f : function
+        The schedule function
+    level : int
+        The priority level
+    Returns
+    -------
+    fregister : function
+        Register function if f is not specified.
+    """
+    def register(myf):
+        """internal register function"""
+        _register_compute(op_name, myf, level)
+        return myf
+    return register(f) if f else register
+def register_schedule(op_name, f=None, level=10):
+    """Register schedule function for operator
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+    f : function
+        The schedule function
+    level : int
+        The priority level
+    Returns
+    -------
+    fregister : function
+        Register function if f is not specified.
+    """
+    def register(myf):
+        """internal register function"""
+        _register_schedule(op_name, myf, level)
+        return myf
+    return register(f) if f else register
+def register_pattern(op_name, pattern, level=10):
+    """Register pattern code for operator
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+    pattern : int
+        The pattern code.
+    level : int
+        The priority level
+    """
+    _register_pattern(op_name, pattern, level)
--- a/nnvm/python/nnvm/graph.py
+++ b/nnvm/python/nnvm/graph.py
@@ -4,7 +4,6 @@
 from __future__ import absolute_import as _abs
 import ctypes
-import sys
 import json
 from ._base import _LIB
 from ._base import c_array, c_str, nn_uint, py_str, string_types
@@ -12,12 +11,73 @@ from ._base import GraphHandle, SymbolHandle
 from ._base import check_call
 from .symbol import Symbol, Group as _Group
+class GraphIndex(object):
+    """Index for quickly accessing graph attributes.
+    Parameters
+    ----------
+    graph : Graph
+        The graph to create index.
+    """
+    def __init__(self, graph):
+        jgraph = json.loads(create(graph).apply("SaveJSON").json_attr("json"))
+        self.nodes = jgraph["nodes"]
+        self.entry_ptr = jgraph["node_row_ptr"]
+        self._name2nodeid = {n["name"]: i for i, n in enumerate(self.nodes)}
+    @property
+    def num_nodes(self):
+        """Number of nodes in graph."""
+        return len(self.entry_ptr) - 1
+    @property
+    def num_node_entries(self):
+        """Number of nodes in graph."""
+        return self.entry_ptr[-1]
+    def node_id(self, key):
+        """Get the node index for a given key.
+        Parameters
+        ----------
+        key : str or int
+            The node key or index
+        Returns
+        -------
+        index : int
+            The entry index
+        """
+        return self._name2nodeid[key]
+    def entry_id(self, key, value_index=0):
+        """Get the entry id of a node entry.
+        Parameters
+        ----------
+        key : str or int
+            The node key or index
+        value_index : int
+            The value index of output
+        Returns
+        -------
+        index : int
+            The entry index
+        """
+        idx = self.node_id(key) if isinstance(key, str) else key
+        assert value_index < self.entry_ptr[idx + 1]
+        return self.entry_ptr[idx] + value_index
 class Graph(object):
    """Graph is the graph object that can be used to apply optimization pass.
-    It contains additional graphwise attribute besides the internal symbol.
+    It contains additional graphwise attribute besides the internal symbol.
    """
+    _tvm_tcode = 17
    # pylint: disable=no-member
    def __init__(self, handle):
@@ -29,6 +89,7 @@ class Graph(object):
            the handle to the underlying C++ Graph
        """
        self.handle = handle
+        self._index = None
    def __del__(self):
        check_call(_LIB.NNGraphFree(self.handle))
@@ -53,7 +114,6 @@ class Graph(object):
        if success.value != 0:
            json_str = py_str(ret.value)
            return json.loads(json_str)[1]
-        else:
        return None
    def _set_symbol_list_attr(self, key, value):
@@ -96,17 +156,33 @@ class Graph(object):
            self.handle, c_str(key), c_str(json_value)))
    @property
+    def _tvm_handle(self):
+        return self.handle.value
+    @property
    def symbol(self):
        shandle = SymbolHandle()
        check_call(_LIB.NNGraphGetSymbol(self.handle, ctypes.byref(shandle)))
        return Symbol(shandle)
+    @property
+    def index(self):
+        if not self._index:
+            self._index = GraphIndex(self)
+        return self._index
    def apply(self, passes):
        """Apply passes to the graph
        Parameters
        ----------
+        passes : str or list of str
+            The passes to be applied
+        Returns
+        -------
+        g : Graph
+            The transformed graph.
        """
        if isinstance(passes, string_types):
            passes = [passes]

--- a/nnvm/python/nnvm/libinfo.py
+++ b/nnvm/python/nnvm/libinfo.py
@@ -52,7 +52,7 @@ def find_lib_path():
        dll_path = [os.path.join(p, '%s.so' % lib_name) for p in dll_path]
    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
-    if len(lib_path) == 0:
+    if not lib_path:
        raise RuntimeError('Cannot find the files.\n' +
                           'List of candidates:\n' + str('\n'.join(dll_path)))
    return lib_path

--- a/nnvm/python/nnvm/runtime.py
+++ b/nnvm/python/nnvm/runtime.py
+"""Runtime environment for nnvm relies on TVM."""
+import tvm
+from tvm.contrib import rpc
+def create(graph, libmod, ctx):
+    """Create a runtime executor module given the graph and module.
+    Parameters
+    ----------
+    graph : The graph to be deployed
+        The graph to be loaded.
+    libmod : tvm.Module
+        The module of the corresponding function
+    ctx : TVMContext
+        The context to deploy the module, can be local or remote.
+    Returns
+    -------
+    graph_module : tvm.Module
+        Runtime graph module to execute the graph.
+    """
+    json_str = graph if isinstance(graph, str) else graph.apply("SaveJSON").json_attr("json")
+    device_type = ctx.device_type
+    device_id = ctx.device_id
+    if device_type >= rpc.RPC_SESS_MASK:
+        assert libmod.type_key == "rpc"
+        assert rpc._SessTableIndex(libmod) == ctx._rpc_sess._tbl_index
+        hmod = rpc._ModuleHandle(libmod)
+        fcreate = ctx._rpc_sess.get_function("nnvm.runtime.remote_create")
+        device_type = device_type % rpc.RPC_SESS_MASK
+        return fcreate(json_str, hmod, device_type, device_id)
+    fcreate = tvm.get_global_func("nnvm.runtime.create")
+    return fcreate(json_str, libmod, device_type, device_id)
--- a/nnvm/python/nnvm/symbol.py
+++ b/nnvm/python/nnvm/symbol.py
+# pylint: disable=invalid-name, unused-import
 """Symbolic configuration API."""
 from __future__ import absolute_import as _abs
 import sys as _sys
@@ -7,8 +8,8 @@ import ctypes as _ctypes
 from numbers import Number as _Number
 from . import _base
 from ._base import _LIB, check_call as _check_call
-from . import _symbol_internal as _internal
 from .attribute import AttrScope
+from . import _symbol_internal as _internal
 # Use different verison of SymbolBase
 # When possible, use cython to speedup part of computation.
@@ -29,6 +30,12 @@ class Symbol(SymbolBase):
    # disable dictionary storage, also do not have parent type.
    __slots__ = []
+    _tvm_tcode = 16
+    @property
+    def _tvm_handle(self):
+        return self.handle.value
    def __add__(self, other):
        if isinstance(other, Symbol):
            return __add_symbol__(self, other)
@@ -148,7 +155,6 @@ class Symbol(SymbolBase):
            self.handle, _base.c_str(key), _ctypes.byref(ret), _ctypes.byref(success)))
        if success.value != 0:
            return _base.py_str(ret.value)
-        else:
        return None
    def list_attr(self, recursive=False):

--- a/nnvm/python/nnvm/top/__init__.py
+++ b/nnvm/python/nnvm/top/__init__.py
+"""Declaration about Tensor operators"""
+from .attr_dict import AttrDict
+from . import tensor
+from . import nn
--- a/nnvm/python/nnvm/top/attr_dict.py
+++ b/nnvm/python/nnvm/top/attr_dict.py
+# pylint: disable=invalid-name
+"""Attr dictionary object used by schedule functions"""
+import json
+import tvm
+_dict_get = tvm.get_global_func("nnvm.compiler._dict_get")
+_dict_size = tvm.get_global_func("nnvm.compiler._dict_size")
+_dict_keys = tvm.get_global_func("nnvm.compiler._dict_keys")
+class AttrDict(object):
+    """Attribute dictionary in nnvm.
+    Used by python registration of compute and schedule function.
+    """
+    _tvm_tcode = 18
+    def __init__(self, handle):
+        self.handle = handle
+    def __del__(self):
+        tvm.nd.free_extension_handle(self.handle, 18)
+    @property
+    def _tvm_handle(self):
+        return self.handle.value
+    def __getitem__(self, key):
+        return _dict_get(self, key)
+    def keys(self):
+        """Get list of keys in the dict.
+        Returns
+        -------
+        keys : list of str
+            List of keys
+        """
+        return [x.value for x in _dict_keys(self)]
+    def get_int_tuple(self, key):
+        """Get tuple of integer from attr dict
+        Parameters
+        ----------
+        key : str
+            The attr key
+        Returns
+        -------
+        tuple : tuple of int
+            The result tuple
+        """
+        return tuple(json.loads(self[key]))
+    def get_int(self, key):
+        """Get integer from attr dict
+        Parameters
+        ----------
+        key : str
+            The attr key
+        Returns
+        -------
+        value : int
+            The result value
+        """
+        return int(self[key])
+    def get_bool(self, key):
+        """Get bool from attr dict
+        Parameters
+        ----------
+        key : str
+            The attr key
+        Returns
+        -------
+        value : bool
+            The result value
+        """
+        return self[key] != "False"
+    def __repr__(self):
+        return str({k : self[k] for k in self.keys()})
+tvm.register_extension(AttrDict, AttrDict)
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
+"""Definition of nn ops"""
+from __future__ import absolute_import
+import tvm
+import topi
+from ..compiler import registry as reg
+from ..compiler import OpPattern
+# conv
+@reg.register_compute("conv2d")
+def compute_conv2d(attrs, inputs):
+    """Compute definition of conv2d"""
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    layout = attrs["layout"]
+    assert layout == "NCHW", "only support nchw for now"
+    assert dilation == (1, 1), "not support dilate now"
+    out = topi.nn.conv2d_nchw(inputs[0], inputs[1], strides, padding)
+    if attrs.get_bool("use_bias"):
+        bias = inputs[2]
+        bias = topi.broadcast_to(bias, (1, bias.shape[0], 1, 1))
+        out = topi.broadcast_add(out, bias)
+    return out
+@reg.register_schedule("conv2d")
+def schedule_conv2d(_, outs, target):
+    """Schedule definition of conv2d"""
+    if target == "cuda":
+        return topi.cuda.schedule_conv2d_nchw(outs)
+    # naive schedule
+    return tvm.create_schedule([x.op for x in outs])
+reg.register_pattern("conv2d", OpPattern.COMPLEX)
--- a/nnvm/python/nnvm/top/tensor.py
+++ b/nnvm/python/nnvm/top/tensor.py
+# pylint: disable=invalid-name
+"""Tensor ops"""
+from __future__ import absolute_import
+import tvm
+import topi
+import topi.cuda
+from ..compiler import registry as reg
+from ..compiler import OpPattern
+def _schedule_broadcast(_, outs, target):
+    """Generic schedule for binary bcast"""
+    if target == "cuda":
+        return topi.cuda.schedule_elemwise(outs)
+    assert target.startswith("llvm")
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+    return s
+_fschedule_broadcast = tvm.convert(_schedule_broadcast)
+# exp
+reg.register_compute("exp",
+                     lambda _, x: topi.exp(x[0]))
+reg.register_pattern("exp", OpPattern.ELEM_WISE)
+reg.register_schedule("exp", _fschedule_broadcast)
+# broadcast_add
+reg.register_compute("broadcast_add",
+                     lambda _, x: topi.broadcast_add(x[0], x[1]))
+reg.register_pattern("broadcast_add", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_add", _fschedule_broadcast)
+# broadcast_sub
+reg.register_compute("broadcast_sub",
+                     lambda _, x: topi.broadcast_sub(x[0], x[1]))
+reg.register_pattern("broadcast_sub", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_sub", _fschedule_broadcast)
+# broadcast_mul
+reg.register_compute("broadcast_mul",
+                     lambda _, x: topi.broadcast_mul(x[0], x[1]))
+reg.register_pattern("broadcast_mul", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_mul", _fschedule_broadcast)
+# broadcast_div
+reg.register_compute("broadcast_div",
+                     lambda _, x: topi.broadcast_div(x[0], x[1]))
+reg.register_pattern("broadcast_div", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_div", _fschedule_broadcast)
--- a/nnvm/src/compiler/packed_func_ext.cc
+++ b/nnvm/src/compiler/packed_func_ext.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file packed_func_ext.cc
+ * \brief Registeration of extension type.
+ */
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/op.h>
+#include <nnvm/compiler/packed_func_ext.h>
+#include <nnvm/compiler/op_attr_types.h>
+namespace tvm {
+namespace runtime {
+TVM_REGISTER_EXT_TYPE(nnvm::Graph);
+TVM_REGISTER_EXT_TYPE(nnvm::Symbol);
+TVM_REGISTER_EXT_TYPE(nnvm::compiler::AttrDict);
+}  // namespace runtime
+}  // namespace tvm
+namespace nnvm {
+namespace compiler {
+using tvm::Tensor;
+using tvm::Array;
+using tvm::Node;
+using tvm::runtime::TVMArgs;
+using tvm::runtime::TVMRetValue;
+TVM_REGISTER_GLOBAL("nnvm.compiler._dict_get")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const AttrDict& dict = args[0].AsExtension<AttrDict>();
+    std::string key = args[1];
+    auto it = dict.find(key);
+    if (it != dict.end()) {
+      *rv = it->second;
+    } else {
+      *rv = nullptr;
+    }
+  });
+TVM_REGISTER_GLOBAL("nnvm.compiler._dict_size")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const AttrDict& dict = args[0].AsExtension<AttrDict>();
+    *rv = static_cast<int64_t>(dict.size());
+  });
+TVM_REGISTER_GLOBAL("nnvm.compiler._dict_keys")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const AttrDict& dict = args[0].AsExtension<AttrDict>();
+    tvm::Array<tvm::Expr> keys;
+    for (const auto& kv : dict) {
+      keys.push_back(kv.first);
+    }
+    *rv = keys;
+  });
+// custom version of TVM compute
+inline std::unordered_map<std::string, std::string>
+GetAttrDict(const NodeAttrs& attrs) {
+  static auto& fgetdict = nnvm::Op::GetAttr<FGetAttrDict>("FGetAttrDict");
+  if (fgetdict.count(attrs.op)) {
+    return fgetdict[attrs.op](attrs);
+  } else {
+    return attrs.dict;
+  }
+}
+TVM_REGISTER_GLOBAL("nnvm._register_compute")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    PackedFunc f = args[1];
+    Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+    auto fcompute = [f](const NodeAttrs& attrs, const Array<Tensor>& inputs)
+        -> Array<Tensor> {
+      TVMRetValue ret = f(GetAttrDict(attrs), inputs);
+      if ((*ret.ptr<std::shared_ptr<tvm::Node> >())->derived_from<tvm::TensorNode>()) {
+        return {ret.operator Tensor()};
+      } else {
+        return ret;
+      }
+    };
+    op.set_attr<FTVMCompute>("FTVMCompute", fcompute, args[2]);
+  });
+TVM_REGISTER_GLOBAL("nnvm._register_schedule")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    PackedFunc f = args[1];
+    Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+    auto fschedule = [f](const NodeAttrs& attrs,
+                         const Array<Tensor>& outs,
+                         const std::string& target) {
+      return f(GetAttrDict(attrs), outs, target).operator Schedule();
+    };
+    op.set_attr<FTVMSchedule>("FTVMSchedule", fschedule, args[2]);
+  });
+TVM_REGISTER_GLOBAL("nnvm._register_pattern")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+    op.set_attr<TOpPattern>("TOpPattern", args[1].operator int(), args[2]);
+  });
+}  // namespace compiler
+}  // namespace nnvm
--- a/nnvm/src/compiler/pass/graph_fuse.cc
+++ b/nnvm/src/compiler/pass/graph_fuse.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file graph_fuse.cc
+ * \brief Fuse the operators together.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/tuple.h>
+#include <nnvm/pass.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/packed_func_ext.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/operation.h>
+#include <tvm/lowered_func.h>
+#include "../../runtime/graph_executor.h"
+namespace nnvm {
+namespace compiler {
+using namespace tvm;
+using DLTypeVector = std::vector<DLDataType>;
+// The single fuse rule.
+enum class FuseRule {
+  kUknown,
+  kFuseToMaster,
+  kRealize
+};
+DLDataType GetDLType(int type_flag) {
+  if (type_flag == 0) return Type2TVMType(Float(32));
+  LOG(FATAL) << "unknown type_flag=" << type_flag;
+  return Type2TVMType(Float(32));
+}
+// Partition the graph into segments
+// Each segment will be compiled into one operator.
+// Need also mark the property of the segment.
+nnvm::Graph GraphFusePartition(nnvm::Graph g) {
+  // setup ref counter
+  const IndexedGraph& idx = g.indexed_graph();
+  // Get attributes from the graph
+  const ShapeVector& shape_vec = g.GetAttr<ShapeVector>("shape");
+  const DTypeVector& dtype_vec = g.GetAttr<DTypeVector>("dtype");
+  // Transform to dltype
+  // In future, directly fo type inference in dltype.
+  DLTypeVector dltype_vec = DLTypeVector(dtype_vec.size());
+  for (size_t i = 0; i < dtype_vec.size(); ++i) {
+    dltype_vec[i] = GetDLType(dtype_vec[i]);
+  }
+  // Reference counter of each op node
+  // For now, always store result when an op is referred more than once.
+  std::vector<uint32_t> ref_count(idx.num_nodes(), 0);
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    for (const auto& e : inode.inputs) {
+      ++ref_count[e.node_id];
+    }
+  }
+  for (const auto& e : idx.outputs()) {
+    // this line will realize all the outputs
+    ref_count[e.node_id] += 2;
+  }
+  // Pattern fo the subgraph
+  std::vector<TOpPattern> pattern_vec(idx.num_nodes(),  kExtern);
+  // Whether node can be fused to parent.
+  std::vector<FuseRule> fuse_vec(idx.num_nodes(), FuseRule::kUknown);
+  // Master node id of fusion segment.
+  std::vector<int> master_vec(idx.num_nodes(), -1);
+  // Operator pattern
+  static auto& op_pattern = nnvm::Op::GetAttr<TOpPattern>("TOpPattern");
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) {
+      fuse_vec[nid] = FuseRule::kRealize; continue;
+    }
+    TOpPattern pt = op_pattern.get(inode.source->op(), kExtern);
+    if (pt <= kBroadcast) {
+      int chosen_master = -1;
+      bool ewise = inode.source->num_outputs() == 1;
+      for (const auto& e : inode.inputs) {
+        if (fuse_vec[e.node_id] == FuseRule::kUknown) {
+          TOpPattern ipt = pattern_vec[e.node_id];
+          if (ipt != kElemWise) ewise = false;
+          if (ipt <= kBroadcast) {
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+          } else if (ipt == kComplex && chosen_master == -1 &&
+            shape_vec[idx.entry_id(nid, 0)] == shape_vec[idx.entry_id(e)]) {
+            chosen_master = master_vec[e.node_id];
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+          } else {
+            fuse_vec[e.node_id] = FuseRule::kRealize;
+          }
+        }
+        if (ewise) {
+          if (shape_vec[idx.entry_id(nid, 0)] != shape_vec[idx.entry_id(e)]) {
+            ewise = false;
+          }
+        }
+      }
+      master_vec[nid] = chosen_master;
+      if (chosen_master != -1) {
+        pt = kComplex;
+      } else {
+        pt = ewise ? kElemWise : kBroadcast;
+      }
+    } else {
+      master_vec[nid] = nid;
+      for (const auto& e : inode.inputs) {
+        if (fuse_vec[e.node_id] == FuseRule::kUknown) {
+          fuse_vec[e.node_id] = FuseRule::kRealize;
+          if (master_vec[e.node_id] == -1) {
+            master_vec[e.node_id] = e.node_id;
+          }
+        }
+      }
+    }
+    pattern_vec[nid] = pt;
+    if (ref_count[nid] > 1) {
+      fuse_vec[nid] = FuseRule::kRealize;
+      if (master_vec[nid] == -1) {
+        master_vec[nid] = nid;
+      }
+    }
+  }
+  // point to the group root id of each node
+  std::vector<int> group_vec(idx.num_nodes(), -1);
+  for (uint32_t i = idx.num_nodes(); i != 0; --i) {
+    uint32_t nid = i - 1;
+    const auto& inode = idx[nid];
+    if (group_vec[nid] == -1) {
+      group_vec[nid] = nid;
+    }
+    // propagate the group id.
+    for (const auto& e : inode.inputs) {
+      if (fuse_vec[e.node_id] == FuseRule::kFuseToMaster) {
+        CHECK(group_vec[e.node_id] == -1||
+              group_vec[e.node_id] == group_vec[nid]);
+        group_vec[e.node_id] = group_vec[nid];
+      }
+    }
+  }
+  g.attrs["group_root"] = std::make_shared<any>(std::move(group_vec));
+  g.attrs["group_master"] = std::make_shared<any>(std::move(master_vec));
+  g.attrs["pattern"] = std::make_shared<any>(std::move(pattern_vec));
+  g.attrs["dltype"] = std::make_shared<any>(std::move(dltype_vec));
+  return g;
+}
+NNVM_REGISTER_PASS(GraphFusePartition)
+.set_body(GraphFusePartition)
+.depend_graph_attr("shape")
+.depend_graph_attr("dtype")
+.provide_graph_attr("dltype");
+struct NodeEntryHash {
+  size_t operator()(const IndexedGraph::NodeEntry& e) const {
+    return e.node_id;
+  }
+};
+struct NodeEntryEqual {
+  size_t operator()(const IndexedGraph::NodeEntry& a,
+                    const IndexedGraph::NodeEntry& b) const {
+    return a.node_id == b.node_id && a.index == b.index;
+  }
+};
+// Auxiliary data structure for representing fused op.
+struct FuseEntry {
+  // The inputs
+  std::vector<IndexedGraph::NodeEntry> inputs;
+  // The input map
+  std::unordered_map<IndexedGraph::NodeEntry, Tensor,
+                     NodeEntryHash, NodeEntryEqual> imap;
+  // Output tensors
+  Array<Tensor> outputs;
+  // Placeholder for inputs
+  Array<Tensor> placeholder;
+  // Computing schedule
+  Schedule schedule;
+  // Function name
+  std::string func_name;
+};
+// Fuse the partitioned graph into segments.
+// Create a new graph with fused noded.
+// Also inheritate attribute shape, dltype from previous graph.
+nnvm::Graph GraphFuse(nnvm::Graph g) {
+  // setup ref counter
+  const IndexedGraph& idx = g.indexed_graph();
+  // Get attributes from the graph
+  const ShapeVector& shape_vec = g.GetAttr<ShapeVector>("shape");
+  const DLTypeVector& dltype_vec = g.GetAttr<DLTypeVector>("dltype");
+  const DTypeVector& dtype_vec = g.GetAttr<DTypeVector>("dtype");
+  const std::vector<int>& group_vec = g.GetAttr<std::vector<int> >("group_root");
+  const std::vector<int>& master_vec = g.GetAttr<std::vector<int> >("group_master");
+  const std::vector<TOpPattern>& pattern_vec =
+      g.GetAttr<std::vector<TOpPattern> >("pattern");
+  std::string target = g.GetAttr<std::string>("target");
+  std::vector<FuseEntry> fuse_vec(idx.num_nodes());
+  // setup inputs and placeholder.
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    CHECK_GE(group_vec[nid], 0);
+    int root_id = group_vec[nid];
+    FuseEntry& fe = fuse_vec[root_id];
+    TOpPattern pt = pattern_vec[root_id];
+    for (const auto& e : inode.inputs) {
+      if (group_vec[e.node_id] != root_id && fe.imap.count(e) == 0) {
+        Array<Expr> shape;
+        if (pt == kElemWise) {
+          // elementwise support flatten
+          int64_t prod = 1;
+          for (int64_t x : shape_vec[idx.entry_id(e)]) {
+            prod *= x;
+          }
+          CHECK_LE(prod, static_cast<int64_t>(std::numeric_limits<int>::max()));
+          shape.push_back(make_const(Int(32), prod));
+        } else {
+          for (int64_t x : shape_vec[idx.entry_id(e)]) {
+            CHECK_LE(x, static_cast<int64_t>(std::numeric_limits<int>::max()));
+            shape.push_back(make_const(Int(32), x));
+          }
+        }
+        std::ostringstream os_name;
+        os_name << "input" << fe.inputs.size();
+        Tensor data = placeholder(
+            shape, TVMType2Type(dltype_vec[idx.entry_id(e)]),
+            os_name.str());
+        fe.imap[e] = data;
+        fe.inputs.push_back(e);
+        fe.placeholder.push_back(data);
+      }
+    }
+  }
+  // Setup the Tensor
+  std::vector<Tensor> tensor_vec(idx.num_node_entries());
+  static auto& fcompute =
+      nnvm::Op::GetAttr<FTVMCompute>("FTVMCompute");
+  static auto& fschedule =
+      nnvm::Op::GetAttr<FTVMSchedule>("FTVMSchedule");
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    int root_id = group_vec[nid];
+    FuseEntry& fe = fuse_vec[root_id];
+    Array<Tensor> inputs;
+    // input loading
+    for (const auto& e : inode.inputs) {
+      if (group_vec[e.node_id] != root_id) {
+        auto it = fe.imap.find(e);
+        CHECK(it != fe.imap.end());
+        inputs.push_back(it->second);
+      } else {
+        Tensor t = tensor_vec[idx.entry_id(e)];
+        CHECK(t.defined());
+        inputs.push_back(t);
+      }
+    }
+    // get default
+    Array<Tensor> out = fcompute[inode.source->op()](
+        inode.source->attrs, inputs);
+    CHECK_EQ(out.size(), inode.source->num_outputs());
+    // schedule on root node, and use master's schedule
+    if (nid != root_id) {
+      for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+        uint32_t eid = idx.entry_id(nid, index);
+        tensor_vec[eid] = out[index];
+      }
+    } else {
+      fe.outputs = out;
+      int master = master_vec[root_id];
+      CHECK_GE(master, 0);
+      fe.schedule = fschedule[idx[master].source->op()](
+          idx[master].source->attrs, fe.outputs, target);
+      std::ostringstream os;
+      os << idx[master].source->attrs.name + "_id" << nid;
+      fe.func_name = os.str();
+    }
+  }
+  static const PackedFunc& flower = GetPackedFunc("nnvm.compiler.lower");
+  static const PackedFunc& fbuild = GetPackedFunc("nnvm.compiler.build_target");
+  Array<tvm::LoweredFunc> funcs;
+  for (const FuseEntry& fe : fuse_vec) {
+    if (fe.schedule.defined()) {
+      Array<tvm::Tensor> args = fe.placeholder;
+      for (tvm::Tensor x : fe.outputs) {
+        args.push_back(x);
+      }
+      Array<tvm::LoweredFunc> ret = flower(fe.schedule, args, fe.func_name);
+      for (LoweredFunc x : ret) {
+        funcs.push_back(x);
+      }
+    }
+  }
+  tvm::runtime::Module module = fbuild(funcs, target);
+  // Final step: Remap the node, with given attribute
+  const nnvm::Op* tvm_op = nnvm::Op::Get("tvm_op");
+  std::unordered_map<uint32_t, nnvm::NodePtr> old_new;
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) {
+      nnvm::NodePtr np = nnvm::Node::Create();
+      np->attrs = inode.source->attrs;
+      old_new[nid] = np;
+    } else {
+      int root_id = group_vec[nid];
+      if (nid != root_id) continue;
+      FuseEntry& fe = fuse_vec[root_id];
+      nnvm::NodePtr np = nnvm::Node::Create();
+      np->attrs.op = tvm_op;
+      np->attrs.name = inode.source->attrs.name;
+      runtime::TVMOpParam param;
+      param.func_name = fuse_vec[nid].func_name;
+      param.num_inputs = static_cast<uint32_t>(fe.inputs.size());
+      param.num_outputs = static_cast<uint32_t>(fe.outputs.size());
+      param.flatten_data = pattern_vec[nid] == kElemWise;
+      param.UpdateDict(&(np->attrs.dict));
+      np->attrs.parsed = std::move(param);
+      for (const auto& e : fe.inputs) {
+        auto it = old_new.find(e.node_id);
+        CHECK(it != old_new.end())
+            << "cannot find node_id=" << e.node_id;
+        np->inputs.emplace_back(
+            nnvm::NodeEntry{it->second, e.index, e.version});
+      }
+      for (const uint32_t node_id : inode.control_deps) {
+        auto it = old_new.find(node_id);
+        CHECK(it != old_new.end());
+        np->control_deps.emplace_back(it->second);
+      }
+      old_new[nid] = np;
+    }
+  }
+  nnvm::Graph ret;
+  for (const auto& e : idx.outputs()) {
+    auto it = old_new.find(group_vec[e.node_id]);
+    CHECK(it != old_new.end())
+        << "cannot find node_id=" << e.node_id;
+    ret.outputs.emplace_back(
+        nnvm::NodeEntry{it->second, e.index, e.version});
+  }
+  const IndexedGraph& new_idx = ret.indexed_graph();
+  ShapeVector new_shape_vec = ShapeVector(new_idx.num_node_entries(), TShape());
+  DTypeVector new_dtype_vec = DTypeVector(new_idx.num_node_entries());
+  std::vector<std::string> new_dltype_vec(new_idx.num_node_entries());
+  for (const auto& kv : old_new) {
+    uint32_t nid = kv.first;
+    const auto& inode = idx[nid];
+    for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+      uint32_t new_eid = new_idx.entry_id(new_idx.node_id(kv.second.get()), i);
+      uint32_t old_eid = idx.entry_id(nid, i);
+      new_shape_vec[new_eid] = shape_vec[old_eid];
+      new_dtype_vec[new_eid] = dtype_vec[old_eid];
+      new_dltype_vec[new_eid] = tvm::runtime::TVMType2String(dltype_vec[old_eid]);
+    }
+  }
+  ret.attrs["shape"] = std::make_shared<any>(std::move(new_shape_vec));
+  ret.attrs["dtype"] = std::make_shared<any>(std::move(new_dtype_vec));
+  ret.attrs["dltype"] = std::make_shared<any>(std::move(new_dltype_vec));
+  ret.attrs["module"] = std::make_shared<any>(std::move(module));
+  ret = nnvm::ApplyPass(ret, "PlanMemory");
+  return ret;
+}
+NNVM_REGISTER_PASS(GraphFuse)
+.set_body(GraphFuse);
+TVM_REGISTER_GLOBAL("nnvm.compiler._move_module")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const nnvm::Graph& g = args[0].AsExtension<Graph>();
+    *rv = const_cast<nnvm::Graph*>(&g)->
+        MoveCopyAttr<tvm::runtime::Module>("module");
+  });
+}  // namespace compiler
+}  // namespace nnvm
--- a/nnvm/src/compiler/pass/layout_transform.cc
+++ b/nnvm/src/compiler/pass/layout_transform.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file layout_transform.cc
+ * \brief Transforms layout.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/pass.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/contrib_op_param.h>
+namespace nnvm {
+namespace compiler {
+const TLayoutInfo& GetDefaultLayout() {
+  static TLayoutInfo default_layout = "default";
+  return default_layout;
+}
+nnvm::NodePtr CreateLayoutTransformNode(const std::string& src,
+                                        const std::string& dst) {
+  static const nnvm::Op* trans_op = nnvm::Op::Get("layout_transform");
+  static int count = 0;
+  nnvm::NodePtr n = nnvm::Node::Create();
+  n->attrs.op = trans_op;
+  n->attrs.name = src + "_to_" + dst + std::to_string(count++);
+  n->attrs.dict["src_layout"] = src;
+  n->attrs.dict["dst_layout"] = dst;
+  n->op()->attr_parser(&(n->attrs));
+  return n;
+}
+/*!
+ * \brief A simple layout transform pass that will
+ *  insert layout transform nodes automatically.
+ */
+nnvm::Graph LayoutTransform(nnvm::Graph src) {
+  static auto& op_layout_request =
+    nnvm::Op::GetAttr<FTVMLayoutRequest>("FTVMLayoutRequest");
+  static auto& op_vecop =
+    nnvm::Op::GetAttr<FTVMVectorizedOp>("FTVMVectorizedOp");
+  static auto& op_pattern = nnvm::Op::GetAttr<TOpPattern>("TOpPattern");
+  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
+  const std::vector<TLayoutInfo>& input_layouts =
+    src.GetAttr<std::vector<TLayoutInfo> >("layout");
+  const IndexedGraph& idx = src.indexed_graph();
+  std::vector<TLayoutInfo> produce_vec(idx.num_node_entries(), GetDefaultLayout());
+  std::vector<nnvm::NodePtr> mirror_vec(idx.num_nodes(), nullptr);
+  // use op pattern to decide whether an op is map
+  auto is_map_op = [&](size_t nid) {
+    TOpPattern pt = op_pattern.get(idx[nid].source->op(), kExtern);
+    bool is_map = (pt <= kBroadcast);
+    if (pt == kBroadcast) {
+      for (const auto& e : idx[nid].inputs) {
+        if (shape_vec[idx.entry_id(nid, 0)] != shape_vec[idx.entry_id(e)]) {
+          is_map = false;
+          break;
+        }
+      }
+    }
+    return is_map;
+  };
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    nnvm::NodePtr new_node = nnvm::Node::Create();
+    *new_node = *(inode.source);
+    if (new_node->is_variable()) {
+      auto input_iter = std::find(
+        idx.input_nodes().cbegin(), idx.input_nodes().cend(), nid);
+      CHECK(input_iter != idx.input_nodes().cend());
+      size_t input_id = std::distance(idx.input_nodes().cbegin(), input_iter);
+      produce_vec[idx.entry_id(nid, 0)] = input_layouts[input_id];
+      mirror_vec[nid] = new_node;
+      continue;
+    }
+    if (op_vecop.count(inode.source->op())) {
+      new_node = op_vecop[inode.source->op()](inode.source);
+      new_node->inputs.resize(new_node->num_inputs());
+    }
+    // set up output and input layouts
+    std::vector<TLayoutInfo> request_ilayouts(new_node->num_inputs(), GetDefaultLayout());
+    if (op_layout_request.count(new_node->op())) {
+      std::vector<TLayoutInfo> produce_olayouts(new_node->num_outputs(), GetDefaultLayout());
+      CHECK(op_layout_request[new_node->op()](
+          new_node->attrs, &request_ilayouts, &produce_olayouts))
+          << "Layout request fail";
+      CHECK_EQ(request_ilayouts.size(), new_node->num_inputs());
+      CHECK_EQ(produce_olayouts.size(), new_node->num_outputs());
+      for (size_t i = 0; i < new_node->num_outputs(); ++i) {
+        produce_vec[idx.entry_id(nid, i)] = produce_olayouts[i];
+      }
+    }
+    bool map_layout = is_map_op(nid);
+    if (map_layout) {
+      const TLayoutInfo& layout = produce_vec[idx.entry_id(inode.inputs[0])];
+      for (const auto& e : inode.inputs) {
+        if (produce_vec[idx.entry_id(e)] != layout) {
+          map_layout = false;
+          break;
+        }
+      }
+      if (map_layout) {
+        for (size_t i = 0; i < inode.source->num_outputs(); ++i) {
+          produce_vec[idx.entry_id(nid, i)] = layout;
+        }
+      }
+    }
+    for (size_t i = 0; i < inode.inputs.size(); ++i) {
+      const auto& e = inode.inputs[i];
+      const nnvm::NodePtr& in = mirror_vec[e.node_id];
+      new_node->inputs[i] =
+        nnvm::NodeEntry{in, e.index, e.version};
+      TLayoutInfo produce = produce_vec[idx.entry_id(e)];
+      TLayoutInfo request = request_ilayouts[i];
+      if (!map_layout && (produce != request)) {
+        nnvm::NodePtr tnode = CreateLayoutTransformNode(produce, request);
+        tnode->attrs.name =
+          idx[e.node_id].source->attrs.name + "_" + request;
+        tnode->inputs.emplace_back(new_node->inputs[i]);
+        new_node->inputs[i] = nnvm::NodeEntry{tnode, 0, 0};
+      }
+    }
+    mirror_vec[nid] = new_node;
+  }
+  std::vector<nnvm::NodeEntry> outputs;
+  for (const auto& e : idx.outputs()) {
+    TLayoutInfo produce = produce_vec[idx.entry_id(e)];
+    if (produce != GetDefaultLayout()) {
+      nnvm::NodePtr tnode = CreateLayoutTransformNode(produce, GetDefaultLayout());
+      tnode->attrs.name =
+        idx[e.node_id].source->attrs.name + "_default";
+      tnode->inputs.emplace_back(
+        nnvm::NodeEntry{mirror_vec[e.node_id], e.index, e.version});
+      outputs.emplace_back(nnvm::NodeEntry{tnode, 0, 0});
+    } else {
+      outputs.emplace_back(
+        nnvm::NodeEntry{mirror_vec[e.node_id], e.index, e.version});
+    }
+  }
+  nnvm::Graph ret;
+  ret.outputs = std::move(outputs);
+  return ret;
+}
+}  // namespace compiler
+}  // namespace nnvm
--- a/nnvm/src/compiler/pass/prune_graph.cc
+++ b/nnvm/src/compiler/pass/prune_graph.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file prune_graph.cc
+ * \brief Prune the graph to do constant folding.
+ *
+ *  This pass breaks the graph into pre-compute graph
+ *  and the execution graph.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/pass.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <unordered_set>
+namespace nnvm {
+namespace compiler {
+nnvm::Graph PruneGraph(nnvm::Graph src) {
+  const auto& params = src.GetAttr<std::unordered_set<std::string> >("params");
+  std::unordered_set<nnvm::Node*> pruned;
+  nnvm::NodeEntryMap<nnvm::NodePtr> entry_var;
+  DFSVisit(src.outputs, [&](const nnvm::NodePtr& n) {
+    bool can_be_pruned = true;
+    if (n->is_variable()) {
+      if (params.count(n->attrs.name)) {
+        pruned.emplace(n.get());
+      }
+      can_be_pruned = false;
+    }
+    for (const auto& e : n->inputs) {
+      if (!pruned.count(e.node.get())) {
+        can_be_pruned = false;
+      }
+    }
+    if (can_be_pruned) {
+      pruned.emplace(n.get());
+    } else {
+      // scan again to find edge nodes, skip variables
+      for (auto& e : n->inputs) {
+        if (!e.node->is_variable() && pruned.count(e.node.get())) {
+          if (!entry_var.count(e)) {
+            nnvm::NodePtr var = nnvm::Node::Create();
+            var->attrs.name = e.node->attrs.name + "_output" + std::to_string(e.index);
+            entry_var.emplace(e, var);
+          }
+          e = nnvm::NodeEntry{entry_var.at(e), 0, 0};
+        }
+      }
+    }
+  });
+  nnvm::Graph pre_graph;
+  pre_graph.outputs.reserve(entry_var.size());
+  std::vector<std::string> output_names;
+  output_names.reserve(entry_var.size());
+  for (auto kv : entry_var) {
+    if (kv.first.node->is_variable()) continue;
+    pre_graph.outputs.emplace_back(kv.first);
+    output_names.emplace_back(kv.second->attrs.name);
+  }
+  pre_graph.attrs["pruned_params"] =
+    std::make_shared<dmlc::any>(std::move(output_names));
+  src.attrs["pre_graph"] =
+    std::make_shared<dmlc::any>(std::move(pre_graph));
+  return src;
+}
+NNVM_REGISTER_PASS(PruneGraph)
+.set_body(PruneGraph);
+}  // namespace compiler
+}  // namespace nnvm
--- a/nnvm/src/runtime/graph_executor.cc
+++ b/nnvm/src/runtime/graph_executor.cc
@@ -312,23 +312,34 @@ NNVM_REGISTER_OP(tvm_op)
    return param.num_outputs;
  });
-TVM_REGISTER_GLOBAL("nnvm.tvm.create_executor")
+tvm::runtime::Module RuntimeCreate(std::string sym_json,
-.set_body([](TVMArgs args, TVMRetValue *rv) {
+                                   tvm::runtime::Module m,
-    std::string sym_json = args[0];
+                                   int device_type,
-    std::string param_blob = args[1];
+                                   int device_id) {
-    tvm::runtime::Module m = args[2];
  TVMContext ctx;
-    ctx.device_type = static_cast<DLDeviceType>(args[3].operator int());
+  ctx.device_type = static_cast<DLDeviceType>(device_type);
-    ctx.device_id   = args[4];
+  ctx.device_id   = device_id;
  // load graph from json string
  nnvm::Graph g;
  g.attrs["json"] = std::make_shared<nnvm::any>(sym_json);
  g = nnvm::ApplyPass(std::move(g), "LoadJSON");
  std::shared_ptr<GraphExecutor> exec = std::make_shared<GraphExecutor>();
  exec->Init(g, m, ctx);
-    // load params form stream of string
+  return tvm::runtime::Module(exec);
-    exec->LoadParams(std::move(param_blob));
+}
-    *rv = tvm::runtime::Module(exec);
+TVM_REGISTER_GLOBAL("nnvm.runtime.create")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    *rv = RuntimeCreate(args[0], args[1], args[2], args[3]);
  });
+TVM_REGISTER_GLOBAL("nnvm.runtime.remote_create")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    void* mhandle = args[1];
+    *rv = RuntimeCreate(args[0],
+                        *static_cast<tvm::runtime::Module*>(mhandle),
+                        args[2], args[3]);
+  });
 }  // namespace runtime
 }  // namespace nnvm
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -114,11 +114,12 @@ a bias vector is created and added to the outputs.
 .add_argument("bias", "1D Tensor", "Bias parameter.")
 .add_arguments(Conv2DParam::__FIELDS__())
 .set_attr_parser(ParamParser<Conv2DParam>)
-.set_num_outputs(1)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
-.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
 .set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
 .set_attr<FInferShape>("FInferShape", Conv2DInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
 .set_support_level(2);
@@ -203,11 +204,12 @@ said convolution.
 .add_argument("bias", "1D Tensor", "Bias parameter.")
 .add_arguments(Conv2DTransposeParam::__FIELDS__())
 .set_attr_parser(ParamParser<Conv2DTransposeParam>)
-.set_num_outputs(1)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DTransposeParam>)
-.set_num_inputs(UseBiasNumInputs<Conv2DTransposeParam>)
 .set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DTransposeParam>)
 .set_attr<FInferShape>("FInferShape", Conv2DTransposeInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<Conv2DTransposeParam>)
 .set_support_level(2);
 }  // namespace top

--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -66,6 +66,7 @@ If ``use_bias`` is set to be false, then the ``bias`` term is ignored.
 .add_argument("bias", "1D Tensor", "Bias parameter.")
 .add_arguments(DenseParam::__FIELDS__())
 .set_attr_parser(ParamParser<DenseParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<DenseParam>)
 .set_num_outputs(1)
 .set_num_inputs(UseBiasNumInputs<DenseParam>)
 .set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<DenseParam>)
@@ -95,10 +96,11 @@ NNVM_REGISTER_OP(dropout)
 )" NNVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "Input to which dropout will be applied")
+.add_arguments(DropoutParam::__FIELDS__())
+.set_attr_parser(ParamParser<DropoutParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<DropoutParam>)
 .set_num_inputs(1)
 .set_num_outputs(2)
-.set_attr_parser(ParamParser<DropoutParam>)
-.add_arguments(DropoutParam::__FIELDS__())
 .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 2>)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 2>)
 .set_attr<FNumVisibleOutputs>("FNumVisibleOutputs", [](const NodeAttrs& attrs) {
@@ -172,10 +174,11 @@ axis to be the last item in the input shape.
 .add_argument("beta", "Tensor", "The beta offset factor")
 .add_argument("moving_mean", "Tensor", "running mean of input")
 .add_argument("moving_var", "Tensor", "running variance of input")
+.add_arguments(BatchNormParam::__FIELDS__())
+.set_attr_parser(ParamParser<BatchNormParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<BatchNormParam>)
 .set_num_inputs(5)
 .set_num_outputs(3)
-.set_attr_parser(ParamParser<BatchNormParam>)
-.add_arguments(BatchNormParam::__FIELDS__())
 .set_attr<FInferShape>("FInferShape", BatchNormInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<5, 3>)
 .set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
@@ -203,10 +206,12 @@ NNVM_REGISTER_OP(softmax)
 .. note::
    This operator can be optimized away for inference.
 )code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_arguments(SoftmaxParam::__FIELDS__())
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<SoftmaxParam>)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<SoftmaxParam>)
-.add_arguments(SoftmaxParam::__FIELDS__())
 .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_support_level(1);
@@ -220,10 +225,12 @@ NNVM_REGISTER_OP(log_softmax)
 .. note::
    This operator can be optimized away for inference.
 )code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_arguments(SoftmaxParam::__FIELDS__())
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<SoftmaxParam>)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<SoftmaxParam>)
-.add_arguments(SoftmaxParam::__FIELDS__())
 .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_support_level(1);
@@ -237,10 +244,12 @@ NNVM_REGISTER_OP(leaky_relu)
 `y = x > 0 ? x : alpha * x`
 )code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_arguments(LeakyReLUParam::__FIELDS__())
+.set_attr_parser(ParamParser<LeakyReLUParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<LeakyReLUParam>)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<LeakyReLUParam>)
-.add_arguments(LeakyReLUParam::__FIELDS__())
 .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_support_level(1);

--- a/nnvm/src/top/nn/pooling.cc
+++ b/nnvm/src/top/nn/pooling.cc
@@ -72,6 +72,7 @@ NNVM_REGISTER_OP(max_pool2d)
 .add_argument("data", "4D Tensor", "Input data.")
 .add_arguments(Pool2DParam::__FIELDS__())
 .set_attr_parser(ParamParser<Pool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Pool2DParam>)
 .set_num_outputs(1)
 .set_num_inputs(1)
 .set_attr<FInferShape>("FInferShape", Pool2DInferShape)
@@ -98,10 +99,11 @@ NNVM_REGISTER_OP(avg_pool2d)
 .add_argument("data", "4D Tensor", "Input data.")
 .add_arguments(Pool2DParam::__FIELDS__())
 .set_attr_parser(ParamParser<Pool2DParam>)
-.set_num_outputs(1)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Pool2DParam>)
-.set_num_inputs(1)
 .set_attr<FInferShape>("FInferShape", Pool2DInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_num_outputs(1)
+.set_num_inputs(1)
 .set_support_level(2);
@@ -135,10 +137,11 @@ NNVM_REGISTER_OP(global_max_pool2d)
 .add_argument("data", "4D Tensor", "Input data.")
 .add_arguments(GlobalPool2DParam::__FIELDS__())
 .set_attr_parser(ParamParser<GlobalPool2DParam>)
-.set_num_outputs(1)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<GlobalPool2DParam>)
-.set_num_inputs(1)
 .set_attr<FInferShape>("FInferShape", GlobalPool2DInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_num_outputs(1)
+.set_num_inputs(1)
 .set_support_level(2);
@@ -154,10 +157,11 @@ NNVM_REGISTER_OP(global_avg_pool2d)
 .add_argument("data", "4D Tensor", "Input data.")
 .add_arguments(GlobalPool2DParam::__FIELDS__())
 .set_attr_parser(ParamParser<GlobalPool2DParam>)
-.set_num_outputs(1)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<GlobalPool2DParam>)
-.set_num_inputs(1)
 .set_attr<FInferShape>("FInferShape", GlobalPool2DInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_num_outputs(1)
+.set_num_inputs(1)
 .set_support_level(2);
 }  // namespace top

--- a/nnvm/src/top/op_common.h
+++ b/nnvm/src/top/op_common.h
@@ -37,6 +37,19 @@ inline void ParamParser(nnvm::NodeAttrs* attrs) {
  attrs->parsed = std::move(param);
 }
+/*!
+ * \brief Parse keyword arguments as PType arguments and save to parsed
+ * \tparam PType the arameter type.
+ * \param attrs The attributes.
+ */
+template<typename PType>
+inline std::unordered_map<std::string, std::string>
+ParamGetAttrDict(const nnvm::NodeAttrs& attrs) {
+  std::unordered_map<std::string, std::string> dict = attrs.dict;
+  nnvm::get<PType>(attrs.parsed).UpdateDict(&dict);
+  return dict;
+}
 /*! \brief check if shape is empty or contains unkown (0) dim. */
 inline bool shape_is_none(const TShape& x) {
  return x.ndim() == 0 || x.Size() == 0;

--- a/nnvm/src/top/tensor/broadcast.cc
+++ b/nnvm/src/top/tensor/broadcast.cc
@@ -61,13 +61,14 @@ The dimension which you do not want to change can also be kept as `0` which mean
 So with `shape=(2,0)`, we will obtain the same result as in the above example.
 )code" NNVM_ADD_FILELINE)
-.set_attr_parser(ParamParser<BroadcastToParam>)
+.add_argument("data", "Tensor", "Input data.")
 .add_arguments(BroadcastToParam::__FIELDS__())
-.set_num_inputs(1)
+.set_attr_parser(ParamParser<BroadcastToParam>)
-.set_num_outputs(1)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<BroadcastToParam>)
 .set_attr<FInferShape>("FInferShape", BroadcastToInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
-.add_argument("data", "Tensor", "Input data.")
+.set_num_inputs(1)
+.set_num_outputs(1)
 .set_support_level(4);
 // binary broadcast op

--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -95,70 +95,60 @@ NNVM_REGISTER_ELEMWISE_UNARY_OP(copy)
 // unary scalar op
 DMLC_REGISTER_PARAMETER(ScalarParam);
-NNVM_REGISTER_ELEMWISE_UNARY_OP(__add_scalar__)
+#define NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(op)                        \
+  NNVM_REGISTER_ELEMWISE_UNARY_OP(op)                                   \
+  .add_arguments(ScalarParam::__FIELDS__())                             \
+  .set_attr_parser(ParamParser<ScalarParam>)                            \
+  .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ScalarParam>)
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__add_scalar__)
 .describe(R"code(Tensor add scalar
 )code"  NNVM_ADD_FILELINE)
-.set_attr_parser(ParamParser<ScalarParam>)
-.add_arguments(ScalarParam::__FIELDS__())
 .set_support_level(3);
-NNVM_REGISTER_ELEMWISE_UNARY_OP(__sub_scalar__)
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__sub_scalar__)
 .describe(R"code(Tensor substract scalar
 )code"  NNVM_ADD_FILELINE)
-.set_attr_parser(ParamParser<ScalarParam>)
-.add_arguments(ScalarParam::__FIELDS__())
 .set_support_level(3);
-NNVM_REGISTER_ELEMWISE_UNARY_OP(__rsub_scalar__)
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rsub_scalar__)
 .describe(R"code(scalar substract Tensor
 )code"  NNVM_ADD_FILELINE)
-.set_attr_parser(ParamParser<ScalarParam>)
-.add_arguments(ScalarParam::__FIELDS__())
 .set_support_level(3);
-NNVM_REGISTER_ELEMWISE_UNARY_OP(__mul_scalar__)
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__mul_scalar__)
 .describe(R"code(Tensor multiplies scalar
 )code"  NNVM_ADD_FILELINE)
-.set_attr_parser(ParamParser<ScalarParam>)
-.add_arguments(ScalarParam::__FIELDS__())
 .set_support_level(3);
-NNVM_REGISTER_ELEMWISE_UNARY_OP(__div_scalar__)
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__div_scalar__)
 .describe(R"code(Tensor divides scalar
 )code"  NNVM_ADD_FILELINE)
-.set_attr_parser(ParamParser<ScalarParam>)
-.add_arguments(ScalarParam::__FIELDS__())
 .set_support_level(3);
-NNVM_REGISTER_ELEMWISE_UNARY_OP(__rdiv_scalar__)
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rdiv_scalar__)
 .describe(R"code(scalar divides Tensor
 )code"  NNVM_ADD_FILELINE)
-.set_attr_parser(ParamParser<ScalarParam>)
-.add_arguments(ScalarParam::__FIELDS__())
 .set_support_level(3);
-NNVM_REGISTER_ELEMWISE_UNARY_OP(__pow_scalar__)
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__pow_scalar__)
 .describe(R"code(Tensor power scalar
 )code"  NNVM_ADD_FILELINE)
-.set_attr_parser(ParamParser<ScalarParam>)
-.add_arguments(ScalarParam::__FIELDS__())
 .set_support_level(3);
-NNVM_REGISTER_ELEMWISE_UNARY_OP(__rpow_scalar__)
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rpow_scalar__)
 .describe(R"code(scalar power Tensor
 )code"  NNVM_ADD_FILELINE)
-.set_attr_parser(ParamParser<ScalarParam>)
-.add_arguments(ScalarParam::__FIELDS__())
 .set_support_level(3);
 }  // namespace top
 }  // namespace nnvm
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -92,13 +92,15 @@ inline void AxesParamParser(nnvm::NodeAttrs* attrs) {
 #define NNVM_REGISTER_REDUCE_OP(op)                                     \
  NNVM_REGISTER_OP(op)                                                  \
-  .set_num_inputs(1)                                            \
+  .add_argument("data", "Tensor", "The input")                          \
-  .set_num_outputs(1)                                           \
+  .add_arguments(ReduceParam::__FIELDS__())                             \
  .set_attr_parser(AxesParamParser<ReduceParam>)                        \
+  .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ReduceParam>) \
  .set_attr<FInferShape>("FInferShape", ReduceShape)                    \
  .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)               \
-  .add_argument("data", "Tensor", "The input")                  \
+  .set_num_inputs(1)                                                    \
-  .add_arguments(ReduceParam::__FIELDS__())
+  .set_num_outputs(1)                                                   \
 NNVM_REGISTER_REDUCE_OP(sum)

--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -132,13 +132,14 @@ Example::
                             [ 5.,  5.,  8.,  8.]]
 )code" NNVM_ADD_FILELINE)
-.set_num_outputs(1)
-.set_num_inputs(kVarg)
-.set_attr_parser(ParamParser<ConcatenateParam>)
-.add_arguments(ConcatenateParam::__FIELDS__())
 .add_argument("data", "Tensor-or-Tensor[]", "List of arrays to concatenate")
+.add_arguments(ConcatenateParam::__FIELDS__())
+.set_attr_parser(ParamParser<ConcatenateParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ConcatenateParam>)
 .set_attr<FInferShape>("FInferShape", ConcatenateInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_num_outputs(1)
+.set_num_inputs(kVarg)
 .set_support_level(1);
@@ -204,6 +205,7 @@ inline uint32_t SplitNumOutputs(const NodeAttrs& attrs) {
  }
 }
+// Intentionally not add ParamGetAttrDict for indices_or_sections.
 NNVM_REGISTER_OP(split)
 .describe(R"code(Splits an array along a particular axis into multiple sub-arrays.
@@ -211,13 +213,13 @@ NNVM_REGISTER_OP(split)
 along which to split the array.
 )code" NNVM_ADD_FILELINE)
-.set_num_inputs(1)
-.set_attr_parser(SplitParamParser)
-.set_num_outputs(SplitNumOutputs)
-.add_arguments(SplitParam::__FIELDS__())
 .add_argument("data", "Tensor", "List of arrays to concatenate")
+.add_arguments(SplitParam::__FIELDS__())
+.set_attr_parser(SplitParamParser)
 .set_attr<FInferShape>("FInferShape", SplitInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_num_inputs(1)
+.set_num_outputs(SplitNumOutputs)
 .set_support_level(1);
 // cast
@@ -237,8 +239,9 @@ NNVM_REGISTER_OP(cast)
 )code" NNVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "Input data array")
-.set_attr_parser(ParamParser<CastParam>)
 .add_arguments(CastParam::__FIELDS__())
+.set_attr_parser(ParamParser<CastParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<CastParam>)
 .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<FInferType>("FInferType", CastInferType)
 .set_num_inputs(1)
@@ -387,13 +390,14 @@ The significance of each is explained below:
  - input shape = (2,3,4), shape = (2,-4,-1,3,-2), output shape = (2,1,3,4)
 )code" NNVM_ADD_FILELINE)
-.set_num_inputs(1)
+.add_argument("data", "Tensor", "Input data.")
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<ReshapeParam>)
 .add_arguments(ReshapeParam::__FIELDS__())
+.set_attr_parser(ParamParser<ReshapeParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ReshapeParam>)
 .set_attr<FInferShape>("FInferShape", ReshapeInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
-.add_argument("data", "Tensor", "Input data.")
+.set_num_inputs(1)
+.set_num_outputs(1)
 .set_support_level(3);
 // tranpose
@@ -453,13 +457,14 @@ Examples::
                                [[ 3.,  4.],
                                 [ 7.,  8.]]]
 )code" NNVM_ADD_FILELINE)
-.set_num_inputs(1)
+.add_argument("data", "Tensor", "Source input")
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<TransposeParam>)
 .add_arguments(TransposeParam::__FIELDS__())
+.set_attr_parser(ParamParser<TransposeParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<TransposeParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", TransposeShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
-.add_argument("data", "Tensor", "Source input")
+.set_num_inputs(1)
+.set_num_outputs(1)
 .set_support_level(4);
 }  // namespace top

--- a/nnvm/tests/lint/pylintrc
+++ b/nnvm/tests/lint/pylintrc
+[MASTER]
+# Specify a configuration file.
+#rcfile=
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS, _cy2, _cy3
+# Add files or directories matching the regex patterns to the blacklist. The
+# regex matches against base names, not paths.
+ignore-patterns=
+# Pickle collected data for later comparisons.
+persistent=yes
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+# Use multiple processes to speed up Pylint.
+jobs=8
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code
+extension-pkg-whitelist=numpy,opencv
+# Allow optimization of some AST trees. This will activate a peephole AST
+# optimizer, which will apply various small optimizations. For instance, it can
+# be used to obtain the result of joining multiple strings with the addition
+# operator. Joining a lot of strings can lead to a maximum recursion error in
+# Pylint and this flag can prevent that. It has one side effect, the resulting
+# AST will be different than the one from reality. This option is deprecated
+# and it will be removed in Pylint 2.0.
+optimize-ast=no
+[MESSAGES CONTROL]
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=indexing-exception,old-raise-syntax
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,protected-access
+[REPORTS]
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+# Put messages in a separate file for each module / package specified on the
+# command line instead of printing them on stdout. Reports (if any) will be
+# written in a file name "pylint_global.[txt|html]". This option is deprecated
+# and it will be removed in Pylint 2.0.
+files-output=no
+# Tells whether to display a full report or only the messages
+reports=no
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+[FORMAT]
+# Maximum number of characters on a single line.
+max-line-length=100
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+# List of optional constructs for which whitespace checking is disabled. `dict-
+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
+# `empty-line` allows space-only lines.
+no-space-check=trailing-comma,dict-separator
+# Maximum number of lines in a module
+max-module-lines=1000
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+# Number of spaces of indent required inside a hanging  or continued line.
+indent-after-paren=4
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+[SPELLING]
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+[MISCELLANEOUS]
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+[TYPECHECK]
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+[LOGGING]
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging
+[SIMILARITIES]
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+# Ignore comments when computing similarities.
+ignore-comments=yes
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+# Ignore imports when computing similarities.
+ignore-imports=no
+[VARIABLES]
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,future.builtins
+[BASIC]
+# Good variable names which should always be accepted, separated by a comma
+good-names=i,j,_,a,b,op,x,y,wd,lr,kv,k,v,s,p,h,c,m,n,X,t,g,f
+# Bad variable names which should always be refused, separated by a comma
+bad-names=
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+property-classes=abc.abstractproperty
+# Regular expression matching correct module names
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+# Naming hint for module names
+module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+# Regular expression matching correct constant names
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+# Naming hint for constant names
+const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
+# Naming hint for inline iteration names
+inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
+# Regular expression matching correct method names
+method-rgx=[a-z_][a-z0-9_]{2,30}$
+# Naming hint for method names
+method-name-hint=[a-z_][a-z0-9_]{2,30}$
+# Regular expression matching correct class attribute names
+class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+# Naming hint for class attribute names
+class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+# Regular expression matching correct argument names
+argument-rgx=[a-z_][a-z0-9_]{2,30}$
+# Naming hint for argument names
+argument-name-hint=[a-z_][a-z0-9_]{2,30}$
+# Regular expression matching correct attribute names
+attr-rgx=[a-z_][a-z0-9_]{2,30}$
+# Naming hint for attribute names
+attr-name-hint=[a-z_][a-z0-9_]{2,30}$
+# Regular expression matching correct variable names
+variable-rgx=[a-z_][a-z0-9_]{2,30}$
+# Naming hint for variable names
+variable-name-hint=[a-z_][a-z0-9_]{2,30}$
+# Regular expression matching correct function names
+function-rgx=[a-z_][a-z0-9_]{2,30}$
+# Naming hint for function names
+function-name-hint=[a-z_][a-z0-9_]{2,30}$
+# Regular expression matching correct class names
+class-rgx=[A-Z_][a-zA-Z0-9]+$
+# Naming hint for class names
+class-name-hint=[A-Z_][a-zA-Z0-9]+$
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=10
+[ELIF]
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+[CLASSES]
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make
+[IMPORTS]
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=optparse
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+[DESIGN]
+# Maximum number of arguments for function / method
+max-args=5
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*
+# Maximum number of locals for function / method body
+max-locals=15
+# Maximum number of return / yield for function / method body
+max-returns=6
+# Maximum number of branch for function / method body
+max-branches=12
+# Maximum number of statements in function / method body
+max-statements=50
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=0
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+# Maximum number of boolean expressions in a if statement
+max-bool-expr=5
+[EXCEPTIONS]
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=Exception
--- a/nnvm/tests/python/compiler/test_build.py
+++ b/nnvm/tests/python/compiler/test_build.py
+import numpy as np
+import tvm
+import nnvm.symbol as sym
+import nnvm.compiler
+import nnvm.runtime
+def test_compile():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.exp(y + x)
+    shape = (10, 128)
+    dtype = tvm.float32
+    shape_dict = {"x": shape, "y": shape}
+    graph, lib = nnvm.compiler.build(z, "llvm", shape_dict)
+    m = nnvm.runtime.create(graph, lib, tvm.cpu(0))
+    # get member functions
+    set_input, run, get_output = m["set_input"], m["run"], m["get_output"]
+    na = tvm.nd.array(np.ones(shape).astype(dtype))
+    nb = tvm.nd.array(np.ones(shape).astype(dtype))
+    # set inputs
+    set_input("x", na)
+    set_input("y", nb)
+    # execute
+    run()
+    # get outputs
+    out = tvm.nd.empty(shape, dtype)
+    get_output(0, out)
+    np.testing.assert_allclose(
+        out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
+if __name__ == "__main__":
+    test_compile()
--- a/nnvm/tests/python/compiler/test_rpc_exec.py
+++ b/nnvm/tests/python/compiler/test_rpc_exec.py
+from tvm.contrib import util, rpc
+import tvm
+import nnvm.symbol as sym
+import nnvm.compiler
+import nnvm.runtime
+import numpy as np
+def test_rpc_executor():
+    host = "localhost"
+    port = 9091
+    server = rpc.Server(host, port)
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.exp(y + x)
+    shape = (10, 128)
+    dtype = tvm.float32
+    shape_dict = {"x": shape, "y": shape}
+    tmp = util.tempdir()
+    lib_name  = tmp.relpath("net.o")
+    graph, lib = nnvm.compiler.build(z, "llvm", shape_dict)
+    # save module
+    lib.save(lib_name)
+    remote = rpc.connect(host, port)
+    remote.upload(lib_name)
+    ctx = remote.cpu(0)
+    # load remote
+    rlib = remote.load_module("net.o")
+    # Create remotemodule
+    m = nnvm.runtime.create(graph, rlib, remote.cpu(0))
+    # get member functions
+    set_input, run, get_output = m["set_input"], m["run"], m["get_output"]
+    na = tvm.nd.array(np.ones(shape).astype(dtype), ctx)
+    nb = tvm.nd.array(np.ones(shape).astype(dtype), ctx)
+    # set inputs
+    set_input("x", na)
+    set_input("y", nb)
+    # execute
+    run()
+    # get outputs
+    out = tvm.nd.empty(shape, dtype, ctx)
+    get_output(0, out)
+    np.testing.assert_allclose(
+        out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
+    server.terminate()
+if __name__ == "__main__":
+    test_rpc_executor()
--- a/nnvm/tests/python/compiler/test_top_level2.py
+++ b/nnvm/tests/python/compiler/test_top_level2.py
+import numpy as np
+import tvm
+import topi
+import nnvm.symbol as sym
+import nnvm.compiler
+import nnvm.runtime
+def test_conv2d():
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=10, kernel_size=(3, 3),
+                   name="y", use_bias=False, padding=(1,1))
+    dtype = "float32"
+    dshape = (1, 3, 18, 18)
+    kshape = (10, 3, 3, 3)
+    oshape = (1, 10, 18, 18)
+    shape_dict = {"x": dshape}
+    graph, lib = nnvm.compiler.build(y, "llvm", shape_dict)
+    m = nnvm.runtime.create(graph, lib, tvm.cpu(0))
+    # get member functions
+    set_input, run, get_output = m["set_input"], m["run"], m["get_output"]
+    # execute
+    run()
+    data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+    kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+    set_input("x", data)
+    set_input("y_weight", kernel)
+    # execute
+    run()
+    # get outputs
+    out = tvm.nd.empty(oshape, dtype)
+    get_output(0, out)
+    c_np = topi.testing.conv2d_nchw_python(
+        data.asnumpy(), kernel.asnumpy(), 1, 1)
+    np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+if __name__ == "__main__":
+    test_conv2d()
--- a/nnvm/tests/python/unittest/test_infer_shape.py
+++ b/nnvm/tests/python/unittest/test_infer_shape.py
@@ -6,13 +6,11 @@ def infer_shape(sym):
    g = graph.create(sym)
    g._set_json_attr("shape_attr_key", "shape")
    g = g.apply("InferShape")
-    jgraph = json.loads(g.apply("SaveJSON").json_attr("json"))
-    jnodes = jgraph["nodes"]
-    jnode_row_ptr = jgraph["node_row_ptr"]
    sdict = {}
    vshape = g.json_attr("shape")
-    for i, n in enumerate(jnodes):
+    entry_ptr = g.index.entry_ptr
-        begin, end = jnode_row_ptr[i], jnode_row_ptr[i + 1]
+    for i, n in enumerate(g.index.nodes):
+        begin, end = entry_ptr[i], entry_ptr[i + 1]
        sdict[n["name"]] = vshape[begin:end]
    return sdict