General Layout Support (#447)

9f8fcfc9 · Yizhi Liu · Tianqi Chen · fc7e9cd2 · fc7e9cd2 · 9f8fcfc9
Commit 9f8fcfc9 authored Apr 25, 2018 by Yizhi Liu Committed by Tianqi Chen May 29, 2018
39 changed files
--- a/nnvm/include/nnvm/compiler/contrib_op_param.h
+++ b/nnvm/include/nnvm/compiler/contrib_op_param.h
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file contrib_op_param.h
- * \brief Additional parameters for compiler optimized operators.
- */
-#ifndef NNVM_COMPILER_CONTRIB_OP_PARAM_H_
-#define NNVM_COMPILER_CONTRIB_OP_PARAM_H_
-#include <dmlc/parameter.h>
-#include <string>
-namespace nnvm {
-namespace compiler {
-/*! \brief Parameters of layout transform operator */
-struct LayoutTransformParam : public dmlc::Parameter<LayoutTransformParam> {
-  std::string src_layout;
-  std::string dst_layout;
-  DMLC_DECLARE_PARAMETER(LayoutTransformParam) {
-    DMLC_DECLARE_FIELD(src_layout);
-    DMLC_DECLARE_FIELD(dst_layout);
-  }
-};
-}  // namespace compiler
-}  // namespace nnvm
-#endif  // NNVM_COMPILER_CONTRIB_OP_PARAM_H_
--- a/nnvm/include/nnvm/compiler/op_attr_types.h
+++ b/nnvm/include/nnvm/compiler/op_attr_types.h
@@ -16,6 +16,7 @@
 #include <nnvm/graph.h>
 #include <vector>
 #include <string>
+#include "packed_func_ext.h"
 namespace nnvm {
 namespace compiler {
@@ -73,19 +74,17 @@ using FTVMSchedule = std::function<
           const Array<Tensor>& outs,
           const std::string& target)>;
-/*! \brief Layout Information about an entry */
-using TLayoutInfo = std::string;
 /*!
- * \brief The producer consumer function of node layout
+ * \brief Modify the op node to alter its input layout.
- * \param attrs The attribute of the node.
+ *  it is invoked in AlterOpLayout pass.
- * \param ilayouts The input layouts that the node request.
+ * \param attrs The attribute of the original node.
- * \param olayouts The output layouts that the node produce.
+ * \param inputs The input symbols of the original node.
- * \return bool The success flag.
+ * \param tinfos The inferred shape and dtype of the inputs.
 */
-using FTVMLayoutRequest = std::function<bool (const NodeAttrs& attrs,
+using FTVMAlterOpLayout = std::function<
-                                              std::vector<TLayoutInfo> *ilayouts,
+  Symbol(const NodeAttrs& attrs,
-                                              std::vector<TLayoutInfo> *olayouts)>;
+         const Symbol& inputs,
+         const Array<Tensor>& tinfos)>;
 /*!
 * \brief Transform from normal operator to vectorized operator

--- a/nnvm/include/nnvm/compiler/packed_func_ext.h
+++ b/nnvm/include/nnvm/compiler/packed_func_ext.h
@@ -11,6 +11,7 @@
 #include <nnvm/graph.h>
 #include <nnvm/symbolic.h>
 #include <string>
+#include <vector>
 #include <unordered_map>
 namespace nnvm {
@@ -52,6 +53,7 @@ template<>
 struct extension_class_info<nnvm::compiler::AttrDict> {
  static const int code = 18;
 };
 }  // namespace runtime
 }  // namespace tvm
 #endif  // NNVM_COMPILER_PACKED_FUNC_EXT_H_
--- a/nnvm/include/nnvm/graph_attr_types.h
+++ b/nnvm/include/nnvm/graph_attr_types.h
@@ -9,6 +9,7 @@
 #include <vector>
 #include <string>
 #include "./tuple.h"
+#include "./layout.h"
 namespace nnvm {
@@ -46,7 +47,7 @@ using ShapeVector = std::vector<TShape>;
 * \code
 *  Graph g = ApplyPass(src_graph, "InferType");
 *  const DTypeVector& types = g.GetAttr<DTypeVector>("dtype");
- *  // get shape by entry id
+ *  // get type by entry id
 *  int entry_type = dtypes[g.indexed_graph().entry_id(my_entry)];
 * \endcode
 *
@@ -55,6 +56,21 @@ using ShapeVector = std::vector<TShape>;
 using DTypeVector = std::vector<int>;
 /*!
+ * \brief The result holder of layout of each NodeEntry in the graph.
+ * \note Stored under graph.attrs["layout"], provided by Pass "InferType"
+ *
+ * \code
+ *  Graph g = ApplyPass(src_graph, "LayoutTransform");
+ *  const LayoutVector& layouts = g.GetAttr<LayoutVector>("layout");
+ *  // get layout by entry id
+ *  int entry_layout = layouts[g.indexed_graph().entry_id(my_entry)];
+ * \endcode
+ *
+ * \sa FInferLayout
+ */
+using LayoutVector = std::vector<Layout>;
+/*!
 * \brief The result holder of device of each operator in the graph.
 * \note Stored under graph.attrs["device"], provided by Pass "PlaceDevice"
 *

--- a/nnvm/include/nnvm/layout.h
+++ b/nnvm/include/nnvm/layout.h
--- a/nnvm/include/nnvm/op_attr_types.h
+++ b/nnvm/include/nnvm/op_attr_types.h
@@ -13,6 +13,7 @@
 #include "./base.h"
 #include "./node.h"
 #include "./tuple.h"
+#include "./layout.h"
 namespace nnvm {
@@ -176,6 +177,31 @@ using FSetInputVarAttrOnCompose = std::function<void(
    NodePtr var,
    const int index)>;
+/*!
+ * \brief Inference function of node layout. See \p Layout for layout convention
+ * \param attrs The attribute of the node.
+ * \param ilayouts Given the input layouts produced by ancestor nodes,
+ *                 it should be filled by layouts that the node requests.
+ *                 If the requested layout is different from what ancestor produces,
+ *                 a __layout_transform__ operator will be inserted automatically.
+ * \param last_ilayouts The input layouts requested by the node
+ *                      at the last infer pass (if any).
+ *                      This can be useful when an operator wants to keep
+ *                      the input layout the same as the original one.
+ *                      For example, after the pass of AlterOpLayout,
+ *                      transpose(input, axis=[1, 2, 3, 0]) may receive an input of NCHW16c layout,
+ *                      with which it cannot calculate with axis=[1, 2, 3, 0].
+ *                      Last input layouts allow it to know what the layout it originally inferred,
+ *                      i.e., the layout in the imported model.
+ * \param olayouts Inferred output layouts.
+ * \return success flag.
+ */
+using FInferLayout = std::function<bool(
+    const NodeAttrs& attrs,
+    std::vector<Layout> *ilayouts,
+    const std::vector<Layout> *last_ilayouts,
+    std::vector<Layout> *olayouts)>;
 }  // namespace nnvm
 #endif  // NNVM_OP_ATTR_TYPES_H_
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -9,23 +9,12 @@
 #include <dmlc/base.h>
 #include <dmlc/parameter.h>
 #include <nnvm/tuple.h>
+#include <nnvm/layout.h>
+#include <string>
 namespace nnvm {
 namespace top {
-// Layout flag in spatial conv and pooling.
-enum LayoutFlag {
-  kNCHW,
-  kNHWC,
-  kCHWN,
-  kNCW,
-  kNWC,
-  kCWN,
-  kNCDHW,
-  kNDHWC,
-  kCDHWN
-};
 struct DenseParam : public dmlc::Parameter<DenseParam> {
  int units;
  bool use_bias;
@@ -130,7 +119,9 @@ struct Conv2DParam : public dmlc::Parameter<Conv2DParam> {
  TShape padding;
  TShape dilation;
  int groups;
-  int layout;
+  std::string layout;
+  std::string kernel_layout;
+  std::string out_layout;
  bool use_bias;
  DMLC_DECLARE_PARAMETER(Conv2DParam) {
@@ -152,14 +143,19 @@ struct Conv2DParam : public dmlc::Parameter<Conv2DParam> {
                "At groups=2, the operation becomes equivalent to having two convolution"
                "layers side by side, each seeing half the input channels, and producing"
                "half the output channels, and both subsequently concatenated.");
-    DMLC_DECLARE_FIELD(layout)
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
-      .add_enum("NCHW", kNCHW)
+      .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
-      .add_enum("NHWC", kNHWC)
-      .set_default(kNCHW)
-      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
                "dimensions respectively. Convolution is applied on the 'H' and"
                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(out_layout).set_default("__undef__")
+      .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Default to be same as input layout.");
+    DMLC_DECLARE_FIELD(kernel_layout).set_default("OIHW")
+      .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                "dimensions respectively.");
    DMLC_DECLARE_FIELD(use_bias).set_default(true)
      .describe("Whether the layer uses a bias vector.");
  }
@@ -178,7 +174,8 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
  TShape output_padding;
  TShape dilation;
  int groups;
-  int layout;
+  std::string layout;
+  std::string kernel_layout;
  bool use_bias;
  DMLC_DECLARE_PARAMETER(Conv2DTransposeParam) {
@@ -202,14 +199,15 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
                "At groups=2, the operation becomes equivalent to having two convolution"
                "layers side by side, each seeing half the input channels, and producing"
                "half the output channels, and both subsequently concatenated.");
-    DMLC_DECLARE_FIELD(layout)
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
-      .add_enum("NCHW", kNCHW)
+      .describe("Dimension ordering of data. Can be 'NCHW', 'NHWC', etc."
-      .add_enum("NHWC", kNHWC)
-      .set_default(kNCHW)
-      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
                "dimensions respectively. Convolution is applied on the 'H' and"
                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(kernel_layout).set_default("OIHW")
+      .describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                "dimensions respectively.");
    DMLC_DECLARE_FIELD(use_bias).set_default(true)
      .describe("Whether the layer uses a bias vector.");
  }
@@ -224,7 +222,7 @@ struct Pool2DParam : public dmlc::Parameter<Pool2DParam> {
  TShape pool_size;
  TShape strides;
  TShape padding;
-  int layout;
+  std::string layout;
  bool ceil_mode;
  DMLC_DECLARE_PARAMETER(Pool2DParam) {
@@ -235,10 +233,7 @@ struct Pool2DParam : public dmlc::Parameter<Pool2DParam> {
    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
      .describe("If padding is non-zero, then the input is implicitly zero-padded"
                "on both sides for padding number of points");
-    DMLC_DECLARE_FIELD(layout)
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
-      .add_enum("NCHW", kNCHW)
-      .add_enum("NHWC", kNHWC)
-      .set_default(kNCHW)
      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
                "dimensions respectively. Convolution is applied on the 'H' and"
@@ -250,13 +245,10 @@ struct Pool2DParam : public dmlc::Parameter<Pool2DParam> {
 struct GlobalPool2DParam : public dmlc::Parameter<GlobalPool2DParam> {
-  int layout;
+  std::string layout;
  DMLC_DECLARE_PARAMETER(GlobalPool2DParam) {
-    DMLC_DECLARE_FIELD(layout)
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
-      .add_enum("NCHW", kNCHW)
-      .add_enum("NHWC", kNHWC)
-      .set_default(kNCHW)
      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
                "dimensions respectively. Convolution is applied on the 'H' and"
@@ -266,15 +258,13 @@ struct GlobalPool2DParam : public dmlc::Parameter<GlobalPool2DParam> {
 struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
  int scale;
-  int layout;
+  std::string layout;
  DMLC_DECLARE_PARAMETER(UpSamplingParam) {
    DMLC_DECLARE_FIELD(scale)
      .describe("upsampling scaling factor");
    DMLC_DECLARE_FIELD(layout)
-      .add_enum("NCHW", kNCHW)
+      .set_default("NCHW")
-      .add_enum("NHWC", kNHWC)
-      .set_default(kNCHW)
      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
                "dimensions respectively. Convolution is applied on the 'H' and"
@@ -282,6 +272,18 @@ struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
  }
 };
+struct LayoutTransformParam : public dmlc::Parameter<LayoutTransformParam> {
+  std::string src_layout;
+  std::string dst_layout;
+  DMLC_DECLARE_PARAMETER(LayoutTransformParam) {
+    DMLC_DECLARE_FIELD(src_layout).set_default("__undef__")
+    .describe("Dimension ordering of data");
+    DMLC_DECLARE_FIELD(dst_layout).set_default("__undef__")
+    .describe("Dimension ordering of data.");
+  }
+};
 }  // namespace top
 }  // namespace nnvm

--- a/nnvm/python/nnvm/_ctypes/symbol.py
+++ b/nnvm/python/nnvm/_ctypes/symbol.py
@@ -211,12 +211,15 @@ def _init_symbol_module(symbol_class, root_namespace):
        op_names.append(py_str(plist[i]))
    module_obj = sys.modules["%s.symbol" % root_namespace]
+    module_obj_contrib = sys.modules["%s.contrib" % root_namespace]
    module_internal = sys.modules["%s._symbol_internal" % root_namespace]
    for name in op_names:
        hdl = OpHandle()
        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
        function = _make_atomic_symbol_function(hdl, name)
-        if function.__name__.startswith('_'):
+        if function.__name__.startswith('_contrib_'):
+            setattr(module_obj_contrib, function.__name__.split('_contrib_')[1], function)
+        elif function.__name__.startswith('_'):
            setattr(module_internal, function.__name__, function)
            setattr(module_obj, function.__name__, function)
        else:

--- a/nnvm/python/nnvm/compiler/build_module.py
+++ b/nnvm/python/nnvm/compiler/build_module.py
@@ -15,7 +15,8 @@ OPT_PASS_LEVEL = {
    "SimplifyInference": 0,
    "PrecomputePrune": 2,
    "OpFusion": 1,
-    "FoldScaleAxis": 3
+    "FoldScaleAxis": 3,
+    "AlterOpLayout": 3,
 }
 # List of optimization pass and level when switch on
@@ -139,7 +140,7 @@ def _update_shape_dtype(shape, dtype, params):
    return shape, dtype
-def optimize(graph, shape, dtype="float32"):
+def optimize(graph, shape, dtype="float32", layout=None):
    """Perform target and parameter invariant graph optimization.
    This is an advanced function that usually do not need to be called.
@@ -157,6 +158,18 @@ def optimize(graph, shape, dtype="float32"):
    """
    # pylint: disable=unused-argument
    cfg = BuildConfig.current
+    if cfg.pass_enabled("AlterOpLayout"):
+        layout = layout if layout else {}
+        graph = graph_attr.set_layout_inputs(graph, layout)
+        graph = graph.apply(["CorrectLayout"])
+        graph = graph_attr.set_shape_inputs(graph, shape)
+        graph = graph_attr.set_dtype_inputs(graph, dtype)
+        graph = graph.apply(["InferShape", "InferType", "AlterOpLayout"])
+        graph = graph_attr.set_layout_inputs(graph, layout)
+        graph = graph.apply(["CorrectLayout"])
    if cfg.pass_enabled("SimplifyInference"):
        graph = graph_attr.set_shape_inputs(graph, shape)
        graph = graph.apply(["InferShape", "SimplifyInference"])
@@ -167,7 +180,8 @@ def optimize(graph, shape, dtype="float32"):
    return graph
-def build(graph, target=None, shape=None, dtype="float32", params=None, target_host=None):
+def build(graph, target=None, shape=None, dtype="float32",
+          params=None, target_host=None, layout=None):
    """Build graph into runtime library.
    The build function will optimize the graph and do the compilation.
@@ -204,8 +218,8 @@ def build(graph, target=None, shape=None, dtype="float32", params=None, target_h
        By default, llvm is used if it is enabled,
        otherwise a stackvm intepreter is used.
-    initialize : bool, optional
+    layout : dict of str to str or str optional
-        Whether to initialize variables in global dict _all_var_init.
+        The input layout
    Returns
    -------
@@ -230,6 +244,15 @@ def build(graph, target=None, shape=None, dtype="float32", params=None, target_h
    cfg = BuildConfig.current
    graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
    shape, dtype = _update_shape_dtype(shape, dtype, params)
+    # correct layout if necessary
+    layout = layout if layout else {}
+    graph = graph_attr.set_layout_inputs(graph, layout)
+    graph = graph.apply("CorrectLayout")
+    index = graph.index
+    layouts = graph.json_attr("layout")
+    layout = {x : layouts[index.entry_id(x)] for x in index.input_names}
    # Initial pass do shape type inference
    ishape, _ = graph_util.infer_shape(graph, **shape)
    shape.update(zip(graph.index.input_names, ishape))
@@ -241,13 +264,14 @@ def build(graph, target=None, shape=None, dtype="float32", params=None, target_h
    if _all_var_init:
        init_var = initialize_variables(shape, dtype)
    # Apply optimization
-    graph = optimize(graph, shape, dtype)
+    graph = optimize(graph, shape, dtype, layout)
    # Precompute prune
    if params and cfg.pass_enabled("PrecomputePrune"):
        graph, params = precompute_prune(graph, params)
        shape, dtype = _update_shape_dtype(shape, dtype, params)
    # Operator Fusion and generation
    graph = graph_attr.set_shape_inputs(graph, shape)
+    graph = graph.apply("InferShape")
    graph = graph_attr.set_dtype_inputs(graph, dtype)
    graph._set_json_attr("target", str(target), "str")
    if target_host is not None:

--- a/nnvm/python/nnvm/compiler/graph_attr.py
+++ b/nnvm/python/nnvm/compiler/graph_attr.py
@@ -96,11 +96,22 @@ def set_layout_inputs(g, layout):
    Returns
    -------
    g : Graph
-        The updated graph with updated dtype.
+        The updated graph with updated layout.
    """
-    list_shape = [
+    if isinstance(layout, dict):
-        layout.get(name, "default") for name in g.index.input_names]
+        list_layout = [
-    g._set_json_attr("layout_inputs", list_shape, 'list_str')
+            layout.get(name, "__undef__") for name in g.index.input_names]
+    elif isinstance(layout, str):
+        list_layout = ["__undef__"] * len(g.index.input_names)
+        list_layout[0] = layout
+    else:
+        raise ValueError("Input layout must be str or dict")
+    last_inferred_layouts = g.json_attr("layout")
+    if last_inferred_layouts:
+        input_layout = [last_inferred_layouts[g.index.entry_id(x)] for x in g.index.input_names]
+        for i, layout_stored in enumerate(input_layout):
+            list_layout[i] = list_layout[i] if list_layout[i] != '__undef__' else layout_stored
+    g._set_json_attr("layout_inputs", list_layout, 'list_layout')
    return g
 _move_out_module = tvm.get_global_func("nnvm.graph._move_module")

--- a/nnvm/python/nnvm/contrib.py
+++ b/nnvm/python/nnvm/contrib.py
+"""Module space to register contrib functions. Leave empty"""
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -86,6 +86,10 @@ def _conv2d(inputs, attrs):
    layout = attrs.get('layout', 'NCHW')
    if layout not in ['NCHW', 'NHWC']:
        _raise_not_supported('layout: ' + layout, 'conv2d')
+    if 'kernel_layout' in attrs:
+        kernel_layout = attrs['kernel_layout']
+    else:
+        kernel_layout = 'HWIO' if layout == 'NHWC' else 'OIHW'
    op_name, new_attrs = 'conv2d', {}
    new_attrs['channels'] = _required_attr(attrs, 'num_filter')
    new_attrs['kernel_size'] = kernel
@@ -94,6 +98,7 @@ def _conv2d(inputs, attrs):
    new_attrs['dilation'] = attrs.get('dilate', (1, 1))
    new_attrs['groups'] = attrs.get('num_group', 1)
    new_attrs['layout'] = layout
+    new_attrs['kernel_layout'] = kernel_layout
    new_attrs['use_bias'] = attrs.get('no_bias', 'False').strip() == 'False'
    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
@@ -106,6 +111,10 @@ def _conv2d_transpose(inputs, attrs):
    layout = attrs.get('layout', 'NCHW')
    if layout not in ['NCHW', 'NHWC']:
        _raise_not_supported('layout: ' + layout, 'conv2d_transpose')
+    if 'kernel_layout' in attrs:
+        kernel_layout = attrs['kernel_layout']
+    else:
+        kernel_layout = 'HWIO' if layout == 'NHWC' else 'OIHW'
    op_name, new_attrs = 'conv2d_transpose', {}
    new_attrs['channels'] = _required_attr(attrs, 'num_filter')
    new_attrs['kernel_size'] = kernel
@@ -115,6 +124,7 @@ def _conv2d_transpose(inputs, attrs):
    new_attrs['dilation'] = attrs.get('dilate', (1, 1))
    new_attrs['groups'] = attrs.get('num_group', 1)
    new_attrs['layout'] = layout
+    new_attrs['kernel_layout'] = kernel_layout
    new_attrs['use_bias'] = not _parse_bool_str(attrs, 'no_bias')
    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
@@ -237,7 +247,7 @@ _convert_map = {
    'min_axis'      : _rename('min'),
    'reshape'       : _reshape,
    'sum_axis'      : _rename('sum'),
-    'UpSampling'    : _upsampling
+    'UpSampling'    : _upsampling,
 }
 def _convert_symbol(op_name, inputs, attrs,

--- a/nnvm/python/nnvm/symbol.py
+++ b/nnvm/python/nnvm/symbol.py
@@ -16,6 +16,7 @@ from . import _base
 from ._base import _LIB, check_call as _check_call, _FFI_MODE, _all_var_init
 from .attribute import AttrScope
 from . import _symbol_internal as _internal
+from . import contrib
 # Use different verison of SymbolBase
 # When possible, use cython to speedup part of computation.

--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -5,7 +5,7 @@ from __future__ import absolute_import
 import tvm
 import topi
 from topi.util import get_const_int
-from .tensor import _fschedule_broadcast
+from .tensor import _fschedule_broadcast, _fschedule_injective
 from . import registry as reg
 from .registry import OpPattern
@@ -32,6 +32,11 @@ reg.register_schedule("pad", _fschedule_broadcast)
 reg.register_pattern("pad", OpPattern.INJECTIVE)
+# layout transform
+reg.register_schedule("__layout_transform__", _fschedule_injective)
+reg.register_pattern("__layout_transform__", OpPattern.INJECTIVE)
 @reg.register_schedule("softmax")
 def schedule_softmax(_, outs, target):
    """Schedule definition of softmax"""
@@ -108,6 +113,42 @@ def schedule_conv2d(attrs, outs, target):
 reg.register_pattern("conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+# convolution NCHWc
+@reg.register_compute("_contrib_conv2d_NCHWc")
+def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
+    """Compute definition of conv2d NCHWc"""
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    kh, kw = attrs.get_int_tuple('kernel_size')
+    groups = attrs.get_int("groups")
+    channels = attrs.get_int("channels")
+    assert dilation == (1, 1), "not support dilate now"
+    if groups == 1:
+        out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], channels, (kh, kw), strides, padding)
+    else:
+        raise ValueError("not support arbitrary group number > 1 for now")
+    if attrs.get_bool("use_bias"):
+        bias = inputs[2]
+        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
+        out = topi.broadcast_add(out, bias)
+    return out
+@reg.register_schedule("_contrib_conv2d_NCHWc")
+def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
+    """Schedule definition of conv2d NCHWc"""
+    groups = attrs.get_int("groups")
+    kh, kw = attrs.get_int_tuple('kernel_size')
+    oc = attrs.get_int("channels")
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    with tvm.target.create(target):
+        if groups == 1:
+            return topi.generic.schedule_conv2d_NCHWc(oc, (kh, kw), strides, padding, outs)
+        else:
+            raise ValueError("not support group number > 1 for now")
+reg.register_pattern("_contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE)
 # conv2d_transpose
 @reg.register_compute("conv2d_transpose")

--- a/nnvm/python/nnvm/top/registry.py
+++ b/nnvm/python/nnvm/top/registry.py
@@ -25,6 +25,7 @@ class OpPattern(object):
 _register_compute = tvm.get_global_func("nnvm._register_compute")
 _register_schedule = tvm.get_global_func("nnvm._register_schedule")
 _register_pattern = tvm.get_global_func("nnvm._register_pattern")
+_register_alter_op_layout = tvm.get_global_func("nnvm.compiler._register_alter_op_layout")
 def register_compute(op_name, f=None, level=10):
    """Register compute function for operator
@@ -93,3 +94,29 @@ def register_pattern(op_name, pattern, level=10):
        The priority level
    """
    _register_pattern(op_name, pattern, level)
+def register_alter_op_layout(op_name, f=None, level=10):
+    """Register alter layout function for operator
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+    f : function
+        The schedule function
+    level : int
+        The priority level
+    Returns
+    -------
+    fregister : function
+        Register function if f is not specified.
+    """
+    def register(myf):
+        """internal register function"""
+        _register_alter_op_layout(op_name, myf, level)
+        return myf
+    return register(f) if f else register
--- a/nnvm/src/c_api/c_api_symbolic.cc
+++ b/nnvm/src/c_api/c_api_symbolic.cc
@@ -294,7 +294,7 @@ int NNSymbolGetNumOutputs(SymbolHandle symbol,
                           nn_uint *output_count) {
  Symbol *s = static_cast<Symbol*>(symbol);
  API_BEGIN();
-    *output_count = static_cast<nn_uint>(s->outputs.size());
+  *output_count = static_cast<nn_uint>(s->outputs.size());
  API_END();
 }

--- a/nnvm/src/compiler/alter_op_layout.cc
+++ b/nnvm/src/compiler/alter_op_layout.cc
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file alter_op_layout.cc
+ * \brief Alter the operator layouts. Keep inferred layouts (if any) from previous stages.
+ *        e.g., convolution may calculates faster with NCHW16c layout.
+ */
+#include <nnvm/pass.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/layout.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/pass_functions.h>
+#include <tvm/tvm.h>
+#include <algorithm>
+#include <functional>
+#include "./compile_engine.h"
+#include "./graph_transform.h"
+namespace nnvm {
+namespace compiler {
+namespace {
+tvm::Array<tvm::Tensor> GetTensorInfo(const IndexedGraph& idx_graph,
+                                      const uint32_t nid,
+                                      const ShapeVector& shape_vec,
+                                      const DTypeVector& dtype_vec) {
+  tvm::Array<tvm::Tensor> vec;
+  for (uint32_t i = 0; i < idx_graph[nid].source->num_outputs(); ++i) {
+    tvm::Array<tvm::Expr> shape;
+    for (int64_t x : shape_vec[idx_graph.entry_id(nid, i)]) {
+      CHECK_LE(x, static_cast<int64_t>(std::numeric_limits<int>::max()));
+      shape.push_back(tvm::make_const(tvm::Int(32), x));
+    }
+    vec.push_back(tvm::placeholder(
+      shape, GetTVMType(dtype_vec[idx_graph.entry_id(nid, i)])));
+  }
+  return vec;
+}
+Graph AlterOpLayout(const Graph& src) {
+  static auto& falter_op_layout =
+    Op::GetAttr<nnvm::compiler::FTVMAlterOpLayout >("FTVMAlterOpLayout");
+  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
+  const DTypeVector& dtype_vec = src.GetAttr<DTypeVector>("dtype");
+  const IndexedGraph& idx_graph = src.indexed_graph();
+  std::vector<std::vector<Layout> > in_layouts_of_node(idx_graph.num_nodes());
+  std::vector<std::vector<Layout> > out_layouts_of_node(idx_graph.num_nodes());
+  std::unordered_map<const Node*, uint32_t> new_nodes;
+  if (src.HasAttr("layout")) {
+    // record layouts so that LayoutTransform pass can fix layouts correctly,
+    // e.g., conv2d can be replaced by some contrib implement
+    // whose layout is different from the original one
+    // (which was imported from a model file).
+    const auto& layouts = src.GetAttr<std::vector<Layout> >("layout");
+    for (uint32_t nid = 0; nid < idx_graph.num_nodes(); ++nid) {
+      const auto &inode = idx_graph[nid];
+      if (falter_op_layout.count(inode.source->op())) {
+        // do not record input layouts of nodes that will be replaced.
+        continue;
+      }
+      std::vector<Layout> in_layout;
+      for (const auto& e : inode.inputs) {
+        in_layout.emplace_back(layouts[idx_graph.entry_id(e)]);
+      }
+      in_layouts_of_node[nid] = in_layout;
+      std::vector<Layout> out_layout;
+      for (uint i = 0; i < inode.source->num_outputs(); ++i) {
+        out_layout.emplace_back(layouts[idx_graph.entry_id(nid, i)]);
+      }
+      out_layouts_of_node[nid] = out_layout;
+    }
+  }
+  auto transform = [&](uint32_t nid,
+                       const NodePtr& n,
+                       std::vector<NodeEntry>* ret) {
+    nnvm::compiler::FTVMAlterOpLayout fn_alter_op_layout =
+      falter_op_layout.get(n->op(), nullptr);
+    if (fn_alter_op_layout == nullptr) {
+      new_nodes[n.get()] = nid;
+      return false;
+    }
+    // construct parameters for registered function
+    std::vector<Symbol> op_inputs;
+    tvm::Array<tvm::Tensor> tensor_infos;
+    CHECK_EQ(n->num_inputs(), idx_graph[nid].inputs.size());
+    for (uint32_t i = 0; i < n->num_inputs(); ++i) {
+      const nnvm::NodeEntry& input = n->inputs[i];
+      // input operator
+      Symbol op_input;
+      op_input.outputs.push_back(input);
+      op_inputs.push_back(op_input);
+      // input tinfo, extract from the original graph
+      // because it was where infer_shape & infer_type applied.
+      tvm::Array<tvm::Tensor> op_output_tinfos =
+        GetTensorInfo(idx_graph, idx_graph[nid].inputs[i].node_id,
+                      shape_vec, dtype_vec);
+      tensor_infos.push_back(op_output_tinfos[input.index]);
+    }
+    // callback registered function to get a new operator.
+    auto op = fn_alter_op_layout(n->attrs, Symbol::CreateGroup(op_inputs), tensor_infos);
+    *ret = op.outputs;
+    return true;
+  };
+  Graph ret = nnvm::compiler::GraphTransform(src, transform);
+  if (src.HasAttr("layout")) {
+    // restore the layouts to return graph
+    const auto& ret_idx = ret.indexed_graph();
+    std::vector<Layout> ret_layouts(ret_idx.num_node_entries(), Layout::Undef());
+    for (uint32_t nid = 0; nid < ret_idx.num_nodes(); ++nid) {
+      const auto& inode = ret_idx[nid];
+      if (new_nodes.count(inode.source)) {
+        const std::vector<Layout>& in_layouts =
+          in_layouts_of_node[new_nodes[inode.source]];
+        for (const auto& e : inode.inputs) {
+          ret_layouts[ret_idx.entry_id(e)] = in_layouts[e.index];
+        }
+        const std::vector<Layout>& out_layouts =
+          out_layouts_of_node[new_nodes[inode.source]];
+        for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+          ret_layouts[ret_idx.entry_id(nid, i)] = out_layouts[i];
+        }
+      }
+    }
+    // cannot call indexed_graph() before return the origin Graph,
+    // thus create a new one.
+    nnvm::Graph new_ret;
+    new_ret.outputs = ret.outputs;
+    new_ret.attrs["layout"] = std::make_shared<any>(std::move(ret_layouts));
+    return new_ret;
+  }
+  return ret;
+}
+// register pass
+NNVM_REGISTER_PASS(AlterOpLayout)
+.set_body(AlterOpLayout)
+.set_change_graph(true);
+}  // namespace
+}  // namespace compiler
+}  // namespace nnvm
--- a/nnvm/src/compiler/fold_scale_axis.cc
+++ b/nnvm/src/compiler/fold_scale_axis.cc
@@ -362,7 +362,7 @@ bool Pool2DBackward(
    std::vector<FoldChainInfo>* in_axis) {
  using top::Pool2DParam;
  const Pool2DParam& param = nnvm::get<Pool2DParam>(attrs.parsed);
-  if (out_info.axis == 1 && param.layout == top::kNCHW) {
+  if (out_info.axis == 1 && param.layout == "NCHW") {
    (*in_axis)[0] = out_info;
  }
  return false;
@@ -376,7 +376,7 @@ bool Pool2DForward(
    FoldChainInfo* out_info) {
  using top::Pool2DParam;
  const Pool2DParam& param = nnvm::get<Pool2DParam>(attrs.parsed);
-  if ((*in_info)[0].axis == 1 && param.layout == top::kNCHW) {
+  if ((*in_info)[0].axis == 1 && param.layout == "NCHW") {
    *out_info = (*in_info)[0];
  }
  return false;
@@ -467,7 +467,7 @@ bool Conv2DScaleAxisBackward(
  const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
  if (out_info.kind != kPending) return false;
  // only optimize for nchw for now
-  if (param.layout == top::kNCHW && out_info.axis == 1) {
+  if (param.layout == "NCHW" && out_info.axis == 1) {
    (*in_axis)[1].kind = kMulConsumer;
    (*in_axis)[1].axis = 0;
    (*in_axis)[1].source = out_info.source;
@@ -492,7 +492,7 @@ bool Conv2DScaleAxisForward(
  const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
  if ((*in_info)[0].kind != kPending) return false;
  // only optimize for nchw for now
-  if (param.layout == top::kNCHW && (*in_info)[0].axis == 1) {
+  if (param.layout == "NCHW" && (*in_info)[0].axis == 1) {
    (*in_info)[1].kind = kMulConsumer;
    (*in_info)[1].axis = 1;
    (*in_info)[1].source = (*in_info)[0].source;

--- a/nnvm/src/compiler/layout_transform.cc
+++ b/nnvm/src/compiler/layout_transform.cc
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file layout_transform.cc
- * \brief Transforms layout.
- */
-#include <nnvm/graph.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/graph_attr_types.h>
-#include <nnvm/pass.h>
-#include <nnvm/compiler/op_attr_types.h>
-#include <nnvm/compiler/contrib_op_param.h>
-namespace nnvm {
-namespace compiler {
-const TLayoutInfo& GetDefaultLayout() {
-  static TLayoutInfo default_layout = "default";
-  return default_layout;
-}
-nnvm::NodePtr CreateLayoutTransformNode(const std::string& src,
-                                        const std::string& dst) {
-  static const nnvm::Op* trans_op = nnvm::Op::Get("layout_transform");
-  static int count = 0;
-  nnvm::NodePtr n = nnvm::Node::Create();
-  n->attrs.op = trans_op;
-  n->attrs.name = src + "_to_" + dst + std::to_string(count++);
-  n->attrs.dict["src_layout"] = src;
-  n->attrs.dict["dst_layout"] = dst;
-  n->op()->attr_parser(&(n->attrs));
-  return n;
-}
-/*!
- * \brief A simple layout transform pass that will
- *  insert layout transform nodes automatically.
- */
-nnvm::Graph LayoutTransform(nnvm::Graph src) {
-  static auto& op_layout_request =
-    nnvm::Op::GetAttr<FTVMLayoutRequest>("FTVMLayoutRequest");
-  static auto& op_vecop =
-    nnvm::Op::GetAttr<FTVMVectorizedOp>("FTVMVectorizedOp");
-  static auto& op_pattern = nnvm::Op::GetAttr<TOpPattern>("TOpPattern");
-  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
-  const std::vector<TLayoutInfo>& input_layouts =
-      src.GetAttr<std::vector<TLayoutInfo> >("layout_inputs");
-  const IndexedGraph& idx = src.indexed_graph();
-  std::vector<TLayoutInfo> produce_vec(idx.num_node_entries(), GetDefaultLayout());
-  std::vector<nnvm::NodePtr> mirror_vec(idx.num_nodes(), nullptr);
-  // use op pattern to decide whether an op is map
-  auto is_map_op = [&](size_t nid) {
-    TOpPattern pt = op_pattern.get(idx[nid].source->op(), kOpaque);
-    bool is_map = (pt <= kBroadcast);
-    if (pt == kBroadcast) {
-      for (const auto& e : idx[nid].inputs) {
-        if (shape_vec[idx.entry_id(nid, 0)] != shape_vec[idx.entry_id(e)]) {
-          is_map = false;
-          break;
-        }
-      }
-    }
-    return is_map;
-  };
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    nnvm::NodePtr new_node = nnvm::Node::Create();
-    *new_node = *(inode.source);
-    if (new_node->is_variable()) {
-      auto input_iter = std::find(
-        idx.input_nodes().cbegin(), idx.input_nodes().cend(), nid);
-      CHECK(input_iter != idx.input_nodes().cend());
-      size_t input_id = std::distance(idx.input_nodes().cbegin(), input_iter);
-      produce_vec[idx.entry_id(nid, 0)] = input_layouts[input_id];
-      mirror_vec[nid] = new_node;
-      continue;
-    }
-    if (op_vecop.count(inode.source->op())) {
-      new_node = op_vecop[inode.source->op()](inode.source);
-      new_node->inputs.resize(new_node->num_inputs());
-    }
-    // set up output and input layouts
-    std::vector<TLayoutInfo> request_ilayouts(new_node->num_inputs(), GetDefaultLayout());
-    if (op_layout_request.count(new_node->op())) {
-      std::vector<TLayoutInfo> produce_olayouts(new_node->num_outputs(), GetDefaultLayout());
-      CHECK(op_layout_request[new_node->op()](
-          new_node->attrs, &request_ilayouts, &produce_olayouts))
-          << "Layout request fail";
-      CHECK_EQ(request_ilayouts.size(), new_node->num_inputs());
-      CHECK_EQ(produce_olayouts.size(), new_node->num_outputs());
-      for (size_t i = 0; i < new_node->num_outputs(); ++i) {
-        produce_vec[idx.entry_id(nid, i)] = produce_olayouts[i];
-      }
-    }
-    bool map_layout = is_map_op(nid);
-    if (map_layout) {
-      const TLayoutInfo& layout = produce_vec[idx.entry_id(inode.inputs[0])];
-      for (const auto& e : inode.inputs) {
-        if (produce_vec[idx.entry_id(e)] != layout) {
-          map_layout = false;
-          break;
-        }
-      }
-      if (map_layout) {
-        for (size_t i = 0; i < inode.source->num_outputs(); ++i) {
-          produce_vec[idx.entry_id(nid, i)] = layout;
-        }
-      }
-    }
-    for (size_t i = 0; i < inode.inputs.size(); ++i) {
-      const auto& e = inode.inputs[i];
-      const nnvm::NodePtr& in = mirror_vec[e.node_id];
-      new_node->inputs[i] =
-        nnvm::NodeEntry{in, e.index, e.version};
-      TLayoutInfo produce = produce_vec[idx.entry_id(e)];
-      TLayoutInfo request = request_ilayouts[i];
-      if (!map_layout && (produce != request)) {
-        nnvm::NodePtr tnode = CreateLayoutTransformNode(produce, request);
-        tnode->attrs.name =
-          idx[e.node_id].source->attrs.name + "_" + request;
-        tnode->inputs.emplace_back(new_node->inputs[i]);
-        new_node->inputs[i] = nnvm::NodeEntry{tnode, 0, 0};
-      }
-    }
-    mirror_vec[nid] = new_node;
-  }
-  std::vector<nnvm::NodeEntry> outputs;
-  for (const auto& e : idx.outputs()) {
-    TLayoutInfo produce = produce_vec[idx.entry_id(e)];
-    if (produce != GetDefaultLayout()) {
-      nnvm::NodePtr tnode = CreateLayoutTransformNode(produce, GetDefaultLayout());
-      tnode->attrs.name =
-        idx[e.node_id].source->attrs.name + "_default";
-      tnode->inputs.emplace_back(
-        nnvm::NodeEntry{mirror_vec[e.node_id], e.index, e.version});
-      outputs.emplace_back(nnvm::NodeEntry{tnode, 0, 0});
-    } else {
-      outputs.emplace_back(
-        nnvm::NodeEntry{mirror_vec[e.node_id], e.index, e.version});
-    }
-  }
-  nnvm::Graph ret;
-  ret.outputs = std::move(outputs);
-  return ret;
-}
-}  // namespace compiler
-}  // namespace nnvm
--- a/nnvm/src/compiler/packed_func_ext.cc
+++ b/nnvm/src/compiler/packed_func_ext.cc
@@ -8,6 +8,7 @@
 #include <nnvm/op.h>
 #include <nnvm/compiler/packed_func_ext.h>
 #include <nnvm/compiler/op_attr_types.h>
+#include <tvm/runtime/c_runtime_api.h>
 #include "./node_attr.h"
 #include "compile_engine.h"
@@ -62,6 +63,23 @@ TVM_REGISTER_GLOBAL("nnvm.compiler._dict_keys")
    *rv = keys;
  });
+TVM_REGISTER_GLOBAL("nnvm.compiler._register_alter_op_layout")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
+  PackedFunc* f = new PackedFunc(args[1].operator PackedFunc());
+  Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+  auto fpack = [f](const NodeAttrs& attrs,
+                   const Symbol& inputs,
+                   const Array<Tensor>& tinfos) {
+    TVMRetValue ret = (*f)(GetAttrDict(attrs), inputs, tinfos);
+    CHECK_EQ(ret.type_code(), tvm::runtime::extension_class_info<Symbol>::code)
+      << " expected " << "Symbol (code = " << tvm::runtime::extension_class_info<Symbol>::code
+      << ") but get code = " << ret.type_code();
+    return *(static_cast<Symbol*>(ret.value().v_handle));
+  };
+  op.set_attr<FTVMAlterOpLayout>("FTVMAlterOpLayout", fpack, args[2]);
+});
 // custom version of TVM compute
 TVM_REGISTER_GLOBAL("nnvm._register_compute")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -84,7 +102,7 @@ TVM_REGISTER_GLOBAL("nnvm._register_compute")
 TVM_REGISTER_GLOBAL("nnvm._register_schedule")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-        // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
+    // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
    PackedFunc* f = new PackedFunc(args[1].operator PackedFunc());
    Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
    auto fschedule = [f](const NodeAttrs& attrs,

--- a/nnvm/src/compiler/simplify_inference.cc
+++ b/nnvm/src/compiler/simplify_inference.cc
@@ -22,7 +22,8 @@ BatchNormToInferUnpack(const nnvm::NodeAttrs& attrs,
                       nnvm::NodeEntry beta,
                       nnvm::NodeEntry moving_mean,
                       nnvm::NodeEntry moving_var,
-                       TShape dshape) {
+                       TShape dshape,
+                       TShape bshape) {
  CHECK_NE(dshape.ndim(), 0);
  CHECK(attrs.op);
  static const  Op* bn_op = Op::Get("batch_norm");
@@ -60,13 +61,14 @@ BatchNormToInferUnpack(const nnvm::NodeAttrs& attrs,
        "elemwise_add", bn_name + "_add_beta", {shift, beta});
  }
  int axis = param.axis;
-  scale = ExpandBiasToMatchAxis(scale, dshape.ndim(), 1, axis);
+  scale = ExpandBiasToMatchAxis(scale, dshape.ndim()-bshape.ndim()+1, 1, axis);
-  shift = ExpandBiasToMatchAxis(shift, dshape.ndim(), 1, axis);
+  shift = ExpandBiasToMatchAxis(shift, dshape.ndim()-bshape.ndim()+1, 1, axis);
  NodeEntry out = MakeNode("broadcast_mul", bn_name + "_a_mul_data",
                           {data, scale});
  out = MakeNode("broadcast_add", bn_name + "_out",
                 {out, shift});
-  // It is invalid to ref the other values of BN after infernece transform.
+  // It is invalid to ref the other values of BN after inference transform.
  NodeEntry undef = MakeNode("__undef__", "undef", {});
  return {out, undef, undef};
 }
@@ -87,7 +89,8 @@ Graph SimplifyInference(nnvm::Graph src) {
          n->inputs[2],
          n->inputs[3],
          n->inputs[4],
-          shape_vec[idx.entry_id(nid, 0)]);
+          shape_vec[idx.entry_id(nid, 0)],
+          shape_vec[idx.entry_id(nid, 1)]);
      return true;
    } else if (n->op() == dropout_op) {
      NodeEntry undef = MakeNode("__undef__", "undef", {});
@@ -101,7 +104,8 @@ Graph SimplifyInference(nnvm::Graph src) {
 }
 NNVM_REGISTER_PASS(SimplifyInference)
-.set_body(SimplifyInference);
+.set_body(SimplifyInference)
+.set_change_graph(true);
 }  // namespace compiler
 }  // namespace nnvm
--- a/nnvm/src/pass/correct_layout.cc
+++ b/nnvm/src/pass/correct_layout.cc
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file correct_layout.cc
+ * \brief Infer and correct layout.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/pass.h>
+#include <nnvm/layout.h>
+namespace nnvm {
+namespace pass {
+nnvm::NodePtr CreateLayoutTransformNode(const Layout& src,
+                                        const Layout& dst) {
+  static const nnvm::Op* trans_op = nnvm::Op::Get("__layout_transform__");
+  static int count = 0;
+  nnvm::NodePtr n = nnvm::Node::Create();
+  n->attrs.op = trans_op;
+  n->attrs.name = src.name() + "_to_" + dst.name() + std::to_string(count++);
+  n->attrs.dict["src_layout"] = src.name();
+  n->attrs.dict["dst_layout"] = dst.name();
+  n->op()->attr_parser(&(n->attrs));
+  return n;
+}
+using LayoutAttrDict = std::unordered_map<const Node*, std::vector<Layout> >;
+/*!
+ * \brief A simple layout infer pass that will
+ *        insert layout transform nodes automatically.
+ */
+nnvm::Graph CorrectLayout(nnvm::Graph src) {
+  static auto& op_infer_layout =
+    nnvm::Op::GetAttr<FInferLayout>("FInferLayout");
+  const IndexedGraph& idx = src.indexed_graph();
+  std::vector<nnvm::NodePtr> mirror_vec(idx.num_nodes(), nullptr);
+  // (new) NodePtr -> output_layouts
+  LayoutAttrDict new_layouts;
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    nnvm::NodePtr new_node = nnvm::Node::Create();
+    *new_node = *(inode.source);
+    if (new_node->is_variable()) {
+      // Variable node. No operator. Only one output entry.
+      auto input_iter = std::find(
+        idx.input_nodes().cbegin(), idx.input_nodes().cend(), nid);
+      CHECK(input_iter != idx.input_nodes().cend());
+      int64_t input_id = std::distance(idx.input_nodes().cbegin(), input_iter);
+      if (src.HasAttr("layout_inputs")) {
+        new_layouts[new_node.get()] =
+          {src.GetAttr<std::vector<Layout> >("layout_inputs")[input_id]};
+      } else {
+        new_layouts[new_node.get()] = {Layout::Undef()};
+      }
+      mirror_vec[nid] = new_node;
+      continue;
+    }
+    const uint32_t num_inputs = inode.inputs.size();
+    const uint32_t num_outputs = inode.source->num_outputs();
+    // set up output and input layouts
+    std::vector<Layout> request_ilayouts(num_inputs, Layout::Undef());
+    for (size_t i = 0; i < num_inputs; ++i) {
+      const IndexedGraph::NodeEntry& input_entry = inode.inputs[i];
+      const NodePtr& new_input_node = mirror_vec[input_entry.node_id];
+      CHECK(new_input_node != nullptr);
+      // fill inputs by previous node (DFS order) inferred layouts.
+      const auto& layouts_iter = new_layouts.find(new_input_node.get());
+      CHECK(layouts_iter != new_layouts.end());
+      request_ilayouts[i] = layouts_iter->second[input_entry.index];
+    }
+    // layouts produced by previous node.
+    std::vector<Layout> produce_ilayouts(request_ilayouts);
+    // input layouts from last pass of LayoutTransform (if apply)
+    std::vector<Layout> last_request_ilayouts(num_inputs, Layout::Undef());
+    // fill outputs by last pass of LayoutTransform (if apply)
+    std::vector<Layout> produce_olayouts(num_outputs, Layout::Undef());
+    if (src.HasAttr("layout")) {
+      const auto& layouts = src.GetAttr<std::vector<Layout> >("layout");
+      for (uint32_t i = 0; i < num_outputs; ++i) {
+        produce_olayouts[i] = layouts[idx.entry_id(nid, i)];
+      }
+      for (uint32_t i = 0; i < num_inputs; ++i) {
+        last_request_ilayouts[i] = layouts[idx.entry_id(inode.inputs[i])];
+      }
+    }
+    const auto& flayout = op_infer_layout[new_node->op()];
+    CHECK(flayout != nullptr) << "Attribute FInferLayout"
+                              << " is not registered by op " << inode.source->op()->name
+                              << " we are not able to complete layout transform.";
+    CHECK(flayout(new_node->attrs, &request_ilayouts, &last_request_ilayouts, &produce_olayouts))
+        << "Layout infer fail";
+    CHECK_EQ(request_ilayouts.size(), num_inputs);
+    CHECK_EQ(produce_olayouts.size(), num_outputs);
+    // update new layouts
+    new_layouts[new_node.get()] = std::move(produce_olayouts);
+    for (uint32_t i = 0; i < inode.inputs.size(); ++i) {
+      const auto& e = inode.inputs[i];
+      const nnvm::NodePtr& in = mirror_vec[e.node_id];
+      new_node->inputs[i] = nnvm::NodeEntry{in, e.index, e.version};
+      // insert layout_transform if necessary
+      const Layout& produce = produce_ilayouts[i];
+      const Layout& request = request_ilayouts[i];
+      if (produce != request && produce.defined()) {
+        nnvm::NodePtr tnode = CreateLayoutTransformNode(produce, request);
+        tnode->attrs.name = idx[e.node_id].source->attrs.name + "_" + request.name();
+        tnode->inputs.emplace_back(new_node->inputs[i]);
+        nnvm::NodeEntry tnode_output{tnode, 0, 0};
+        new_node->inputs[i] = tnode_output;
+        // layout produced by LayoutTransformNode
+        new_layouts[tnode.get()] = {request};
+      } else if (!produce.defined()) {
+        // do reverse infer
+        new_layouts[in.get()][e.index] = request;
+      }
+    }
+    mirror_vec[nid] = new_node;
+  }
+  std::vector<nnvm::NodeEntry> outputs;
+  for (const auto& e : idx.outputs()) {
+    outputs.emplace_back(nnvm::NodeEntry{mirror_vec[e.node_id], e.index, e.version});
+  }
+  nnvm::Graph ret;
+  ret.outputs = outputs;
+  // restore the layouts to return graph
+  const auto& ret_idx = ret.indexed_graph();
+  std::vector<Layout> ret_layouts(ret_idx.num_node_entries(), Layout::Undef());
+  for (uint32_t nid = 0; nid < ret_idx.num_nodes(); ++nid) {
+    const auto& inode = ret_idx[nid];
+    const auto& layout_iter = new_layouts.find(inode.source);
+    if (layout_iter != new_layouts.end()) {
+      for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+        ret_layouts[ret_idx.entry_id(nid, i)] = std::move(layout_iter->second[i]);
+      }
+    }
+  }
+  // cannot call indexed_graph() before return the origin Graph,
+  // thus create a new one
+  nnvm::Graph new_ret;
+  new_ret.outputs = std::move(outputs);
+  new_ret.attrs["layout"] = std::make_shared<any>(std::move(ret_layouts));
+  return new_ret;
+}
+// register pass
+NNVM_REGISTER_PASS(CorrectLayout)
+.describe("Return a layout-transformed graph of src.")
+.set_body(CorrectLayout)
+.provide_graph_attr("layout")
+.set_change_graph(true);
+DMLC_JSON_ENABLE_ANY(LayoutVector, list_layout);
+}  // namespace pass
+}  // namespace nnvm
--- a/nnvm/src/pass/infer_shape_type.cc
+++ b/nnvm/src/pass/infer_shape_type.cc
@@ -158,7 +158,7 @@ Graph InferAttr(Graph &&ret,
        } else {
          CHECK(!last_iter)
              << "Attribute " << infer_name
-              << " is not registed by op " << inode.source->op()->name
+              << " is not registered by op " << inode.source->op()->name
              << " we are not able to complete the inference because of this";
        }
      }

--- a/nnvm/src/top/elemwise_op_common.h
+++ b/nnvm/src/top/elemwise_op_common.h
@@ -6,9 +6,12 @@
 #ifndef NNVM_TOP_ELEMWISE_OP_COMMON_H_
 #define NNVM_TOP_ELEMWISE_OP_COMMON_H_
+#include <nnvm/layout.h>
+#include <nnvm/top/nn.h>
 #include <string>
 #include <vector>
 #include <utility>
+#include <functional>
 #include "./op_common.h"
 namespace nnvm {
@@ -100,12 +103,176 @@ inline bool ElementWiseReduceType(const NodeAttrs& attrs,
    attrs, in_attrs, out_attrs, -1);
 }
+template<int n_in, int n_out>
+inline bool ElemwiseFixedLayout(const NodeAttrs& attrs,
+                                std::vector<Layout> *in_layouts,
+                                const std::vector<Layout> *last_in_layouts,
+                                std::vector<Layout> *out_layouts,
+                                const std::function<Layout(const Layout& in)>& finfer) {
+  const size_t in_size = (n_in == -1) ? in_layouts->size() : static_cast<size_t>(n_in);
+  const size_t out_size = (n_out == -1) ? out_layouts->size() : static_cast<size_t>(n_out);
+  auto deduce = [&](Layout *target, const std::vector<Layout> *vec,
+                    size_t size, const char *name) {
+    for (size_t i = 0; i < size; ++i) {
+      if (vec->at(i).defined()) {
+        if (!target->defined()) {
+          *target = vec->at(i);
+        }
+        CHECK_EQ(*target, vec->at(i))
+          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
+          << name << ": " << "expected " << *target
+          << ", got " << vec->at(i);
+      }
+    }
+  };
+  Layout in, last_in, out;
+  deduce(&in, in_layouts, in_size, "input");
+  deduce(&last_in, last_in_layouts, in_size, "input (last infer pass)");
+  deduce(&out, out_layouts, out_size, "output");
+  if (!last_in.defined()) {
+    last_in = in;
+  } else {
+    // else we copy in_layout produced by last infer pass to in_layout,
+    // and let LayoutTransform pass
+    // to insert an layout_transform node to fix the input layout.
+    in = last_in;
+  }
+  out = finfer(in);
+  auto write = [](std::vector<Layout> *vec, Layout& value, size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+      vec->at(i) = value;
+    }
+  };
+  if (in.defined()) write(in_layouts, in, in_size);
+  if (out.defined()) write(out_layouts, out, out_size);
+  return true;
+}
+/*! \brief Fix the input layout as the previous inferred (if any) and copy to output */
+template<int n_in, int n_out>
+inline bool ElemwiseFixedLayoutCopyToOut(const NodeAttrs& attrs,
+                                         std::vector<Layout> *in_layouts,
+                                         const std::vector<Layout> *last_in_layouts,
+                                         std::vector<Layout> *out_layouts) {
+  return ElemwiseFixedLayout<n_in, n_out>(
+    attrs, in_layouts, last_in_layouts, out_layouts, [](const Layout& in) {
+    return in;
+  });
+}
+/*! \brief Fix the input layout as the previous inferred (if any) and do not define output */
+template<int n_in, int n_out>
+inline bool ElemwiseFixedLayoutUnknownOut(const NodeAttrs& attrs,
+                                          std::vector<Layout> *in_layouts,
+                                          const std::vector<Layout> *last_in_layouts,
+                                          std::vector<Layout> *out_layouts) {
+  return ElemwiseFixedLayout<n_in, n_out>(
+    attrs, in_layouts, last_in_layouts, out_layouts, [](const Layout& in) {
+    return Layout::Undef();
+  });
+}
+/*! \brief take arbitrary input layout and copy to output */
+template<int n_in, int n_out>
+inline bool ElemwiseArbitraryLayout(const NodeAttrs& attrs,
+                                    std::vector<Layout> *in_layouts,
+                                    const std::vector<Layout> *last_in_layouts,
+                                    std::vector<Layout> *out_layouts) {
+  const size_t in_size = (n_in == -1) ? in_layouts->size() : static_cast<size_t>(n_in);
+  const size_t out_size = (n_out == -1) ? out_layouts->size() : static_cast<size_t>(n_out);
+  Layout in;
+  for (size_t i = 0; i < in_size; ++i) {
+    if (!in.defined()) in = in_layouts->at(i);
+    CHECK_EQ(in, in_layouts->at(i))
+      << "Incompatible attr in node " << attrs.name << " at " << i
+      << "-th input: expected " << in
+      << ", got " << in_layouts->at(i);
+  }
+  if (in.defined()) {
+    for (size_t i = 0; i < out_size; ++i) {
+      out_layouts->at(i) = in;
+    }
+  }
+  return true;
+}
+/*!
+ * \brief try to convert right layout to left layout if they are different.
+ *        if the converting fails, it will use the last inferred layouts.
+ */
+inline bool ElemwiseBinaryKeepLeftLayout(const NodeAttrs& attrs,
+                                         std::vector<Layout> *in_layouts,
+                                         const std::vector<Layout> *last_in_layouts,
+                                         std::vector<Layout> *out_layouts) {
+  CHECK_EQ(in_layouts->size(), 2U);
+  CHECK_EQ(last_in_layouts->size(), 2U);
+  CHECK_EQ(out_layouts->size(), 1U);
+  const Layout& lhs_last = (*last_in_layouts)[0];
+  const Layout& rhs_last = (*last_in_layouts)[1];
+  CHECK((lhs_last.defined() && rhs_last.defined()) ||
+        (!lhs_last.defined() && !rhs_last.defined()));
+  const Layout& lhs = (*in_layouts)[0];
+  const Layout& rhs = (*in_layouts)[1];
+  if (!lhs.defined() && !rhs.defined()) {
+    CHECK(!lhs_last.defined() && !rhs_last.defined())
+      << "Lost input layouts in node " << attrs.name
+      << ": last inferred lhs=" << lhs_last << ", rhs=" << rhs_last;
+    return true;
+  } else if (!lhs.defined()) {
+    CHECK(!lhs_last.defined() && !rhs_last.defined());
+    in_layouts->at(0) = rhs;
+    out_layouts->at(0) = rhs;
+    return true;
+  } else if (!rhs.defined()) {
+    CHECK(!lhs_last.defined() && !rhs_last.defined());
+    in_layouts->at(1) = lhs;
+    out_layouts->at(0) = lhs;
+    return true;
+  }
+  if (lhs == rhs) {
+    // for same layout, we can always do binary calculation
+    // and pass the layout to next layer
+    out_layouts->at(0) = lhs;
+    return true;
+  }
+  if (rhs.convertible(lhs)) {
+    in_layouts->at(1) = lhs;
+    out_layouts->at(0) = lhs;
+  } else {
+    CHECK(lhs_last.defined() && rhs_last.defined())
+      << "Incompatible input layouts in node " << attrs.name
+      << ". lhs: " << lhs << ", rhs: " << rhs;
+    CHECK(lhs_last == rhs_last);
+    in_layouts->at(0) = lhs_last;
+    in_layouts->at(1) = rhs_last;
+    out_layouts->at(0) = lhs_last;
+  }
+  return true;
+}
 #define NNVM_REGISTER_ELEMWISE_UNARY_OP(name)                       \
  NNVM_REGISTER_OP(name)                                            \
  .set_num_inputs(1)                                                \
  .set_num_outputs(1)                                               \
  .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)        \
  .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)           \
+  .set_attr<FInferLayout>("FInferLayout",                           \
+    ElemwiseArbitraryLayout<1, 1>)                                  \
  .set_attr<FInplaceOption>("FInplaceOption",                       \
    [](const NodeAttrs& attrs){                                     \
      return std::vector<std::pair<int, int> >{{0, 0}};             \
@@ -131,6 +298,8 @@ inline bool ElementWiseReduceType(const NodeAttrs& attrs,
  .set_num_outputs(1)                                               \
  .set_attr<FInferShape>("FInferShape", ElemwiseShape<2, 1>)        \
  .set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)           \
+  .set_attr<FInferLayout>("FInferLayout",                           \
+    ElemwiseBinaryKeepLeftLayout)                                   \
  .set_attr<FInplaceOption>("FInplaceOption",                       \
    [](const NodeAttrs& attrs) {                                    \
      return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};     \
@@ -150,6 +319,8 @@ inline bool ElementWiseReduceType(const NodeAttrs& attrs,
    ParamGetAttrDict<ElementWiseReduceParam>)                       \
  .set_attr<nnvm::FInferShape>("FInferShape",                       \
    ElementWiseReduceShape)                                         \
+  .set_attr<FInferLayout>("FInferLayout",                           \
+    ElemwiseFixedLayoutCopyToOut<1, 1>)                             \
  .set_attr<nnvm::FInferType>("FInferType", ElementWiseReduceType)  \
  .add_argument("args", "Symbol[]", "Positional input arguments")
@@ -166,6 +337,8 @@ inline bool ElementWiseReduceType(const NodeAttrs& attrs,
        static_cast<int>(kFloat32));                                \
      return true;                                                  \
  })                                                                \
+  .set_attr<FInferLayout>("FInferLayout",                           \
+    ElemwiseFixedLayoutUnknownOut<1, 1>)                            \
  .set_attr<FGradient>(                                             \
    "FGradient", [](const NodePtr& n,                               \
                    const std::vector<NodeEntry>& ograds) {         \

--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
--- a/nnvm/src/top/nn/nn_common.h
+++ b/nnvm/src/top/nn/nn_common.h
@@ -8,6 +8,7 @@
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
+#include <nnvm/layout.h>
 #include <nnvm/top/nn.h>
 #include <string>
 #include <vector>
@@ -40,100 +41,47 @@ inline std::vector<std::string> UseBiasListInputNames(const NodeAttrs& attrs) {
 * \param dst_layout target layout
 * \return shape in target layout
 */
-inline TShape ConvertLayout(TShape src, int src_layout, int dst_layout, bool is_weight = false) {
+inline TShape ConvertLayout(TShape src, const Layout& src_layout, const Layout& dst_layout) {
-  if (src_layout == dst_layout) return src;
+  if (src_layout == dst_layout) {
-  TShape dst = src;
+    return src;
-  if (src.ndim() == 3) {
+  } else if (!src_layout.defined()) {
-    switch (src_layout) {
+    LOG(FATAL) << "cannot convert undefined layout to " << dst_layout;
-      case kNCW: break;
+  } else if (!dst_layout.defined()) {
-      case kNWC: {
+    LOG(FATAL) << "cannot convert " << src_layout << " to undefined layout";
-        std::swap(dst[1], dst[2]);
+  }
-        break;
-      }
+  CHECK(src_layout.convertible(dst_layout)) << "cannot convert from "
-      default: {
+                                            << src_layout << " to " << dst_layout;
-        LOG(FATAL) << "inavlid layout for 3d shape" << src_layout;
-      }
+  TShape dst(dst_layout.ndim());
-    }
+  for (size_t i = 0; i < src_layout.ndim(); ++i) {
-    switch (dst_layout) {
+    Layout::LayoutDim src_dim = src_layout[i];
-      case kNCW: break;
+    if (Layout::is_superdim(src_dim)) {
-      case kNWC: {
+      int dst_major_pos = dst_layout.indexof(Layout::to_superdim(src_dim));
-        std::swap(dst[1], dst[2]);
+      int dst_minor_pos = dst_layout.indexof(Layout::to_subdim(src_dim));
-        break;
+      int src_minor_pos = src_layout.indexof(Layout::to_subdim(src_dim));
-      }
+      int src_factor = src_layout.subsizeof(src_dim);
-      default: {
+      int dst_factor = dst_layout.subsizeof(src_dim);
-        LOG(FATAL) << "inavlid layout for 3d shape" << dst_layout;
-      }
+      uint32_t src_dim_size = src[i];
-    }
+      if (src_minor_pos >= 0) {
-  } else if (src.ndim() == 4) {
+        CHECK_EQ(src_factor, src[src_minor_pos]) << "src shape " << src
-    switch (src_layout) {
+                                                 << " does not agree with layout " << src_layout;
-      case kNCHW: break;
+        src_dim_size *= src_factor;
-      case kNHWC: {
-        if (is_weight) {
-           dst[2] = src[0];
-           dst[3] = src[1];
-           dst[1] = src[2];
-           dst[0] = src[3];
-        } else {
-           dst[2] = src[1];
-           dst[3] = src[2];
-           dst[1] = src[3];
-        }
-        break;
-      }
-      default: {
-        LOG(FATAL) << "inavlid layout for 4d shape" << src_layout;
-      }
-    }
-    src = dst;
-    switch (dst_layout) {
-      case kNCHW: break;
-      case kNHWC: {
-        if (is_weight) {
-            dst[0] = src[2];
-            dst[1] = src[3];
-            dst[2] = src[1];
-            dst[3] = src[0];
-        } else {
-            dst[1] = src[2];
-            dst[2] = src[3];
-            dst[3] = src[1];
-        }
-        break;
-      }
-      default: {
-        LOG(FATAL) << "inavlid layout for 4d shape" << dst_layout;
-      }
-    }
-  } else if (src.ndim() == 5) {
-    switch (src_layout) {
-      case kNCDHW: break;
-      case kNDHWC: {
-        dst[2] = src[1];
-        dst[3] = src[2];
-        dst[4] = src[3];
-        dst[1] = src[4];
-        break;
-      }
-      default: {
-        LOG(FATAL) << "inavlid layout for 5d shape" << src_layout;
-      }
-    }
-    src = dst;
-    switch (dst_layout) {
-      case kNCDHW: break;
-      case kNDHWC: {
-        dst[1] = src[2];
-        dst[2] = src[3];
-        dst[3] = src[4];
-        dst[4] = src[1];
-        break;
      }
-      default: {
-        LOG(FATAL) << "inavlid layout for 5d shape" << dst_layout;
+      dst[dst_major_pos] = src_dim_size;
+      if (dst_minor_pos >= 0) {
+        CHECK_GT(dst_factor, 0);
+        CHECK_LE(dst_factor, src_dim_size) << "Converting " << src
+                                           << " from " << src_layout
+                                           << " to " << dst_factor
+                                           << ": cannot split dimension size of "
+                                           << src_dim_size << " by " << dst_factor;
+        dst[dst_major_pos] /= dst_factor;
+        dst[dst_minor_pos] = dst_factor;
      }
    }
-  } else {
-    LOG(FATAL) << "no layout option for " << dst.ndim() << " dimensions";
  }
  return dst;
 }

--- a/nnvm/src/top/nn/pooling.cc
+++ b/nnvm/src/top/nn/pooling.cc
--- a/nnvm/src/top/nn/upsampling.cc
+++ b/nnvm/src/top/nn/upsampling.cc
@@ -19,6 +19,7 @@ DMLC_REGISTER_PARAMETER(UpSamplingParam);
 inline bool UpSamplingInferShape(const nnvm::NodeAttrs& attrs,
                                   std::vector<TShape>* in_shape,
                                   std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
  CHECK_EQ(in_shape->size(), 1U);
  CHECK_EQ(out_shape->size(), 1U);
@@ -33,6 +34,19 @@ inline bool UpSamplingInferShape(const nnvm::NodeAttrs& attrs,
  return true;
 }
+inline bool UpsamplingLayout(const NodeAttrs& attrs,
+                             std::vector<Layout> *in_layouts,
+                             const std::vector<Layout> *last_in_layouts,
+                             std::vector<Layout> *out_layouts) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  CHECK_EQ(in_layouts->size(), 1U);
+  CHECK_EQ(out_layouts->size(), 1U);
+  const Layout layout(param.layout);
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 0, layout);
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 0, layout);
+  return true;
+}
 NNVM_REGISTER_OP(upsampling)
 .describe(R"(Perform nearest neighbor upsampling to input array.
@@ -46,6 +60,7 @@ NNVM_REGISTER_OP(upsampling)
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<UpSamplingParam>)
 .set_attr<FInferShape>("FInferShape", UpSamplingInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferLayout>("FInferLayout", UpsamplingLayout)
 .set_num_outputs(1)
 .set_num_inputs(1)
 .set_support_level(2);

--- a/nnvm/src/top/op_common.h
+++ b/nnvm/src/top/op_common.h
@@ -203,6 +203,13 @@ inline std::string attr_assign_error_msg(const NodeAttrs& attrs,
    }                                                                    \
  }
+#define NNVM_ASSIGN_LAYOUT(outputs, index, layout)                       \
+  {                                                                      \
+    if (layout.defined()) {                                              \
+      (outputs)[index] = layout;                                         \
+    }                                                                    \
+  }
 /*!
 * \brief macro assign rhs shape to lhs
 *  Use macro so we can see the error file more clearly
@@ -253,6 +260,14 @@ inline bool ZeroShape(const NodeAttrs& attrs,
  }
 }
+// do not infer layout
+inline bool ZeroLayout(const NodeAttrs& attrs,
+                       std::vector<Layout> *in_layouts,
+                       const std::vector<Layout> *last_in_layouts,
+                       std::vector<Layout> *out_layouts) {
+  return true;
+}
 // simply assign output shape or type from input
 template<typename AttrType, int in_index, int out_index>
 inline bool AssignOutputAttr(const NodeAttrs& attrs,

--- a/nnvm/src/top/tensor/broadcast.cc
+++ b/nnvm/src/top/tensor/broadcast.cc
@@ -11,6 +11,7 @@
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/compiler/util.h>
 #include <nnvm/top/tensor.h>
+#include <nnvm/top/nn.h>
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/broadcast.h"
@@ -74,6 +75,7 @@ So with `shape=(2,0)`, we will obtain the same result as in the above example.
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<BroadcastToParam>)
 .set_attr<FInferShape>("FInferShape", BroadcastToInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferLayout>("FInferLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_attr<FTVMCompute>(
  "FTVMCompute", [](const NodeAttrs& attrs,
    const Array<Tensor>& inputs,
@@ -115,7 +117,7 @@ inline bool BinaryBroadcastShape(const nnvm::NodeAttrs& attrs,
      } else {
        CHECK(l == 1 || r == 1)
          << "operands could not be broadcast together with shapes "
-          << lhs << " " << rhs;
+          << lhs << " " << rhs << ", l=" << l << ", r=" << r;
        out[i] = std::max(l, r);
      }
    } else {
@@ -126,6 +128,77 @@ inline bool BinaryBroadcastShape(const nnvm::NodeAttrs& attrs,
  return true;
 }
+inline bool BinaryBroadcastInferLayout(const NodeAttrs& attrs,
+                                       std::vector<Layout> *ilayouts,
+                                       const std::vector<Layout> *last_ilayouts,
+                                       std::vector<Layout> *olayouts) {
+  CHECK_EQ(ilayouts->size(), 2U);
+  CHECK_EQ(olayouts->size(), 1U);
+  Layout lhs = (*ilayouts)[0];
+  Layout rhs = (*ilayouts)[1];
+  Layout out(Layout::Undef());
+  if (lhs.defined() && rhs.defined()) {
+    if (lhs == rhs) {
+      NNVM_ASSIGN_LAYOUT(*olayouts, 0, lhs);
+      return true;
+    }
+    // For example, NCHW <-> CHW, N16nCH16cW <-> HCW16c, etc, are broadcast-convertible
+    // because as the definition, CHW can broadcast with NCHW.
+    // For the second case, we can convert HCW16c to CH16cW then it can broadcast with N16nCH16cW.
+    // But CNHW <-> CHW, NCHW16n <-> CHW are not,
+    // because not matter how we adjust the layout of 'CHW',
+    // we can never have an 'N' between 'C' and "HW".
+    size_t l_start = 0, r_start = 0;
+    size_t l = 0, r = 0;
+    bool find_first_match = false;
+    while (l < lhs.ndim() && r < rhs.ndim()) {
+      if (!rhs.contains(Layout::to_superdim(lhs[l]))) {
+        CHECK(!find_first_match) << lhs << " and " << rhs << " are not broadcast-convertible";
+        l_start = ++l;
+      } else if (!lhs.contains(Layout::to_superdim(rhs[r]))) {
+        CHECK(!find_first_match) << lhs << " and " << rhs << " are not broadcast-convertible";
+        r_start = ++r;
+      } else {
+        find_first_match = true;
+        ++l; ++r;
+      }
+    }
+    if (l_start > 0 && r_start > 0) {
+      LOG(FATAL) << lhs << " and " << rhs << " are not broadcast-convertible";
+    } else if (l_start > 0) {
+      rhs = lhs.sublayout(l_start, lhs.ndim()-l_start);
+      out = lhs;
+    } else if (r_start > 0) {
+      lhs = rhs.sublayout(r_start, rhs.ndim()-r_start);
+      out = rhs;
+    } else {
+      // prior to keep left layout
+      rhs = lhs;
+      out = lhs;
+    }
+  } else if (lhs.defined()) {
+    const Layout& last_lhs = last_ilayouts->at(0);
+    if (last_lhs.defined()) {
+      CHECK(lhs.convertible(last_lhs)) << "current lhs layout " << lhs
+                                       << " cannot be converted to the original one " << last_lhs;
+      lhs = last_lhs;
+      // cannot decide output layout
+    }
+  } else if (rhs.defined()) {
+    const Layout& last_rhs = last_ilayouts->at(1);
+    if (last_rhs.defined()) {
+      CHECK(rhs.convertible(last_rhs)) << "current rhs layout " << rhs
+                                       << " cannot be converted to the original one " << last_rhs;
+      rhs = last_rhs;
+      // cannot decide output layout
+    }
+  }
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, lhs);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 1, rhs);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, out);
+  return true;
+}
 #define NNVM_REGISTER_BINARY_BROADCAST_OP(name)                     \
  NNVM_REGISTER_OP(name)                                            \
@@ -133,6 +206,8 @@ inline bool BinaryBroadcastShape(const nnvm::NodeAttrs& attrs,
  .set_num_outputs(1)                                               \
  .set_attr<FInferShape>("FInferShape", BinaryBroadcastShape)       \
  .set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)           \
+  .set_attr<FInferLayout>("FInferLayout",                           \
+    BinaryBroadcastInferLayout)                                     \
  .set_attr<FInplaceOption>("FInplaceOption",                       \
    [](const NodeAttrs& attrs) {                                    \
      return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};     \

--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -333,6 +333,7 @@ NNVM_REGISTER_INIT_OP(full)
 .add_arguments(InitOpWithScalarParam::__FIELDS__())
 .set_attr<FInferShape>("FInferShape", ZeroShape<InitOpWithScalarParam>)
 .set_attr<FInferType>("FInferType", ZeroType<InitOpWithScalarParam>)
+.set_attr<FInferLayout>("FInferLayout", ZeroLayout)
 .set_support_level(4);
 NNVM_REGISTER_INIT_OP(zeros)
@@ -345,6 +346,7 @@ NNVM_REGISTER_INIT_OP(zeros)
 .add_arguments(InitOpParam::__FIELDS__())
 .set_attr<FInferShape>("FInferShape", ZeroShape<InitOpParam>)
 .set_attr<FInferType>("FInferType", ZeroType<InitOpParam>)
+.set_attr<FInferLayout>("FInferLayout", ZeroLayout)
 .set_support_level(4);
 NNVM_REGISTER_INIT_OP(ones)
@@ -357,6 +359,7 @@ NNVM_REGISTER_INIT_OP(ones)
 .add_arguments(InitOpParam::__FIELDS__())
 .set_attr<FInferShape>("FInferShape", ZeroShape<InitOpParam>)
 .set_attr<FInferType>("FInferType", ZeroType<InitOpParam>)
+.set_attr<FInferLayout>("FInferLayout", ZeroLayout)
 .set_support_level(4);
 // full_like
@@ -693,6 +696,7 @@ Example::
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ClipParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FInferLayout>("FInferLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_attr<FTVMCompute>(
  "FTVMCompute", [](const NodeAttrs& attrs,
                    const Array<Tensor>& inputs,

--- a/nnvm/src/top/tensor/matrix_op.cc
+++ b/nnvm/src/top/tensor/matrix_op.cc
@@ -41,6 +41,31 @@ inline bool DotShape(const nnvm::NodeAttrs& attrs,
  return true;
 }
+inline bool DotInferLayout(const NodeAttrs& attrs,
+                           std::vector<Layout> *ilayouts,
+                           const std::vector<Layout> *last_ilayouts,
+                           std::vector<Layout> *olayouts) {
+  const MatMulParam& param = nnvm::get<MatMulParam>(attrs.parsed);
+  CHECK_EQ(ilayouts->size(), 2U);
+  CHECK_EQ(olayouts->size(), 1U);
+  const Layout& lhs = last_ilayouts->at(0).defined() ? last_ilayouts->at(0)
+                                                     : ilayouts->at(0);
+  const Layout& rhs = last_ilayouts->at(1).defined() ? last_ilayouts->at(1)
+                                                     : ilayouts->at(1);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, lhs);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 1, rhs);
+  if (lhs.ndim() > 1 && rhs.ndim() > 1) {
+    // concat lhs and rhs layout
+    const Layout& lhs_out = param.transpose_a ? lhs.reverse() : lhs;
+    const Layout& rhs_out = param.transpose_b ? rhs.reverse() : rhs;
+    Layout out = std::move(lhs_out.sublayout(0, lhs_out.ndim()-1) +
+                           rhs_out.sublayout(1, rhs_out.ndim()-1));
+    NNVM_ASSIGN_LAYOUT(*olayouts, 0, out);
+  }
+  return true;
+}
 NNVM_REGISTER_OP(matmul)
  .describe(R"doc(Matrix multiplication of two arrays.
@@ -67,6 +92,7 @@ NNVM_REGISTER_OP(matmul)
 .add_argument("rhs", "NDArray-or-Symbol", "The second input")
 .set_attr<FInferShape>("FInferShape", DotShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FInferLayout>("FInferLayout", DotInferLayout)
 .set_attr<FGradient>(
  "FGradient", [](const NodePtr& n,
                  const std::vector<NodeEntry>& ograds) {

--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -111,6 +111,8 @@ inline void AxesParamParser(nnvm::NodeAttrs* attrs) {
  .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ReduceParam>) \
  .set_attr<FInferShape>("FInferShape", ReduceShape)                    \
  .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)               \
+  .set_attr<FInferLayout>("FInferLayout",                               \
+    ElemwiseFixedLayoutUnknownOut<1, 1>)                                \
  .set_num_inputs(1)                                                    \
  .set_num_outputs(1)

--- a/nnvm/src/top/tensor/state_op.cc
+++ b/nnvm/src/top/tensor/state_op.cc
@@ -45,6 +45,15 @@ This is an experimental operator.
    return Array<Tensor>{ topi::identity(inputs[1]) };
 })
 .set_attr<FInferShape>("FInferShape", SameShape)
+.set_attr<FInferLayout>(
+  "FInferLayout", [](const NodeAttrs& attrs,
+                     std::vector<Layout> *in_layouts,
+                     const std::vector<Layout> *last_in_layouts,
+                     std::vector<Layout> *out_layouts) {
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 1, (*in_layouts)[0]);
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 0, (*in_layouts)[0]);
+  return true;
+})
 .set_attr<FInplaceOption>(
  "FInplaceOption", [](const NodeAttrs& attrs) {
    return std::vector<std::pair<int, int> >{{1, 0}};

--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -10,6 +10,7 @@
 #include <nnvm/compiler/util.h>
 #include <nnvm/top/tensor.h>
 #include <cctype>
+#include <sstream>
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/nn/flatten.h"
@@ -63,6 +64,7 @@ Example::
 .set_num_outputs(1)
 .set_attr<FInferShape>("FInferShape", FlattenInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferLayout>("FInferLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .add_argument("data", "Tensor", "Input data.")
 .set_attr<FTVMCompute>(
  "FTVMCompute", [](const NodeAttrs& attrs,
@@ -119,6 +121,22 @@ inline bool ConcatenateInferShape(const NodeAttrs& attrs,
  return dshape.Size() != 0;
 }
+inline bool ConcatenateInferLayout(const NodeAttrs& attrs,
+                                   std::vector<Layout> *ilayouts,
+                                   const std::vector<Layout> *last_ilayouts,
+                                   std::vector<Layout> *olayouts) {
+  CHECK_EQ(ilayouts->size(), last_ilayouts->size());
+  CHECK_EQ(olayouts->size(), 1U);
+  for (size_t i = 0; i < ilayouts->size(); ++i) {
+    const Layout& input = last_ilayouts->at(i).defined() ?
+                          last_ilayouts->at(i) : ilayouts->at(i);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, i, input);
+  }
+  return true;
+}
 NNVM_REGISTER_OP(concatenate)
 .describe(R"code(Joins input arrays along a given axis.
@@ -156,6 +174,7 @@ Example::
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ConcatenateParam>)
 .set_attr<FInferShape>("FInferShape", ConcatenateInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_attr<FInferLayout>("FInferLayout", ConcatenateInferLayout)
 .set_attr<FTVMCompute>(
  "FTVMCompute", [](const NodeAttrs& attrs,
                    const Array<Tensor>& inputs,
@@ -177,7 +196,8 @@ inline bool ExpandDimsInferShape(const NodeAttrs& attrs,
  CHECK_EQ(in_shape->size(), 1U);
  const TShape& dshape = in_shape->at(0);
  int ndim = static_cast<int>(dshape.ndim());
-  CHECK(param.axis >= -ndim - 1 && param.axis <= ndim);
+  CHECK(param.axis >= -ndim - 1 && param.axis <= ndim)
+    << "with axis = " << param.axis << " ndim = " << ndim;
  int axis = param.axis < 0 ? ndim + param.axis + 1 : param.axis;
  std::vector<dim_t> oshape;
  for (int i = 0; i < axis; ++i) {
@@ -198,7 +218,7 @@ NNVM_REGISTER_OP(expand_dims)
 .describe(R"code(Inserts a new axis of size 1 into the array shape
 For example, given ``x`` with shape ``(2,3,4)``, then ``expand_dims(x, axis=1, num_newaxis=5)``
-will return a new array with shape ``(2,5,3,4)``.
+will return a new array with shape ``(2,1,1,1,1,1,3,4)``.
 )code" NNVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "Input tensor")
@@ -207,6 +227,7 @@ will return a new array with shape ``(2,5,3,4)``.
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ExpandDimsParam>)
 .set_attr<FInferShape>("FInferShape", ExpandDimsInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferLayout>("FInferLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<FTVMCompute>(
@@ -249,6 +270,8 @@ Examples::
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<IndicatorParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", AssignOutputAttr<TShape, 1, 0>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+// never transform layout of the second input array.
+.set_attr<FInferLayout>("FInferLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_num_inputs(2)
 .set_num_outputs(1)
 .set_attr<FGradient>(
@@ -345,6 +368,7 @@ along which to split the array.
 .set_attr_parser(SplitParamParser)
 .set_attr<FInferShape>("FInferShape", SplitInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, -1>)
+.set_attr<FInferLayout>("FInferLayout", ElemwiseFixedLayoutUnknownOut<1, -1>)
 .set_num_inputs(1)
 .set_num_outputs(SplitNumOutputs)
 .set_attr<FTVMCompute>(
@@ -387,6 +411,7 @@ NNVM_REGISTER_OP(cast)
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<CastParam>)
 .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<FInferType>("FInferType", CastInferType)
+.set_attr<FInferLayout>("FInferLayout", ElemwiseArbitraryLayout<1, 1>)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_support_level(1);
@@ -539,6 +564,7 @@ The significance of each is explained below:
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ReshapeParam>)
 .set_attr<FInferShape>("FInferShape", ReshapeInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferLayout>("FInferLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<FTVMCompute>(
@@ -578,6 +604,8 @@ the input array into an output array with the same shape as the second input arr
    return true;
 })
 .set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)
+// never transform layout of the second input array.
+.set_attr<FInferLayout>("FInferLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_attr<FGradient>(
  "FGradient", [](const NodePtr& n,
                  const std::vector<NodeEntry>& ograds) {
@@ -660,6 +688,7 @@ Examples::
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<SqueezeParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", SqueezeShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferLayout>("FInferLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<FTVMCompute>(
@@ -680,7 +709,7 @@ Examples::
 })
 .set_support_level(1);
-// tranpose
+// transpose
 DMLC_REGISTER_PARAMETER(TransposeParam);
 inline bool TransposeShape(const nnvm::NodeAttrs& attrs,
@@ -708,6 +737,39 @@ inline bool TransposeShape(const nnvm::NodeAttrs& attrs,
  return true;
 }
+inline bool TransposeInferLayout(const NodeAttrs& attrs,
+                                 std::vector<Layout> *ilayouts,
+                                 const std::vector<Layout> *last_ilayouts,
+                                 std::vector<Layout> *olayouts) {
+  const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
+  CHECK_EQ(ilayouts->size(), 1U);
+  CHECK_EQ(olayouts->size(), 1U);
+  const Layout& input = last_ilayouts->at(0).defined()
+                        ? last_ilayouts->at(0)
+                        : ilayouts->at(0);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, input);
+  if (input.defined()) {
+    std::ostringstream new_layout;
+    if (param.axes.ndim() == 0) {
+      for (size_t i = 0; i < input.ndim(); ++i) {
+        new_layout << input.at(input.ndim() - 1 - i);
+      }
+    } else {
+      CHECK_EQ(input.ndim(), param.axes.ndim());
+      for (size_t i = 0; i < input.ndim(); ++i) {
+        CHECK(param.axes[i] < input.ndim());
+        new_layout << input.at(param.axes[i]);
+      }
+    }
+    NNVM_ASSIGN_LAYOUT(*olayouts, 0, Layout(new_layout.str()));
+  }
+  return true;
+}
 NNVM_REGISTER_OP(transpose)
 .describe(R"code(Permutes the dimensions of an array.
@@ -743,6 +805,7 @@ Examples::
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<TransposeParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", TransposeShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferLayout>("FInferLayout", TransposeInferLayout)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_support_level(4)

--- a/nnvm/tests/python/compiler/test_alter_op_layout.py
+++ b/nnvm/tests/python/compiler/test_alter_op_layout.py
+"""Unittest cases for AlterOpLayout pass"""
+from nnvm import symbol as sym
+from nnvm.compiler import graph_attr
+from nnvm.top import registry as reg
+import nnvm.graph as graph
+def get_layouts(g):
+    ldict = {}
+    vlayout = g.json_attr("layout")
+    entry_ptr = g.index.entry_ptr
+    for i, n in enumerate(g.index.nodes):
+        begin, end = entry_ptr[i], entry_ptr[i + 1]
+        ldict[n["name"]] = vlayout[begin:end]
+    return ldict
+def test_alter_conv2d_layout():
+    data = sym.Variable("data", shape=(1, 32, 512, 512))
+    conv = sym.conv2d(data, name="conv", channels=16,
+                      kernel_size=(3,3), padding=(1,1),
+                      use_bias=False, layout="NCHW")
+    relu = sym.relu(conv, name="relu")
+    flatten = sym.flatten(relu, name="flatten")
+    softmax = sym.softmax(flatten, name="softmax")
+    g = graph.create(softmax)
+    g = g.apply("CorrectLayout")
+    g = graph_attr.set_dtype_inputs(g, "float32")
+    g = g.apply(["InferShape", "InferType"])
+    layouts_origin = get_layouts(g)
+    @reg.register_alter_op_layout("conv2d")
+    def alter_conv2d_layout(attrs, inputs, tinfos):
+        new_attrs = {k : attrs[k] for k in attrs.keys()}
+        new_attrs["layout"] = "NCHW16c"
+        new_attrs["kernel_layout"] = "NCHW16c"
+        new_attrs["name"] = "conv_alter"
+        return sym.conv2d(inputs[0], inputs[1], **new_attrs)
+    g = g.apply("AlterOpLayout")
+    layouts = get_layouts(g)
+    # check copy layouts
+    for node in ["data", "relu", "flatten", "softmax", "conv_weight"]:
+        assert(layouts[node] == layouts_origin[node])
+    assert(layouts["conv_alter"] == layouts_origin["conv"])
+if __name__ == "__main__":
+    test_alter_conv2d_layout()
--- a/nnvm/tests/python/compiler/test_nhwc_layout.py
+++ b/nnvm/tests/python/compiler/test_nhwc_layout.py
@@ -5,9 +5,10 @@ import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
-def get_sym(layout, channels):
+def get_sym(layout, kernel_layout, channels):
    data = sym.Variable(name="data")
-    data = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1), layout=layout, use_bias=True)
+    data = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1),
+                      layout=layout, kernel_layout=kernel_layout, use_bias=True)
    data = sym.max_pool2d(data=data, pool_size=(2, 2), strides=(2, 2), layout=layout)
    data = sym.upsampling(data=data, scale=2, layout=layout)
    softmax_axis = 1
@@ -31,8 +32,8 @@ def build_and_run(sym, params, data, out_shape):
 def test_nhwc():
    data_shape = (1, 3, 224, 224)
    out_channel = 8
-    nchw_sym = get_sym("NCHW", out_channel)
+    nchw_sym = get_sym("NCHW", "OIHW", out_channel)
-    nhwc_sym = get_sym("NHWC", out_channel)
+    nhwc_sym = get_sym("NHWC", "HWIO", out_channel)
    conv_weight = np.random.uniform(-1, 1, (out_channel, 3, 3, 3)).astype(np.float32)
    conv_bias = np.random.uniform(-1, 1, (out_channel)).astype(np.float32)
    nchw_params = {

--- a/nnvm/tests/python/unittest/test_correct_layout.py
+++ b/nnvm/tests/python/unittest/test_correct_layout.py