[GRADIENT] Register more gradient operators (#300)

* Add conv2d max_pool backward op * Added tests * Fix testing * Address comments * Change dot to matmul * Address comments * Break down indicator function * Make greater, less numpy compatible

[GRADIENT] Register more gradient operators (#300)
* Add conv2d max_pool backward op * Added tests * Fix testing * Address comments * Change dot to matmul * Address comments * Break down indicator function * Make greater, less numpy compatible
f34e1744 · yuruofeifei · Tianqi Chen · 4bd92a4a · f34e1744 · f34e1744
Commit f34e1744 authored Jan 17, 2018 by yuruofeifei Committed by Tianqi Chen May 29, 2018
18 changed files
--- a/nnvm/docs/top.rst
+++ b/nnvm/docs/top.rst
@@ -28,6 +28,7 @@ This level enables fully connected multi-layer perceptron.
   :nosignatures:
   nnvm.symbol.dense
+   nnvm.symbol.matmul
   nnvm.symbol.relu
   nnvm.symbol.tanh
   nnvm.symbol.sigmoid
@@ -38,6 +39,7 @@ This level enables fully connected multi-layer perceptron.
   nnvm.symbol.elemwise_sub
   nnvm.symbol.elemwise_mul
   nnvm.symbol.elemwise_div
+   nnvm.symbol.elemwise_sum
   nnvm.symbol.full
   nnvm.symbol.full_like
   nnvm.symbol.ones
@@ -54,6 +56,8 @@ This level enables fully connected multi-layer perceptron.
   nnvm.symbol.softmax
   nnvm.symbol.log_softmax
   nnvm.symbol.pad
+   nnvm.symbol.block_grad
+   nnvm.symbol.indicator
 **Level 2: Convolutions**
@@ -77,6 +81,8 @@ This level enables typical convnet models.
   :nosignatures:
   nnvm.symbol.reshape
+   nnvm.symbol.reshape_like
+   nnvm.symbol.expand_like
   nnvm.symbol.copy
   nnvm.symbol.negative
   nnvm.symbol.leaky_relu
@@ -107,6 +113,7 @@ This level enables typical convnet models.
 Detailed Definitions
 --------------------
 .. autofunction:: nnvm.symbol.dense
+.. autofunction:: nnvm.symbol.matmul
 .. autofunction:: nnvm.symbol.relu
 .. autofunction:: nnvm.symbol.tanh
 .. autofunction:: nnvm.symbol.sigmoid
@@ -117,6 +124,7 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.elemwise_sub
 .. autofunction:: nnvm.symbol.elemwise_mul
 .. autofunction:: nnvm.symbol.elemwise_div
+.. autofunction:: nnvm.symbol.elemwise_sum
 .. autofunction:: nnvm.symbol.full
 .. autofunction:: nnvm.symbol.full_like
 .. autofunction:: nnvm.symbol.ones
@@ -133,6 +141,8 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.softmax
 .. autofunction:: nnvm.symbol.log_softmax
 .. autofunction:: nnvm.symbol.pad
+.. autofunction:: nnvm.symbol.block_grad
+.. autofunction:: nnvm.symbol.indicator
 .. autofunction:: nnvm.symbol.conv2d
 .. autofunction:: nnvm.symbol.conv2d_transpose
@@ -142,6 +152,8 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.global_avg_pool2d
 .. autofunction:: nnvm.symbol.reshape
+.. autofunction:: nnvm.symbol.reshape_like
+.. autofunction:: nnvm.symbol.expand_like
 .. autofunction:: nnvm.symbol.copy
 .. autofunction:: nnvm.symbol.negative
 .. autofunction:: nnvm.symbol.leaky_relu

--- a/nnvm/include/nnvm/top/tensor.h
+++ b/nnvm/include/nnvm/top/tensor.h
@@ -62,6 +62,13 @@ enum TypeFlag {
  kUint64 = 10,
 };
+enum IndicatorRuleFlag {
+  kGT0 = 0,
+  kLT0 = 1,
+  kMax = 2,
+  kMin = 3,
+};
 #define DMLC_DECLARE_DTYPE_FIELD(name)                              \
  DMLC_DECLARE_FIELD(name)                                          \
  .add_enum("float16", kFloat16)                                    \
@@ -84,6 +91,28 @@ struct CastParam : public dmlc::Parameter<CastParam> {
  }
 };
+struct IndicatorParam : public dmlc::Parameter<IndicatorParam> {
+  TShape axis;
+  bool exclude;
+  DMLC_DECLARE_PARAMETER(IndicatorParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(TShape())
+    .describe(R"code(The axis or axes along which to perform the indicator rule.
+        The default, `axis=()`, will compute over all elements into a
+        scalar array with shape `(1,)`.
+        If `axis` is int, rule is applied on a particular axis.
+        If `axis` is a tuple of ints, rule is applied on all the axes
+        specified in the tuple.
+        If `exclude` is true, rule will be applied on the axes that are
+        NOT in axis instead.)code");
+    DMLC_DECLARE_FIELD(exclude).set_default(false)
+    .describe("Whether to apply rule on axis that are NOT in axis instead.");
+  }
+};
 struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
  Tuple<int64_t> shape;
@@ -97,8 +126,7 @@ struct SqueezeParam : public dmlc::Parameter<SqueezeParam> {
  DMLC_DECLARE_PARAMETER(SqueezeParam) {
    DMLC_DECLARE_FIELD(axis).set_default(TShape())
-    .describe("The axis to squeeze in the input tensor."
+    .describe("The axis to squeeze in the input tensor.");
-              " If set to None, all size=1 axes will be squeezed");
  }
 };
@@ -110,6 +138,15 @@ struct ScalarParam : public dmlc::Parameter<ScalarParam> {
  }
 };
+struct FillValueParam : public dmlc::Parameter<FillValueParam> {
+  double fill_value;
+  DMLC_DECLARE_PARAMETER(FillValueParam) {
+    DMLC_DECLARE_FIELD(fill_value)
+    .describe("Scalar value to be filled");
+  }
+};
 struct TransposeParam : public dmlc::Parameter<TransposeParam> {
  TShape axes;
@@ -158,16 +195,49 @@ struct ReduceParam : public dmlc::Parameter<ReduceParam> {
  }
 };
+struct InitOpWithScalarParam : public dmlc::Parameter<InitOpWithScalarParam> {
+  TShape shape;
+  int dtype;
+  double fill_value;
+  DMLC_DECLARE_PARAMETER(InitOpWithScalarParam) {
+    DMLC_DECLARE_FIELD(shape).set_default(TShape());
+    DMLC_DECLARE_DTYPE_FIELD(dtype).set_default(kFloat32)
+      .describe("Target data type.");
+    DMLC_DECLARE_FIELD(fill_value).describe("Scalar value to fill");
+  }
+};
 struct InitOpParam : public dmlc::Parameter<InitOpParam> {
  TShape shape;
  int dtype;
-  double value;
  DMLC_DECLARE_PARAMETER(InitOpParam) {
    DMLC_DECLARE_FIELD(shape).set_default(TShape());
    DMLC_DECLARE_DTYPE_FIELD(dtype).set_default(kFloat32)
      .describe("Target data type.");
-    DMLC_DECLARE_FIELD(value).describe("Value to fill");
+  }
+};
+struct ElementWiseReduceParam : public dmlc::Parameter<ElementWiseReduceParam> {
+  int num_args;
+  DMLC_DECLARE_PARAMETER(ElementWiseReduceParam) {
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
+      .describe("Number of inputs to be reduced.");
+  }
+};
+struct MatMulParam : public dmlc::Parameter<MatMulParam> {
+  bool transpose_a;
+  bool transpose_b;
+  DMLC_DECLARE_PARAMETER(MatMulParam) {
+    DMLC_DECLARE_FIELD(transpose_a)
+      .describe("If true then transpose the first input before dot.")
+      .set_default(false);
+    DMLC_DECLARE_FIELD(transpose_b)
+      .describe("If true then transpose the second input before dot.")
+      .set_default(false);
  }
 };

--- a/nnvm/python/nnvm/compiler/build_module.py
+++ b/nnvm/python/nnvm/compiler/build_module.py
@@ -188,7 +188,7 @@ def build(graph, target=None, shape=None, dtype="float32", params=None, target_h
        The input types to the graph
    params : dict of str to NDArray
-        Input parameetrs to the graph that do not change
+        Input parameters to the graph that do not change
        during inference time. Used for pre-compute
        folding optimization.

--- a/nnvm/python/nnvm/compiler/graph_util.py
+++ b/nnvm/python/nnvm/compiler/graph_util.py
@@ -5,6 +5,9 @@ from __future__ import absolute_import as _abs
 import tvm
 from . import graph_attr
+from ..graph import create
+from ..symbol import Group, ones_like
 def infer_shape(graph, **shape):
    """Infer the shape given the shape of inputs.
@@ -89,3 +92,57 @@ def check_graph_equal(grapha, graphb, compare_variable_attrs=False):
    err = _deep_compare(grapha, graphb, compare_variable_attrs)
    if err:
        raise ValueError("Graph compare error: " + err)
+def get_gradient_graph(ys, xs, grad_ys=None):
+    """Create gradient graph of ys with respect to xs.
+    Parameters
+    ----------
+    ys : Symbol or list of Symbol
+        Symbols from which the gradient is calculated.
+    xs : Symbol or list of Symbol
+        Symbols the gradient respect to.
+        For group symbol, gradients for all outputs will be calculated.
+    grad_ys : Symbol or list of Symbol
+        Head gradients for ys.
+    Returns
+    -------
+    ret : Graph
+        Generated gradient graph.
+    """
+    if isinstance(ys, list):
+        ys = Group(ys)
+    g = create(ys)
+    g._set_symbol_list_attr('grad_ys', ys)
+    g._set_symbol_list_attr('grad_xs', xs)
+    ny = len(ys.list_output_names())
+    if grad_ys is None:
+        grad_ys = [ones_like(ys[i]) for i in range(ny)]
+    g._set_symbol_list_attr('grad_ys_out_grad', grad_ys)
+    return g.apply('Gradient')
+def gradients(ys, xs, grad_ys=None):
+    """Create gradient symbol of ys respect to xs.
+    Parameters
+    ----------
+    ys : Symbol or list of Symbol
+        Symbols from which the gradient is calculated.
+    xs : Symbol or list of Symbol
+        Symbols the gradient respect to.
+        For group symbol, gradients for all outputs will be calculated.
+    grad_ys : Symbol or list of Symbol
+        Head gradients for ys.
+    Returns
+    -------
+    ret : list of Symbol
+        Generated gradient symbol. For each xs,
+        all gradients from ys are merged into a single symbol.
+    """
+    grad_g = get_gradient_graph(ys, xs, grad_ys)
+    nx = len(Group(xs).list_output_names()) \
+        if isinstance(xs, list) else len(xs.list_output_names())
+    ret = [grad_g.symbol[i] for i in range(nx)]
+    return ret
--- a/nnvm/python/nnvm/graph.py
+++ b/nnvm/python/nnvm/graph.py
@@ -13,7 +13,6 @@ from ._base import c_array, c_str, nn_uint, py_str, string_types
 from ._base import GraphHandle, SymbolHandle
 from ._base import check_call
 from .symbol import Variable, Symbol, Group as _Group
-from .symbol import ones_like
 class GraphIndex(object):
    """Index for quickly accessing graph attributes.
@@ -271,38 +270,3 @@ def create(symbol):
    check_call(_LIB.NNGraphCreate(
        symbol.handle, ctypes.byref(ghandle)))
    return Graph(ghandle)
-def gradients(ys, xs, grad_ys=None):
-    """Create gradient symbol of ys respect to xs.
-    Parameters
-    ----------
-    ys : Symbol or list of Symbol
-        Symbols from which the gradient is calculated.
-    xs : Symbol or list of Symbol
-        Symbols the gradient respect to.
-        For group symbol, gradients for all outputs will be calculated.
-    grad_ys : Symbol or list of Symbol
-        Head gradients for ys.
-    Returns
-    -------
-    ret : list of Symbol
-        Generated gradient symbol. For each xs,
-        all gradients from ys are merged into a single symbol.
-    """
-    if isinstance(ys, list):
-        ys = _Group(ys)
-    g = create(ys)
-    g._set_symbol_list_attr('grad_ys', ys)
-    g._set_symbol_list_attr('grad_xs', xs)
-    ny = len(ys.list_output_names())
-    if grad_ys is None:
-        grad_ys = [ones_like(ys[i]) for i in range(ny)]
-    g._set_symbol_list_attr('grad_ys_out_grad', grad_ys)
-    sym = g.apply('Gradient').symbol
-    nx = len(_Group(xs).list_output_names()) \
-        if isinstance(xs, list) else len(xs.list_output_names())
-    ret = [sym[i] for i in range(nx)]
-    return ret
--- a/nnvm/src/pass/gradient.cc
+++ b/nnvm/src/pass/gradient.cc
@@ -14,18 +14,23 @@ namespace pass {
 namespace {
 // default aggregate gradient function
-// require operator __zero__ and __ewise_sum__ to be presented.
+// require operator zeros and elemwise_sum to be presented.
 NodeEntry DefaultAggregateGradient(std::vector<NodeEntry>&& v) {
  if (v.size() == 1) {
    return std::move(v[0]);
  } else if (v.size() == 0) {
    NodePtr zero_node = Node::Create();
-    zero_node->attrs.op = Op::Get("_zeros");
+    zero_node->attrs.op = Op::Get("zeros");
+    zero_node->attrs.name = "zero_grad";
+    zero_node->attrs.op->attr_parser(&(zero_node->attrs));
    return NodeEntry{zero_node, 0, 0};
  } else {
    NodePtr sum_node = Node::Create();
    sum_node->attrs.op = Op::Get("elemwise_sum");
    sum_node->inputs = std::move(v);
+    sum_node->attrs.name = "grad_sum";
+    sum_node->attrs.dict["num_args"] = std::to_string(sum_node->inputs.size());
+    sum_node->attrs.op->attr_parser(&(sum_node->attrs));
    return NodeEntry{sum_node, 0, 0};
  }
 }

--- a/nnvm/src/top/elemwise_op_common.h
+++ b/nnvm/src/top/elemwise_op_common.h
@@ -84,6 +84,22 @@ inline bool ElemwiseType(const NodeAttrs& attrs,
    attrs, in_attrs, out_attrs, -1);
 }
+inline bool ElementWiseReduceShape(const NodeAttrs& attrs,
+                                   std::vector<TShape> *in_attrs,
+                                   std::vector<TShape> *out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  return ElemwiseAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
+    attrs, in_attrs, out_attrs, TShape());
+}
+inline bool ElementWiseReduceType(const NodeAttrs& attrs,
+                                  std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
+    attrs, in_attrs, out_attrs, -1);
+}
 #define NNVM_REGISTER_ELEMWISE_UNARY_OP(name)                       \
  NNVM_REGISTER_OP(name)                                            \
  .set_num_inputs(1)                                                \
@@ -100,11 +116,13 @@ inline bool ElemwiseType(const NodeAttrs& attrs,
 #define NNVM_REGISTER_INIT_OP(name)                                 \
  NNVM_REGISTER_OP(name)                                            \
  .set_num_inputs(0)                                                \
-  .set_num_outputs(1)                                               \
+  .set_num_outputs(1)
-  .set_attr_parser(ParamParser<InitOpParam>)                        \
-  .add_arguments(InitOpParam::__FIELDS__())                         \
-  .set_attr<FInferShape>("FInferShape", ZeroShape)                  \
+#define NNVM_REGISTER_INIT_LIKE_OP(name)                            \
-  .set_attr<FInferType>("FInferType", ZeroType)
+  NNVM_REGISTER_ELEMWISE_UNARY_OP(name)                             \
+  .set_attr<FGradient>("FGradient", MakeZeroGradNodes)              \
+  .add_argument("data", "Symbol", "The input")
 #define NNVM_REGISTER_ELEMWISE_BINARY_OP(name)                      \
@@ -120,6 +138,41 @@ inline bool ElemwiseType(const NodeAttrs& attrs,
  .add_argument("lhs", "Tensor", "first input")                     \
  .add_argument("rhs", "Tensor", "second input")
+#define NNVM_REGISTER_ELEMWISE_REDUCE_OP(name)                      \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs([](const NodeAttrs& attrs) {                      \
+    return static_cast<uint32_t>(                                   \
+      dmlc::get<ElementWiseReduceParam>(attrs.parsed).num_args);    \
+    })                                                              \
+  .set_attr_parser(ParamParser<ElementWiseReduceParam>)             \
+  .set_attr<FGetAttrDict>("FGetAttrDict",                           \
+    ParamGetAttrDict<ElementWiseReduceParam>)                       \
+  .set_attr<nnvm::FInferShape>("FInferShape",                       \
+    ElementWiseReduceShape)                                         \
+  .set_attr<nnvm::FInferType>("FInferType", ElementWiseReduceType)  \
+  .add_argument("args", "Symbol[]", "Positional input arguments")
+#define NNVM_REGISTER_INDICATOR_OP(name)                            \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_outputs(1)                                               \
+  .set_attr<FInferType>(                                            \
+    "FInferType", [](const NodeAttrs& attrs,                        \
+                     std::vector<int>* in_attrs,                    \
+                     std::vector<int>* out_attrs) {                 \
+      CHECK_EQ(out_attrs->size(), 1U);                              \
+      NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0,                 \
+        static_cast<int>(kFloat32));                                \
+      return true;                                                  \
+  })                                                                \
+  .set_attr<FGradient>(                                             \
+    "FGradient", [](const NodePtr& n,                               \
+                    const std::vector<NodeEntry>& ograds) {         \
+      return MakeZeroGradNodes(n, ograds);                          \
+  })
 }  // namespace top
 }  // namespace nnvm
 #endif  // NNVM_TOP_ELEMWISE_OP_COMMON_H_
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -120,7 +120,42 @@ a bias vector is created and added to the outputs.
 .set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
 .set_num_outputs(1)
 .set_num_inputs(UseBiasNumInputs<Conv2DParam>)
-.set_support_level(2);
+.set_support_level(2)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return MakeGradNode("_conv2d_grad", n,
+                        {ograds[0], n->inputs[Conv2DParam::kData],
+                         n->inputs[Conv2DParam::kWeight]},
+                        n->attrs.dict);
+});
+NNVM_REGISTER_OP(_conv2d_grad)
+  .describe(R"code(2D convolution grad.
+)code" NNVM_ADD_FILELINE)
+.add_argument("ograd", "4D Tensor", "Output grad.")
+.add_argument("data", "4D Tensor", "Input data of conv2d.")
+.add_argument("weight", "4D Tensor", "Input weight.")
+.set_num_inputs(3)
+.set_num_outputs(UseBiasNumInputs<Conv2DParam>)
+.set_attr<FListOutputNames>("FListOutputNames", UseBiasListInputNames<Conv2DParam>)
+.set_attr_parser(ParamParser<Conv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
+.set_attr<FInferShape>(
+  "FInferShape", [](const nnvm::NodeAttrs& attrs,
+                    std::vector<TShape>* in_attrs,
+                    std::vector<TShape>* out_attrs) {
+    const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, Conv2DParam::kData, in_attrs->at(1));
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, Conv2DParam::kWeight, in_attrs->at(2));
+    if (param.use_bias) {
+      NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, Conv2DParam::kBias, TShape({param.channels}));
+    }
+    return true;
+})
+.set_attr<FInferType>("FInferType", ElemwiseType<3, -1>)
+.set_attr<TIsBackward>("TIsBackward", true);
 DMLC_REGISTER_PARAMETER(Conv2DTransposeParam);

--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -54,7 +54,7 @@ NNVM_REGISTER_OP(dense)
 - **data**: `(x1, x2, ..., xn, input_dim)`
 - **weight**: `(units, input_dim)`
 - **bias**: `(units,)`
- **out**: `(x1, x2, ..., xn, num_hidden)`
+- **out**: `(x1, x2, ..., xn, units)`
 The learnable parameters include both ``weight`` and ``bias``.
@@ -72,6 +72,34 @@ If ``use_bias`` is set to be false, then the ``bias`` term is ignored.
 .set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<DenseParam>)
 .set_attr<FInferShape>("FInferShape", DenseInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    const DenseParam& param = nnvm::get<DenseParam>(n->attrs.parsed);
+    NodeEntry data_grad = MakeNode("matmul",
+                                   n->attrs.name + "_data_grad",
+                                   {ograds[0], n->inputs[DenseParam::kWeight]});
+    NodeEntry w_grad_sub = MakeNode("matmul",
+                                     n->attrs.name + "_weight_grad_sub0",
+                                     {ograds[0], n->inputs[DenseParam::kData]},
+                                     {{"transpose_a", "true"}});
+    TShape w_reduce_axis = {0, -1};
+    std::ostringstream w_oss; w_oss << w_reduce_axis;
+    NodeEntry w_grad = MakeNode("sum", n->attrs.name + "_weight_grad",
+                                {w_grad_sub},
+                                {{"axis", w_oss.str()}, {"exclude", "true"}});
+    std::vector<NodeEntry> grads = {data_grad, w_grad};
+    if (param.use_bias) {
+      TShape axis = {-1};
+      std::ostringstream b_oss; b_oss << axis;
+      grads.push_back(MakeNode("sum", n->attrs.name + "_bias_grad",
+                      {ograds[0]},
+                      {{"axis", b_oss.str()}, {"exclude", "true"}}));
+    }
+    return grads;
+})
 .set_support_level(1);
 // relu
@@ -82,6 +110,18 @@ NNVM_REGISTER_ELEMWISE_UNARY_OP(relu)
   max(input, 0)
 )code" NNVM_ADD_FILELINE)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = relu(x)
+    // grad = indicator(x > 0)
+    NodeEntry zero = MakeNode("zeros_like", n->attrs.name + "_grad_zero",
+                              {n->inputs[0]});
+    return std::vector<NodeEntry>{
+      MakeNode("greater", n->attrs.name + "_grad",
+               {n->inputs[0], zero}, {{"exclude", "true"}})
+    };
+})
 .set_support_level(1);
 // dropout
@@ -217,7 +257,37 @@ NNVM_REGISTER_OP(softmax)
 .set_num_outputs(1)
 .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
-.set_support_level(1);
+.set_support_level(1)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // grad_x = grad_y dot jacobian of softmax
+    //
+    // jacobian of softmax
+    // [-y1y1 + y1, -y1y2,        ...    ]
+    // [ ...      , -y2y2 + y2,   ...    ]
+    // [ ...                      ...    ]
+    // [ ...                  ,-ynyn + yn]
+    //
+    // grad_x =
+    // [-y1*(ograd1*y1 - 1 + ograd2*y2 + ..., -y2*(ograd1*y1 - 1 + ograd2*y2, ..., ...]]
+    // grad_x = ograd elemwise_mul output
+    // grad_x = sum(grad_x, keepdim, axis)
+    // grad_x = grad_x broadcast_mul output
+    // grad_x = neg grad_x
+    // grad_x = grad_x + output
+    const SoftmaxParam& param = nnvm::get<SoftmaxParam>(n->attrs.parsed);
+    NodeEntry output =  NodeEntry{n, 0, 0};
+    NodeEntry sub0 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub0", {ograds[0], output});
+    NodeEntry sub1 = MakeNode("sum", n->attrs.name + "_grad_sub1", {sub0},
+                              {{"axis", std::to_string(param.axis)}, {"keepdims", "true"}});
+    NodeEntry sub2 = MakeNode("broadcast_mul", n->attrs.name + "_grad_sub2", {sub1, output});
+    NodeEntry sub3 = MakeNode("negative", n->attrs.name + "_grad_sub3", {sub2});
+    return std::vector<NodeEntry> {
+      MakeNode("elemwise_add", n->attrs.name + "_grad", {sub3, output})
+    };
+});
 // log_softmax
 NNVM_REGISTER_OP(log_softmax)
@@ -236,6 +306,38 @@ NNVM_REGISTER_OP(log_softmax)
 .set_num_outputs(1)
 .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // grad_x = grad_y dot jacobian of softmax
+    //
+    // jacobian of softmax
+    // [-y1 + 1, -y2,        ...    ]
+    // [ ...   , -y2 + 1,    ...    ]
+    // [ ...                 ...    ]
+    // [ ...                ,-yn + 1]
+    //
+    // grad_x =
+    // [-(ograd1*y1 - 1 + ograd2*y2 + ..., -(ograd1*y1 - 1 + ograd2*y2, ..., ...]]
+    // grad_x = ograd elemwise_mul output
+    // grad_x = sum(grad_x, keepdim, axis)
+    // grad_x = neg grad_x
+    // grad_x = grad_x + ones_like(grad_x)
+    // grad_x = expand_dims(grad_x, axis)
+    const SoftmaxParam& param = nnvm::get<SoftmaxParam>(n->attrs.parsed);
+    NodeEntry output =  NodeEntry{n, 0, 0};
+    NodeEntry sub0 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub0", {ograds[0], output});
+    NodeEntry sub1 = MakeNode("sum", n->attrs.name + "_grad_sub1", {sub0},
+                              {{"axis", std::to_string(param.axis)}, {"keepdims", "true"}});
+    NodeEntry sub2 = MakeNode("negative", n->attrs.name + "_grad_sub2", {sub1});
+    NodeEntry sub3 = MakeNode("ones_like", n->attrs.name + "_grad_sub3", {sub2});
+    NodeEntry sub4 = MakeNode("elemwise_add", n->attrs.name + "_grad_sub4", {sub2, sub3});
+    return std::vector<NodeEntry> {
+      MakeNode("expand_like", n->attrs.name + "_grad", {sub4, output},
+               {{"axis", std::to_string(param.axis)}})
+    };
+})
 .set_support_level(1);
 // leaky_rlu
@@ -255,6 +357,25 @@ NNVM_REGISTER_OP(leaky_relu)
 .set_num_outputs(1)
 .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = leak_relu(x)
+    // grad = indicator(x > 0) + alpha * indicator(x < 0)
+    const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(n->attrs.parsed);
+    NodeEntry zero = MakeNode("zeros_like", n->attrs.name + "_grad_zero",
+                              {n->inputs[0]});
+    NodeEntry sub0 = MakeNode("greater", n->attrs.name + "_pos_grad",
+                              {n->inputs[0], zero}, {{"exclude", "true"}});
+    NodeEntry sub1 = MakeNode("less", n->attrs.name + "_neg_grad",
+                              {n->inputs[0], zero}, {{"exclude", "true"}});
+    NodeEntry sub2 = MakeNode("__mul_scalar__", n->attrs.name + "_neg_mul_2",
+                              {sub1},
+                              {{"scalar", std::to_string(param.alpha)}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_add", n->attrs.name + "_add_grad", {sub0, sub2})
+    };
+})
 .set_support_level(1);

--- a/nnvm/src/top/nn/pooling.cc
+++ b/nnvm/src/top/nn/pooling.cc
@@ -77,8 +77,30 @@ NNVM_REGISTER_OP(max_pool2d)
 .set_num_inputs(1)
 .set_attr<FInferShape>("FInferShape", Pool2DInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return MakeGradNode("_max_pool2d_grad", n,
+                        {ograds[0], n->inputs[0], NodeEntry{n, 0, 0}},
+                        n->attrs.dict);
+})
 .set_support_level(2);
+NNVM_REGISTER_OP(_max_pool2d_grad)
+  .describe(R"code(Max pooling 2D grad.
+)code" NNVM_ADD_FILELINE)
+.add_argument("ograd", "4D Tensor", "Output grad.")
+.add_argument("input", "4D Tensor", "Input data of max_pool2d grad.")
+.add_argument("output", "4D Tensor", "Output data of max_pool2d grad.")
+.set_num_inputs(3)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<Pool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Pool2DParam>)
+.set_attr<FInferShape>("FInferShape", AssignOutputAttr<TShape, 1, 0>)
+.set_attr<FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<TIsBackward>("TIsBackward", true);
 NNVM_REGISTER_OP(avg_pool2d)
 .describe(R"code(Average pooling operation for one dimensional data.

--- a/nnvm/src/top/op_common.h
+++ b/nnvm/src/top/op_common.h
@@ -144,7 +144,7 @@ inline std::string attr_assign_error_msg(const NodeAttrs& attrs,
 }
 /*!
- * \brief macro assign shape to out if out is unknown otherwise check consistency
+ * \brief macro assign shape to input if out is unknown otherwise check consistency
 *  Use macro so we can see the error file more clearly
 * \param inputs the shape array to store the result
 * \param index the index of in the array
@@ -240,10 +240,11 @@ inline bool SameShape(const NodeAttrs& attrs,
 }
 // return shape from node attrs
+template<typename PType>
 inline bool ZeroShape(const NodeAttrs& attrs,
                      std::vector<TShape> *ishape,
                      std::vector<TShape> *oshape) {
-  const TShape& ts = dmlc::get<InitOpParam>(attrs.parsed).shape;
+  const TShape& ts = dmlc::get<PType>(attrs.parsed).shape;
  if (ts.ndim() != 0) {
    SHAPE_ASSIGN(oshape->at(0), ts);
    return true;
@@ -252,15 +253,63 @@ inline bool ZeroShape(const NodeAttrs& attrs,
  }
 }
+// simply assign output shape or type from input
+template<typename AttrType, int in_index, int out_index>
+inline bool AssignOutputAttr(const NodeAttrs& attrs,
+                              std::vector<AttrType> *in_attrs,
+                              std::vector<AttrType> *out_attrs) {
+  CHECK_LT(in_index, in_attrs->size());
+  CHECK_LT(out_index, out_attrs->size());
+  const TShape &dshape = in_attrs->at(in_index);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, out_index, dshape);
+  return true;
+}
 // return type from node attrs
+template<typename PType>
 inline bool ZeroType(const NodeAttrs& attrs,
                     std::vector<int> *iattr,
                     std::vector<int> *oattr) {
-  int dtype = dmlc::get<InitOpParam>(attrs.parsed).dtype;
+  int dtype = dmlc::get<PType>(attrs.parsed).dtype;
  DTYPE_ASSIGN(oattr->at(0), dtype);
  return true;
 }
+// Make zero grad node
+inline std::vector<NodeEntry> MakeZeroGradNodes(
+  const NodePtr& n,
+  const std::vector<NodeEntry>& ograds) {
+  std::vector<NodeEntry> ret;
+  for (uint32_t i = 0; i < n->num_inputs(); ++i) {
+    std::ostringstream os;
+    ret.push_back(MakeNode("zeros_like", n->attrs.name + "_zero_grad",
+                           {n->inputs[i]}));
+  }
+  return ret;
+}
+// Helper to make gradient node
+inline std::vector<NodeEntry> MakeGradNode(
+  const char* op_name,
+  const NodePtr& n,
+  std::vector<NodeEntry> inputs,
+  std::unordered_map<std::string, std::string> attr = {}) {
+  NodePtr p = Node::Create();
+  p->attrs.op = nnvm::Op::Get(op_name);
+  p->attrs.name = n->attrs.name + "_grad";
+  p->inputs = std::move(inputs);
+  p->attrs.dict = std::move(attr);
+  if (p->attrs.op->attr_parser) {
+    p->attrs.op->attr_parser(&p->attrs);
+  }
+  std::vector<NodeEntry> ret;
+  for (uint32_t i = 0; i < p->num_outputs(); ++i) {
+    ret.emplace_back(NodeEntry{p, i, 0});
+  }
+  return ret;
+}
 }  // namespace top
 }  // namespace nnvm

--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -241,73 +241,70 @@ NNVM_REGISTER_ELEMWISE_UNARY_OP(copy)
 });
 DMLC_REGISTER_PARAMETER(InitOpParam);
+DMLC_REGISTER_PARAMETER(InitOpWithScalarParam);
+DMLC_REGISTER_PARAMETER(FillValueParam);
 // full
 NNVM_REGISTER_INIT_OP(full)
 .describe(R"code(Fill array with scalar value
 )code"  NNVM_ADD_FILELINE)
+.set_attr_parser(ParamParser<InitOpWithScalarParam>)
+.set_attr<FGetAttrDict>(
+  "FGetAttrDict", ParamGetAttrDict<InitOpWithScalarParam>)
+.add_arguments(InitOpWithScalarParam::__FIELDS__())
+.set_attr<FInferShape>("FInferShape", ZeroShape<InitOpWithScalarParam>)
+.set_attr<FInferType>("FInferType", ZeroType<InitOpWithScalarParam>)
 .set_support_level(1);
 NNVM_REGISTER_INIT_OP(zeros)
 .describe(R"code(Fill target with zeros
 )code"  NNVM_ADD_FILELINE)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<FGetAttrDict>(
+  "FGetAttrDict", ParamGetAttrDict<InitOpParam>)
+.add_arguments(InitOpParam::__FIELDS__())
+.set_attr<FInferShape>("FInferShape", ZeroShape<InitOpParam>)
+.set_attr<FInferType>("FInferType", ZeroType<InitOpParam>)
 .set_support_level(1);
 NNVM_REGISTER_INIT_OP(ones)
 .describe(R"code(Fill target with ones
 )code"  NNVM_ADD_FILELINE)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<FGetAttrDict>(
+  "FGetAttrDict", ParamGetAttrDict<InitOpParam>)
+.add_arguments(InitOpParam::__FIELDS__())
+.set_attr<FInferShape>("FInferShape", ZeroShape<InitOpParam>)
+.set_attr<FInferType>("FInferType", ZeroType<InitOpParam>)
 .set_support_level(1);
 // full_like
-NNVM_REGISTER_ELEMWISE_UNARY_OP(full_like)
+NNVM_REGISTER_INIT_LIKE_OP(full_like)
-  .describe(R"code(Return an scalar value array with the same shape and type
+.describe(R"code(Return an scalar value array with the same shape and type
 as the input array
 )code"  NNVM_ADD_FILELINE)
-.set_support_level(1)
+.add_arguments(FillValueParam::__FIELDS__())
-.add_arguments(InitOpParam::__FIELDS__())
+.set_attr_parser(ParamParser<FillValueParam>)
-.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<FillValueParam>)
-.set_attr<FGradient>(
+.set_support_level(1);
-  "FGradient", [](const NodePtr& n,
-                  const std::vector<NodeEntry>& ograds){
-    return std::vector<NodeEntry>{
-      MakeNode("zeros_like", n->attrs.name + "_grad",
-               {n->inputs[0]})
-    };
-});
-NNVM_REGISTER_ELEMWISE_UNARY_OP(zeros_like)
+NNVM_REGISTER_INIT_LIKE_OP(zeros_like)
 .describe(R"code(Return an array of zeros with the same shape and type
 as the input array.
 )code")
-.add_argument("data", "Symbol", "The input")
+.set_support_level(1);
-.set_attr<FGradient>(
-  "FGradient", [](const NodePtr& n,
-                  const std::vector<NodeEntry>& ograds){
-    return std::vector<NodeEntry>{
-      MakeNode("zeros_like", n->attrs.name + "_grad",
-               {n->inputs[0]})
-    };
-});
-NNVM_REGISTER_ELEMWISE_UNARY_OP(ones_like)
+NNVM_REGISTER_INIT_LIKE_OP(ones_like)
 .describe(R"code(Return an array of ones with the same shape and type
 as the input array.
 )code")
-.add_argument("data", "Symbol", "The input")
+.set_support_level(1);
-.set_attr<FGradient>(
-  "FGradient", [](const NodePtr& n,
-                  const std::vector<NodeEntry>& ograds){
-    return std::vector<NodeEntry>{
-      MakeNode("zeros_like", n->attrs.name + "_grad",
-               {n->inputs[0]})
-    };
-});
 // unary scalar op
 DMLC_REGISTER_PARAMETER(ScalarParam);
@@ -452,64 +449,84 @@ NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rpow_scalar__)
    };
 });
+DMLC_REGISTER_PARAMETER(ElementWiseReduceParam);
-struct ElementWiseSumParam : public dmlc::Parameter<ElementWiseSumParam> {
+NNVM_REGISTER_ELEMWISE_REDUCE_OP(elemwise_sum)
-  int num_args;
-  DMLC_DECLARE_PARAMETER(ElementWiseSumParam) {
-    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
-      .describe("Number of inputs to be summed.");
-  }
-};
-DMLC_REGISTER_PARAMETER(ElementWiseSumParam);
-bool ElementWiseSumShape(const NodeAttrs& attrs,
-                         std::vector<TShape> *in_attrs,
-                         std::vector<TShape> *out_attrs) {
-  CHECK_EQ(out_attrs->size(), 1);
-  return ElemwiseAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
-    attrs, in_attrs, out_attrs, TShape());
-}
-bool ElementWiseSumType(const NodeAttrs& attrs,
-                        std::vector<int> *in_attrs,
-                        std::vector<int> *out_attrs) {
-  CHECK_EQ(out_attrs->size(), 1);
-  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
-    attrs, in_attrs, out_attrs, -1);
-}
-std::vector<NodeEntry> ElementWiseSumGrad(
-    const NodePtr& n,
-    const std::vector<NodeEntry>& ograds) {
-  // identity constraints in the beginning for easier shape inference.
-  const Op* copy_op = Op::Get("identity");
-  CHECK_EQ(ograds.size(), 1);
-  std::vector<NodeEntry> ret;
-  NodeEntry n_out{n, 0, 0};
-  for (size_t i = 0; i < n->inputs.size(); i++) {
-    NodePtr id_node = Node::Create();
-    id_node->attrs.op = copy_op;
-    id_node->inputs = {ograds[0]};
-    ret.push_back(NodeEntry{id_node, 0, 0});
-  }
-  return ret;
-}
-NNVM_REGISTER_OP(elemwise_sum)
 .describe(R"code(Adds all input arguments element-wise.
 )code"  NNVM_ADD_FILELINE)
-.set_attr_parser(ParamParser<ElementWiseSumParam>)
+.set_attr<nnvm::FGradient>(
-.set_num_inputs([](const NodeAttrs& attrs) {
+  "FGradient", [](const NodePtr& n,
-  uint32_t ret = dmlc::get<ElementWiseSumParam>(attrs.parsed).num_args;
+                  const std::vector<NodeEntry>& ograds){
-  return ret;
+    CHECK_EQ(ograds.size(), 1);
+    std::vector<NodeEntry> ret;
+    for (size_t i = 0; i < n->inputs.size(); i++) {
+      ret.push_back(ograds[0]);
+    }
+    return ret;
+  });
+NNVM_REGISTER_ELEMWISE_UNARY_OP(block_grad)
+.describe(R"code(Blocks gradient computation for input.
+)code" NNVM_ADD_FILELINE)
+.set_attr<nnvm::FInplaceIdentity>(
+  "FInplaceIdentity", [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
 })
-.set_attr<nnvm::FInferShape>("FInferShape", ElementWiseSumShape)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
-.set_attr<nnvm::FInferType>("FInferType", ElementWiseSumType)
-.set_attr<nnvm::FGradient>("FGradient", ElementWiseSumGrad)
+DMLC_REGISTER_PARAMETER(IndicatorParam);
-.add_argument("args", "Symbol[]", "Positional input arguments");
+// indicator function
+NNVM_REGISTER_INDICATOR_OP(greater)
+.describe(R"code(Greater function that returns a mask tensor
+with 1.0 if (left > right), otherwise 0.0 element-wise.
+)code" NNVM_ADD_FILELINE)
+.add_argument("lhs", "Tensor", "First input")
+.add_argument("rhs", "Tensor", "Second input")
+.set_num_inputs(2)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
+.set_support_level(1);
+NNVM_REGISTER_INDICATOR_OP(less)
+  .describe(R"code(Less function that returns a mask tensor
+with 1.0 if (left < right), otherwise 0.0 element-wise.
+)code" NNVM_ADD_FILELINE)
+.add_argument("lhs", "Tensor", "First input")
+.add_argument("rhs", "Tensor", "Second input")
+.set_num_inputs(2)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
+.set_support_level(1);
+NNVM_REGISTER_INDICATOR_OP(_max_mask)
+  .describe(R"code(Function that returns a mask tensor
+with 1.0 if the value is maximum over given axes, otherwise 0.0 element-wise.
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input")
+.set_num_inputs(1)
+.add_arguments(IndicatorParam::__FIELDS__())
+.set_attr_parser(ParamParser<IndicatorParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<IndicatorParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_support_level(1);
+NNVM_REGISTER_INDICATOR_OP(_min_mask)
+  .describe(R"code(Function that returns a mask tensor
+with 1.0 if the value is minimum over given axes, otherwise 0.0 element-wise.
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input")
+.set_num_inputs(1)
+.add_arguments(IndicatorParam::__FIELDS__())
+.set_attr_parser(ParamParser<IndicatorParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<IndicatorParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_support_level(1);
 }  // namespace top
 }  // namespace nnvm
--- a/nnvm/src/top/tensor/matrix_op.cc
+++ b/nnvm/src/top/tensor/matrix_op.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file matrix_op.cc
+ * \brief Matrix operators
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/top/tensor.h>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+namespace nnvm {
+namespace top {
+DMLC_REGISTER_PARAMETER(MatMulParam);
+inline bool DotShape(const nnvm::NodeAttrs& attrs,
+                     std::vector<TShape> *in_attrs,
+                     std::vector<TShape> *out_attrs) {
+  const MatMulParam& param = nnvm::get<MatMulParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TShape lshape = (*in_attrs)[0];
+  TShape rshape = (*in_attrs)[1];
+  if (lshape.ndim() == 1)  lshape = TShape{1, lshape[0]};
+  if (rshape.ndim() == 1) rshape = TShape{1, rshape[0]};
+  if (param.transpose_a) std::reverse(lshape.begin(), lshape.end());
+  if (param.transpose_b) std::reverse(rshape.begin(), rshape.end());
+  CHECK_EQ(lshape[lshape.ndim() - 1], rshape[0])
+    << "dot shape inconsistent: " << lshape << " X " << rshape;
+  TShape oshape(lshape.ndim() + rshape.ndim() - 1);
+  for (int i = 0; i < lshape.ndim() - 1; i++) oshape[i] = lshape[i];
+  for (int i = 1; i < rshape.ndim(); i++) oshape[i + lshape.ndim() - 1] = rshape[i];
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape);
+  return true;
+}
+NNVM_REGISTER_OP(matmul)
+  .describe(R"doc(Matrix multiplication of two arrays.
+``dot``'s behavior depends on the input array dimensions:
+- 1-D arrays: inner product of vectors
+- 2-D arrays: matrix multiplication
+- N-D arrays: a sum product over the last axis of the first input and the first
+  axis of the second input
+  For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the
+  result array will have shape `(n,m,r,s)`. It is computed by::
+    dot(x,y) = sum(x[i,j,:]*y[:,a,b])
+)doc" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<MatMulParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<MatMulParam>)
+.add_arguments(MatMulParam::__FIELDS__())
+.add_argument("lhs", "NDArray-or-Symbol", "The first input")
+.add_argument("rhs", "NDArray-or-Symbol", "The second input")
+.set_attr<FInferShape>("FInferShape", DotShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // z = x dot y
+    // xshape (n,m,k), yshape (k,r,s)
+    const MatMulParam& param = nnvm::get<MatMulParam>(n->attrs.parsed);
+    bool Ta = param.transpose_a;
+    bool Tb = param.transpose_b;
+    // Ta = false, Tb = false
+    // grad_x = grad_z dot y.T
+    // grad_y = x.T dot grad_z
+    if (!Ta && !Tb) {
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {ograds[0], n->inputs[1]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "true"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {n->inputs[0], ograds[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "false"}})
+      };
+    } else if (Ta && !Tb) {
+      // Ta = true, Tb = false
+      // grad_x = y dot grad_z.T
+      // grad_y = x dot grad_z
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {n->inputs[1], ograds[0]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "true"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {n->inputs[0], ograds[0]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "false"}})
+      };
+    } else if (!Ta && Tb) {
+      // Ta = false, Tb = true
+      // grad_x = grad_z dot y
+      // grad_y = grad_z.T dot x
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {ograds[0], n->inputs[1]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "false"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {ograds[0], n->inputs[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "false"}})
+      };
+    } else {
+      // Ta = true, Tb = true
+      // grad_x = y.T dot grad_z.T
+      // grad_y = grad_z.T dot x.T
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {n->inputs[1], ograds[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "true"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {ograds[0], n->inputs[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "true"}})
+      };
+    }
+});
+}  // namespace top
+}  // namespace nnvm
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -31,11 +31,19 @@ inline TShape ReduceShapeImpl(const TShape& ishape,
    << "Reduction axis " << axis[axis.ndim() - 1]
    << " Exceeds input dimensions " << ishape;
+  TShape in_axis = axis;
+  for (auto& i : in_axis) {
+    i = i < 0 ? i + ishape.ndim(): i;
+    CHECK_GE(i, 0) << "axis out of bounds in reduce operator";
+    CHECK_LT(i, ishape.ndim()) << "axis out of bounds in reduce operator";
+  }
+  std::sort(in_axis.begin(), in_axis.end());
  if (keepdims) {
    TShape oshape(ishape);
    if (exclude) {
      for (dim_t i = 0, j = 0; i < ishape.ndim(); ++i) {
-        if (j < axis.ndim() && i == axis[j]) {
+        if (j < in_axis.ndim() && i == in_axis[j]) {
          ++j;
          continue;
        }
@@ -44,22 +52,22 @@ inline TShape ReduceShapeImpl(const TShape& ishape,
      return oshape;
    }
-    for (dim_t i = 0; i < axis.ndim(); ++i) {
+    for (dim_t i = 0; i < in_axis.ndim(); ++i) {
-      oshape[axis[i]] = 1;
+      oshape[in_axis[i]] = 1;
    }
    return oshape;
  }
  if (exclude) {
-    TShape oshape = TShape(axis.ndim());
+    TShape oshape = TShape(in_axis.ndim());
-    for (dim_t i = 0; i < axis.ndim(); ++i) {
+    for (dim_t i = 0; i < in_axis.ndim(); ++i) {
-      oshape[i] = ishape[axis[i]];
+      oshape[i] = ishape[in_axis[i]];
    }
    return oshape;
  }
-  TShape oshape = TShape(std::max<dim_t>(1, ishape.ndim() - axis.ndim()));
+  TShape oshape = TShape(std::max<dim_t>(1, ishape.ndim() - in_axis.ndim()));
  for (dim_t i = 0, j = 0, k = 0; i < ishape.ndim(); ++i) {
-    if (j < axis.ndim() && i == axis[j]) {
+    if (j < in_axis.ndim() && i == in_axis[j]) {
      ++j;
      continue;
    }
@@ -99,9 +107,7 @@ inline void AxesParamParser(nnvm::NodeAttrs* attrs) {
  .set_attr<FInferShape>("FInferShape", ReduceShape)                    \
  .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)               \
  .set_num_inputs(1)                                                    \
-  .set_num_outputs(1)                                                   \
+  .set_num_outputs(1)
 NNVM_REGISTER_REDUCE_OP(sum)
 .describe(R"code(Computes the sum of array elements over given axes.
@@ -120,17 +126,66 @@ Example::
  sum(data, axis=[1,2])
  [ 12.  19.  27.]
-)code" NNVM_ADD_FILELINE);
+)code" NNVM_ADD_FILELINE)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    const ReduceParam& param = nnvm::get<ReduceParam>(n->attrs.parsed);
+    std::ostringstream axis; axis << param.axis;
+    return std::vector<NodeEntry>{
+      MakeNode("expand_like", n->attrs.name + "_grad",
+               {ograds[0], n->inputs[0]},
+               {{"axis", axis.str()},
+                {"keepdims", std::to_string(param.keepdims)},
+                {"exclude", std::to_string(param.exclude)}})
+  };
+});
 NNVM_REGISTER_REDUCE_OP(max)
 .describe(R"code(Computes the max of array elements over given axes.
-)code" NNVM_ADD_FILELINE);
+)code" NNVM_ADD_FILELINE)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    const ReduceParam& param = nnvm::get<ReduceParam>(n->attrs.parsed);
+    std::ostringstream axis; axis << param.axis;
+    NodeEntry sub0 = MakeNode("expand_like", n->attrs.name + "_grad_sub0",
+                             {ograds[0], n->inputs[0]},
+                             {{"axis", axis.str()},
+                              {"keepdims", std::to_string(param.keepdims)},
+                              {"exclude", std::to_string(param.exclude)}});
+    NodeEntry sub1 = MakeNode("_max_mask", n->attrs.name + "_grad_sub1",
+                              {ograds[0]},
+                              {{"axis", axis.str()},
+                               {"exclude", std::to_string(param.exclude)}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad", {sub0, sub1})
+    };
+});
 NNVM_REGISTER_REDUCE_OP(min)
 .describe(R"code(Computes the min of array elements over given axes.
-)code" NNVM_ADD_FILELINE);
+)code" NNVM_ADD_FILELINE)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    const ReduceParam& param = nnvm::get<ReduceParam>(n->attrs.parsed);
+    std::ostringstream axis; axis << param.axis;
+    NodeEntry sub0 = MakeNode("expand_like", n->attrs.name + "_grad_sub0",
+                              {ograds[0], n->inputs[0]},
+                              {{"axis", axis.str()},
+                               {"keepdims", std::to_string(param.keepdims)},
+                               {"exclude", std::to_string(param.exclude)}});
+    NodeEntry sub1 = MakeNode("_min_mask", n->attrs.name + "_grad_sub1",
+                              {ograds[0]},
+                              {{"axis", axis.str()},
+                               {"exclude", std::to_string(param.exclude)}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad", {sub0, sub1})
+    };
+});
 }  // namespace top

--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -44,7 +44,7 @@ Example::
        [4,5,6],
        [7,8,9]
    ],
-    [    [1,2,3],
+    [   [1,2,3],
        [4,5,6],
        [7,8,9]
    ]],
@@ -58,6 +58,12 @@ Example::
 .set_attr<FInferShape>("FInferShape", FlattenInferShape)
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
 .add_argument("data", "Tensor", "Input data.")
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    return MakeGradNode("reshape_like", n,
+                        {ograds[0], n->inputs[0]});
+})
 .set_support_level(1);
 // concatenate
@@ -172,8 +178,8 @@ inline bool ExpandDimsInferShape(const NodeAttrs& attrs,
 NNVM_REGISTER_OP(expand_dims)
 .describe(R"code(Inserts a new axis of size 1 into the array shape
-For example, given ``x`` with shape ``(2,3,4)``, then ``expand_dims(x, axis=1)``
+For example, given ``x`` with shape ``(2,3,4)``, then ``expand_dims(x, axis=1, num_newaxis=5)``
-will return a new array with shape ``(2,1,3,4)``.
+will return a new array with shape ``(2,5,3,4)``.
 )code" NNVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "Input tensor")
@@ -184,6 +190,61 @@ will return a new array with shape ``(2,1,3,4)``.
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_num_inputs(1)
 .set_num_outputs(1)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    const ExpandDimsParam& param = nnvm::get<ExpandDimsParam>(n->attrs.parsed);
+    return std::vector<NodeEntry> {
+      MakeNode("sum", n->attrs.name + "_grad", {ograds[0]},
+               {{"axis", std::to_string(param.axis)}})
+    };
+})
+.set_support_level(1);
+NNVM_REGISTER_OP(expand_like)
+  .describe(R"code(Expand an input array with the shape of second array.
+This operation can always be composed of unsqueezing and expanding dims.
+Examples::
+  input = [ 12.  19.  27.]
+  input.shape = (3,)
+  new_shape_array = [[[1,2],[2,3],[1,3]],
+                     [[1,4],[4,3],[5,2]],
+                     [[7,1],[7,2],[7,3]]]
+  new_shape_array.shape = (3, 3, 2)
+  expand_like(input, [1,2], new_shape_array) =
+                    [[[12,12],[12,12],[12,12]],
+                     [[19,19],[19,19],[19,19]],
+                     [[27,27],[27,27],[27,27]]]
+)code" NNVM_ADD_FILELINE)
+.add_argument("input", "Tensor", "Source input")
+.add_argument("shape_like", "Tensor", "Input with new shape")
+.add_arguments(ReduceParam::__FIELDS__())
+.set_attr_parser(ParamParser<ReduceParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ReduceParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", AssignOutputAttr<TShape, 1, 0>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(n->attrs.parsed);
+    std::ostringstream axis;
+    axis << param.axis;
+    return std::vector<NodeEntry>{
+      MakeNode("sum", n->attrs.name + "_grad",
+               {ograds[0]},
+               {{"axis", axis.str()},
+                {"keepdims", std::to_string(param.keepdims)},
+                {"exclude", std::to_string(param.exclude)}})
+    };
+})
 .set_support_level(1);
 // split
@@ -383,7 +444,7 @@ NNVM_REGISTER_OP(reshape)
 .describe(R"code(Reshapes the input array.
 Given an array and a shape, this function returns a copy of the array in the new shape.
-The shape is a tuple of integers such as (2,3,4).The size of the new shape should be same as the size of the input array.
+The shape is a tuple of integers such as (2,3,4). The size of the new shape should be same as the size of the input array.
 Example::
@@ -443,6 +504,46 @@ The significance of each is explained below:
 .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_num_inputs(1)
 .set_num_outputs(1)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return std::vector<NodeEntry>{
+      MakeNode("reshape_like", n->attrs.name + "_grad",
+               {ograds[0], n->inputs[0]})
+    };
+})
+.set_support_level(3);
+NNVM_REGISTER_OP(reshape_like)
+  .describe(R"code(Reshapes the input array by the size of another array.
+For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
+the input array into an output array with the same shape as the second input array.
+.. note::
+    Sizes for both array should be compatible.
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("shape_like", "Tensor", "Input data.")
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<FInferShape>(
+  "FInferShape", [](const NodeAttrs& attrs,
+                    std::vector<TShape>* in_attrs,
+                    std::vector<TShape>* out_attrs) {
+    CHECK_EQ(in_attrs->at(0).Size(), in_attrs->at(1).Size())
+      << "Reshape inputs size should be compatible";
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, in_attrs->at(1));
+    return true;
+})
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return MakeGradNode("reshape_like", n,
+                        {ograds[0], n->inputs[0]});
+})
 .set_support_level(3);
 // squeeze
@@ -502,12 +603,14 @@ NNVM_REGISTER_OP(squeeze)
 Examples::
  x = [[[0], [1], [2]]]
+  x.shape = (1, 3, 1)
  squeeze(x) = [0, 1, 2]
  squeeze(x, 0) = [[0], [1], [2]]
  squeeze(x, (0, 2)) = [0, 1, 2]
 )code" NNVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "Source input")
 .add_arguments(SqueezeParam::__FIELDS__())
@@ -517,6 +620,13 @@ Examples::
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_num_inputs(1)
 .set_num_outputs(1)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return std::vector<NodeEntry>{
+      MakeNode("reshape_like", n->attrs.name + "_grad", {n->inputs[0]})
+    };
+})
 .set_support_level(1);
 // tranpose
@@ -584,7 +694,16 @@ Examples::
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_support_level(4);
+.set_support_level(4)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    const TransposeParam& param = nnvm::get<TransposeParam>(n->attrs.parsed);
+    std::ostringstream oss; oss << param.axes;
+    return std::vector<NodeEntry>{
+      MakeNode("transpose", n->attrs.name + "_t", {ograds[0]}, {{"axes", oss.str()}})
+    };
+});
 }  // namespace top
 }  // namespace nnvm
--- a/nnvm/tests/cpp/tuple_test.cc
+++ b/nnvm/tests/cpp/tuple_test.cc
@@ -13,7 +13,7 @@ TEST(Tuple, Basic) {
  Tuple<int> z{1, 2, 3, 5, 6};
  std::ostringstream os;
  os << z;
-  CHECK_EQ(os.str(), "(1,2,3,5,6)");
+  CHECK_EQ(os.str(), "[1,2,3,5,6]");
  std::istringstream is(os.str());
  is >> y;
  CHECK_EQ(x, y);

--- a/nnvm/tests/python/unittest/test_graph.py
+++ b/nnvm/tests/python/unittest/test_graph.py
 import json
 import nnvm.symbol as sym
 import nnvm.graph as graph
+import nnvm.compiler.graph_util as graph_util
 def test_json_pass():
    x = sym.Variable('x')
@@ -117,13 +118,13 @@ def test_gradient():
    y = sym.Variable("y")
    z1 = sym.elemwise_add(x, sym.sqrt(y))
    z2 = sym.log(x)
-    gradient = graph.gradients([z1, z2], [x, y])
+    gradient = graph_util.gradients([z1, z2], [x, y])
    assert len(gradient) == 2
    g1 = sym.Variable("g1")
    g2 = sym.Variable("g2")
    grad_ys = [g1, g2]
-    gradient = graph.gradients(sym.Group([z1, z2]),
+    gradient = graph_util.gradients(sym.Group([z1, z2]),
                               sym.Group([x, y]), grad_ys=grad_ys)
    g_graph = graph.create(sym.Group(gradient)).ir()
    assert len(gradient) == 2

--- a/nnvm/tests/python/unittest/test_graph_gradient.py
+++ b/nnvm/tests/python/unittest/test_graph_gradient.py
+import nnvm.symbol as sym
+from nnvm.compiler import graph_util
+def test_cnn_gradients():
+    # input data
+    h = 128
+    w = 128
+    data_shape = (1000, 3, h, w)
+    data = sym.Variable('data', shape=data_shape, dtype=0)
+    # conv2d
+    num_channels = 64
+    kernel_size = 32
+    conv_w_shape = (num_channels, 3, kernel_size, kernel_size)
+    conv_b_shape = (num_channels,)
+    conv_w = sym.Variable('conv_w', shape=conv_w_shape)
+    conv_b = sym.Variable('conv_b', shape=conv_b_shape)
+    conv1 = sym.conv2d(data=data, weight=conv_w, bias=conv_b,
+                      channels=num_channels, kernel_size=(kernel_size, kernel_size),
+                      name='conv1')
+    # relu1
+    relu1 = sym.relu(data=conv1, name='relu1')
+    # max pooling
+    max_pooling1 = sym.max_pool2d(data=relu1, pool_size=(2, 2), name='max_pooling1')
+    # flatten
+    flatten1 = sym.flatten(data=max_pooling1)
+    # shape after flatten
+    flatten_out_shape = (h - kernel_size) * (w - kernel_size) * num_channels
+    # dense1
+    dense1_hidden_units = 100
+    dense1 = sym.dense(data=flatten1, name='dense1', units=dense1_hidden_units)
+    # relu2
+    relu2 = sym.relu(data=dense1, name='relu2')
+    # dense2
+    dense2_hidden_units = 10
+    dense2 = sym.dense(data=relu2, name='dense2', units=dense2_hidden_units)
+    # softmax
+    mlp = sym.softmax(data=dense2, name='softmax')
+    # fake non-sparse label
+    label = sym.full_like(mlp, fill_value=1)
+    # cross entropy loss
+    ce_loss = sym.sum(
+        sym.elemwise_mul(sym.log_softmax(dense2), label),
+        axis=1,
+        keepdims=True,
+        name="ce_loss")
+    # input variables:
+    # print grad_g.symbol.list_input_names()
+    # >> ['data', 'conv_w', 'conv_b',
+    #     'dense1_weight', 'dense1_bias',
+    #     'dense2_weight', 'dense2_bias']
+    # output gradient variables:
+    # print grad_g.symbol.list_output_names()
+    # >> ['conv1_grad_data', 'conv1_grad_weight', 'conv1_grad_bias',
+    #     'dense1_grad_weight', 'dense1_grad_bias',
+    #     'dense2_grad_weight', 'dense2_grad_bias']
+    grad_g = graph_util.get_gradient_graph(ce_loss, ce_loss.list_input_variables())
+    # infer shape
+    in_shapes, out_shapes = graph_util.infer_shape(grad_g)
+    # forward graph shape
+    assert in_shapes == [list(data_shape), list(conv_w_shape), list(conv_b_shape),
+                          [dense1_hidden_units, flatten_out_shape], [dense1_hidden_units],
+                          [dense2_hidden_units, dense1_hidden_units], [dense2_hidden_units]]
+    # input grads shape should be equal with input shape
+    assert in_shapes == out_shapes
+    # output grads w.r.t input variables
+    grads = graph_util.gradients(ce_loss, ce_loss.list_input_variables())
+    # gradients number should be equal with grad_input number
+    assert len(grads) == len(ce_loss.list_input_variables())
+    # infer type
+    in_dtypes, out_dtypes = graph_util.infer_dtype(grad_g)
+    assert out_dtypes == ['float32', 'float32', 'float32', 'float32', 'float32', 'float32', 'float32']
+def test_multi_loss_graph_gradients():
+    # input data
+    shape1 = (1000, 100)
+    data1 = sym.Variable('data1', shape=(1000, 100), dtype=0)
+    # fake non-sparse label
+    label = sym.full(fill_value=3)
+    # square loss
+    sub1 = sym.elemwise_sub(data1, label, name="sub1")
+    square_loss = sym.sum(data=sub1**2, axis=1, name="square_loss")
+    # fake loss1
+    shape2 = (1000, )
+    data2 = sym.Variable('data2', shape=shape2, dtype=0)
+    loss1 = sym.sqrt(data2, name="loss1")
+    # fake loss2
+    loss2 = sym.relu(data1, name='loss2')
+    # block loss1
+    total_loss = sym.elemwise_sum(
+        sym.block_grad(loss1),
+        square_loss,
+        num_args=2,
+        name="total_loss")
+    # grad_g.symbol.list_output_names()
+    # >> ['loss1_grad_0_output', 'grad_sum_output']
+    grad_g = graph_util.get_gradient_graph([total_loss, loss2], total_loss.list_input_variables())
+    # infer shape
+    in_shapes, out_shapes = graph_util.infer_shape(grad_g)
+    assert out_shapes == [list(shape2), list(shape1)]
+    # grad_data1 is elemwise_sum of grad_loss2, grad_square_loss
+    grad_data1 = grad_g.symbol[1]
+    assert grad_data1.list_attr()['num_args'] == '2'
+    # block grad should return zero grad
+    grad_data2 = grad_g.symbol[0]
+    assert 'zeros_like' in grad_g.ir()
+    # test reverse infer shape for label
+    assert grad_g.apply('InferShape').json_attr('shape_num_unknown_nodes') == 0
+    # infer type
+    in_dtypes, out_dtypes = graph_util.infer_dtype(grad_g)
+    assert out_dtypes == ['float32', 'float32']
+    # test reverse infer type for label
+    assert grad_g.apply('InferType').json_attr('dtype_num_unknown_nodes') == 0
+if __name__ == "__main__":
+    test_cnn_gradients()
+    test_multi_loss_graph_gradients()