[PASS] add plan memory (#19)

0538a9fc · Tianqi Chen · 204c4442 · 0538a9fc · 0538a9fc · 0538a9fc
Commit 0538a9fc authored Jul 29, 2016 by Tianqi Chen
9 changed files
--- a/nnvm/include/nnvm/graph.h
+++ b/nnvm/include/nnvm/graph.h
@@ -147,6 +147,10 @@ class IndexedGraph {
  inline const std::vector<uint32_t>& arg_nodes() const {
    return arg_nodes_;
  }
+  /*! \return list of output entries */
+  inline const std::vector<NodeEntry>& outputs() const {
+    return outputs_;
+  }
 private:
  friend class Graph;
@@ -159,6 +163,8 @@ class IndexedGraph {
  std::vector<Node> nodes_;
  // index to argument nodes
  std::vector<uint32_t> arg_nodes_;
+  // space to store the outputs entries
+  std::vector<NodeEntry> outputs_;
  // mapping from node to index.
  std::unordered_map<const nnvm::Node*, uint32_t> node2index_;
  // CSR pointer of node entries

--- a/nnvm/include/nnvm/graph_attr_types.h
+++ b/nnvm/include/nnvm/graph_attr_types.h
@@ -60,7 +60,7 @@ using DTypeVector = std::vector<int>;
 *
 * \code
 *  Graph g = ApplyPass(src_graph, {"PlaceDevice"});
- *  const &device = g.GetAttr<DeviceVector>("dtype");
+ *  const &device = g.GetAttr<DeviceVector>("device");
 *  // get device by node_id
 *  int device_type = device[g.indexed_graph().node_id(my_node)];
 * \endcode
@@ -75,6 +75,21 @@ using DeviceVector = std::vector<int>;
 */
 using DeviceAssignMap = std::unordered_map<std::string, int>;
+/*!
+ * \brief The result holder of storage id of each NodeEntry in the graph.
+ *
+ * \note Stored under graph.attrs["storage"], provided by Pass "PlanMemory"
+ *  Storage id is a continuous integer.
+ *  If the storage id is -1 then the storage is not assigned.
+ *
+ * \code
+ *  Graph g = ApplyPass(src_graph, {"PlanMemory"});
+ *  const &storage = g.GetAttr<StorageVector>("storage");
+ *  // get storage id by entry
+ *  int storage_id = storage[g.indexed_graph().entry_id(my_entry)];
+ * \endcode
+ */
+using StorageVector = std::vector<int>;
 }  // namespace nnvm

--- a/nnvm/include/nnvm/op_attr_types.h
+++ b/nnvm/include/nnvm/op_attr_types.h
@@ -8,6 +8,7 @@
 #include <vector>
 #include <string>
+#include <utility>
 #include <functional>
 #include "./base.h"
 #include "./tuple.h"
@@ -93,6 +94,20 @@ using FInferType = FInferNodeEntryAttr<int>;
 */
 using TIsBackwardOp = bool;
+/*!
+ * \brief Get possible inplace options.
+ *  This function enables optimization to reuse memory of inputs in output.
+ * \param attrs The attributes of the node
+ * \param in_data The input data.
+ * \param out_data The output data.
+ * \return list of pair of that maps input->output,
+ *   indicating possible in place operations.
+ *
+ * \note Register under "FInplaceOption", by default no inplace can happen.
+ */
+using FInplaceOption = std::function<
+  std::vector<std::pair<int, int> > (const NodeAttrs& attrs)>;
 }  // namespace nnvm
 #endif  // NNVM_OP_ATTR_TYPES_H_
--- a/nnvm/src/core/graph.cc
+++ b/nnvm/src/core/graph.cc
@@ -52,6 +52,11 @@ IndexedGraph::IndexedGraph(const Graph &g) {
      control_rptr.push_back(control_deps_.size());
  });
+  for (const auto& e : g.outputs) {
+    outputs_.emplace_back(NodeEntry{
+        node2index_.at(e.node.get()), e.index, e.version});
+  }
  // setup array view
  // input_entries_ and control_rptr must not change after this step.
  const NodeEntry* iptr = dmlc::BeginPtr(input_entries_);

--- a/nnvm/src/example/operator.cc
+++ b/nnvm/src/example/operator.cc
@@ -14,6 +14,7 @@ using nnvm::FListInputNames;
 using nnvm::FMutateInput;
 using nnvm::FInferShape;
 using nnvm::FInferType;
+using nnvm::FInplaceOption;
 using nnvm::NodeAttrs;
 using nnvm::TShape;
 using nnvm::array_view;
@@ -32,6 +33,10 @@ inline bool SameShape(const NodeAttrs& attrs,
  return true;
 }
+inline std::vector<std::pair<int, int> > InplaceIn0Out0(const NodeAttrs& attrs) {
+  return {{0, 0}};
+}
 // simple demonstration of reshape.
 NNVM_REGISTER_OP(reshape)
 .describe("reshape source to target shape")
@@ -55,7 +60,8 @@ NNVM_REGISTER_OP(reshape)
      CHECK_EQ(ishape[0]->Size(), target.Size())
          << "Reshape op: source target shape mismatch";
      return true;
-    });
+    })
+.attr<FInplaceOption>("FInplaceOption", InplaceIn0Out0);
 NNVM_REGISTER_OP(cast)
@@ -82,7 +88,8 @@ NNVM_REGISTER_OP(cast)
 NNVM_REGISTER_OP(add)
 .describe("add two data together")
 .set_num_inputs(2)
-.attr<FInferShape>("FInferShape", SameShape);
+.attr<FInferShape>("FInferShape", SameShape)
+.attr<FInplaceOption>("FInplaceOption", InplaceIn0Out0);
 NNVM_REGISTER_OP(__add_symbol__)
 .describe("Alias of add")

--- a/nnvm/src/pass/graph_algorithm.h
+++ b/nnvm/src/pass/graph_algorithm.h
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file graph_algorithm.h
+ * \brief This header contains graph algorithms on StaticGraph.
+ *  It is used  compute informations such as whether two
+ *  operations can run in parallel, and helps allocation.
+*/
+#ifndef NNVM_PASS_GRAPH_ALGORITHM_H_
+#define NNVM_PASS_GRAPH_ALGORITHM_H_
+#include <nnvm/graph.h>
+#include <vector>
+namespace nnvm {
+namespace pass {
+/*!
+ * \brief Find best path in the DAG, with reward defined
+ *  by sum of reward of each node along the path.
+ * \param graph the original static graph.
+ * \param topo_order topo order of the nodes in the graph.
+ * \param node_reward the reward of each node.
+ * \param path the output path of nodes.
+ * \return the total reward of best path.
+ */
+inline uint32_t FindBestPath(
+    const IndexedGraph& graph,
+    const std::vector<uint32_t>& node_reward,
+    std::vector<uint32_t>* path) {
+  const uint32_t num_nodes = static_cast<uint32_t>(graph.num_nodes());
+  CHECK_EQ(num_nodes, node_reward.size());
+  std::vector<uint32_t> best_reward(node_reward.size(), 0);
+  std::vector<uint32_t> next_node(node_reward.size(), num_nodes);
+  uint32_t best_solution = 0, best_start_node = 0;
+  // traverse in reverse topo order
+  for (uint32_t i = static_cast<uint32_t>(graph.num_nodes()); i != 0; --i) {
+    const uint32_t nid = i - 1;
+    best_reward[nid] += node_reward[nid];
+    if (best_reward[nid] > best_solution) {
+      best_solution = best_reward[nid];
+      best_start_node = nid;
+    }
+    for (const auto& e : graph[nid].inputs) {
+      const uint32_t prev = e.node_id;
+      if (best_reward[nid] > best_reward[prev]) {
+        best_reward[prev] = best_reward[nid];
+        next_node[prev] = nid;
+      }
+    }
+  }
+  path->clear();
+  uint32_t reward = 0;
+  for (uint32_t nid = best_start_node; nid < num_nodes; nid = next_node[nid]) {
+    path->push_back(nid); reward += node_reward[nid];
+  }
+  CHECK_EQ(reward, best_solution);
+  return best_solution;
+}
+/*!
+ * \brief Color the nodes in the graph into index.
+ *  The coloring algorithm tries to assign node group
+ *  such that node in the same group cannot run in parallel.
+ *
+ * \param graph the original indexed graph.
+ * \param node_importance The importance of the node
+ * \param max_ncolor maximum number of colors allowed.
+ * \param color the color index of each of the node.
+ * \return the total number of colors.
+ */
+inline uint32_t ColorNodeGroup(
+    const IndexedGraph &graph,
+    std::vector<uint32_t> node_importance,
+    uint32_t max_ncolor,
+    std::vector<uint32_t> *color) {
+  CHECK_NE(max_ncolor, 0);
+  CHECK_EQ(graph.num_nodes(), node_importance.size());
+  color->clear();
+  color->resize(graph.num_nodes(), max_ncolor);
+  uint32_t cindex;
+  // greedy algorithm, every time
+  // find a path with best reward and assign a new color
+  // All the nodes in the path cannot run in parallel.
+  for (cindex = 0; cindex < max_ncolor - 1; ++cindex) {
+    std::vector<uint32_t> path;
+    uint32_t reward = FindBestPath(graph, node_importance, &path);
+    if (reward == 0) break;
+    for (uint32_t nid : path) {
+      if (node_importance[nid] != 0) {
+        CHECK_EQ(color->at(nid), max_ncolor);
+        color->at(nid) = cindex;
+        // make the importance 0 after color is decided.
+        node_importance[nid] = 0;
+      }
+    }
+  }
+  // assign i for rest of the node
+  for (uint32_t i = 0; i < graph.num_nodes(); ++i) {
+    if (color->at(i) == max_ncolor) {
+      color->at(i) = cindex;
+    }
+  }
+  return cindex + 1;
+}
+}  // namespace pass
+}  // namespace nnvm
+#endif  // NNVM_PASS_GRAPH_ALGORITHM_H_
--- a/nnvm/src/pass/infer_shape_type.cc
+++ b/nnvm/src/pass/infer_shape_type.cc
@@ -121,6 +121,7 @@ NNVM_REGISTER_PASS(InferType)
 DMLC_JSON_ENABLE_ANY(ShapeVector, list_shape);
 DMLC_JSON_ENABLE_ANY(DTypeVector, list_int);
+DMLC_JSON_ENABLE_ANY(size_t, size_t);
 }  // namespace pass
 }  // namespace nnvm
--- a/nnvm/src/pass/plan_memory.cc
+++ b/nnvm/src/pass/plan_memory.cc
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file plan_memory.cc
+ * \brief Assign memory tag to each of the data entries.
+ */
+#include <nnvm/graph.h>
+#include <nnvm/pass.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/op_attr_types.h>
+#include <memory>
+#include "./graph_algorithm.h"
+namespace nnvm {
+namespace pass {
+// simple graph based allocator.
+class GraphAllocator {
+ public:
+  // storage id equals integer.
+  using StorageID = int;
+  // bad storage id
+  static const StorageID kBadStorageID = -1;
+  // request a free storage
+  StorageID Request(int dev_id, int dtype, TShape shape, uint32_t node_id) {
+    if (shape.ndim() == 0) return kBadStorageID;
+    // search memory block in [size / match_range_, size * match_range_)
+    // TODO(tqchen) add size of the dtype, assume 4 bytes for now
+    size_t size = shape.Size() * 4;
+    if (match_range_ == 0) return this->Alloc(dev_id, size);
+    auto begin = free_.lower_bound(size / match_range_);
+    auto mid = free_.lower_bound(size);
+    auto end = free_.upper_bound(size * match_range_);
+    // search for memory blocks larger than requested
+    for (auto it = mid; it != end; ++it) {
+      StorageEntry *e = it->second;
+      if (e->device_id != dev_id) continue;
+      if (node_color_.size() != 0 &&
+          node_color_[e->released_by_node] != node_color_[node_id]) continue;
+      // Use exect matching strategy
+      e->max_bytes = std::max(size, e->max_bytes);
+      // find a exact match, erase from map and return
+      free_.erase(it);
+      return e->id;
+    }
+    // then search for memory blocks smaller than requested space
+    for (auto it = mid; it != begin;) {
+      --it;
+      StorageEntry *e = it->second;
+      if (e->device_id != dev_id) continue;
+      if (node_color_.size() != 0 &&
+          node_color_[e->released_by_node] != node_color_[node_id]) continue;
+      // Use exect matching strategy
+      e->max_bytes = std::max(size, e->max_bytes);
+      // find a exact match, erase from map and return
+      free_.erase(it);
+      return e->id;
+    }
+    // cannot find anything return a new one.
+    return this->Alloc(dev_id, size);
+  }
+  // release a memory space.
+  void Release(StorageID id, uint32_t node_id) {
+    CHECK_NE(id, kBadStorageID);
+    StorageEntry *e = data_[id].get();
+    e->released_by_node = node_id;
+    free_.insert({e->max_bytes, e});
+  }
+  // totoal number of bytes allocated
+  size_t TotalAllocBytes() const {
+    size_t total = 0;
+    for (auto &p : data_) {
+      total += p->max_bytes;
+    }
+    return total;
+  }
+  // constructor
+  explicit GraphAllocator(const IndexedGraph* idx) : idx_(idx) {
+    this->Init(dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16),
+               dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1));
+  }
+ private:
+  // initialize the graph allocator
+  void Init(size_t match_range, uint32_t num_match_color) {
+    match_range_ = match_range;
+    num_match_color_ = num_match_color;
+    if (num_match_color_ > 1) {
+      std::vector<uint32_t> importance(idx_->num_nodes(), 0);
+      for (uint32_t nid = 0; nid < idx_->num_nodes(); ++nid) {
+        if ((*idx_)[nid].source->is_variable()) continue;
+        importance[nid] = 1;
+      }
+      num_match_color_ = ColorNodeGroup(
+          *idx_, importance, num_match_color_, &node_color_);
+    }
+  }
+  StorageID Alloc(int dev_id, size_t size) {
+    StorageID id = static_cast<StorageID>(data_.size());
+    std::unique_ptr<StorageEntry> ptr(new StorageEntry());
+    ptr->id = id;
+    ptr->device_id = dev_id;
+    ptr->max_bytes = size;
+    data_.emplace_back(std::move(ptr));
+    return id;
+  }
+  // internal storage entry
+  struct StorageEntry {
+    // the id of the entry.
+    StorageID id;
+    // the device id of the storage.
+    int device_id;
+    // maximum size of storage requested.
+    size_t max_bytes{0};
+    // node index that released it last time
+    uint32_t released_by_node{0};
+  };
+  // scale used for rough match
+  size_t match_range_;
+  // whether use color based match algorithm
+  uint32_t num_match_color_{1};
+  // the size of each dtype
+  std::vector<size_t> dtype_size_dict_;
+  // free list of storage entry
+  std::multimap<size_t, StorageEntry*> free_;
+  // all the storage resources available
+  std::vector<std::unique_ptr<StorageEntry> > data_;
+  // color of nodes in the graph, used for auxiliary policy making.
+  std::vector<uint32_t> node_color_;
+  // internal indexed graph
+  const IndexedGraph* idx_;
+};
+// function to plan memory
+Graph PlanMemory(Graph ret) {
+  // setup ref counter
+  const IndexedGraph& idx = ret.indexed_graph();
+  // reference counter of each node
+  std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
+  // step 1: initialize reference count
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    for (const auto& e : idx[nid].inputs) {
+      ++ref_count[e.node_id];
+    }
+  }
+  for (const auto& e : idx.outputs()) {
+    ++ref_count[e.node_id];
+  }
+  // step 2: allocate memory.
+  StorageVector storage(idx.num_node_entries(), -1);
+  const ShapeVector& shape_vec = ret.GetAttr<ShapeVector>("shape");
+  const DTypeVector& dtype_vec = ret.GetAttr<DTypeVector>("dtype");
+  const DeviceVector* device_vec = nullptr;
+  static auto& finplace_option = Op::GetAttr<FInplaceOption>("FInplaceOption");
+  if (ret.attrs.count("device") != 0) {
+    device_vec = &(ret.GetAttr<DeviceVector>("device"));
+  }
+  // the allocator.
+  GraphAllocator allocator(&idx);
+  // number of entries that are not statically allocated.
+  size_t num_not_allocated = 0;
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    // check inplace option
+    if (finplace_option.count(inode.source->op) != 0) {
+      auto inplace_pairs = finplace_option[inode.source->op](inode.source->attrs);
+      for (auto& kv : inplace_pairs) {
+        uint32_t eid_out = idx.entry_id(nid, kv.second);
+        uint32_t eid_in = idx.entry_id(inode.inputs[kv.first]);
+        if (ref_count[eid_in] == 1 && storage[eid_in] != GraphAllocator::kBadStorageID) {
+          storage[eid_out] = storage[eid_in];
+          ref_count[eid_in] = 0;
+        }
+      }
+    }
+    // normal allocation
+    const int dev_id = (device_vec != nullptr) ? device_vec->at(nid) : 0;
+    // allocate output
+    for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+      uint32_t eid = idx.entry_id(nid, index);
+      if (storage[eid] == GraphAllocator::kBadStorageID) {
+        storage[eid] = allocator.Request(dev_id, dtype_vec[eid], shape_vec[eid], nid);
+      }
+    }
+    // then free inputs
+    for (const auto& e : inode.inputs) {
+      uint32_t eid = idx.entry_id(e);
+      // temp_ref_count == 0 means it is taken by inplace op
+      if (ref_count[eid] == 0) continue;
+      // if we decrease it to zero, means we are ready to relase
+      --ref_count[eid];
+      if (ref_count[eid] == 0 && storage[eid] != GraphAllocator::kBadStorageID) {
+        allocator.Release(storage[eid], nid);
+      }
+    }
+    // check if there are outputs that can be freeded immediately
+    for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+      uint32_t eid = idx.entry_id(nid, index);
+      if (ref_count[eid] == 0 && storage[eid] != GraphAllocator::kBadStorageID) {
+        allocator.Release(storage[eid], nid);
+      }
+      if (storage[eid] == GraphAllocator::kBadStorageID) {
+        ++num_not_allocated;
+      }
+    }
+  }
+  ret.attrs["storage_id"] = std::make_shared<any>(std::move(storage));
+  ret.attrs["storage_allocated_bytes"] = std::make_shared<any>(allocator.TotalAllocBytes());
+  ret.attrs["storage_num_not_allocated"] = std::make_shared<any>(num_not_allocated);
+  return ret;
+}
+NNVM_REGISTER_PASS(PlanMemory)
+.describe("Plan the memory allocation of each node entries.")
+.set_body(PlanMemory)
+.set_change_graph(false)
+.depend_graph_attr("dtype")
+.depend_graph_attr("shape")
+.provide_graph_attr("storage_id");
+}  // namespace pass
+}  // namespace nnvm
--- a/nnvm/tests/python/test_graph.py
+++ b/nnvm/tests/python/test_graph.py
@@ -95,6 +95,25 @@ def test_place_device():
    assert g.json_attr('device')[jnode_row_ptr[nindex["add3"]]] == 1
    assert g.json_attr('device')[jnode_row_ptr[nindex["cast1"]]] == 0
+def test_plan_memory():
+    x = sym.Variable('x', shape=(4, 2))
+    x2 = sym.add(x, x, name='addk')
+    y = sym.reshape(x2, target=(2, 4), name="reshapek")
+    y = sym.add(y, x2, name="add2")
+    y = sym.add(y, y)
+    g = graph.create(y)
+    g._set_json_attr("shape_attr_key", "shape")
+    g = g.apply(["InferShape", "InferType", "PlanMemory"])
+    jgraph = json.loads(g.apply('SaveJSON').json_attr('json'))
+    jnodes = jgraph['nodes']
+    jnode_row_ptr = jgraph['node_row_ptr']
+    storage_id = g.json_attr('storage_id')
+    nindex = {n['name']: i for i, n in enumerate(jnodes)}
+    assert (storage_id[jnode_row_ptr[nindex["addk"]]] !=
+            storage_id[jnode_row_ptr[nindex["reshapek"]]])
+    assert (storage_id[jnode_row_ptr[nindex["add2"]]] ==
+            storage_id[jnode_row_ptr[nindex["reshapek"]]])
 if __name__ == "__main__":
    test_order_mutation_pass()
@@ -103,3 +122,4 @@ if __name__ == "__main__":
    test_infer_shape()
    test_infer_type()
    test_place_device()
+    test_plan_memory()