Commit 0538a9fc by Tianqi Chen

[PASS] add plan memory (#19)

parent 204c4442
...@@ -147,6 +147,10 @@ class IndexedGraph { ...@@ -147,6 +147,10 @@ class IndexedGraph {
inline const std::vector<uint32_t>& arg_nodes() const { inline const std::vector<uint32_t>& arg_nodes() const {
return arg_nodes_; return arg_nodes_;
} }
/*! \return list of output entries */
inline const std::vector<NodeEntry>& outputs() const {
return outputs_;
}
private: private:
friend class Graph; friend class Graph;
...@@ -159,6 +163,8 @@ class IndexedGraph { ...@@ -159,6 +163,8 @@ class IndexedGraph {
std::vector<Node> nodes_; std::vector<Node> nodes_;
// index to argument nodes // index to argument nodes
std::vector<uint32_t> arg_nodes_; std::vector<uint32_t> arg_nodes_;
// space to store the outputs entries
std::vector<NodeEntry> outputs_;
// mapping from node to index. // mapping from node to index.
std::unordered_map<const nnvm::Node*, uint32_t> node2index_; std::unordered_map<const nnvm::Node*, uint32_t> node2index_;
// CSR pointer of node entries // CSR pointer of node entries
......
...@@ -60,7 +60,7 @@ using DTypeVector = std::vector<int>; ...@@ -60,7 +60,7 @@ using DTypeVector = std::vector<int>;
* *
* \code * \code
* Graph g = ApplyPass(src_graph, {"PlaceDevice"}); * Graph g = ApplyPass(src_graph, {"PlaceDevice"});
* const &device = g.GetAttr<DeviceVector>("dtype"); * const &device = g.GetAttr<DeviceVector>("device");
* // get device by node_id * // get device by node_id
* int device_type = device[g.indexed_graph().node_id(my_node)]; * int device_type = device[g.indexed_graph().node_id(my_node)];
* \endcode * \endcode
...@@ -75,6 +75,21 @@ using DeviceVector = std::vector<int>; ...@@ -75,6 +75,21 @@ using DeviceVector = std::vector<int>;
*/ */
using DeviceAssignMap = std::unordered_map<std::string, int>; using DeviceAssignMap = std::unordered_map<std::string, int>;
/*!
* \brief The result holder of storage id of each NodeEntry in the graph.
*
* \note Stored under graph.attrs["storage"], provided by Pass "PlanMemory"
* Storage id is a continuous integer.
* If the storage id is -1 then the storage is not assigned.
*
* \code
* Graph g = ApplyPass(src_graph, {"PlanMemory"});
* const &storage = g.GetAttr<StorageVector>("storage");
* // get storage id by entry
* int storage_id = storage[g.indexed_graph().entry_id(my_entry)];
* \endcode
*/
using StorageVector = std::vector<int>;
} // namespace nnvm } // namespace nnvm
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <utility>
#include <functional> #include <functional>
#include "./base.h" #include "./base.h"
#include "./tuple.h" #include "./tuple.h"
...@@ -93,6 +94,20 @@ using FInferType = FInferNodeEntryAttr<int>; ...@@ -93,6 +94,20 @@ using FInferType = FInferNodeEntryAttr<int>;
*/ */
using TIsBackwardOp = bool; using TIsBackwardOp = bool;
/*!
* \brief Get possible inplace options.
* This function enables optimization to reuse memory of inputs in output.
* \param attrs The attributes of the node
* \param in_data The input data.
* \param out_data The output data.
* \return list of pair of that maps input->output,
* indicating possible in place operations.
*
* \note Register under "FInplaceOption", by default no inplace can happen.
*/
using FInplaceOption = std::function<
std::vector<std::pair<int, int> > (const NodeAttrs& attrs)>;
} // namespace nnvm } // namespace nnvm
#endif // NNVM_OP_ATTR_TYPES_H_ #endif // NNVM_OP_ATTR_TYPES_H_
...@@ -52,6 +52,11 @@ IndexedGraph::IndexedGraph(const Graph &g) { ...@@ -52,6 +52,11 @@ IndexedGraph::IndexedGraph(const Graph &g) {
control_rptr.push_back(control_deps_.size()); control_rptr.push_back(control_deps_.size());
}); });
for (const auto& e : g.outputs) {
outputs_.emplace_back(NodeEntry{
node2index_.at(e.node.get()), e.index, e.version});
}
// setup array view // setup array view
// input_entries_ and control_rptr must not change after this step. // input_entries_ and control_rptr must not change after this step.
const NodeEntry* iptr = dmlc::BeginPtr(input_entries_); const NodeEntry* iptr = dmlc::BeginPtr(input_entries_);
......
...@@ -14,6 +14,7 @@ using nnvm::FListInputNames; ...@@ -14,6 +14,7 @@ using nnvm::FListInputNames;
using nnvm::FMutateInput; using nnvm::FMutateInput;
using nnvm::FInferShape; using nnvm::FInferShape;
using nnvm::FInferType; using nnvm::FInferType;
using nnvm::FInplaceOption;
using nnvm::NodeAttrs; using nnvm::NodeAttrs;
using nnvm::TShape; using nnvm::TShape;
using nnvm::array_view; using nnvm::array_view;
...@@ -32,6 +33,10 @@ inline bool SameShape(const NodeAttrs& attrs, ...@@ -32,6 +33,10 @@ inline bool SameShape(const NodeAttrs& attrs,
return true; return true;
} }
inline std::vector<std::pair<int, int> > InplaceIn0Out0(const NodeAttrs& attrs) {
return {{0, 0}};
}
// simple demonstration of reshape. // simple demonstration of reshape.
NNVM_REGISTER_OP(reshape) NNVM_REGISTER_OP(reshape)
.describe("reshape source to target shape") .describe("reshape source to target shape")
...@@ -55,7 +60,8 @@ NNVM_REGISTER_OP(reshape) ...@@ -55,7 +60,8 @@ NNVM_REGISTER_OP(reshape)
CHECK_EQ(ishape[0]->Size(), target.Size()) CHECK_EQ(ishape[0]->Size(), target.Size())
<< "Reshape op: source target shape mismatch"; << "Reshape op: source target shape mismatch";
return true; return true;
}); })
.attr<FInplaceOption>("FInplaceOption", InplaceIn0Out0);
NNVM_REGISTER_OP(cast) NNVM_REGISTER_OP(cast)
...@@ -82,7 +88,8 @@ NNVM_REGISTER_OP(cast) ...@@ -82,7 +88,8 @@ NNVM_REGISTER_OP(cast)
NNVM_REGISTER_OP(add) NNVM_REGISTER_OP(add)
.describe("add two data together") .describe("add two data together")
.set_num_inputs(2) .set_num_inputs(2)
.attr<FInferShape>("FInferShape", SameShape); .attr<FInferShape>("FInferShape", SameShape)
.attr<FInplaceOption>("FInplaceOption", InplaceIn0Out0);
NNVM_REGISTER_OP(__add_symbol__) NNVM_REGISTER_OP(__add_symbol__)
.describe("Alias of add") .describe("Alias of add")
......
/*!
* Copyright (c) 2016 by Contributors
* \file graph_algorithm.h
* \brief This header contains graph algorithms on StaticGraph.
* It is used compute informations such as whether two
* operations can run in parallel, and helps allocation.
*/
#ifndef NNVM_PASS_GRAPH_ALGORITHM_H_
#define NNVM_PASS_GRAPH_ALGORITHM_H_
#include <nnvm/graph.h>
#include <vector>
namespace nnvm {
namespace pass {
/*!
* \brief Find best path in the DAG, with reward defined
* by sum of reward of each node along the path.
* \param graph the original static graph.
* \param topo_order topo order of the nodes in the graph.
* \param node_reward the reward of each node.
* \param path the output path of nodes.
* \return the total reward of best path.
*/
inline uint32_t FindBestPath(
const IndexedGraph& graph,
const std::vector<uint32_t>& node_reward,
std::vector<uint32_t>* path) {
const uint32_t num_nodes = static_cast<uint32_t>(graph.num_nodes());
CHECK_EQ(num_nodes, node_reward.size());
std::vector<uint32_t> best_reward(node_reward.size(), 0);
std::vector<uint32_t> next_node(node_reward.size(), num_nodes);
uint32_t best_solution = 0, best_start_node = 0;
// traverse in reverse topo order
for (uint32_t i = static_cast<uint32_t>(graph.num_nodes()); i != 0; --i) {
const uint32_t nid = i - 1;
best_reward[nid] += node_reward[nid];
if (best_reward[nid] > best_solution) {
best_solution = best_reward[nid];
best_start_node = nid;
}
for (const auto& e : graph[nid].inputs) {
const uint32_t prev = e.node_id;
if (best_reward[nid] > best_reward[prev]) {
best_reward[prev] = best_reward[nid];
next_node[prev] = nid;
}
}
}
path->clear();
uint32_t reward = 0;
for (uint32_t nid = best_start_node; nid < num_nodes; nid = next_node[nid]) {
path->push_back(nid); reward += node_reward[nid];
}
CHECK_EQ(reward, best_solution);
return best_solution;
}
/*!
* \brief Color the nodes in the graph into index.
* The coloring algorithm tries to assign node group
* such that node in the same group cannot run in parallel.
*
* \param graph the original indexed graph.
* \param node_importance The importance of the node
* \param max_ncolor maximum number of colors allowed.
* \param color the color index of each of the node.
* \return the total number of colors.
*/
inline uint32_t ColorNodeGroup(
const IndexedGraph &graph,
std::vector<uint32_t> node_importance,
uint32_t max_ncolor,
std::vector<uint32_t> *color) {
CHECK_NE(max_ncolor, 0);
CHECK_EQ(graph.num_nodes(), node_importance.size());
color->clear();
color->resize(graph.num_nodes(), max_ncolor);
uint32_t cindex;
// greedy algorithm, every time
// find a path with best reward and assign a new color
// All the nodes in the path cannot run in parallel.
for (cindex = 0; cindex < max_ncolor - 1; ++cindex) {
std::vector<uint32_t> path;
uint32_t reward = FindBestPath(graph, node_importance, &path);
if (reward == 0) break;
for (uint32_t nid : path) {
if (node_importance[nid] != 0) {
CHECK_EQ(color->at(nid), max_ncolor);
color->at(nid) = cindex;
// make the importance 0 after color is decided.
node_importance[nid] = 0;
}
}
}
// assign i for rest of the node
for (uint32_t i = 0; i < graph.num_nodes(); ++i) {
if (color->at(i) == max_ncolor) {
color->at(i) = cindex;
}
}
return cindex + 1;
}
} // namespace pass
} // namespace nnvm
#endif // NNVM_PASS_GRAPH_ALGORITHM_H_
...@@ -121,6 +121,7 @@ NNVM_REGISTER_PASS(InferType) ...@@ -121,6 +121,7 @@ NNVM_REGISTER_PASS(InferType)
DMLC_JSON_ENABLE_ANY(ShapeVector, list_shape); DMLC_JSON_ENABLE_ANY(ShapeVector, list_shape);
DMLC_JSON_ENABLE_ANY(DTypeVector, list_int); DMLC_JSON_ENABLE_ANY(DTypeVector, list_int);
DMLC_JSON_ENABLE_ANY(size_t, size_t);
} // namespace pass } // namespace pass
} // namespace nnvm } // namespace nnvm
/*!
* Copyright (c) 2016 by Contributors
* \file plan_memory.cc
* \brief Assign memory tag to each of the data entries.
*/
#include <nnvm/graph.h>
#include <nnvm/pass.h>
#include <nnvm/graph_attr_types.h>
#include <nnvm/op_attr_types.h>
#include <memory>
#include "./graph_algorithm.h"
namespace nnvm {
namespace pass {
// simple graph based allocator.
class GraphAllocator {
public:
// storage id equals integer.
using StorageID = int;
// bad storage id
static const StorageID kBadStorageID = -1;
// request a free storage
StorageID Request(int dev_id, int dtype, TShape shape, uint32_t node_id) {
if (shape.ndim() == 0) return kBadStorageID;
// search memory block in [size / match_range_, size * match_range_)
// TODO(tqchen) add size of the dtype, assume 4 bytes for now
size_t size = shape.Size() * 4;
if (match_range_ == 0) return this->Alloc(dev_id, size);
auto begin = free_.lower_bound(size / match_range_);
auto mid = free_.lower_bound(size);
auto end = free_.upper_bound(size * match_range_);
// search for memory blocks larger than requested
for (auto it = mid; it != end; ++it) {
StorageEntry *e = it->second;
if (e->device_id != dev_id) continue;
if (node_color_.size() != 0 &&
node_color_[e->released_by_node] != node_color_[node_id]) continue;
// Use exect matching strategy
e->max_bytes = std::max(size, e->max_bytes);
// find a exact match, erase from map and return
free_.erase(it);
return e->id;
}
// then search for memory blocks smaller than requested space
for (auto it = mid; it != begin;) {
--it;
StorageEntry *e = it->second;
if (e->device_id != dev_id) continue;
if (node_color_.size() != 0 &&
node_color_[e->released_by_node] != node_color_[node_id]) continue;
// Use exect matching strategy
e->max_bytes = std::max(size, e->max_bytes);
// find a exact match, erase from map and return
free_.erase(it);
return e->id;
}
// cannot find anything return a new one.
return this->Alloc(dev_id, size);
}
// release a memory space.
void Release(StorageID id, uint32_t node_id) {
CHECK_NE(id, kBadStorageID);
StorageEntry *e = data_[id].get();
e->released_by_node = node_id;
free_.insert({e->max_bytes, e});
}
// totoal number of bytes allocated
size_t TotalAllocBytes() const {
size_t total = 0;
for (auto &p : data_) {
total += p->max_bytes;
}
return total;
}
// constructor
explicit GraphAllocator(const IndexedGraph* idx) : idx_(idx) {
this->Init(dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16),
dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1));
}
private:
// initialize the graph allocator
void Init(size_t match_range, uint32_t num_match_color) {
match_range_ = match_range;
num_match_color_ = num_match_color;
if (num_match_color_ > 1) {
std::vector<uint32_t> importance(idx_->num_nodes(), 0);
for (uint32_t nid = 0; nid < idx_->num_nodes(); ++nid) {
if ((*idx_)[nid].source->is_variable()) continue;
importance[nid] = 1;
}
num_match_color_ = ColorNodeGroup(
*idx_, importance, num_match_color_, &node_color_);
}
}
StorageID Alloc(int dev_id, size_t size) {
StorageID id = static_cast<StorageID>(data_.size());
std::unique_ptr<StorageEntry> ptr(new StorageEntry());
ptr->id = id;
ptr->device_id = dev_id;
ptr->max_bytes = size;
data_.emplace_back(std::move(ptr));
return id;
}
// internal storage entry
struct StorageEntry {
// the id of the entry.
StorageID id;
// the device id of the storage.
int device_id;
// maximum size of storage requested.
size_t max_bytes{0};
// node index that released it last time
uint32_t released_by_node{0};
};
// scale used for rough match
size_t match_range_;
// whether use color based match algorithm
uint32_t num_match_color_{1};
// the size of each dtype
std::vector<size_t> dtype_size_dict_;
// free list of storage entry
std::multimap<size_t, StorageEntry*> free_;
// all the storage resources available
std::vector<std::unique_ptr<StorageEntry> > data_;
// color of nodes in the graph, used for auxiliary policy making.
std::vector<uint32_t> node_color_;
// internal indexed graph
const IndexedGraph* idx_;
};
// function to plan memory
Graph PlanMemory(Graph ret) {
// setup ref counter
const IndexedGraph& idx = ret.indexed_graph();
// reference counter of each node
std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
// step 1: initialize reference count
for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
for (const auto& e : idx[nid].inputs) {
++ref_count[e.node_id];
}
}
for (const auto& e : idx.outputs()) {
++ref_count[e.node_id];
}
// step 2: allocate memory.
StorageVector storage(idx.num_node_entries(), -1);
const ShapeVector& shape_vec = ret.GetAttr<ShapeVector>("shape");
const DTypeVector& dtype_vec = ret.GetAttr<DTypeVector>("dtype");
const DeviceVector* device_vec = nullptr;
static auto& finplace_option = Op::GetAttr<FInplaceOption>("FInplaceOption");
if (ret.attrs.count("device") != 0) {
device_vec = &(ret.GetAttr<DeviceVector>("device"));
}
// the allocator.
GraphAllocator allocator(&idx);
// number of entries that are not statically allocated.
size_t num_not_allocated = 0;
for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
const auto& inode = idx[nid];
if (inode.source->is_variable()) continue;
// check inplace option
if (finplace_option.count(inode.source->op) != 0) {
auto inplace_pairs = finplace_option[inode.source->op](inode.source->attrs);
for (auto& kv : inplace_pairs) {
uint32_t eid_out = idx.entry_id(nid, kv.second);
uint32_t eid_in = idx.entry_id(inode.inputs[kv.first]);
if (ref_count[eid_in] == 1 && storage[eid_in] != GraphAllocator::kBadStorageID) {
storage[eid_out] = storage[eid_in];
ref_count[eid_in] = 0;
}
}
}
// normal allocation
const int dev_id = (device_vec != nullptr) ? device_vec->at(nid) : 0;
// allocate output
for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
uint32_t eid = idx.entry_id(nid, index);
if (storage[eid] == GraphAllocator::kBadStorageID) {
storage[eid] = allocator.Request(dev_id, dtype_vec[eid], shape_vec[eid], nid);
}
}
// then free inputs
for (const auto& e : inode.inputs) {
uint32_t eid = idx.entry_id(e);
// temp_ref_count == 0 means it is taken by inplace op
if (ref_count[eid] == 0) continue;
// if we decrease it to zero, means we are ready to relase
--ref_count[eid];
if (ref_count[eid] == 0 && storage[eid] != GraphAllocator::kBadStorageID) {
allocator.Release(storage[eid], nid);
}
}
// check if there are outputs that can be freeded immediately
for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
uint32_t eid = idx.entry_id(nid, index);
if (ref_count[eid] == 0 && storage[eid] != GraphAllocator::kBadStorageID) {
allocator.Release(storage[eid], nid);
}
if (storage[eid] == GraphAllocator::kBadStorageID) {
++num_not_allocated;
}
}
}
ret.attrs["storage_id"] = std::make_shared<any>(std::move(storage));
ret.attrs["storage_allocated_bytes"] = std::make_shared<any>(allocator.TotalAllocBytes());
ret.attrs["storage_num_not_allocated"] = std::make_shared<any>(num_not_allocated);
return ret;
}
NNVM_REGISTER_PASS(PlanMemory)
.describe("Plan the memory allocation of each node entries.")
.set_body(PlanMemory)
.set_change_graph(false)
.depend_graph_attr("dtype")
.depend_graph_attr("shape")
.provide_graph_attr("storage_id");
} // namespace pass
} // namespace nnvm
...@@ -95,6 +95,25 @@ def test_place_device(): ...@@ -95,6 +95,25 @@ def test_place_device():
assert g.json_attr('device')[jnode_row_ptr[nindex["add3"]]] == 1 assert g.json_attr('device')[jnode_row_ptr[nindex["add3"]]] == 1
assert g.json_attr('device')[jnode_row_ptr[nindex["cast1"]]] == 0 assert g.json_attr('device')[jnode_row_ptr[nindex["cast1"]]] == 0
def test_plan_memory():
x = sym.Variable('x', shape=(4, 2))
x2 = sym.add(x, x, name='addk')
y = sym.reshape(x2, target=(2, 4), name="reshapek")
y = sym.add(y, x2, name="add2")
y = sym.add(y, y)
g = graph.create(y)
g._set_json_attr("shape_attr_key", "shape")
g = g.apply(["InferShape", "InferType", "PlanMemory"])
jgraph = json.loads(g.apply('SaveJSON').json_attr('json'))
jnodes = jgraph['nodes']
jnode_row_ptr = jgraph['node_row_ptr']
storage_id = g.json_attr('storage_id')
nindex = {n['name']: i for i, n in enumerate(jnodes)}
assert (storage_id[jnode_row_ptr[nindex["addk"]]] !=
storage_id[jnode_row_ptr[nindex["reshapek"]]])
assert (storage_id[jnode_row_ptr[nindex["add2"]]] ==
storage_id[jnode_row_ptr[nindex["reshapek"]]])
if __name__ == "__main__": if __name__ == "__main__":
test_order_mutation_pass() test_order_mutation_pass()
...@@ -103,3 +122,4 @@ if __name__ == "__main__": ...@@ -103,3 +122,4 @@ if __name__ == "__main__":
test_infer_shape() test_infer_shape()
test_infer_type() test_infer_type()
test_place_device() test_place_device()
test_plan_memory()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment