Commit 96db41db by Haibin Lin Committed by Tianqi Chen

LSTM Memory Allocator Fix #5035 (#105)

* Imbalance version of shared pool during plan memory

* Bug fix for no shared_pool case

* Auto search and updated shared mem pool

* Cleanup unused code

* Cleanup logging code

* Add unit test for shared storage

* Remove shared pool in PlanMemory. Fix lint warnings

* Fix lint warnings

* Use reference instead of ptrs
parent 04edd05d
......@@ -19,10 +19,12 @@ class GraphAllocator {
public:
// storage id equals integer.
using StorageID = int;
// bad storage id
static const StorageID kBadStorageID = -1;
// external storage id
static const StorageID kExternalStorageID = -2;
// request a free storage
StorageID Request(int dev_id, int dtype, TShape shape, uint32_t node_id) {
if (shape.ndim() == 0) return kBadStorageID;
......@@ -54,7 +56,7 @@ class GraphAllocator {
node_color_[e->released_by_node] != node_color_[node_id]) continue;
// Use exect matching strategy
e->max_bytes = std::max(size, e->max_bytes);
// find a exact match, erase from map and return
// erase from map and return
free_.erase(it);
return e->id;
}
......@@ -69,6 +71,7 @@ class GraphAllocator {
e->released_by_node = node_id;
free_.insert({e->max_bytes, e});
}
// totoal number of bytes allocated
size_t TotalAllocBytes() const {
size_t total = 0;
......@@ -79,14 +82,13 @@ class GraphAllocator {
}
// constructor
explicit GraphAllocator(const IndexedGraph* idx) : idx_(idx) {
this->Init(dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16),
dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1));
explicit GraphAllocator(const IndexedGraph* idx, const size_t match_range) : idx_(idx) {
this->Init(match_range, dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1));
}
private:
// initialize the graph allocator
void Init(size_t match_range, uint32_t num_match_color) {
void Init(const size_t match_range, const uint32_t num_match_color) {
match_range_ = match_range;
num_match_color_ = num_match_color;
if (num_match_color_ > 1) {
......@@ -136,43 +138,17 @@ class GraphAllocator {
const IndexedGraph* idx_;
};
// function to plan memory
Graph PlanMemory(Graph ret) {
// setup ref counter
const IndexedGraph& idx = ret.indexed_graph();
/*
* Internal method to perform the memory allocation for a graph
* */
size_t AllocMemory(const Graph& ret, const IndexedGraph& idx, StorageVector* storage_ptr,
std::vector<int>* storage_inplace_index_ptr, std::vector<uint32_t> ref_count,
GraphAllocator* allocator) {
// Get reference
auto &storage = *storage_ptr;
auto &storage_inplace_index = *storage_inplace_index_ptr;
static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
// reference counter of each node
std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
// step 1: initialize reference count
for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
const auto& inode = idx[nid];
if (inode.source->is_variable()) continue;
for (const auto& e : inode.inputs) {
++ref_count[idx.entry_id(e)];
}
// no dataflow dependency is needed for those are ignored.
// revoke the dependency counter.
if (fignore_inputs.count(inode.source->op()) != 0) {
auto ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
for (uint32_t i : ignore_inputs) {
--ref_count[idx.entry_id(inode.inputs[i])];
}
}
}
for (const auto& e : idx.outputs()) {
++ref_count[idx.entry_id(e)];
}
// step 2: allocate memory.
StorageVector storage;
if (ret.attrs.count("storage") != 0) {
storage = ret.MoveCopyAttr<StorageVector>("storage");
} else {
storage.resize(idx.num_node_entries(), -1);
}
std::vector<int> storage_inplace_index(idx.num_node_entries(), -1);
// Get attributes from the graph
const ShapeVector& shape_vec = ret.GetAttr<ShapeVector>("shape");
const DTypeVector& dtype_vec = ret.GetAttr<DTypeVector>("dtype");
const DeviceVector* device_vec = nullptr;
......@@ -181,9 +157,6 @@ Graph PlanMemory(Graph ret) {
if (ret.attrs.count("device") != 0) {
device_vec = &(ret.GetAttr<DeviceVector>("device"));
}
// the allocator.
GraphAllocator allocator(&idx);
// number of entries that are not statically allocated.
size_t num_not_allocated = 0;
for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
......@@ -210,15 +183,24 @@ Graph PlanMemory(Graph ret) {
}
// normal allocation
const int dev_id = (device_vec != nullptr) ? device_vec->at(nid) : 0;
// allocate output
// sort output nodes based on size before allocating output
std::multimap<size_t, uint32_t> eids;
for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
uint32_t eid = idx.entry_id(nid, index);
if (storage[eid] == GraphAllocator::kBadStorageID) {
storage[eid] = allocator.Request(dev_id, dtype_vec[eid], shape_vec[eid], nid);
auto &eshape = shape_vec[eid];
size_t esize = 0;
if (eshape.ndim() != 0) esize = eshape.Size();
eids.insert(std::make_pair(esize, eid));
}
}
for (auto rit = eids.rbegin(); rit != eids.rend(); ++rit) {
uint32_t eid = rit->second;
storage[eid] = allocator->Request(dev_id, dtype_vec[eid], shape_vec[eid], nid);
}
// check if certain inputs is ignored.
static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
std::vector<uint32_t> ignore_inputs;
if (fignore_inputs.count(inode.source->op()) != 0) {
ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
......@@ -235,7 +217,7 @@ Graph PlanMemory(Graph ret) {
// if we decrease it to zero, means we are ready to relase
--ref_count[eid];
if (ref_count[eid] == 0 && storage[eid] != GraphAllocator::kBadStorageID) {
allocator.Release(storage[eid], nid);
allocator->Release(storage[eid], nid);
}
}
// check if there are outputs that can be freeded immediately
......@@ -243,7 +225,7 @@ Graph PlanMemory(Graph ret) {
for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
uint32_t eid = idx.entry_id(nid, index);
if (ref_count[eid] == 0 && storage[eid] != GraphAllocator::kBadStorageID) {
allocator.Release(storage[eid], nid);
allocator->Release(storage[eid], nid);
// use -2 to indicate that the node was never touched.
storage_inplace_index[eid] = -2;
}
......@@ -252,10 +234,70 @@ Graph PlanMemory(Graph ret) {
}
}
}
ret.attrs["storage_id"] = std::make_shared<any>(std::move(storage));
ret.attrs["storage_inplace_index"] = std::make_shared<any>(std::move(storage_inplace_index));
ret.attrs["storage_allocated_bytes"] = std::make_shared<any>(allocator.TotalAllocBytes());
ret.attrs["storage_num_not_allocated"] = std::make_shared<any>(num_not_allocated);
return num_not_allocated;
}
// function to plan memory
Graph PlanMemory(Graph ret) {
// setup ref counter
const IndexedGraph& idx = ret.indexed_graph();
static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
// reference counter of each node
std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
// step 1: initialize reference count
for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
const auto& inode = idx[nid];
if (inode.source->is_variable()) continue;
for (const auto& e : inode.inputs) {
++ref_count[idx.entry_id(e)];
}
// no dataflow dependency is needed for those are ignored.
// revoke the dependency counter.
if (fignore_inputs.count(inode.source->op()) != 0) {
auto ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
for (uint32_t i : ignore_inputs) {
--ref_count[idx.entry_id(inode.inputs[i])];
}
}
}
for (const auto& e : idx.outputs()) {
++ref_count[idx.entry_id(e)];
}
// step 2: allocate memory.
StorageVector storage;
if (ret.attrs.count("storage") != 0) {
storage = ret.MoveCopyAttr<StorageVector>("storage");
} else {
storage.resize(idx.num_node_entries(), -1);
}
// Search the best NNVM_EXEC_MATCH_RANGE parameter. This is turned off by default
size_t min_allocated_bytes = -1;
size_t max_match_range = dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16);
size_t min_match_range =
dmlc::GetEnv("NNVM_AUTO_SEARCH_MATCH_RANGE", false) ? 1 : max_match_range;
for (size_t match_range = min_match_range; match_range <= max_match_range; match_range *= 2) {
// Make a copy of related fields
StorageVector storage_vec(storage);
std::vector<int> storage_inplace_index(idx.num_node_entries(), -1);
// the allocator
GraphAllocator allocator(&idx, match_range);
// number of entries that are not statically allocated.
size_t storage_num_not_allocated =
AllocMemory(ret, idx, &storage_vec, &storage_inplace_index, ref_count, &allocator);
size_t storage_allocated_bytes = allocator.TotalAllocBytes();
// Choose the plan which leads to minimal memory usage
if (min_allocated_bytes > storage_allocated_bytes) {
ret.attrs["storage_id"] = std::make_shared<any>(std::move(storage_vec));
ret.attrs["storage_inplace_index"] = std::make_shared<any>(std::move(storage_inplace_index));
ret.attrs["storage_allocated_bytes"] = std::make_shared<any>(storage_allocated_bytes);
ret.attrs["storage_num_not_allocated"] = std::make_shared<any>(storage_num_not_allocated);
min_allocated_bytes = storage_allocated_bytes;
}
}
return ret;
}
......
......@@ -88,7 +88,6 @@ def test_infer_shape_known_partial():
assert g.json_attr('shape')[jnode_row_ptr[nindex["reshape1"]]] == [2, 4]
assert g.json_attr('shape')[jnode_row_ptr[nindex["add1"]]] == [4, 2]
def test_infer_type():
x = sym.Variable('x', dtype=0)
y = sym.add(x, x, name='add1')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment