Commit 9d880bd3 by Luis Vega Committed by Thierry Moreau

[VTA][TSIM] add virtual memory support to tsim example (#3868)

* [VTA][TSIM] add virtual memory support to tsim example

* fix identation

* remove USE_TSIM macro and use 32-bit addr instead
parent a711f38e
...@@ -35,8 +35,11 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND ...@@ -35,8 +35,11 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
endif() endif()
file(GLOB TSIM_SW_SRC src/driver.cc) file(GLOB TSIM_SW_SRC src/driver.cc)
list(APPEND TSIM_SW_SRC ${VTA_DIR}/src/vmem/virtual_memory.cc)
list(APPEND TSIM_SW_SRC ${VTA_DIR}/src/dpi/module.cc)
add_library(sw SHARED ${TSIM_SW_SRC}) add_library(sw SHARED ${TSIM_SW_SRC})
target_include_directories(sw PRIVATE ${VTA_DIR}/include) target_include_directories(sw PRIVATE ${VTA_DIR}/include ${VTA_DIR}/src)
if(APPLE) if(APPLE)
set_target_properties(sw PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") set_target_properties(sw PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
......
...@@ -63,13 +63,12 @@ https://www.veripool.org/projects/verilator/wiki/Installing ...@@ -63,13 +63,12 @@ https://www.veripool.org/projects/verilator/wiki/Installing
1. Install `verilator` and `sbt` as described above 1. Install `verilator` and `sbt` as described above
2. Get tvm `git clone https://github.com/dmlc/tvm.git` 2. Get tvm `git clone https://github.com/dmlc/tvm.git`
3. Change VTA target in `tvm/vta/config/vta_config.json` from `sim` to `tsim` 3. Build [tvm](https://docs.tvm.ai/install/from_source.html#build-the-shared-library)
4. Build [tvm](https://docs.tvm.ai/install/from_source.html#build-the-shared-library)
## How to run VTA TSIM examples ## How to run VTA TSIM examples
There are two sample VTA accelerators, add-a-constant, designed in Chisel3 and Verilog to show how *TSIM* works. There are two sample VTA accelerators, add-a-constant, designed in Chisel3 and Verilog to show how *TSIM* works.
The default `TARGET` language for these two implementations is Verilog. The following instructions show The default target language for these two implementations is Verilog. The following instructions show
how to run both of them: how to run both of them:
* Test Verilog backend * Test Verilog backend
......
...@@ -21,15 +21,28 @@ import os.path as osp ...@@ -21,15 +21,28 @@ import os.path as osp
from sys import platform from sys import platform
def get_ext(): def get_ext():
"""Return shared library extension"""
return ".dylib" if platform == "darwin" else ".so" return ".dylib" if platform == "darwin" else ".so"
def load_dll(dll): def load_dll(dll):
"""Load shared library
Parameters
------------
dll : str
Path for shared library
Returns
------------
The shared library
"""
try: try:
return [ctypes.CDLL(dll, ctypes.RTLD_GLOBAL)] return [ctypes.CDLL(dll, ctypes.RTLD_GLOBAL)]
except OSError: except OSError:
return [] return []
def load_sw(): def load_sw():
"""Load all software shared libraries"""
cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__))) cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
sw_libname = "libsw" + get_ext() sw_libname = "libsw" + get_ext()
sw_lib = osp.join(cur_path, "..", "build", sw_libname) sw_lib = osp.join(cur_path, "..", "build", sw_libname)
...@@ -48,11 +61,12 @@ def init(hw_backend): ...@@ -48,11 +61,12 @@ def init(hw_backend):
hw_libname = "libhw" + get_ext() hw_libname = "libhw" + get_ext()
if hw_backend in ("verilog", "chisel"): if hw_backend in ("verilog", "chisel"):
hw_lib = osp.join(cur_path, "..", "hardware", hw_backend, "build", hw_libname) hw_lib = osp.join(cur_path, "..", "hardware", hw_backend, "build", hw_libname)
m = tvm.module.load(hw_lib, "vta-tsim")
load_sw() load_sw()
m = tvm.module.load(hw_lib, "vta-tsim")
f = tvm.get_global_func("tvm.vta.tsim.init") f = tvm.get_global_func("tvm.vta.tsim.init")
f(m) f(m)
def load_module(): def load_module():
"""Return driver function"""
load_sw() load_sw()
return tvm.get_global_func("tvm.vta.driver") return tvm.get_global_func("tvm.vta.driver")
...@@ -21,17 +21,11 @@ ...@@ -21,17 +21,11 @@
#include <tvm/runtime/registry.h> #include <tvm/runtime/registry.h>
#include <vta/dpi/module.h> #include <vta/dpi/module.h>
#include "vmem/virtual_memory.h"
namespace vta { namespace vta {
namespace driver { namespace driver {
uint32_t get_half_addr(void *p, bool upper) {
if (upper) {
return ((uint64_t) ((uint64_t*) p)) >> 32;
} else {
return ((uint64_t) ((uint64_t*) p));
}
}
using vta::dpi::DPIModuleNode; using vta::dpi::DPIModuleNode;
using tvm::runtime::Module; using tvm::runtime::Module;
...@@ -70,11 +64,19 @@ class Device { ...@@ -70,11 +64,19 @@ class Device {
loader_ = DPILoader::Global(); loader_ = DPILoader::Global();
} }
uint32_t Run(uint32_t c, uint32_t length, void* inp, void* out) { uint32_t Run(uint32_t c, DLTensor* a, DLTensor* b) {
uint32_t cycles; uint32_t cycles;
uint32_t len = a->shape[0];
size_t size = (a->dtype.bits >> 3) * len;
a_ = this->MemAlloc(size);
b_ = this->MemAlloc(size);
this->MemCopyFromHost(a_, a->data, size);
this->Init(); this->Init();
this->Launch(c, length, inp, out); this->Launch(c, len);
cycles = this->WaitForCompletion(); cycles = this->WaitForCompletion();
this->MemCopyToHost(b->data, b_, size);
this->MemFree(a_);
this->MemFree(b_);
return cycles; return cycles;
} }
...@@ -84,13 +86,35 @@ class Device { ...@@ -84,13 +86,35 @@ class Device {
dpi_->SimResume(); dpi_->SimResume();
} }
void Launch(uint32_t c, uint32_t length, void* inp, void* out) { void* MemAlloc(size_t size) {
void * addr = vta::vmem::VirtualMemoryManager::Global()->Alloc(size);
return reinterpret_cast<void*>(vta::vmem::VirtualMemoryManager::Global()->GetPhyAddr(addr));
}
void MemFree(void* buf) {
void * addr = vta::vmem::VirtualMemoryManager::Global()->GetAddr(reinterpret_cast<uint64_t>(buf));
vta::vmem::VirtualMemoryManager::Global()->Free(addr);
}
vta_phy_addr_t MemGetPhyAddr(void* buf) {
return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf));
}
void MemCopyFromHost(void* dst, const void* src, size_t size) {
vta::vmem::VirtualMemoryManager::Global()->MemCopyFromHost(dst, src, size);
}
void MemCopyToHost(void* dst, const void* src, size_t size) {
vta::vmem::VirtualMemoryManager::Global()->MemCopyToHost(dst, src, size);
}
void Launch(uint32_t c, uint32_t len) {
dpi_->WriteReg(0x08, c); dpi_->WriteReg(0x08, c);
dpi_->WriteReg(0x0c, length); dpi_->WriteReg(0x0c, len);
dpi_->WriteReg(0x10, get_half_addr(inp, false)); dpi_->WriteReg(0x10, this->MemGetPhyAddr(a_));
dpi_->WriteReg(0x14, get_half_addr(inp, true)); dpi_->WriteReg(0x14, 0);
dpi_->WriteReg(0x18, get_half_addr(out, false)); dpi_->WriteReg(0x18, this->MemGetPhyAddr(b_));
dpi_->WriteReg(0x1c, get_half_addr(out, true)); dpi_->WriteReg(0x1c, 0);
dpi_->WriteReg(0x00, 0x1); // launch dpi_->WriteReg(0x00, 0x1); // launch
} }
...@@ -111,6 +135,10 @@ class Device { ...@@ -111,6 +135,10 @@ class Device {
DPILoader* loader_{nullptr}; DPILoader* loader_{nullptr};
// DPI Module // DPI Module
DPIModuleNode* dpi_{nullptr}; DPIModuleNode* dpi_{nullptr};
// input vm ptr
void* a_{nullptr};
// output vm ptr
void* b_{nullptr};
}; };
using tvm::runtime::TVMRetValue; using tvm::runtime::TVMRetValue;
...@@ -124,10 +152,11 @@ TVM_REGISTER_GLOBAL("tvm.vta.tsim.init") ...@@ -124,10 +152,11 @@ TVM_REGISTER_GLOBAL("tvm.vta.tsim.init")
TVM_REGISTER_GLOBAL("tvm.vta.driver") TVM_REGISTER_GLOBAL("tvm.vta.driver")
.set_body([](TVMArgs args, TVMRetValue* rv) { .set_body([](TVMArgs args, TVMRetValue* rv) {
Device dev_;
DLTensor* A = args[0]; DLTensor* A = args[0];
DLTensor* B = args[1]; DLTensor* B = args[1];
Device dev_; uint32_t c = static_cast<int>(args[2]);
uint32_t cycles = dev_.Run(static_cast<int>(args[2]), A->shape[0], A->data, B->data); uint32_t cycles = dev_.Run(c, A, B);
*rv = static_cast<int>(cycles); *rv = static_cast<int>(cycles);
}); });
......
...@@ -21,11 +21,12 @@ import tsim ...@@ -21,11 +21,12 @@ import tsim
def test_accel(): def test_accel():
rmax = 64 rmax = 64
dtype = "uint64"
n = np.random.randint(1, rmax) n = np.random.randint(1, rmax)
c = np.random.randint(0, rmax) c = np.random.randint(0, rmax)
ctx = tvm.cpu(0) ctx = tvm.cpu(0)
a = tvm.nd.array(np.random.randint(rmax, size=n).astype("uint64"), ctx) a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
b = tvm.nd.array(np.zeros(n).astype("uint64"), ctx) b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
f = tsim.load_module() f = tsim.load_module()
cycles = f(a, b, c) cycles = f(a, b, c)
msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c) msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
......
...@@ -21,11 +21,12 @@ import tsim ...@@ -21,11 +21,12 @@ import tsim
def test_accel(): def test_accel():
rmax = 64 rmax = 64
dtype = "uint64"
n = np.random.randint(1, rmax) n = np.random.randint(1, rmax)
c = np.random.randint(0, rmax) c = np.random.randint(0, rmax)
ctx = tvm.cpu(0) ctx = tvm.cpu(0)
a = tvm.nd.array(np.random.randint(rmax, size=n).astype("uint64"), ctx) a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
b = tvm.nd.array(np.zeros(n).astype("uint64"), ctx) b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
f = tsim.load_module() f = tsim.load_module()
cycles = f(a, b, c) cycles = f(a, b, c)
msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c) msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment