Commit 9d880bd3 by Luis Vega Committed by Thierry Moreau

[VTA][TSIM] add virtual memory support to tsim example (#3868)

* [VTA][TSIM] add virtual memory support to tsim example

* fix identation

* remove USE_TSIM macro and use 32-bit addr instead
parent a711f38e
......@@ -35,8 +35,11 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
endif()
file(GLOB TSIM_SW_SRC src/driver.cc)
list(APPEND TSIM_SW_SRC ${VTA_DIR}/src/vmem/virtual_memory.cc)
list(APPEND TSIM_SW_SRC ${VTA_DIR}/src/dpi/module.cc)
add_library(sw SHARED ${TSIM_SW_SRC})
target_include_directories(sw PRIVATE ${VTA_DIR}/include)
target_include_directories(sw PRIVATE ${VTA_DIR}/include ${VTA_DIR}/src)
if(APPLE)
set_target_properties(sw PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
......
......@@ -52,7 +52,7 @@ Verilator version check
verilator --version
```
the supported version of Verilator should be at least 4.012,
the supported version of Verilator should be at least 4.012,
if homebrew (OSX) or package-manager (Linux) does not support that version,
please install Verilator 4.012 or later from binary or source base on following
instruction of Verilator wiki.
......@@ -63,13 +63,12 @@ https://www.veripool.org/projects/verilator/wiki/Installing
1. Install `verilator` and `sbt` as described above
2. Get tvm `git clone https://github.com/dmlc/tvm.git`
3. Change VTA target in `tvm/vta/config/vta_config.json` from `sim` to `tsim`
4. Build [tvm](https://docs.tvm.ai/install/from_source.html#build-the-shared-library)
3. Build [tvm](https://docs.tvm.ai/install/from_source.html#build-the-shared-library)
## How to run VTA TSIM examples
There are two sample VTA accelerators, add-a-constant, designed in Chisel3 and Verilog to show how *TSIM* works.
The default `TARGET` language for these two implementations is Verilog. The following instructions show
The default target language for these two implementations is Verilog. The following instructions show
how to run both of them:
* Test Verilog backend
......
......@@ -21,15 +21,28 @@ import os.path as osp
from sys import platform
def get_ext():
"""Return shared library extension"""
return ".dylib" if platform == "darwin" else ".so"
def load_dll(dll):
"""Load shared library
Parameters
------------
dll : str
Path for shared library
Returns
------------
The shared library
"""
try:
return [ctypes.CDLL(dll, ctypes.RTLD_GLOBAL)]
except OSError:
return []
def load_sw():
"""Load all software shared libraries"""
cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
sw_libname = "libsw" + get_ext()
sw_lib = osp.join(cur_path, "..", "build", sw_libname)
......@@ -38,9 +51,9 @@ def load_sw():
def init(hw_backend):
"""Init hardware and software shared library for accelerator
Parameters
------------
hw_backend : str
Parameters
------------
hw_backend : str
Hardware backend can be verilog or chisel
"""
......@@ -48,11 +61,12 @@ def init(hw_backend):
hw_libname = "libhw" + get_ext()
if hw_backend in ("verilog", "chisel"):
hw_lib = osp.join(cur_path, "..", "hardware", hw_backend, "build", hw_libname)
m = tvm.module.load(hw_lib, "vta-tsim")
load_sw()
m = tvm.module.load(hw_lib, "vta-tsim")
f = tvm.get_global_func("tvm.vta.tsim.init")
f(m)
def load_module():
"""Return driver function"""
load_sw()
return tvm.get_global_func("tvm.vta.driver")
......@@ -21,17 +21,11 @@
#include <tvm/runtime/registry.h>
#include <vta/dpi/module.h>
#include "vmem/virtual_memory.h"
namespace vta {
namespace driver {
uint32_t get_half_addr(void *p, bool upper) {
if (upper) {
return ((uint64_t) ((uint64_t*) p)) >> 32;
} else {
return ((uint64_t) ((uint64_t*) p));
}
}
using vta::dpi::DPIModuleNode;
using tvm::runtime::Module;
......@@ -70,11 +64,19 @@ class Device {
loader_ = DPILoader::Global();
}
uint32_t Run(uint32_t c, uint32_t length, void* inp, void* out) {
uint32_t Run(uint32_t c, DLTensor* a, DLTensor* b) {
uint32_t cycles;
uint32_t len = a->shape[0];
size_t size = (a->dtype.bits >> 3) * len;
a_ = this->MemAlloc(size);
b_ = this->MemAlloc(size);
this->MemCopyFromHost(a_, a->data, size);
this->Init();
this->Launch(c, length, inp, out);
this->Launch(c, len);
cycles = this->WaitForCompletion();
this->MemCopyToHost(b->data, b_, size);
this->MemFree(a_);
this->MemFree(b_);
return cycles;
}
......@@ -84,13 +86,35 @@ class Device {
dpi_->SimResume();
}
void Launch(uint32_t c, uint32_t length, void* inp, void* out) {
void* MemAlloc(size_t size) {
void * addr = vta::vmem::VirtualMemoryManager::Global()->Alloc(size);
return reinterpret_cast<void*>(vta::vmem::VirtualMemoryManager::Global()->GetPhyAddr(addr));
}
void MemFree(void* buf) {
void * addr = vta::vmem::VirtualMemoryManager::Global()->GetAddr(reinterpret_cast<uint64_t>(buf));
vta::vmem::VirtualMemoryManager::Global()->Free(addr);
}
vta_phy_addr_t MemGetPhyAddr(void* buf) {
return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf));
}
void MemCopyFromHost(void* dst, const void* src, size_t size) {
vta::vmem::VirtualMemoryManager::Global()->MemCopyFromHost(dst, src, size);
}
void MemCopyToHost(void* dst, const void* src, size_t size) {
vta::vmem::VirtualMemoryManager::Global()->MemCopyToHost(dst, src, size);
}
void Launch(uint32_t c, uint32_t len) {
dpi_->WriteReg(0x08, c);
dpi_->WriteReg(0x0c, length);
dpi_->WriteReg(0x10, get_half_addr(inp, false));
dpi_->WriteReg(0x14, get_half_addr(inp, true));
dpi_->WriteReg(0x18, get_half_addr(out, false));
dpi_->WriteReg(0x1c, get_half_addr(out, true));
dpi_->WriteReg(0x0c, len);
dpi_->WriteReg(0x10, this->MemGetPhyAddr(a_));
dpi_->WriteReg(0x14, 0);
dpi_->WriteReg(0x18, this->MemGetPhyAddr(b_));
dpi_->WriteReg(0x1c, 0);
dpi_->WriteReg(0x00, 0x1); // launch
}
......@@ -111,6 +135,10 @@ class Device {
DPILoader* loader_{nullptr};
// DPI Module
DPIModuleNode* dpi_{nullptr};
// input vm ptr
void* a_{nullptr};
// output vm ptr
void* b_{nullptr};
};
using tvm::runtime::TVMRetValue;
......@@ -124,10 +152,11 @@ TVM_REGISTER_GLOBAL("tvm.vta.tsim.init")
TVM_REGISTER_GLOBAL("tvm.vta.driver")
.set_body([](TVMArgs args, TVMRetValue* rv) {
Device dev_;
DLTensor* A = args[0];
DLTensor* B = args[1];
Device dev_;
uint32_t cycles = dev_.Run(static_cast<int>(args[2]), A->shape[0], A->data, B->data);
uint32_t c = static_cast<int>(args[2]);
uint32_t cycles = dev_.Run(c, A, B);
*rv = static_cast<int>(cycles);
});
......
......@@ -21,11 +21,12 @@ import tsim
def test_accel():
rmax = 64
dtype = "uint64"
n = np.random.randint(1, rmax)
c = np.random.randint(0, rmax)
ctx = tvm.cpu(0)
a = tvm.nd.array(np.random.randint(rmax, size=n).astype("uint64"), ctx)
b = tvm.nd.array(np.zeros(n).astype("uint64"), ctx)
a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
f = tsim.load_module()
cycles = f(a, b, c)
msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
......
......@@ -21,11 +21,12 @@ import tsim
def test_accel():
rmax = 64
dtype = "uint64"
n = np.random.randint(1, rmax)
c = np.random.randint(0, rmax)
ctx = tvm.cpu(0)
a = tvm.nd.array(np.random.randint(rmax, size=n).astype("uint64"), ctx)
b = tvm.nd.array(np.zeros(n).astype("uint64"), ctx)
a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
f = tsim.load_module()
cycles = f(a, b, c)
msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment