[VTA][TSIM] add virtual memory support to tsim example (#3868)

* [VTA][TSIM] add virtual memory support to tsim example * fix identation * remove USE_TSIM macro and use 32-bit addr instead

[VTA][TSIM] add virtual memory support to tsim example (#3868)
* [VTA][TSIM] add virtual memory support to tsim example * fix identation * remove USE_TSIM macro and use 32-bit addr instead
9d880bd3 · Luis Vega · Thierry Moreau · a711f38e · 9d880bd3 · 9d880bd3
Commit 9d880bd3 authored Aug 31, 2019 by Luis Vega Committed by Thierry Moreau Aug 31, 2019
6 changed files
--- a/vta/apps/tsim_example/CMakeLists.txt
+++ b/vta/apps/tsim_example/CMakeLists.txt
@@ -35,8 +35,11 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
 endif()

 file(GLOB TSIM_SW_SRC src/driver.cc)
+list(APPEND TSIM_SW_SRC ${VTA_DIR}/src/vmem/virtual_memory.cc)
+list(APPEND TSIM_SW_SRC ${VTA_DIR}/src/dpi/module.cc)
+
 add_library(sw SHARED ${TSIM_SW_SRC})
-target_include_directories(sw PRIVATE ${VTA_DIR}/include)
+target_include_directories(sw PRIVATE ${VTA_DIR}/include ${VTA_DIR}/src)

 if(APPLE)
  set_target_properties(sw PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")

--- a/vta/apps/tsim_example/README.md
+++ b/vta/apps/tsim_example/README.md
@@ -52,7 +52,7 @@ Verilator version check
 verilator --version
 ```

-the supported version of Verilator should be at least 4.012, 
+the supported version of Verilator should be at least 4.012,
 if homebrew (OSX) or package-manager (Linux) does not support that version,
 please install Verilator 4.012 or later from binary or source base on following
 instruction of Verilator wiki.  
@@ -63,13 +63,12 @@ https://www.veripool.org/projects/verilator/wiki/Installing

 1. Install `verilator` and `sbt` as described above
 2. Get tvm `git clone https://github.com/dmlc/tvm.git`
-3. Change VTA target in `tvm/vta/config/vta_config.json` from `sim` to `tsim`
-4. Build [tvm](https://docs.tvm.ai/install/from_source.html#build-the-shared-library)
+3. Build [tvm](https://docs.tvm.ai/install/from_source.html#build-the-shared-library)

 ## How to run VTA TSIM examples

 There are two sample VTA accelerators, add-a-constant, designed in Chisel3 and Verilog to show how *TSIM* works.
-The default `TARGET` language for these two implementations is Verilog. The following instructions show
+The default target language for these two implementations is Verilog. The following instructions show
 how to run both of them:

 * Test Verilog backend

--- a/vta/apps/tsim_example/python/tsim.py
+++ b/vta/apps/tsim_example/python/tsim.py
@@ -21,15 +21,28 @@ import os.path as osp
 from sys import platform

 def get_ext():
+    """Return shared library extension"""
    return ".dylib" if platform == "darwin" else ".so"

 def load_dll(dll):
+    """Load shared library
+
+    Parameters
+    ------------
+    dll : str
+        Path for shared library
+
+    Returns
+    ------------
+    The shared library
+    """
    try:
        return [ctypes.CDLL(dll, ctypes.RTLD_GLOBAL)]
    except OSError:
        return []

 def load_sw():
+    """Load all software shared libraries"""
    cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
    sw_libname = "libsw" + get_ext()
    sw_lib = osp.join(cur_path, "..", "build", sw_libname)
@@ -38,9 +51,9 @@ def load_sw():
 def init(hw_backend):
    """Init hardware and software shared library for accelerator

-     Parameters
-     ------------
-     hw_backend : str
+    Parameters
+    ------------
+    hw_backend : str
        Hardware backend can be verilog or chisel

    """
@@ -48,11 +61,12 @@ def init(hw_backend):
    hw_libname = "libhw" + get_ext()
    if hw_backend in ("verilog", "chisel"):
        hw_lib = osp.join(cur_path, "..", "hardware", hw_backend, "build", hw_libname)
-    m = tvm.module.load(hw_lib, "vta-tsim")
    load_sw()
+    m = tvm.module.load(hw_lib, "vta-tsim")
    f = tvm.get_global_func("tvm.vta.tsim.init")
    f(m)

 def load_module():
+    """Return driver function"""
    load_sw()
    return tvm.get_global_func("tvm.vta.driver")
--- a/vta/apps/tsim_example/src/driver.cc
+++ b/vta/apps/tsim_example/src/driver.cc
@@ -21,17 +21,11 @@
 #include <tvm/runtime/registry.h>
 #include <vta/dpi/module.h>

+#include "vmem/virtual_memory.h"
+
 namespace vta {
 namespace driver {

-uint32_t get_half_addr(void *p, bool upper) {
-  if (upper) {
-    return ((uint64_t) ((uint64_t*) p)) >> 32;
-  } else {
-    return ((uint64_t) ((uint64_t*) p));
-  }
-}
-
 using vta::dpi::DPIModuleNode;
 using tvm::runtime::Module;

@@ -70,11 +64,19 @@ class Device {
    loader_ = DPILoader::Global();
  }

-  uint32_t Run(uint32_t c, uint32_t length, void* inp, void* out) {
+  uint32_t Run(uint32_t c, DLTensor* a, DLTensor* b) {
    uint32_t cycles;
+    uint32_t len = a->shape[0];
+    size_t size = (a->dtype.bits >> 3) * len;
+    a_ = this->MemAlloc(size);
+    b_ = this->MemAlloc(size);
+    this->MemCopyFromHost(a_, a->data, size);
    this->Init();
-    this->Launch(c, length, inp, out);
+    this->Launch(c, len);
    cycles = this->WaitForCompletion();
+    this->MemCopyToHost(b->data, b_, size);
+    this->MemFree(a_);
+    this->MemFree(b_);
    return cycles;
  }

@@ -84,13 +86,35 @@ class Device {
    dpi_->SimResume();
  }

-  void Launch(uint32_t c, uint32_t length, void* inp, void* out) {
+  void* MemAlloc(size_t size) {
+    void * addr = vta::vmem::VirtualMemoryManager::Global()->Alloc(size);
+    return reinterpret_cast<void*>(vta::vmem::VirtualMemoryManager::Global()->GetPhyAddr(addr));
+  }
+
+  void MemFree(void* buf) {
+    void * addr = vta::vmem::VirtualMemoryManager::Global()->GetAddr(reinterpret_cast<uint64_t>(buf));
+    vta::vmem::VirtualMemoryManager::Global()->Free(addr);
+  }
+
+  vta_phy_addr_t MemGetPhyAddr(void* buf) {
+    return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf));
+  }
+
+  void MemCopyFromHost(void* dst, const void* src, size_t size) {
+    vta::vmem::VirtualMemoryManager::Global()->MemCopyFromHost(dst, src, size);
+  }
+
+  void MemCopyToHost(void* dst, const void* src, size_t size) {
+    vta::vmem::VirtualMemoryManager::Global()->MemCopyToHost(dst, src, size);
+  }
+
+  void Launch(uint32_t c, uint32_t len) {
    dpi_->WriteReg(0x08, c);
-    dpi_->WriteReg(0x0c, length);
-    dpi_->WriteReg(0x10, get_half_addr(inp, false));
-    dpi_->WriteReg(0x14, get_half_addr(inp, true));
-    dpi_->WriteReg(0x18, get_half_addr(out, false));
-    dpi_->WriteReg(0x1c, get_half_addr(out, true));
+    dpi_->WriteReg(0x0c, len);
+    dpi_->WriteReg(0x10, this->MemGetPhyAddr(a_));
+    dpi_->WriteReg(0x14, 0);
+    dpi_->WriteReg(0x18, this->MemGetPhyAddr(b_));
+    dpi_->WriteReg(0x1c, 0);
    dpi_->WriteReg(0x00, 0x1); // launch
  }

@@ -111,6 +135,10 @@ class Device {
  DPILoader* loader_{nullptr};
  // DPI Module
  DPIModuleNode* dpi_{nullptr};
+  // input vm ptr
+  void* a_{nullptr};
+  // output vm ptr
+  void* b_{nullptr};
 };

 using tvm::runtime::TVMRetValue;
@@ -124,10 +152,11 @@ TVM_REGISTER_GLOBAL("tvm.vta.tsim.init")

 TVM_REGISTER_GLOBAL("tvm.vta.driver")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
+    Device dev_;
    DLTensor* A = args[0];
    DLTensor* B = args[1];
-    Device dev_;
-    uint32_t cycles = dev_.Run(static_cast<int>(args[2]), A->shape[0], A->data, B->data);
+    uint32_t c = static_cast<int>(args[2]);
+    uint32_t cycles = dev_.Run(c, A, B);
    *rv = static_cast<int>(cycles);
  });


--- a/vta/apps/tsim_example/tests/python/chisel_accel.py
+++ b/vta/apps/tsim_example/tests/python/chisel_accel.py
@@ -21,11 +21,12 @@ import tsim

 def test_accel():
    rmax = 64
+    dtype = "uint64"
    n = np.random.randint(1, rmax)
    c = np.random.randint(0, rmax)
    ctx = tvm.cpu(0)
-    a = tvm.nd.array(np.random.randint(rmax, size=n).astype("uint64"), ctx)
-    b = tvm.nd.array(np.zeros(n).astype("uint64"), ctx)
+    a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
+    b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
    f = tsim.load_module()
    cycles = f(a, b, c)
    msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)

--- a/vta/apps/tsim_example/tests/python/verilog_accel.py
+++ b/vta/apps/tsim_example/tests/python/verilog_accel.py
@@ -21,11 +21,12 @@ import tsim

 def test_accel():
    rmax = 64
+    dtype = "uint64"
    n = np.random.randint(1, rmax)
    c = np.random.randint(0, rmax)
    ctx = tvm.cpu(0)
-    a = tvm.nd.array(np.random.randint(rmax, size=n).astype("uint64"), ctx)
-    b = tvm.nd.array(np.zeros(n).astype("uint64"), ctx)
+    a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
+    b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
    f = tsim.load_module()
    cycles = f(a, b, c)
    msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)