[VTA] VTA hardware/software codebase re-org (#5037)

683ed4a3 · Thierry Moreau · GitHub · 14ba49c6 · 683ed4a3 · 683ed4a3
Unverified Commit 683ed4a3 authored Mar 11, 2020 by Thierry Moreau Committed by GitHub Mar 11, 2020
155 changed files
--- a/Makefile
+++ b/Makefile
@@ -81,7 +81,7 @@ jnilint:
 	python3 3rdparty/dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
 scalalint:
-	make -C vta/hardware/chisel lint
+	make -C vta/vta-hw/hardware/chisel lint
 lint: cpplint pylint jnilint scalalint

--- a/apps/vta_rpc/start_rpc_server_to_tracker.sh
+++ b/apps/vta_rpc/start_rpc_server_to_tracker.sh
@@ -18,7 +18,7 @@
 PROJROOT="$( cd "$( dirname '${BASH_SOURCE[0]}' )/../../" && pwd )"
 # Derive target specified by vta_config.json
-VTA_CONFIG=${PROJROOT}/vta/config/vta_config.py
+VTA_CONFIG=${PROJROOT}/vta/vta-hw/config/vta_config.py
 TARGET=$(python ${VTA_CONFIG} --target)
 export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python

--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -18,14 +18,17 @@
 # CMake Build rules for VTA
 find_program(PYTHON NAMES python python3 python3.6)
+# VTA sources directory
+set(VTA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vta/vta-hw)
 if(MSVC)
  message(STATUS "VTA build is skipped in Windows..")
 elseif(PYTHON)
-  set(VTA_CONFIG ${PYTHON} ${CMAKE_CURRENT_SOURCE_DIR}/vta/config/vta_config.py)
+  set(VTA_CONFIG ${PYTHON} ${VTA_DIR}/config/vta_config.py)
  if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
    message(STATUS "Use VTA config " ${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
-    set(VTA_CONFIG ${PYTHON} ${CMAKE_CURRENT_SOURCE_DIR}/vta/config/vta_config.py
+    set(VTA_CONFIG ${PYTHON} ${VTA_DIR}/config/vta_config.py
      --use-cfg=${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
  endif()
@@ -40,18 +43,18 @@ elseif(PYTHON)
  # Fast simulator driver build
  if(USE_VTA_FSIM)
    # Add fsim driver sources
-    file(GLOB FSIM_RUNTIME_SRCS vta/src/*.cc)
+    file(GLOB FSIM_RUNTIME_SRCS ${VTA_DIR}/src/*.cc)
-    list(APPEND FSIM_RUNTIME_SRCS vta/src/sim/sim_driver.cc)
+    file(GLOB FSIM_RUNTIME_SRCS vta/runtime/*.cc)
-    list(APPEND FSIM_RUNTIME_SRCS vta/src/vmem/virtual_memory.cc vta/src/vmem/virtual_memory.h)
+    list(APPEND FSIM_RUNTIME_SRCS ${VTA_DIR}/src/sim/sim_driver.cc)
-    list(APPEND FSIM_RUNTIME_SRCS vta/src/sim/sim_tlpp.cc)
+    list(APPEND FSIM_RUNTIME_SRCS ${VTA_DIR}/src/sim/sim_tlpp.cc)
+    list(APPEND FSIM_RUNTIME_SRCS ${VTA_DIR}/src/vmem/virtual_memory.cc)
    # Target lib: vta_fsim
    add_library(vta_fsim SHARED ${FSIM_RUNTIME_SRCS})
-    target_include_directories(vta_fsim PUBLIC vta/include)
+    target_include_directories(vta_fsim PUBLIC ${VTA_DIR}/include)
    foreach(__def ${VTA_DEFINITIONS})
      string(SUBSTRING ${__def} 3 -1 __strip_def)
      target_compile_definitions(vta_fsim PUBLIC ${__strip_def})
    endforeach()
-    include_directories("vta/include")
    if(APPLE)
      set_target_properties(vta_fsim PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
    endif(APPLE)
@@ -61,18 +64,18 @@ elseif(PYTHON)
  # Cycle accurate simulator driver build
  if(USE_VTA_TSIM)
    # Add tsim driver sources
-    file(GLOB TSIM_RUNTIME_SRCS vta/src/*.cc)
+    file(GLOB TSIM_RUNTIME_SRCS ${VTA_DIR}/src/*.cc)
-    list(APPEND TSIM_RUNTIME_SRCS vta/src/tsim/tsim_driver.cc)
+    file(GLOB TSIM_RUNTIME_SRCS vta/runtime/*.cc)
-    list(APPEND TSIM_RUNTIME_SRCS vta/src/dpi/module.cc)
+    list(APPEND TSIM_RUNTIME_SRCS ${VTA_DIR}/src/tsim/tsim_driver.cc)
-    list(APPEND TSIM_RUNTIME_SRCS vta/src/vmem/virtual_memory.cc vta/src/vmem/virtual_memory.h)
+    list(APPEND TSIM_RUNTIME_SRCS ${VTA_DIR}/src/dpi/module.cc)
+    list(APPEND TSIM_RUNTIME_SRCS ${VTA_DIR}/src/vmem/virtual_memory.cc)
    # Target lib: vta_tsim
    add_library(vta_tsim SHARED ${TSIM_RUNTIME_SRCS})
-    target_include_directories(vta_tsim PUBLIC vta/include)
+    target_include_directories(vta_tsim PUBLIC ${VTA_DIR}/include)
    foreach(__def ${VTA_DEFINITIONS})
      string(SUBSTRING ${__def} 3 -1 __strip_def)
      target_compile_definitions(vta_tsim PUBLIC ${__strip_def})
    endforeach()
-    include_directories("vta/include")
    if(APPLE)
      set_target_properties(vta_tsim PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
    endif(APPLE)
@@ -80,15 +83,15 @@ elseif(PYTHON)
  # VTA FPGA driver sources
  if(USE_VTA_FPGA)
-    file(GLOB FPGA_RUNTIME_SRCS vta/src/*.cc)
+    file(GLOB FPGA_RUNTIME_SRCS ${VTA_HW_DIR}/src/*.cc)
    # Rules for Zynq-class FPGAs with pynq OS support (see pynq.io)
    if(${VTA_TARGET} STREQUAL "pynq" OR
       ${VTA_TARGET} STREQUAL "ultra96")
-      list(APPEND FPGA_RUNTIME_SRCS vta/src/pynq/pynq_driver.cc)
+      list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_DIR}/src/pynq/pynq_driver.cc)
      # Rules for Pynq v2.4
      find_library(__cma_lib NAMES cma PATH /usr/lib)
    elseif(${VTA_TARGET} STREQUAL "de10nano")  # DE10-Nano rules
-      file(GLOB FPGA_RUNTIME_SRCS vta/src/de10nano/*.cc vta/src/*.cc)
+      file(GLOB FPGA_RUNTIME_SRCS ${VTA_HW_DIR}/src/de10nano/*.cc ${VTA_HW_DIR}/src/*.cc)
    endif()
    # Target lib: vta
    add_library(vta SHARED ${FPGA_RUNTIME_SRCS})
@@ -102,7 +105,7 @@ elseif(PYTHON)
      target_link_libraries(vta ${__cma_lib})
    elseif(${VTA_TARGET} STREQUAL "de10nano")  # DE10-Nano rules
     #target_compile_definitions(vta PUBLIC VTA_MAX_XFER=2097152) # (1<<21)
-      target_include_directories(vta PUBLIC vta/src/de10nano)
+      target_include_directories(vta PUBLIC ${VTA_HW_DIR}/src/de10nano)
      target_include_directories(vta PUBLIC 3rdparty)
      target_include_directories(vta PUBLIC
        "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include")

--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -770,7 +770,7 @@ WARN_LOGFILE           =
 # spaces.
 # Note: If this tag is empty the current directory is searched.
-INPUT                  = include/tvm topi/include/topi vta/include/vta
+INPUT                  = include/tvm topi/include/topi vta/vta-hw/include/vta
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses

--- a/docs/vta/dev/config.rst
+++ b/docs/vta/dev/config.rst
@@ -21,7 +21,7 @@ VTA Configuration
 The VTA stack incorporates both a hardware accelerator stack and
 a TVM based software stack.
 VTA incorporates flexibility out of the box: by modifying the
-``vta/config/vta_config.json`` high-level configuration file,
+``vta/vta-hw/config/vta_config.json`` high-level configuration file,
 the user can change the shape of the tensor intrinsic,
 clock frequency, pipelining, data type width, and on-chip buffer sizes.

--- a/docs/vta/dev/hardware.rst
+++ b/docs/vta/dev/hardware.rst
@@ -53,17 +53,17 @@ HLS Hardware Source Organization
 The VTA design is currently specified in Vivado HLS C++, which is only supported
 by Xilinx toolchains.
-The VTA hardware sources are contained under ``vta/hardware/xilinx/sources``:
+The VTA hardware sources are contained under ``vta/vta-hw/hardware/xilinx/sources``:
 - ``vta.cc`` contains the definitions for each VTA module, as well as a top
   level behavioral model for the top-level VTA design.
 - ``vta.h`` contains type definitions using Xilinx ``ap_int`` types, and
   function prototypes declarations.
-In addition preprocessor macros are defined under ``vta/include/vta/hw_spec.h``.
+In addition preprocessor macros are defined under ``vta/vta-hw/include/vta/hw_spec.h``.
 Much of these macro definitions are derived from the parameters listed in the
-``vta/config/vta_config.json`` file.
+``vta/vta-hw/config/vta_config.json`` file.
-The json file is processed by ``vta/config/vta_config.py`` to produce a string of
+The json file is processed by ``vta/vta-hw/config/vta_config.py`` to produce a string of
 compile flags that define the preprocessor macros.
 That string is used by the makefile in order to set those high-level
 parameters in both the HLS hardware synthesis compiler, and the C++
@@ -220,7 +220,7 @@ Microarchitectural Overview
 ---------------------------
 We describe the modules that compose the VTA design.
-The module definitions are contained in ``vta/hardware/xilinx/sources/vta.cc``.
+The module definitions are contained in ``vta/vta-hw/hardware/xilinx/sources/vta.cc``.
 Fetch Module
 ~~~~~~~~~~~~
@@ -234,7 +234,7 @@ The fetch module is the entry point of VTA to the CPU and is programmed via thre
 The CPU prepares the instruction stream in DRAM in a physically-contiguous buffer prepared by the VTA runtime.
 When the instruction stream is ready, the CPU writes the start physical address into the ``insns`` register, the length of the instruction stream into the ``insn_count`` register, and asserts the start signal in the ``control`` register.
-This procedure starts VTA, which reads in the instruction stream from DRAM via DMA. 
+This procedure starts VTA, which reads in the instruction stream from DRAM via DMA.
 Upon accessing the instruction stream, the fetch module partially decodes instructions, and pushes those instructions into command queues that feed into the load, compute, and store modules:

--- a/docs/vta/install.md
+++ b/docs/vta/install.md
@@ -52,7 +52,7 @@ You are invited to try out our [VTA programming tutorials](https://docs.tvm.ai/v
 ### Advanced Configuration (optional)
 VTA is a generic configurable deep learning accelerator.
-The configuration is specified by `vta_config.json` under the TVM root folder.
+The configuration is specified by `vta_config.json` under `vta/vta-hw/config`.
 This file provides an architectural specification of the VTA accelerator to parameterize the TVM compiler stack and the VTA hardware stack.
 The VTA configuration file also specifies the TVM compiler target.
@@ -62,9 +62,9 @@ To do so,
 ```bash
 cd <tvm root>
-vim vta/config/vta_config.json
+vim vta/vta-hw/config/vta_config.json
 # edit vta_config.json
-make vta
+make
 ```
 ## VTA Pynq-Based Test Setup
@@ -122,7 +122,7 @@ echo 'set(USE_VTA_FSIM OFF)' >> build/config.cmake
 echo 'set(USE_VTA_TSIM OFF)' >> build/config.cmake
 echo 'set(USE_VTA_FPGA ON)' >> build/config.cmake
 # Copy pynq specific configuration
-cp vta/config/pynq_sample.json vta/config/vta_config.json
+cp vta/vta-hw/config/pynq_sample.json vta/vta-hw/config/vta_config.json
 cd build
 cmake ..
 make runtime vta -j2
@@ -156,7 +156,7 @@ In addition, you'll need to edit the `vta_config.json` file on the host to indic
 ```bash
 # On the Host-side
 cd <tvm root>
-cp vta/config/pynq_sample.json vta/config/vta_config.json
+cp vta/vta-hw/config/pynq_sample.json vta/vta-hw/config/vta_config.json
 ```
 This time again, we will run the 2D convolution testbench.
@@ -185,7 +185,7 @@ You can also try out our [VTA programming tutorials](https://docs.tvm.ai/vta/tut
 ## VTA Custom Test Setup for Intel FPGA
-Similar to the PYNQ side setup steps, this third guide bring us the details on how can we setup up the Linux environment for Intel FPGA boards like DE10-Nano. 
+Similar to the PYNQ side setup steps, this third guide bring us the details on how can we setup up the Linux environment for Intel FPGA boards like DE10-Nano.
 In terms of hardware components, you would need the [DE10-Nano Development Kit](https://www.terasic.com.tw/cgi-bin/page/archive.pl?Language=English&No=1046), which can be acquired for $130, or $100 for academics from [Terasic](https://www.terasic.com.tw/). A microSD card would be delivered the kit. Power cables and USB cables would be included as well. However, an additional Ethernet cable would be needed to connect the board to LAN.
@@ -213,7 +213,7 @@ tar xf de10-nano-image-Angstrom-v2016.12.socfpga-sdimg.2017.03.31.tgz
 This would produce a single SD card image named `de10-nano-image-Angstrom-v2016.12.socfpga-sdimg` (approx. 2.4 GB), it contains all the file systems to boot Angstrom Linux.
-Second, plugin a SD card that is ready to flash in your PC, and identify the device id for the disk with `fdisk -l`, or `gparted` if you feel better to use GUI. The typical device id for your disk would likely to be `/dev/sdb`. 
+Second, plugin a SD card that is ready to flash in your PC, and identify the device id for the disk with `fdisk -l`, or `gparted` if you feel better to use GUI. The typical device id for your disk would likely to be `/dev/sdb`.
 Then, flash the disk image into your physical SD card with the following command:
@@ -225,8 +225,8 @@ This would take a few minutes for your PC to write the whole file systems into t
 After this process completes, you are ready to unmount the SD card and insert it into your DE10-Nano board.
 Now you can connect the power cable and serial port to boot the Angstrom Linux.
-> Note: When boot up from the microSD card, you might notice the incompatibility of the linux kernel `zImage` in the microSD card. 
+> Note: When boot up from the microSD card, you might notice the incompatibility of the linux kernel `zImage` in the microSD card.
-> In this case, you might need to build the `zImage` file of your own from [socfpga-4.9.78-ltsi](https://github.com/altera-opensource/linux-socfpga/tree/socfpga-4.9.78-ltsi) branch of the [linux-socfpga](https://github.com/altera-opensource/linux-socfpga) repository. 
+> In this case, you might need to build the `zImage` file of your own from [socfpga-4.9.78-ltsi](https://github.com/altera-opensource/linux-socfpga/tree/socfpga-4.9.78-ltsi) branch of the [linux-socfpga](https://github.com/altera-opensource/linux-socfpga) repository.
 > For a quick fix, you can also download a prebuilt version of the `zImage` file [here](https://raw.githubusercontent.com/liangfu/de10-nano-supplement/master/zImage).
 After connecting the usb cables to the DE10-Nano board, power on the board by connecting the power cable. You may then connect to the serial port of the device by using `minicom` on your host PC:
@@ -240,8 +240,8 @@ The default user name for the device would be `root`, and the password is empty 
 You may now start to install supporting Python3 packages (TVM has dropped the support for Python2), specifically, they are `numpy`, `attrs` and `decorator`.
-> Note: You might fail to install `numpy` by using `pip3` on the DE10-Nano device. 
+> Note: You might fail to install `numpy` by using `pip3` on the DE10-Nano device.
-> In that case, you have the option to either build your own filesystem image for the board from [meta-de10-nano](https://github.com/intel/meta-de10-nano) repository; 
+> In that case, you have the option to either build your own filesystem image for the board from [meta-de10-nano](https://github.com/intel/meta-de10-nano) repository;
 > an alternative option is to download prebuilt packages from existing Linux distributions, e.g. Debian.
 > For a quick fix, we have concatenated the supplementary binary files [here](https://raw.githubusercontent.com/liangfu/de10-nano-supplement/master/rootfs_supplement.tgz), and you can extract the files into the root filesystem.
@@ -251,8 +251,8 @@ After accessing bash terminal from the serial port, we need to install required 
 #### Build Additional Components to Use VTA Bitstream
-To use the above built bitstream on DE10-Nano hardware, several additional components need to be compiled for the system. 
+To use the above built bitstream on DE10-Nano hardware, several additional components need to be compiled for the system.
-Specifically, to compile application executables for the system, you need to download and install [SoCEDS](http://fpgasoftware.intel.com/soceds/18.1/?edition=standard&download_manager=dlm3&platform=linux) (recommended), or alternatively install the `g++-arm-linux-gnueabihf` package on your host machine. You would also need a `cma` kernel module to allocate contigous memory, and a driver for communicating with the VTA subsystem. 
+Specifically, to compile application executables for the system, you need to download and install [SoCEDS](http://fpgasoftware.intel.com/soceds/18.1/?edition=standard&download_manager=dlm3&platform=linux) (recommended), or alternatively install the `g++-arm-linux-gnueabihf` package on your host machine. You would also need a `cma` kernel module to allocate contigous memory, and a driver for communicating with the VTA subsystem.
 ## VTA FPGA Toolchain Installation
@@ -310,7 +310,7 @@ export PATH=${XILINX_VIVADO}/bin:${PATH}
 ### Intel Toolchain Installation
-It is recommended to use `Intel Quartus Prime 18.1`, since the test scripts contained in this document have been tested on this version. 
+It is recommended to use `Intel Quartus Prime 18.1`, since the test scripts contained in this document have been tested on this version.
 You would need to install Intel's FPGA compilation toolchain, [Quartus Prime Lite](http://fpgasoftware.intel.com/?edition=lite), which is a license-free version of the Intel Quartus Prime software.
@@ -347,11 +347,11 @@ For this custom VTA bitstream compilation exercise, we'll change the frequency o
 * Set the `HW_FREQ` field to `142`. The Pynq board supports 100, 142, 167 and 200MHz clocks. Note that the higher the frequency, the harder it will be to close timing. Increasing the frequency can lead to timing violation and thus faulty hardware execution.
 * Set the `HW_CLK_TARGET` to `6`. This parameters refers to the target clock period in nano seconds for HLS - a lower clock period leads to more aggressive pipelining to achieve timing closure at higher frequencies. Technically a 142MHz clock would require a 7ns target, but we intentionally lower the clock target to 6ns to more aggressively pipeline our design.
-Bitstream generation is driven by a top-level `Makefile` under `<tvm root>/vta/hardware/xilinx/`.
+Bitstream generation is driven by a top-level `Makefile` under `<tvm root>/vta/vta-hw/hardware/xilinx/`.
 If you just want to simulate the VTA design in software emulation to make sure that it is functional, enter:
 ```bash
-cd <tvm root>/vta/hardware/xilinx
+cd <tvm root>/vta/vta-hw/hardware/xilinx
 make ip MODE=sim
 ```
@@ -359,7 +359,7 @@ If you just want to generate the HLS-based VTA IP cores without launching the en
 ```bash
 make ip
 ```
-You'll be able to view the HLS synthesis reports under `<tvm root>/vta/build/hardware/xilinx/hls/` `<configuration>/<block>/solution0/syn/report/<block>_csynth.rpt`
+You'll be able to view the HLS synthesis reports under `<tvm root>/vta/vta-hw/build/hardware/xilinx/hls/` `<configuration>/<block>/solution0/syn/report/<block>_csynth.rpt`
 > Note: The `<configuration>` name is a string that summarizes the VTA configuration parameters listed in the `vta_config.json`. The `<block>` name refers to the specific module (or HLS function) that compose the high-level VTA pipeline.
 Finally to run the full hardware compilation and generate the VTA bitstream, run:
@@ -371,20 +371,20 @@ make
 This process is lengthy, and can take around up to an hour to complete depending on your machine's specs.
 We recommend setting the `VTA_HW_COMP_THREADS` variable in the Makefile to take full advantage of all the cores on your development machine.
-Once the compilation completes, the generated bitstream can be found under `<tvm root>/vta/build/hardware/xilinx/vivado/<configuration>/export/vta.bit`.
+Once the compilation completes, the generated bitstream can be found under `<tvm root>/vta/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit`.
 ### Chisel-based Custom VTA Bitstream Compilation for DE10-Nano
-Similar to the HLS-based design, high-level hardware parameters in Chisel-based design are listed in the VTA configuration file [Configs.scala](https://github.com/apache/incubator-tvm/blob/master/vta/hardware/chisel/src/main/scala/core/Configs.scala), and they can be customized by the user.
+Similar to the HLS-based design, high-level hardware parameters in Chisel-based design are listed in the VTA configuration file [Configs.scala](https://github.com/apache/incubator-tvm/blob/master/vta/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala), and they can be customized by the user.
-For Intel FPGA, bitstream generation is driven by a top-level `Makefile` under `<tvmroot>/vta/hardware/intel`.
+For Intel FPGA, bitstream generation is driven by a top-level `Makefile` under `<tvmroot>/vta/vta-hw/hardware/intel`.
 If you just want to generate the Chisel-based VTA IP core for the DE10-Nano board without compiling the design for the FPGA hardware, enter:
 ```bash
-cd <tvmroot>/vta/hardware/intel
+cd <tvmroot>/vta/vta-hw/hardware/intel
 make ip
 ```
-Then you'll be able to locate the generated verilog file at `<tvmroot>/vta/build/hardware/intel/chisel/<configuration>/VTA.DefaultDe10Config.v`.
+Then you'll be able to locate the generated verilog file at `<tvmroot>/vta/vta-hw/build/hardware/intel/chisel/<configuration>/VTA.DefaultDe10Config.v`.
 If you would like to run the full hardware compilation for the `de10nano` board:
 ```bash
@@ -393,14 +393,14 @@ make
 This process might be a bit lengthy, and might take up to half an hour to complete depending on the performance of your PC. The Quartus Prime software would automatically detect the number of cores available on your PC and try to utilize all of them to perform such process.
-Once the compilation completes, the generated bistream can be found under `<tvmroot>/vta/build/hardware/intel/quartus/<configuration>/export/vta.rbf`. You can also open the Quartus project file (.qpf) available at `<tvmroot>/vta/build/hardware/intel/quartus/<configuration>/de10_nano_top.qpf` to look around the generated reports.
+Once the compilation completes, the generated bistream can be found under `<tvmroot>vtay/vta-hw/build/hardware/intel/quartus/<configuration>/export/vta.rbf`. You can also open the Quartus project file (.qpf) available at `<tvmroot>/vta/vta-hw/build/hardware/intel/quartus/<configuration>/de10_nano_top.qpf` to look around the generated reports.
 ### Use the Custom Bitstream
 We can program the new VTA FPGA bitstream by setting the bitstream path of the `vta.program_fpga()` function in the tutorial examples, or in the `test_program_rpc.py` script.
 ```python
-vta.program_fpga(remote, bitstream="<tvm root>/vta/build/hardware/xilinx/vivado/<configuration>/export/vta.bit")
+vta.program_fpga(remote, bitstream="<tvm root>/vta/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit")
 ```
 Instead of downloading a pre-built bitstream from the VTA bitstream repository, TVM will instead use the new bitstream you just generated, which is a VTA design clocked at a higher frequency.

--- a/tests/scripts/task_python_vta.sh
+++ b/tests/scripts/task_python_vta.sh
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-set -e
-set -u
-export PYTHONPATH=python:vta/python:topi/python
-# cleanup pycache
-find . -type f -path "*.pyc" | xargs rm -f
-rm -rf ~/.tvm
-# Rebuild cython
-make cython3
-# Reset default fsim simulation
-cp vta/config/fsim_sample.json vta/config/vta_config.json
-# Run unit tests in functional/fast simulator
-echo "Running unittest in fsim..."
-python3 -m pytest -v vta/tests/python/unittest
-# Run unit tests in functional/fast simulator
-echo "Running integration test in fsim..."
-python3 -m pytest -v vta/tests/python/integration
--- a/tests/scripts/task_python_vta_fsim.sh
+++ b/tests/scripts/task_python_vta_fsim.sh
@@ -30,7 +30,7 @@ rm -rf ~/.tvm
 make cython3
 # Reset default fsim simulation
-cp vta/config/fsim_sample.json vta/config/vta_config.json
+cp vta/vta-hw/config/fsim_sample.json vta/vta-hw/config/vta_config.json
 # Run unit tests in functional/fast simulator
 echo "Running unittest in fsim..."

--- a/tests/scripts/task_python_vta_tsim.sh
+++ b/tests/scripts/task_python_vta_tsim.sh
@@ -30,16 +30,22 @@ rm -rf ~/.tvm
 make cython3
 # Set default VTA config to use TSIM cycle accurate sim
-cp vta/config/tsim_sample.json vta/config/vta_config.json
+cp vta/vta-hw/config/tsim_sample.json vta/vta-hw/config/vta_config.json
+# Build and run the TSIM apps (disable until refactor is complete)
+# echo "Test the TSIM apps..."
+# make -C vta/vta-hw/apps/tsim_example/ run_verilog
+# make -C vta/vta-hw/apps/tsim_example/ run_chisel
+# make -C vta/vta-hw/apps/gemm/ default
 # Check style of scala code
 echo "Check style of scala code..."
-make -C vta/hardware/chisel lint
+make -C vta/vta-hw/hardware/chisel lint
 # Build VTA chisel design and verilator simulator
 echo "Building VTA chisel design..."
-make -C vta/hardware/chisel cleanall
+make -C vta/vta-hw/hardware/chisel cleanall
-make -C vta/hardware/chisel USE_THREADS=0 lib
+make -C vta/vta-hw/hardware/chisel USE_THREADS=0 lib
 # Run unit tests in cycle accurate simulator
 echo "Running unittest in tsim..."
@@ -50,4 +56,4 @@ echo "Running integration test in tsim..."
 python3 -m pytest -v vta/tests/python/integration
 # Reset default fsim simulation
-cp vta/config/fsim_sample.json vta/config/vta_config.json
+cp vta/vta-hw/config/fsim_sample.json vta/vta-hw/config/vta_config.json
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -312,7 +312,7 @@ def _init_env():
        os.path.abspath(os.path.expanduser(__file__)))
    proj_root = os.path.abspath(os.path.join(curr_path, "../../../"))
    path_list = [
-        os.path.join(proj_root, "vta/config/vta_config.json")
+        os.path.join(proj_root, "vta/vta-hw/config/vta_config.json")
    ]
    path_list = [p for p in path_list if os.path.exists(p)]
    if not path_list:

--- a/vta/python/vta/libinfo.py
+++ b/vta/python/vta/libinfo.py
@@ -40,7 +40,7 @@ def _get_lib_name(lib_name):
 def find_libvta(lib_vta, optional=False):
-    """Find VTA library
+    """Find VTA Chisel-based library
    Returns
    -------
@@ -56,10 +56,8 @@ def find_libvta(lib_vta, optional=False):
        Enable error check
    """
    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    lib_search = [curr_path]
+    lib_search = [os.path.join(curr_path, "..", "..", "..", "build",)]
-    lib_search += [os.path.join(curr_path, "..", "..", "build",)]
+    lib_search += [os.path.join(curr_path, "..", "..", "vta-hw", "build")]
-    lib_search += [os.path.join(curr_path, "..", "..", "..", "build",)]
-    lib_search += [os.path.join(curr_path, "..", "..", "..", "build", "Release")]
    lib_name = _get_lib_name(lib_vta)
    lib_path = [os.path.join(x, lib_name) for x in lib_search]
    lib_found = [x for x in lib_path if os.path.exists(x)]

--- a/vta/python/vta/pkg_config.py
+++ b/vta/python/vta/pkg_config.py
@@ -66,21 +66,21 @@ class PkgConfig(object):
        # Include path
        self.include_path = [
            "-I%s/include" % proj_root,
-            "-I%s/vta/include" % proj_root,
+            "-I%s/vta/vta-hw/include" % proj_root,
            "-I%s/3rdparty/dlpack/include" % proj_root,
            "-I%s/3rdparty/dmlc-core/include" % proj_root
        ]
        # List of source files that can be used to build standalone library.
        self.lib_source = []
-        self.lib_source += glob.glob("%s/vta/src/*.cc" % proj_root)
+        self.lib_source += glob.glob("%s/vta/vta-hw/src/*.cc" % proj_root)
        if self.TARGET in ["pynq", "ultra96"]:
            # add pynq drivers for any board that uses pynq driver stack (see pynq.io)
-            self.lib_source += glob.glob("%s/vta/src/pynq/*.cc" % (proj_root))
+            self.lib_source += glob.glob("%s/vta/vta-hw/src/pynq/*.cc" % (proj_root))
        elif self.TARGET in ["de10nano"]:
-            self.lib_source += glob.glob("%s/vta/src/de10nano/*.cc" % (proj_root))
+            self.lib_source += glob.glob("%s/vta/vta-hw/src/de10nano/*.cc" % (proj_root))
            self.include_path += [
-                "-I%s/vta/src/de10nano" % proj_root,
+                "-I%s/vta/vta-hw/src/de10nano" % proj_root,
                "-I%s/3rdparty" % proj_root
            ]

--- a/vta/python/vta/testing/simulator.py
+++ b/vta/python/vta/testing/simulator.py
@@ -37,7 +37,7 @@ def _load_sw():
    if env.TARGET == "tsim":
        lib_hw = find_libvta("libvta_hw", optional=True)
-        assert lib_hw # make sure to build vta/hardware/chisel
+        assert lib_hw # make sure to build vta/vta-hw/hardware/chisel
        try:
            f = tvm.get_global_func("vta.tsim.init")
            m = tvm.runtime.load_module(lib_hw[0], "vta-tsim")

--- a/vta/src/device_api.cc
+++ b/vta/src/device_api.cc
@@ -24,8 +24,8 @@
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include <vta/runtime.h>
+#include "runtime.h"
 #include "../../src/runtime/workspace_pool.h"

--- a/vta/src/runtime.cc
+++ b/vta/src/runtime.cc
@@ -26,15 +26,17 @@
 */
 #include <vta/driver.h>
 #include <vta/hw_spec.h>
-#include <vta/runtime.h>
 #include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <algorithm>
 #include <cassert>
 #include <cstring>
 #include <vector>
 #include <memory>
+#include "runtime.h"
 namespace vta {
 // Avoid bad configurations.

--- a/vta/include/vta/runtime.h
+++ b/vta/include/vta/runtime.h
@@ -6,9 +6,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
- * 
+ *
 *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,15 +22,15 @@
 * \brief VTA runtime library.
 */
-#ifndef VTA_RUNTIME_H_
+#ifndef VTA_RUNTIME_RUNTIME_H_
-#define VTA_RUNTIME_H_
+#define VTA_RUNTIME_RUNTIME_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include <tvm/runtime/c_runtime_api.h>
-#include "driver.h"
+#include <vta/driver.h>
 #define VTA_MEMCPY_H2D 1
 #define VTA_MEMCPY_D2H 2
@@ -291,4 +291,4 @@ TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
 #ifdef __cplusplus
 }
 #endif
-#endif  // VTA_RUNTIME_H_
+#endif  // VTA_RUNTIME_RUNTIME_H_
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -181,7 +181,7 @@ def compile_network(env, target, model, start_pack, stop_pack):
 tracker_host = os.environ.get("TVM_TRACKER_HOST", '0.0.0.0')
 tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
-# Load VTA parameters from the vta/config/vta_config.json file
+# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
 env = vta.get_env()
 # This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.

--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -68,7 +68,7 @@ assert tvm.runtime.enabled("rpc")
 # -------------------------------------
 # Execute on CPU vs. VTA, and define the model.
-# Load VTA parameters from the vta/config/vta_config.json file
+# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
 env = vta.get_env()
 # Set ``device=arm_cpu`` to run inference on the CPU

--- a/vta/tutorials/frontend/deploy_detection.py
+++ b/vta/tutorials/frontend/deploy_detection.py
@@ -111,7 +111,7 @@ names = [x.strip() for x in content]
 # --------------------------------------
 # Execute on CPU vs. VTA, and define the model.
-# Load VTA parameters from the vta/config/vta_config.json file
+# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
 env = vta.get_env()
 # Set ``device=arm_cpu`` to run inference on the CPU
 # or ``device=vta`` to run inference on the FPGA.

--- a/vta/tutorials/matrix_multiply.py
+++ b/vta/tutorials/matrix_multiply.py
@@ -43,7 +43,7 @@ from tvm import rpc
 from tvm.contrib import util
 from vta.testing import simulator
-# Load VTA parameters from the vta/config/vta_config.json file
+# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
 env = vta.get_env()
 # We read the Pynq RPC host IP address and port number from the OS environment

--- a/vta/tutorials/optimize/convolution_opt.py
+++ b/vta/tutorials/optimize/convolution_opt.py
@@ -47,7 +47,7 @@ from tvm import rpc
 from tvm.contrib import util
 from vta.testing import simulator
-# Load VTA parameters from the vta/config/vta_config.json file
+# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
 env = vta.get_env()
 # We read the Pynq RPC host IP address and port number from the OS environment

--- a/vta/tutorials/optimize/matrix_multiply_opt.py
+++ b/vta/tutorials/optimize/matrix_multiply_opt.py
@@ -46,7 +46,7 @@ from tvm import rpc
 from tvm.contrib import util
 from vta.testing import simulator
-# Load VTA parameters from the vta/config/vta_config.json file
+# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
 env = vta.get_env()
 # We read the Pynq RPC host IP address and port number from the OS environment

--- a/vta/apps/gemm/CMakeLists.txt
+++ b/vta/apps/gemm/CMakeLists.txt
@@ -18,13 +18,13 @@
 cmake_minimum_required(VERSION 3.2)
 project(tsim C CXX)
-set(TVM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../)
+set(TVM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../)
-set(VTA_DIR ${TVM_DIR}/vta)
+set(VTA_HW_DIR ${TVM_DIR}/3rdparty/vta-hw)
 include_directories("${TVM_DIR}/include")
 include_directories("${TVM_DIR}/3rdparty/dlpack/include")
 include_directories("${TVM_DIR}/3rdparty/dmlc-core/include")
-include_directories("${TVM_DIR}/vta/src/dpi")
+include_directories("${VTA_HW_DIR}/src/dpi")
 set(CMAKE_C_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden")
 set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden -std=c++11")
@@ -35,11 +35,11 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
 endif()
 file(GLOB TSIM_SW_SRC src/driver.cc)
-list(APPEND TSIM_SW_SRC ${VTA_DIR}/src/vmem/virtual_memory.cc)
+list(APPEND TSIM_SW_SRC ${VTA_HW_DIR}/src/vmem/virtual_memory.cc)
-list(APPEND TSIM_SW_SRC ${VTA_DIR}/src/dpi/module.cc)
+list(APPEND TSIM_SW_SRC ${VTA_HW_DIR}/src/dpi/module.cc)
 add_library(sw SHARED ${TSIM_SW_SRC})
-target_include_directories(sw PRIVATE ${VTA_DIR}/include ${VTA_DIR}/src)
+target_include_directories(sw PRIVATE ${VTA_HW_DIR}/include ${VTA_HW_DIR}/src)
 if(APPLE)
  set_target_properties(sw PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")

--- a/vta/apps/gemm/Makefile
+++ b/vta/apps/gemm/Makefile
@@ -15,15 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
-export PYTHONPATH:=$(PWD)/python:$(PYTHONPATH)
+export PYTHONPATH:=$(abspath .)/python:$(PYTHONPATH)
+export PYTHONPATH:=$(abspath .)/../../../../python:$(PYTHONPATH)
 BUILD_NAME = build
 build_dir = $(abspath .)/$(BUILD_NAME)
-default: chisel driver
+default: chisel driver serial parallel
-	python3 tests/python/chisel_accel.py serial
-serial:  
+serial:
 	python3 tests/python/chisel_accel.py serial
 parallel:

--- a/vta/apps/gemm/README.md
+++ b/vta/apps/gemm/README.md
@@ -15,9 +15,9 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
-VTA TSIM Application 
+VTA TSIM Application
 ======================
-Prior to this application, please take a look at `<tvm-root>/vta/apps/tsim_example` for installation
+Prior to this application, please take a look at `<tvm-root>/vta/vta-hw/apps/tsim_example` for installation
 This is an application that performs Bit Serial Multiplication for GEMM utilizing TSIM.
 **Bit Serial Multiplication for GEMM:**
@@ -28,23 +28,23 @@ We approach this operation with slicing and shifting, like how basic multiplicat
 We can sufficiently reduce the cycles required to perform a gemm given that the data bit width is small. This GEMM application uses TSIM for future accerlerator prototypes.
 * Test Chisel3 backend with bit serial GEMM
-    * Go to `<tvm-root>/vta/apps/gemm`
+    * Go to `<tvm-root>/vta/vta-hw/apps/gemm`
    * Run `make`
-* If you have already compiled chisel backend (i.e. ran `make`) 
+* If you have already compiled chisel backend (i.e. ran `make`)
    * Bit Serial test with another input set, run `make serial`
    * Bit parallel test with another input set, run `make parallel`
 * Some steps for creating your own custom TSIM application
-    * Go to `<tvm-root>/vta/apps/gemm`
+    * Go to `<tvm-root>/vta/vta-hw/apps/gemm`
    * Create custom circuit within `./hardware/chisel/src/scala.main/accel/Compute.scala`
    * Map the according Registers in `./hardware/chisel/src/scala.main/accel/RegFile.scala`
    * Create your test script
    * Map the registers in `./src/driver.cc` and link it with both `RegFile.scala` and the test script
-    * Understanding of `<tvm-root>/vta/apps/tsim_example`, which performs add by one to a vector, is highly encouraged to create a more complex application
+    * Understanding of `<tvm-root>/vta/vta-hw/apps/tsim_example`, which performs add by one to a vector, is highly encouraged to create a more complex application
 * Some pointers
-    * Chisel3 tests in `<tvm-root>/vta/apps/gemm/tests/python`
+    * Chisel3 tests in `<tvm-root>/vta/vta-hw/apps/gemm/tests/python`
-    * Chisel3 accelerator backend `<tvm-root>/vta/apps/gemm/hardware/chisel`
+    * Chisel3 accelerator backend `<tvm-root>/vta/vta-hw/apps/gemm/hardware/chisel`
-    * Software C++ driver (backend) that handles the accelerator `<tvm-root>/vta/apps/gemm/src/driver.cc`
+    * Software C++ driver (backend) that handles the accelerator `<tvm-root>/vta/vta-hw/apps/gemm/src/driver.cc`
-    * Software Python driver (frontend) that handles the accelerator `<tvm-root>/vta/apps/gemm/python/accel`
+    * Software Python driver (frontend) that handles the accelerator `<tvm-root>vtay/vta-hw/apps/gemm/python/accel`
--- a/vta/apps/gemm/hardware/chisel/Makefile
+++ b/vta/apps/gemm/hardware/chisel/Makefile
@@ -38,7 +38,7 @@ USE_TRACE = 1
 LIBNAME = libhw
 vta_dir = $(abspath ../../../../)
-tvm_dir = $(abspath ../../../../../)
+tvm_dir = $(abspath ../../../../../../)
 build_dir = $(abspath .)/$(BUILD_NAME)
 verilator_build_dir = $(build_dir)/verilator
 chisel_build_dir = $(build_dir)/chisel

--- a/vta/apps/gemm/hardware/chisel/build.sbt
+++ b/vta/apps/gemm/hardware/chisel/build.sbt
--- a/vta/apps/gemm/hardware/chisel/project/build.properties
+++ b/vta/apps/gemm/hardware/chisel/project/build.properties
@@ -17,4 +17,4 @@
 * under the License.
 */
-sbt.version = 1.1.1
+sbt.version = 1.3.2
--- a/vta/apps/gemm/hardware/chisel/project/plugins.sbt
+++ b/vta/apps/gemm/hardware/chisel/project/plugins.sbt
--- a/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Accel.scala
+++ b/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Accel.scala
--- a/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
+++ b/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
@@ -89,7 +89,7 @@ class Compute(implicit config: AccelConfig) extends Module {
    is (sReadAData) {
      when (io.mem.rd.valid) {
        state := sReadADone
-      }   
+      }
    }
    is (sReadADone) {
      when (cntwgt === (length * length) - 1.U) {
@@ -180,8 +180,8 @@ class Compute(implicit config: AccelConfig) extends Module {
  }
  io.mem.rd.ready := state === sReadAData | state === sReadBData
-  mvc.io.inp.data.valid := state === sInpDone // 2 inputs have been processed 
+  mvc.io.inp.data.valid := state === sInpDone // 2 inputs have been processed
-  mvc.io.wgt.data.valid := state === sInpDone // 2 inputs have been processed 
+  mvc.io.wgt.data.valid := state === sInpDone // 2 inputs have been processed
  mvc.io.wgt.data.bits <> reg1
  mvc.io.inp.data.bits <> reg2
@@ -198,7 +198,7 @@ class Compute(implicit config: AccelConfig) extends Module {
  accum.io.valid := mvc.io.acc_o.data.valid
  // write
-  io.mem.wr.valid := state === sWriteData 
+  io.mem.wr.valid := state === sWriteData
  io.mem.wr.bits := accum.io.sum(cntout)
  // count read/write

--- a/vta/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
+++ b/vta/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
--- a/vta/apps/gemm/hardware/chisel/src/test/scala/dut/TestAccel.scala
+++ b/vta/apps/gemm/hardware/chisel/src/test/scala/dut/TestAccel.scala
--- a/vta/apps/gemm/python/__init__.py
+++ b/vta/apps/gemm/python/__init__.py
--- a/vta/apps/gemm/python/tsim.py
+++ b/vta/apps/gemm/python/tsim.py
--- a/vta/apps/gemm/src/driver.cc
+++ b/vta/apps/gemm/src/driver.cc
--- a/vta/apps/gemm/tests/python/chisel_accel.py
+++ b/vta/apps/gemm/tests/python/chisel_accel.py
--- a/vta/apps/tsim_example/CMakeLists.txt
+++ b/vta/apps/tsim_example/CMakeLists.txt
@@ -18,13 +18,13 @@
 cmake_minimum_required(VERSION 3.2)
 project(tsim C CXX)
-set(TVM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../)
+set(TVM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../)
-set(VTA_DIR ${TVM_DIR}/vta)
+set(VTA_HW_DIR ${TVM_DIR}/vta/vta-hw)
 include_directories("${TVM_DIR}/include")
 include_directories("${TVM_DIR}/3rdparty/dlpack/include")
 include_directories("${TVM_DIR}/3rdparty/dmlc-core/include")
-include_directories("${TVM_DIR}/vta/src/dpi")
+include_directories("${VTA_HW_DIR}/src/dpi")
 set(CMAKE_C_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden")
 set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden -std=c++11")
@@ -35,11 +35,11 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
 endif()
 file(GLOB TSIM_SW_SRC src/driver.cc)
-list(APPEND TSIM_SW_SRC ${VTA_DIR}/src/vmem/virtual_memory.cc)
+list(APPEND TSIM_SW_SRC ${VTA_HW_DIR}/src/vmem/virtual_memory.cc)
-list(APPEND TSIM_SW_SRC ${VTA_DIR}/src/dpi/module.cc)
+list(APPEND TSIM_SW_SRC ${VTA_HW_DIR}/src/dpi/module.cc)
 add_library(sw SHARED ${TSIM_SW_SRC})
-target_include_directories(sw PRIVATE ${VTA_DIR}/include ${VTA_DIR}/src)
+target_include_directories(sw PRIVATE ${VTA_HW_DIR}/include ${VTA_HW_DIR}/src)
 if(APPLE)
  set_target_properties(sw PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")

--- a/vta/apps/tsim_example/Makefile
+++ b/vta/apps/tsim_example/Makefile
@@ -20,7 +20,9 @@ export PYTHONPATH:=$(PWD)/python:$(PYTHONPATH)
 BUILD_NAME = build
 build_dir = $(abspath .)/$(BUILD_NAME)
-default: verilog driver
+default: run_verilog
+run_verilog: verilog driver
 	python3 tests/python/verilog_accel.py
 run_chisel: chisel driver

--- a/vta/apps/tsim_example/README.md
+++ b/vta/apps/tsim_example/README.md
@@ -55,7 +55,7 @@ verilator --version
 the supported version of Verilator should be at least 4.012,
 if homebrew (OSX) or package-manager (Linux) does not support that version,
 please install Verilator 4.012 or later from binary or source base on following
-instruction of Verilator wiki.  
+instruction of Verilator wiki.
 https://www.veripool.org/projects/verilator/wiki/Installing
@@ -72,16 +72,16 @@ The default target language for these two implementations is Verilog. The follow
 how to run both of them:
 * Test Verilog backend
-    * Go to `<tvm-root>/vta/apps/tsim_example`
+    * Go to `<tvm-root>/vta/vta-hw/apps/tsim_example`
    * Run `make`
 * Test Chisel3 backend
-    * Go to `<tvm-root>/vta/apps/tsim_example`
+    * Go to `<tvm-root>/vta/vta-hw/apps/tsim_example`
    * Run `make run_chisel`
 * Some pointers
-    * Verilog and Chisel3 tests in `<tvm-root>/vta/apps/tsim_example/tests/python`
+    * Verilog and Chisel3 tests in `<tvm-root>/vta/vta-hw/apps/tsim_example/tests/python`
-    * Verilog accelerator backend `<tvm-root>/vta/apps/tsim_example/hardware/verilog`
+    * Verilog accelerator backend `<tvm-root>/vta/vta-hw/apps/tsim_example/hardware/verilog`
-    * Chisel3 accelerator backend `<tvm-root>/vta/apps/tsim_example/hardware/chisel`
+    * Chisel3 accelerator backend `<tvm-root>/vta/vta-hw/apps/tsim_example/hardware/chisel`
-    * Software C++ driver (backend) that handles the accelerator `<tvm-root>/vta/apps/tsim_example/src/driver.cc`
+    * Software C++ driver (backend) that handles the accelerator `<tvm-root>/vta/vta-hw/apps/tsim_example/src/driver.cc`
-    * Software Python driver (frontend) that handles the accelerator `<tvm-root>/vta/apps/tsim_example/python/accel`
+    * Software Python driver (frontend) that handles the accelerator `<tvm-root>vtay/vta-hw/apps/tsim_example/python/accel`
--- a/vta/apps/tsim_example/hardware/chisel/Makefile
+++ b/vta/apps/tsim_example/hardware/chisel/Makefile
@@ -38,7 +38,7 @@ USE_TRACE = 0
 LIBNAME = libhw
 vta_dir = $(abspath ../../../../)
-tvm_dir = $(abspath ../../../../../)
+tvm_dir = $(abspath ../../../../../../)
 build_dir = $(abspath .)/$(BUILD_NAME)
 verilator_build_dir = $(build_dir)/verilator
 chisel_build_dir = $(build_dir)/chisel

--- a/vta/apps/tsim_example/hardware/chisel/build.sbt
+++ b/vta/apps/tsim_example/hardware/chisel/build.sbt
--- a/vta/apps/tsim_example/hardware/chisel/project/build.properties
+++ b/vta/apps/tsim_example/hardware/chisel/project/build.properties
@@ -17,4 +17,4 @@
 * under the License.
 */
-sbt.version = 1.1.1
+sbt.version = 1.3.2
--- a/vta/apps/tsim_example/hardware/chisel/project/plugins.sbt
+++ b/vta/apps/tsim_example/hardware/chisel/project/plugins.sbt
--- a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
--- a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
--- a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
--- a/vta/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
+++ b/vta/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
--- a/vta/apps/tsim_example/hardware/verilog/Makefile
+++ b/vta/apps/tsim_example/hardware/verilog/Makefile
@@ -38,7 +38,7 @@ USE_TRACE = 0
 LIBNAME = libhw
 vta_dir = $(abspath ../../../../)
-tvm_dir = $(abspath ../../../../../)
+tvm_dir = $(abspath ../../../../../../)
 build_dir = $(abspath .)/$(BUILD_NAME)
 verilator_opt = --cc

--- a/vta/apps/tsim_example/hardware/verilog/src/Accel.v
+++ b/vta/apps/tsim_example/hardware/verilog/src/Accel.v
--- a/vta/apps/tsim_example/hardware/verilog/src/Compute.v
+++ b/vta/apps/tsim_example/hardware/verilog/src/Compute.v
--- a/vta/apps/tsim_example/hardware/verilog/src/RegFile.v
+++ b/vta/apps/tsim_example/hardware/verilog/src/RegFile.v
--- a/vta/apps/tsim_example/hardware/verilog/src/TestAccel.v
+++ b/vta/apps/tsim_example/hardware/verilog/src/TestAccel.v
--- a/vta/apps/tsim_example/python/__init__.py
+++ b/vta/apps/tsim_example/python/__init__.py
--- a/vta/apps/tsim_example/python/tsim.py
+++ b/vta/apps/tsim_example/python/tsim.py
--- a/vta/apps/tsim_example/src/driver.cc
+++ b/vta/apps/tsim_example/src/driver.cc
--- a/vta/apps/tsim_example/tests/python/chisel_accel.py
+++ b/vta/apps/tsim_example/tests/python/chisel_accel.py
--- a/vta/apps/tsim_example/tests/python/verilog_accel.py
+++ b/vta/apps/tsim_example/tests/python/verilog_accel.py
--- a/vta/config/README.md
+++ b/vta/config/README.md
--- a/vta/config/de10nano_sample.json
+++ b/vta/config/de10nano_sample.json
--- a/vta/config/fsim_sample.json
+++ b/vta/config/fsim_sample.json
--- a/vta/config/pynq_sample.json
+++ b/vta/config/pynq_sample.json
--- a/vta/config/tsim_sample.json
+++ b/vta/config/tsim_sample.json
--- a/vta/config/ultra96_sample.json
+++ b/vta/config/ultra96_sample.json
--- a/vta/config/vta_config.json
+++ b/vta/config/vta_config.json
--- a/vta/config/vta_config.py
+++ b/vta/config/vta_config.py
@@ -23,7 +23,7 @@ import argparse
 def get_pkg_config(cfg):
    """Get the pkg config object."""
    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    proj_root = os.path.abspath(os.path.join(curr_path, "../../"))
+    proj_root = os.path.abspath(os.path.join(curr_path, "../../../"))
    pkg_config_py = os.path.join(proj_root, "vta/python/vta/pkg_config.py")
    libpkg = {"__file__": pkg_config_py}
    exec(compile(open(pkg_config_py, "rb").read(), pkg_config_py, "exec"), libpkg, libpkg)
@@ -107,9 +107,9 @@ def main():
    curr_path = os.path.dirname(
        os.path.abspath(os.path.expanduser(__file__)))
-    proj_root = os.path.abspath(os.path.join(curr_path, "../../"))
+    proj_root = os.path.abspath(os.path.join(curr_path, "../../../"))
    path_list = [
-        os.path.join(proj_root, "vta/config/vta_config.json")
+        os.path.join(proj_root, "vta/vta-hw/config/vta_config.json")
    ]
    if args.use_cfg:
        path_list = [args.use_cfg]

--- a/vta/hardware/chisel/.gitignore
+++ b/vta/hardware/chisel/.gitignore
--- a/vta/hardware/chisel/Makefile
+++ b/vta/hardware/chisel/Makefile
@@ -21,15 +21,15 @@ endif
 # Change VERILATOR_INC_DIR if Verilator is installed on a different location
 ifeq (, $(VERILATOR_INC_DIR))
-  ifeq (, $(wildcard /usr/local/share/verilator/include/*))
+	ifeq (, $(wildcard /usr/local/share/verilator/include/*))
-    ifeq (, $(wildcard /usr/share/verilator/include/*))
+		ifeq (, $(wildcard /usr/share/verilator/include/*))
-      $(error "Verilator include directory is not set properly")
+			$(error "Verilator include directory is not set properly")
-    else
+		else
-      VERILATOR_INC_DIR := /usr/share/verilator/include
+			VERILATOR_INC_DIR := /usr/share/verilator/include
-    endif
+		endif
-  else
+	else
-      VERILATOR_INC_DIR := /usr/local/share/verilator/include
+			VERILATOR_INC_DIR := /usr/local/share/verilator/include
-  endif
+	endif
 endif
 CONFIG = DefaultDe10Config
@@ -49,7 +49,7 @@ USE_TRACE_FST = 0
 # This will significantly increase the trace size and should only be used
 # on a per need basis for difficult debug problems.
 USE_TRACE_DETAILED = 0
-USE_THREADS = $(shell nproc)
+USE_THREADS = 0
 VTA_LIBNAME = libvta_hw
 UNITTEST_NAME = all
 CXX = g++
@@ -65,7 +65,7 @@ CXX_HAS_ALIGN_NEW := $(shell [ $(CXX_MAJOR) -ge 7 ] && echo true)
 config_test = $(TOP_TEST)$(CONFIG)
 vta_dir = $(abspath ../../)
-tvm_dir = $(abspath ../../../)
+tvm_dir = $(abspath ../../../../)
 verilator_build_dir = $(vta_dir)/$(BUILD_NAME)/verilator
 chisel_build_dir = $(vta_dir)/$(BUILD_NAME)/chisel
@@ -81,14 +81,14 @@ verilator_opt += -Mdir ${verilator_build_dir}
 verilator_opt += -I$(chisel_build_dir)
 ifeq ($(DEBUG), 0)
-  cxx_flags = -O2 -Wall -fvisibility=hidden
+	cxx_flags = -O2 -Wall -fvisibility=hidden
 else
-  cxx_flags = -O0 -g -Wall
+	cxx_flags = -O0 -g -Wall
 endif
 cxx_flags += -std=c++11 -Wno-maybe-uninitialized
 ifeq ($(CXX_HAS_ALIGN_NEW),true)
-  cxx_flags += -faligned-new
+	cxx_flags += -faligned-new
 endif
 cxx_flags += -DVL_TSIM_NAME=V$(TOP_TEST)
 cxx_flags += -DVL_PRINTF=printf
@@ -107,50 +107,50 @@ cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
 ld_flags = -fPIC -shared
 ifeq ($(SANITIZE), 1)
-  ifeq ($(DEBUG), 1)
+	ifeq ($(DEBUG), 1)
-    cxx_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
+		cxx_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
-     ld_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
+		ld_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
-  endif
+	endif
 endif
 cxx_objs = $(verilator_build_dir)/verilated.o $(verilator_build_dir)/verilated_dpi.o $(verilator_build_dir)/tsim_device.o
 ifneq ($(USE_TRACE), 0)
-  cxx_flags += -DVM_TRACE=1
+	cxx_flags += -DVM_TRACE=1
-  ifeq ($(USE_TRACE_FST), 1)
+	ifeq ($(USE_TRACE_FST), 1)
-    cxx_flags += -DVM_TRACE_FST
+		cxx_flags += -DVM_TRACE_FST
-    verilator_opt += --trace-fst
+		verilator_opt += --trace-fst
-  else
+	else
-    verilator_opt += --trace
+		verilator_opt += --trace
-  endif
+	endif
-  ifeq ($(USE_TRACE_DETAILED), 1)
+	ifeq ($(USE_TRACE_DETAILED), 1)
-    verilator_opt += --trace-underscore --trace-structs
+		verilator_opt += --trace-underscore --trace-structs
-  endif
+	endif
-  ifeq ($(USE_TRACE_FST), 1)
+	ifeq ($(USE_TRACE_FST), 1)
-    cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).fst
+		cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).fst
-    cxx_objs += $(verilator_build_dir)/verilated_fst_c.o
+		cxx_objs += $(verilator_build_dir)/verilated_fst_c.o
-  else
+	else
-    cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).vcd
+		cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).vcd
-    cxx_objs += $(verilator_build_dir)/verilated_vcd_c.o
+		cxx_objs += $(verilator_build_dir)/verilated_vcd_c.o
-  endif
+	endif
 else
-  cxx_flags += -DVM_TRACE=0
+	cxx_flags += -DVM_TRACE=0
 endif
 ifneq ($(USE_THREADS), 0)
-  verilator_opt += --threads $(USE_THREADS)
+	verilator_opt += --threads $(USE_THREADS)
-  cxx_flags += -DVL_THREADED
+	cxx_flags += -DVL_THREADED
-  cxx_objs += $(verilator_build_dir)/verilated_threads.o
+	cxx_objs += $(verilator_build_dir)/verilated_threads.o
 endif
 VPATH = $(VERILATOR_INC_DIR):$(verilator_build_dir):$(vta_dir)/hardware/dpi
 # The following is to be consistent with cmake
 ifeq ($(shell uname), Darwin)
-  lib_path = $(vta_dir)/$(BUILD_NAME)/$(VTA_LIBNAME).dylib
+	lib_path = $(vta_dir)/$(BUILD_NAME)/$(VTA_LIBNAME).dylib
-  cxx_flags += -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
+	cxx_flags += -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
 else
-  lib_path = $(vta_dir)/$(BUILD_NAME)/$(VTA_LIBNAME).so
+	lib_path = $(vta_dir)/$(BUILD_NAME)/$(VTA_LIBNAME).so
 endif
 default: lint lib
@@ -193,4 +193,5 @@ clean:
 cleanall:
 	-rm -rf $(vta_dir)/$(BUILD_NAME)/chisel
 	-rm -rf $(vta_dir)/$(BUILD_NAME)/libvta_hw.so
+	-rm -rf $(vta_dir)/$(BUILD_NAME)/libvta_hw.dylib
 	-rm -rf $(vta_dir)/$(BUILD_NAME)/verilator
--- a/vta/hardware/chisel/README.md
+++ b/vta/hardware/chisel/README.md
@@ -15,16 +15,16 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
-VTA in Chisel 
+VTA in Chisel
 ===================================================
 For contributors who wants to test a chisel module:
 - You can add your test files in  `src/test/scala/unitttest`
 - Add your test name and tests to the `test` object in `src/test/scala/unitttest/Launcher.scala`
 - Check out the provided sample test `mvm` which tests the MatrixVectorComputation module
    in `src/main/scala/core/TensorGemm.scala`
 - Running unit tests: `make test test_name=your_own test_name`
--- a/vta/hardware/chisel/build.sbt
+++ b/vta/hardware/chisel/build.sbt
@@ -62,7 +62,7 @@ resolvers ++= Seq(
 val defaultVersions = Map(
  "chisel3" -> "3.1.7",
-  "chisel-iotesters" -> "[1.2.5,1.3-SNAPSHOT["
+  "chisel-iotesters" -> "1.2.4"
  )
 libraryDependencies ++= Seq("chisel3","chisel-iotesters").map {

--- a/vta/hardware/chisel/project/build.properties
+++ b/vta/hardware/chisel/project/build.properties
@@ -17,4 +17,4 @@
 * under the License.
 */
-sbt.version = 1.1.1
+sbt.version = 1.3.2
--- a/vta/hardware/chisel/project/plugins.sbt
+++ b/vta/hardware/chisel/project/plugins.sbt
--- a/vta/hardware/chisel/scalastyle-config.xml
+++ b/vta/hardware/chisel/scalastyle-config.xml
--- a/vta/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
+++ b/vta/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
--- a/vta/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v
+++ b/vta/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v
--- a/vta/hardware/chisel/src/main/resources/verilog/VTASimDPI.v
+++ b/vta/hardware/chisel/src/main/resources/verilog/VTASimDPI.v
--- a/vta/hardware/chisel/src/main/scala/core/Compute.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Compute.scala
--- a/vta/hardware/chisel/src/main/scala/core/Configs.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Configs.scala
--- a/vta/hardware/chisel/src/main/scala/core/Core.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Core.scala
--- a/vta/hardware/chisel/src/main/scala/core/Decode.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Decode.scala
--- a/vta/hardware/chisel/src/main/scala/core/EventCounters.scala
+++ b/vta/hardware/chisel/src/main/scala/core/EventCounters.scala
--- a/vta/hardware/chisel/src/main/scala/core/Fetch.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Fetch.scala
--- a/vta/hardware/chisel/src/main/scala/core/ISA.scala
+++ b/vta/hardware/chisel/src/main/scala/core/ISA.scala
--- a/vta/hardware/chisel/src/main/scala/core/Load.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Load.scala
--- a/vta/hardware/chisel/src/main/scala/core/LoadUop.scala
+++ b/vta/hardware/chisel/src/main/scala/core/LoadUop.scala
--- a/vta/hardware/chisel/src/main/scala/core/Semaphore.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Semaphore.scala
--- a/vta/hardware/chisel/src/main/scala/core/Store.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Store.scala
--- a/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
+++ b/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
--- a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala
+++ b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala
--- a/vta/hardware/chisel/src/main/scala/core/TensorLoad.scala
+++ b/vta/hardware/chisel/src/main/scala/core/TensorLoad.scala
--- a/vta/hardware/chisel/src/main/scala/core/TensorStore.scala
+++ b/vta/hardware/chisel/src/main/scala/core/TensorStore.scala
--- a/vta/hardware/chisel/src/main/scala/core/TensorUtil.scala
+++ b/vta/hardware/chisel/src/main/scala/core/TensorUtil.scala
--- a/vta/hardware/chisel/src/main/scala/core/package.scala
+++ b/vta/hardware/chisel/src/main/scala/core/package.scala
--- a/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
+++ b/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
--- a/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
+++ b/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
--- a/vta/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala
+++ b/vta/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala
--- a/vta/hardware/chisel/src/main/scala/interface/axi/AXI.scala
+++ b/vta/hardware/chisel/src/main/scala/interface/axi/AXI.scala
--- a/vta/hardware/chisel/src/main/scala/shell/Configs.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/Configs.scala
--- a/vta/hardware/chisel/src/main/scala/shell/IntelShell.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/IntelShell.scala
--- a/vta/hardware/chisel/src/main/scala/shell/SimShell.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/SimShell.scala
--- a/vta/hardware/chisel/src/main/scala/shell/VCR.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/VCR.scala
--- a/vta/hardware/chisel/src/main/scala/shell/VME.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/VME.scala
--- a/vta/hardware/chisel/src/main/scala/shell/VTAShell.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/VTAShell.scala
--- a/vta/hardware/chisel/src/main/scala/shell/XilinxShell.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/XilinxShell.scala
--- a/vta/hardware/chisel/src/main/scala/test/Test.scala
+++ b/vta/hardware/chisel/src/main/scala/test/Test.scala
--- a/vta/hardware/chisel/src/main/scala/util/Config.scala
+++ b/vta/hardware/chisel/src/main/scala/util/Config.scala
--- a/vta/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
+++ b/vta/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
--- a/vta/hardware/chisel/src/main/scala/vta/Configs.scala
+++ b/vta/hardware/chisel/src/main/scala/vta/Configs.scala
--- a/vta/hardware/chisel/src/test/scala/unittest/AluTest.scala
+++ b/vta/hardware/chisel/src/test/scala/unittest/AluTest.scala
@@ -16,7 +16,7 @@
 * specific language governing permissions and limitations
 * under the License.
 */
 package unittest
 import chisel3._
@@ -27,7 +27,7 @@ import unittest.util._
 import vta.core._
 class TestAluVector(c: AluVector) extends PeekPokeTester(c) {
  /* alu_ref
   *
   * This is a software function used as a reference for the hardware
@@ -36,11 +36,11 @@ class TestAluVector(c: AluVector) extends PeekPokeTester(c) {
    val size = a.length
    val mask = helper.getMask(log2Ceil(width))
    val res = Array.fill(size) {0}
    if (opcode == 1) {
      for (i <- 0 until size) {
        res(i) = if (a(i) < b(i)) b(i) else a(i)
-      } 
+      }
    } else if (opcode == 2) {
      for (i <- 0 until size) {
        res(i) = a(i) + b(i)
@@ -62,7 +62,7 @@ class TestAluVector(c: AluVector) extends PeekPokeTester(c) {
      }
    }
    return res
-  } 
+  }
  val num_ops = ALU_OP_NUM
  for (i <- 0 until num_ops) {
@@ -73,18 +73,18 @@ class TestAluVector(c: AluVector) extends PeekPokeTester(c) {
    val in_a = dataGen.any
    val in_b = if (op != 4) dataGen.any else dataGen.negative
    val mask = helper.getMask(bits)
-    val res = aluRef(op, in_a, in_b, bits)  
+    val res = aluRef(op, in_a, in_b, bits)
    for (i <- 0 until c.blockOut) {
      poke(c.io.acc_a.data.bits(0)(i), in_a(i) & mask)
      poke(c.io.acc_b.data.bits(0)(i), in_b(i) & mask)
    }
-    poke(c.io.opcode, op) 
+    poke(c.io.opcode, op)
    poke(c.io.acc_a.data.valid, 1)
    poke(c.io.acc_b.data.valid, 1)
    poke(c.io.acc_y.data.valid, 1)
    step(1)
    poke(c.io.acc_a.data.valid, 0)
@@ -94,11 +94,11 @@ class TestAluVector(c: AluVector) extends PeekPokeTester(c) {
    // wait for valid signal
    while (peek(c.io.acc_y.data.valid) == BigInt(0)) {
      step(1) // advance clock
-    } 
+    }
    if (peek(c.io.acc_y.data.valid) == BigInt(1)) {
      for (i <- 0 until c.blockOut) {
          expect(c.io.acc_y.data.bits(0)(i), res(i) & mask)
      }
    }
-  } 
+  }
 }
--- a/vta/hardware/chisel/src/test/scala/unittest/Launcher.scala
+++ b/vta/hardware/chisel/src/test/scala/unittest/Launcher.scala
@@ -16,7 +16,7 @@
 * specific language governing permissions and limitations
 * under the License.
 */
 package unittest
 // taken from https://github.com/freechipsproject/chisel-testers
@@ -36,7 +36,7 @@ class TestConfig extends Config(new CoreConfig ++ new PynqConfig)
 *
 * How to Use:
 * When the user input: sbt 'test:runMain unittest.Launcher mvm'
- * the TestRunner will look for 'mvm' in the map and executes the 
+ * the TestRunner will look for 'mvm' in the map and executes the
 * test that 'mvm' is mapped to
 */
 object Launcher {
@@ -50,8 +50,8 @@ object Launcher {
 		"alu" -> { (manager: TesterOptionsManager) =>
      Driver.execute(() => new AluVector, manager) {
        (c) => new TestAluVector(c)
-      }   
+      }
-    } 
+    }
  )
  def main(args: Array[String]): Unit = {

--- a/vta/hardware/chisel/src/test/scala/unittest/MvmTest.scala
+++ b/vta/hardware/chisel/src/test/scala/unittest/MvmTest.scala
@@ -16,7 +16,7 @@
 * specific language governing permissions and limitations
 * under the License.
 */
 package unittest
 import chisel3._
@@ -27,7 +27,7 @@ import unittest.util._
 import vta.core._
 class TestMatrixVectorMultiplication(c: MatrixVectorMultiplication) extends PeekPokeTester(c) {
  /* mvm_ref
   *
   * This is a software function that computes dot product with a programmable shift
@@ -53,11 +53,11 @@ class TestMatrixVectorMultiplication(c: MatrixVectorMultiplication) extends Peek
    val wgtGen = new RandomArray(c.size, c.wgtBits)
    val in_a = inpGen.any
    val in_b = Array.fill(c.size) { wgtGen.any }
-    val res = mvmRef(in_a, in_b, 0)  
+    val res = mvmRef(in_a, in_b, 0)
    val inpMask = helper.getMask(c.inpBits)
    val wgtMask = helper.getMask(c.wgtBits)
    val accMask = helper.getMask(c.accBits)
    for (i <- 0 until c.size) {
      poke(c.io.inp.data.bits(0)(i), in_a(i) & inpMask)
      poke(c.io.acc_i.data.bits(0)(i), 0)
@@ -65,13 +65,13 @@ class TestMatrixVectorMultiplication(c: MatrixVectorMultiplication) extends Peek
        poke(c.io.wgt.data.bits(i)(j), in_b(i)(j) & wgtMask)
      }
    }
    poke(c.io.reset, 0)
    poke(c.io.inp.data.valid, 1)
    poke(c.io.wgt.data.valid, 1)
    poke(c.io.acc_i.data.valid, 1)
    step(1)
    poke(c.io.inp.data.valid, 0)
@@ -81,7 +81,7 @@ class TestMatrixVectorMultiplication(c: MatrixVectorMultiplication) extends Peek
    // wait for valid signal
    while (peek(c.io.acc_o.data.valid) == BigInt(0)) {
      step(1) // advance clock
-    } 
+    }
    if (peek(c.io.acc_o.data.valid) == BigInt(1)) {
      for (i <- 0 until c.size) {
          expect(c.io.acc_o.data.bits(0)(i), res(i) & accMask)

--- a/vta/hardware/chisel/src/test/scala/unittest/utils/Helper.scala
+++ b/vta/hardware/chisel/src/test/scala/unittest/utils/Helper.scala
--- a/vta/hardware/chisel/src/test/scala/unittest/utils/RandomArray.scala
+++ b/vta/hardware/chisel/src/test/scala/unittest/utils/RandomArray.scala
--- a/vta/hardware/chisel/src/test/scala/unittest/utils/TestRunner.scala
+++ b/vta/hardware/chisel/src/test/scala/unittest/utils/TestRunner.scala
--- a/vta/hardware/dpi/tsim_device.cc
+++ b/vta/hardware/dpi/tsim_device.cc
--- a/vta/hardware/intel/Makefile
+++ b/vta/hardware/intel/Makefile
--- a/vta/hardware/intel/README.md
+++ b/vta/hardware/intel/README.md
--- a/vta/hardware/intel/scripts/compile_design.tcl
+++ b/vta/hardware/intel/scripts/compile_design.tcl
@@ -67,7 +67,7 @@ if {$make_assignments} {
  set_global_assignment -name VERILOG_FILE ${PROJECT_NAME}.v
  set_global_assignment -name SIGNALTAP_FILE ${PROJECT_NAME}.stp
  set_global_assignment -name USE_SIGNALTAP_FILE ${PROJECT_NAME}.stp
  set_location_assignment PIN_V11 -to FPGA_CLK1_50
  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to FPGA_CLK1_50
  set_location_assignment PIN_Y13 -to FPGA_CLK2_50
@@ -91,7 +91,7 @@ if {$make_assignments} {
  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to LED[6]
  set_location_assignment PIN_AA23 -to LED[7]
  set_instance_assignment -name IO_STANDARD "3.3-V LVTTL" -to LED[7]
  for {set i 0} {$i < 32} {incr i} {
    set_instance_assignment -name IO_STANDARD "SSTL-15 CLASS I" -to HPS_DDR3_DQ[$i]
    set_instance_assignment -name INPUT_TERMINATION "PARALLEL 50 OHM WITH CALIBRATION" -to HPS_DDR3_DQ[$i] -tag __hps_sdram_p0
@@ -161,13 +161,13 @@ if {$make_assignments} {
  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_RESET_N -tag __hps_sdram_p0
  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_CK_P -tag __hps_sdram_p0
  set_instance_assignment -name PACKAGE_SKEW_COMPENSATION OFF -to HPS_DDR3_CK_N -tag __hps_sdram_p0
  set_instance_assignment -name PARTITION_HIERARCHY root_partition -to | -section_id Top
  # Commit assignments
  export_assignments
-  load_package flow 
+  load_package flow
  execute_flow -compile
  # Close project

--- a/vta/hardware/intel/scripts/de10_nano_top.v
+++ b/vta/hardware/intel/scripts/de10_nano_top.v
--- a/vta/hardware/intel/scripts/ip/vta/vta_hw.tcl
+++ b/vta/hardware/intel/scripts/ip/vta/vta_hw.tcl
--- a/vta/hardware/intel/scripts/set_attrs.py
+++ b/vta/hardware/intel/scripts/set_attrs.py
--- a/vta/hardware/intel/scripts/set_clocks.sdc
+++ b/vta/hardware/intel/scripts/set_clocks.sdc
--- a/vta/hardware/intel/scripts/soc_system.tcl
+++ b/vta/hardware/intel/scripts/soc_system.tcl
--- a/vta/hardware/xilinx/.gitignore
+++ b/vta/hardware/xilinx/.gitignore
--- a/vta/hardware/xilinx/Makefile
+++ b/vta/hardware/xilinx/Makefile
@@ -17,8 +17,8 @@
 # Directories
 ROOTDIR = $(CURDIR)
-VTA_DIR = $(CURDIR)/../..
+VTA_HW_DIR = $(CURDIR)/../..
-BUILD_DIR = $(VTA_DIR)/build/hardware/xilinx
+BUILD_DIR = $(VTA_HW_DIR)/build/hardware/xilinx
 SCRIPT_DIR = $(CURDIR)/scripts
 SRC_DIR = $(CURDIR)/src
@@ -27,7 +27,7 @@ VIVADO_HLS = vivado_hls
 VIVADO = vivado
 # Process VTA JSON config
-VTA_CONFIG := $(CURDIR)/../../config/vta_config.py
+VTA_CONFIG := $(VTA_HW_DIR)/config/vta_config.py
 # Derive config name
 CONF := $(shell python ${VTA_CONFIG} --cfg-str)
@@ -52,7 +52,7 @@ $(IP_PATH): $(SRC_DIR)/*
 		$(VIVADO_HLS) \
 		-f $(SCRIPT_DIR)/hls.tcl \
 		-tclargs \
-			$(VTA_DIR) \
+			$(VTA_HW_DIR) \
 			${VTA_CONFIG}
 $(BIT_PATH): $(IP_PATH)

--- a/vta/hardware/xilinx/README.md
+++ b/vta/hardware/xilinx/README.md
@@ -15,4 +15,4 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 Complete instructions on how to build custom FPGA hardware designs are available on the [TVM documentation webpage](https://docs.tvm.ai/vta/install.html#vta-fpga-toolchain-installation).
\ No newline at end of file
--- a/vta/hardware/xilinx/scripts/hls.tcl
+++ b/vta/hardware/xilinx/scripts/hls.tcl
@@ -5,9 +5,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

--- a/vta/hardware/xilinx/scripts/hsi.tcl
+++ b/vta/hardware/xilinx/scripts/hsi.tcl
@@ -5,9 +5,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

--- a/vta/hardware/xilinx/scripts/vivado.tcl
+++ b/vta/hardware/xilinx/scripts/vivado.tcl
@@ -5,9 +5,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

--- a/vta/hardware/xilinx/sim/vta_test.cc
+++ b/vta/hardware/xilinx/sim/vta_test.cc
@@ -6,9 +6,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
- * 
+ *
 *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

--- a/vta/hardware/xilinx/src/vta.cc
+++ b/vta/hardware/xilinx/src/vta.cc
@@ -6,9 +6,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
- * 
+ *
 *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

--- a/vta/hardware/xilinx/src/vta.h
+++ b/vta/hardware/xilinx/src/vta.h
@@ -6,9 +6,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
- * 
+ *
 *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -32,7 +32,7 @@
 #include <vta/hw_spec.h>
 /*!
-* Define HLS stream depth 
+* Define HLS stream depth
 */
 #define PRAGMA_SUB(x) _Pragma (#x)
 #define PRAGMA_HLS(x) PRAGMA_SUB(x)

--- a/vta/include/vta/dpi/module.h
+++ b/vta/include/vta/dpi/module.h
--- a/vta/include/vta/dpi/tsim.h
+++ b/vta/include/vta/dpi/tsim.h
--- a/vta/include/vta/driver.h
+++ b/vta/include/vta/driver.h
--- a/vta/include/vta/hw_spec.h
+++ b/vta/include/vta/hw_spec.h
@@ -6,9 +6,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
- * 
+ *
 *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

--- a/vta/include/vta/sim_tlpp.h
+++ b/vta/include/vta/sim_tlpp.h
@@ -6,9 +6,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
- * 
+ *
 *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -49,7 +49,7 @@ class TlppVerify {
    /*! Return TlppVefiy class instance.*/
    static TlppVerify *Global() { static TlppVerify Cls; return &Cls;}
-    /*! 
+    /*!
     *  \brief Loop to process instruction and verify tlpp logic.
     *  \param run_function function pointer to excute instruction .
     *  \param fsim_handle class pointer of function simulator class Device.

--- a/vta/src/de10nano/cma_api.cc
+++ b/vta/src/de10nano/cma_api.cc
--- a/vta/src/de10nano/cma_api.h
+++ b/vta/src/de10nano/cma_api.h
--- a/vta/src/de10nano/de10nano_driver.cc
+++ b/vta/src/de10nano/de10nano_driver.cc
--- a/vta/src/de10nano/de10nano_driver.h
+++ b/vta/src/de10nano/de10nano_driver.h
--- a/vta/src/de10nano/de10nano_mgr.h
+++ b/vta/src/de10nano/de10nano_mgr.h
--- a/vta/src/dpi/module.cc
+++ b/vta/src/dpi/module.cc
--- a/vta/src/pynq/pynq_driver.cc
+++ b/vta/src/pynq/pynq_driver.cc
--- a/vta/src/pynq/pynq_driver.h
+++ b/vta/src/pynq/pynq_driver.h
--- a/vta/src/sim/sim_driver.cc
+++ b/vta/src/sim/sim_driver.cc
--- a/vta/src/sim/sim_tlpp.cc
+++ b/vta/src/sim/sim_tlpp.cc
--- a/vta/src/tsim/tsim_driver.cc
+++ b/vta/src/tsim/tsim_driver.cc
--- a/vta/src/vmem/virtual_memory.cc
+++ b/vta/src/vmem/virtual_memory.cc
--- a/vta/src/vmem/virtual_memory.h
+++ b/vta/src/vmem/virtual_memory.h
--- a/vta/tests/hardware/common/test_lib.cc
+++ b/vta/tests/hardware/common/test_lib.cc
@@ -6,9 +6,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
- * 
+ *
 *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

--- a/vta/tests/hardware/common/test_lib.h
+++ b/vta/tests/hardware/common/test_lib.h
@@ -6,9 +6,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
- * 
+ *
 *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

--- a/vta/tests/hardware/metal_test/Makefile
+++ b/vta/tests/hardware/metal_test/Makefile
--- a/vta/tests/hardware/metal_test/metal_test.cc
+++ b/vta/tests/hardware/metal_test/metal_test.cc