Commit f55609b4 by Thierry Moreau Committed by Jared Roesch

[VTA] Refactor to increase platform coverage (Ultra96 etc.) (#3496)

* hardware refactor for increased FPGA coverage, small optimizations

* fix header

* cleaning up parameters that won't be needed for now

* streamlining makefile, and simplifying tcl scripts

* moving parameter derivation into pkg_config.py, keeping tcl scripts lightweight

* refactoring tcl script to avoid global variables

* deriving AXI signals in pkg_config.py

* unifying address map definition for hardware and software drivers

* single channel design for ultra96 to simplify build

* enable alu by default, no mul opcode for now

* hardware fix

* new bitstream; vta version

* avoid error when env variable is not set

* ultra96 cleanup

* further cleaning up tcl script for bitstream generation

* preliminary rpc server support on ultra96

* rpc server tracker scripts

* ultra96 ldflag

* ultra96 support

* ultra96 support

* cleanup line

* cmake support for ultra96

* simplify memory instantiation

* cleaning up IP parameter initialization

* fix queue instantiation

* 2019.1 transition

* fix macro def

* removing bus width from config

* cleanup

* fix

* turning off testing for now

* cleanup ultra96 ps insantiation

* minor refactor

* adding comments

* upgrading to tophub v0.6

* model used in TVM target now refers to a specific version of VTA for better autoTVM scheduling

* revert change due to bug

* rename driver files to be for zynq-type devices

* streamlining address mapping

* unifying register map offset values between driver and hardware generator

* rely on cma library for cache flush/invalidation

* coherence management

* not make buffer packing depend on data types that can be wider than 64bits

* refactor config derivation to minimize free parameters

* fix environment/pkg config interaction

* adding cfg dump property to pkgconfig:

* fix rpc reconfig

* fix spacing

* cleanup

* fix spacing

* long line fix

* fix spacing and lint

* fix line length

* cmake fix

* environment fix

* renaming after pynq since the driver stack relies on the pynq library - see pynq.io

* update doc

* adding parameterization to  name

* space

* removing reg width

* vta RPC

* update doc on how to edit vta_config.json

* fix path

* fix path
parent bca8ac17
......@@ -17,7 +17,10 @@
# under the License.
PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"
# Derive target specified by vta_config.json
VTA_CONFIG=${PROJROOT}/vta/config/vta_config.py
TARGET=$(python ${VTA_CONFIG} --target)
export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq
python3 -m vta.exec.rpc_server --tracker fleet:9190 --key pynq
python3 -m vta.exec.rpc_server --tracker fleet:9190 --key $TARGET
......@@ -38,11 +38,16 @@ elseif(PYTHON)
string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_=.]*" VTA_DEFINITIONS "${__vta_defs}")
file(GLOB VTA_RUNTIME_SRCS vta/src/*.cc)
file(GLOB __vta_target_srcs vta/src/${VTA_TARGET}/*.cc)
# Add sim driver sources
if(${VTA_TARGET} STREQUAL "sim")
file(GLOB __vta_target_srcs vta/src/sim/*.cc)
endif()
# Add pynq driver sources
if(${VTA_TARGET} STREQUAL "pynq" OR ${VTA_TARGET} STREQUAL "ultra96")
file(GLOB __vta_target_srcs vta/src/pynq/*.cc)
endif()
list(APPEND VTA_RUNTIME_SRCS ${__vta_target_srcs})
add_library(vta SHARED ${VTA_RUNTIME_SRCS})
# Add tsim driver sources
if(${VTA_TARGET} STREQUAL "tsim")
target_compile_definitions(vta PUBLIC USE_TSIM)
include_directories("vta/include")
......@@ -50,6 +55,8 @@ elseif(PYTHON)
list(APPEND RUNTIME_SRCS ${RUNTIME_DPI_SRCS})
endif()
add_library(vta SHARED ${VTA_RUNTIME_SRCS})
target_include_directories(vta PUBLIC vta/include)
foreach(__def ${VTA_DEFINITIONS})
......@@ -62,7 +69,7 @@ elseif(PYTHON)
endif(APPLE)
# PYNQ rules for Pynq v2.4
if(${VTA_TARGET} STREQUAL "pynq")
if(${VTA_TARGET} STREQUAL "pynq" OR ${VTA_TARGET} STREQUAL "ultra96")
find_library(__cma_lib NAMES cma PATH /usr/lib)
target_link_libraries(vta ${__cma_lib})
endif()
......
......@@ -36,10 +36,6 @@ below.
+=======================+============+========================================================+
| ``TARGET`` | String | The TVM device target. |
+-----------------------+------------+--------------------------------------------------------+
| ``HW_TARGET`` | Int | FPGA frequency in MHz. |
+-----------------------+------------+--------------------------------------------------------+
| ``HW_CLK_TARGET`` | Int | FPGA clock period in ns target for HLS tool. |
+-----------------------+------------+--------------------------------------------------------+
| ``HW_VER`` | String | VTA hardware version number. |
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_INP_WIDTH`` | Int (log2) | Input data type signed integer width. |
......@@ -48,13 +44,9 @@ below.
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_ACC_WIDTH`` | Int (log2) | Accumulator data type signed integer width. |
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_OUT_WIDTH`` | Int (log2) | Output data type signed integer width. |
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_BATCH`` | Int (log2) | VTA matrix multiply intrinsic output dimension 0. |
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_BLOCK_IN`` | Int (log2) | VTA matrix multiply reduction dimension. |
| ``LOG_BATCH`` | Int (log2) | VTA matrix multiply intrinsic input/output dimension 0.|
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_BLOCK_OUT`` | Int (log2) | VTA matrix multiply intrinsic output dimension 1. |
| ``LOG_BLOCK`` | Int (log2) | VTA matrix multiply inner dimensions. |
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_UOP_BUFF_SIZE`` | Int (log2) | Micro-op on-chip buffer in Bytes. |
+-----------------------+------------+--------------------------------------------------------+
......@@ -75,13 +67,8 @@ below.
We provide additional detail below regarding each parameter:
- ``TARGET``: Can be set to ``"pynq"`` or ``"sim"``.
- ``HW_TARGET``: In pynq mode, can be set to ``100``, ``142``, ``167``, or ``200`` MHz.
- ``HW_CLK_TARGET``: The lower the target, the more pipeline stages HLS will insert to achieve timing closure during place and route (this can also slightly decrease performance).
- ``TARGET``: Can be set to ``"pynq"``, ``"ultra96"``, ``"sim"`` (fast simulator), or ``"tsim"`` (cycle accurate sim with verilator).
- ``HW_VER``: Hardware version which increments everytime the VTA hardware design changes. This parameter is used to uniquely idenfity hardware bitstreams.
- ``LOG_OUT_WIDTH``: We recommend matching ``LOG_OUT_WIDTH`` to ``LOG_INP_WIDTH``.
- ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension.
- ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension.
- ``LOG_BLOCK_IN``: Equivalent to B in multiplication of shape (A, B) x (B, C), or typically, the input channel dimension.
- ``LOG_BLOCK_OUT``: Equivalent to C in multiplication of shape (A, B) x (B, C), or typically, the output channel dimension.
- ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension of inner tensor computation.
- ``LOG_BLOCK``: Equivalent to B and C in multiplication of shape (A, B) x (B, C), or typically, the input/output channel dimensions of the innter tensor computation.
......@@ -61,7 +61,7 @@ To do so,
```bash
cd <tvm root>
cp vta/config/vta_config.json vta_config.json
vim vta/config/vta_config.json
# edit vta_config.json
make vta
```
......@@ -118,7 +118,7 @@ cd /home/xilinx/tvm
mkdir build
cp cmake/config.cmake build/.
# Copy pynq specific configuration
cp vta/config/pynq_sample.json build/vta_config.json
cp vta/config/pynq_sample.json vta/config/vta_config.json
cd build
cmake ..
make runtime vta -j2
......@@ -147,13 +147,12 @@ export VTA_PYNQ_RPC_PORT=9091
```
In addition, you'll need to edit the `vta_config.json` file on the host to indicate that we are targeting the Pynq platform, by setting the `TARGET` field to `"pynq"`.
Alternatively, you can copy the default `vta/config/pynq_sample.json` into the TVM root as `vta_config.json`.
> Note: in contrast to our simulation setup, there are no libraries to compile on the host side since the host offloads all of the computation to the Pynq board.
```bash
# On the Host-side
cd <tvm root>
cp vta/config/pynq_sample.json vta_config.json
cp vta/config/pynq_sample.json vta/config/vta_config.json
```
This time again, we will run the 2D convolution testbench.
......@@ -187,28 +186,28 @@ This third and last guide allows users to generate custom VTA bitstreams using f
### Xilinx Toolchain Installation
We recommend using `Vivado 2018.2` since our scripts have been tested to work on this version of the Xilinx toolchains.
We recommend using `Vivado 2019.1` since our scripts have been tested to work on this version of the Xilinx toolchains.
Our guide is written for Linux (Ubuntu) installation.
You’ll need to install Xilinx’ FPGA compilation toolchain, [Vivado HL WebPACK 2018.2](https://www.xilinx.com/products/design-tools/vivado.html), which a license-free version of the Vivado HLx toolchain.
You’ll need to install Xilinx’ FPGA compilation toolchain, [Vivado HL WebPACK 2019.1](https://www.xilinx.com/products/design-tools/vivado.html), which a license-free version of the Vivado HLx toolchain.
#### Obtaining and Launching the Vivado GUI Installer
1. Go to the [download webpage](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/2018-2.html), and download the Linux Self Extracting Web Installer for Vivado HLx 2018.2: WebPACK and Editions.
1. Go to the [download webpage](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/2019-1.html), and download the Linux Self Extracting Web Installer for Vivado HLx 2019.1: WebPACK and Editions.
2. You’ll have to sign in with a Xilinx account. This requires a Xilinx account creation that will take 2 minutes.
3. Complete the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin`.
3. Complete the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin`.
4. Now that the file is downloaded, go to your `Downloads` directory, and change the file permissions so it can be executed:
```bash
chmod u+x Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin
chmod u+x Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin
```
5. Now you can execute the binary:
```bash
./Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin
./Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin
```
#### Xilinx Vivado GUI Installer Steps
At this point you've launched the Vivado 2018.2 Installer GUI program.
At this point you've launched the Vivado 2019.1 Installer GUI program.
1. Click “Next” on the *Welcome* screen.
2. On the *Select Install Type* screen, enter your Xilinx user credentials under the “User Authentication” box and select the “Download and Install Now” option before clicking “Next” .
......@@ -230,8 +229,8 @@ At this point you've launched the Vivado 2018.2 Installer GUI program.
The last step is to update your `~/.bashrc` with the following lines. This will include all of the Xilinx binary paths so you can launch compilation scripts from the command line.
```bash
# Xilinx Vivado 2018.2 environment
export XILINX_VIVADO=${XILINX_PATH}/Vivado/2018.2
# Xilinx Vivado 2019.1 environment
export XILINX_VIVADO=${XILINX_PATH}/Vivado/2019.1
export PATH=${XILINX_VIVADO}/bin:${PATH}
```
......
......@@ -44,7 +44,7 @@ PACKAGE_VERSION = {
'opencl': "v0.02",
'mali': "v0.05",
'vta': "v0.05",
'vta': "v0.06",
}
logger = logging.getLogger('autotvm')
......
{
"TARGET" : "pynq",
"HW_FREQ" : 100,
"HW_CLK_TARGET" : 8,
"HW_VER" : "0.0.0",
"HW_VER" : "0.0.1",
"LOG_INP_WIDTH" : 3,
"LOG_WGT_WIDTH" : 3,
"LOG_ACC_WIDTH" : 5,
"LOG_OUT_WIDTH" : 3,
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_BLOCK" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" :15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}
{
"TARGET" : "ultra96",
"HW_VER" : "0.0.1",
"LOG_INP_WIDTH" : 3,
"LOG_WGT_WIDTH" : 3,
"LOG_ACC_WIDTH" : 5,
"LOG_BATCH" : 0,
"LOG_BLOCK" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" :15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}
{
"TARGET" : "sim",
"HW_FREQ" : 100,
"HW_CLK_TARGET" : 7,
"HW_VER" : "0.0.0",
"HW_VER" : "0.0.1",
"LOG_INP_WIDTH" : 3,
"LOG_WGT_WIDTH" : 3,
"LOG_ACC_WIDTH" : 5,
"LOG_OUT_WIDTH" : 3,
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_BLOCK" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
......
......@@ -30,7 +30,6 @@ def get_pkg_config(cfg):
PkgConfig = libpkg["PkgConfig"]
return PkgConfig(cfg, proj_root)
def main():
"""Main funciton"""
parser = argparse.ArgumentParser()
......@@ -45,7 +44,7 @@ def main():
parser.add_argument("--update", action="store_true",
help="Print out the json option.")
parser.add_argument("--ldflags", action="store_true",
help="print the cflags")
help="print the ldflags")
parser.add_argument("--cfg-json", action="store_true",
help="print all the config json")
parser.add_argument("--save-cfg-json", type=str, default="",
......@@ -54,33 +53,51 @@ def main():
help="print the target")
parser.add_argument("--cfg-str", action="store_true",
help="print the configuration string")
parser.add_argument("--get-inpwidth", action="store_true",
help="returns log of input bitwidth")
parser.add_argument("--get-wgtwidth", action="store_true",
help="returns log of weight bitwidth")
parser.add_argument("--get-accwidth", action="store_true",
help="returns log of accum bitwidth")
parser.add_argument("--get-outwidth", action="store_true",
help="returns log of output bitwidth")
parser.add_argument("--get-batch", action="store_true",
help="returns log of tensor batch dimension")
parser.add_argument("--get-blockin", action="store_true",
help="returns log of tensor block in dimension")
parser.add_argument("--get-blockout", action="store_true",
help="returns log of tensor block out dimension")
parser.add_argument("--get-uopbuffsize", action="store_true",
help="returns log of micro-op buffer size in B")
parser.add_argument("--get-inpbuffsize", action="store_true",
help="returns log of input buffer size in B")
parser.add_argument("--get-wgtbuffsize", action="store_true",
help="returns log of weight buffer size in B")
parser.add_argument("--get-accbuffsize", action="store_true",
help="returns log of accum buffer size in B")
parser.add_argument("--get-outbuffsize", action="store_true",
help="returns log of output buffer size in B")
parser.add_argument("--get-fpgafreq", action="store_true",
parser.add_argument("--get-inp-mem-banks", action="store_true",
help="returns number of input memory banks")
parser.add_argument("--get-inp-mem-width", action="store_true",
help="returns input memory read/write port width")
parser.add_argument("--get-inp-mem-depth", action="store_true",
help="returns input memory depth")
parser.add_argument("--get-inp-mem-axi-ratio", action="store_true",
help="returns ratio between input element width and axi width")
parser.add_argument("--get-wgt-mem-banks", action="store_true",
help="returns number of weight memory banks")
parser.add_argument("--get-wgt-mem-width", action="store_true",
help="returns weight memory read/write port width")
parser.add_argument("--get-wgt-mem-depth", action="store_true",
help="returns weight memory depth")
parser.add_argument("--get-wgt-mem-axi-ratio", action="store_true",
help="returns ratio between weight element width and axi width")
parser.add_argument("--get-out-mem-banks", action="store_true",
help="returns number of output memory banks")
parser.add_argument("--get-out-mem-width", action="store_true",
help="returns output memory read/write port width")
parser.add_argument("--get-out-mem-depth", action="store_true",
help="returns output memory depth")
parser.add_argument("--get-out-mem-axi-ratio", action="store_true",
help="returns ratio between output element width and axi width")
parser.add_argument("--get-axi-cache-bits", action="store_true",
help="returns AXI system ARCACHE/AWCACHE hardcoded bit value")
parser.add_argument("--get-axi-prot-bits", action="store_true",
help="returns AXI system ARPROT/AWPROT hardcoded bit value")
parser.add_argument("--get-ip-reg-map-range", action="store_true",
help="returns ip register map address range")
parser.add_argument("--get-fetch-base-addr", action="store_true",
help="returns fetch module base address")
parser.add_argument("--get-load-base-addr", action="store_true",
help="returns load module base address")
parser.add_argument("--get-compute-base-addr", action="store_true",
help="returns compute module base address")
parser.add_argument("--get-store-base-addr", action="store_true",
help="returns store module base address")
parser.add_argument("--get-fpga-dev", action="store_true",
help="returns FPGA device target")
parser.add_argument("--get-fpga-family", action="store_true",
help="returns FPGA device family")
parser.add_argument("--get-fpga-freq", action="store_true",
help="returns FPGA frequency")
parser.add_argument("--get-fpgaper", action="store_true",
parser.add_argument("--get-fpga-per", action="store_true",
help="returns HLS target clock period")
args = parser.parse_args()
......@@ -92,8 +109,6 @@ def main():
os.path.abspath(os.path.expanduser(__file__)))
proj_root = os.path.abspath(os.path.join(curr_path, "../../"))
path_list = [
os.path.join(proj_root, "vta_config.json"),
os.path.join(proj_root, "build", "vta_config.json"),
os.path.join(proj_root, "vta/config/vta_config.json")
]
if args.use_cfg:
......@@ -102,14 +117,11 @@ def main():
if not ok_path_list:
raise RuntimeError("Cannot find config in %s" % str(path_list))
cfg = json.load(open(ok_path_list[0]))
cfg["LOG_OUT_BUFF_SIZE"] = (
cfg["LOG_ACC_BUFF_SIZE"] +
cfg["LOG_OUT_WIDTH"] -
cfg["LOG_ACC_WIDTH"])
pkg = get_pkg_config(cfg)
if args.target:
print(pkg.target)
print(pkg.TARGET)
if args.defs:
print(" ".join(pkg.macro_defs))
......@@ -119,8 +131,10 @@ def main():
if args.cflags:
cflags_str = " ".join(pkg.cflags)
if cfg["TARGET"] == "pynq":
if pkg.TARGET == "pynq":
cflags_str += " -DVTA_TARGET_PYNQ"
if pkg.TARGET == "ultra96":
cflags_str += " -DVTA_TARGET_ULTRA96"
print(cflags_str)
if args.ldflags:
......@@ -134,63 +148,76 @@ def main():
fo.write(pkg.cfg_json)
if args.cfg_str:
# Needs to match the BITSTREAM string in python/vta/environment.py
cfg_str = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}".format(
(1 << cfg["LOG_BATCH"]),
(1 << cfg["LOG_BLOCK_IN"]),
(1 << cfg["LOG_BLOCK_OUT"]),
(1 << cfg["LOG_INP_WIDTH"]),
(1 << cfg["LOG_WGT_WIDTH"]),
cfg["LOG_UOP_BUFF_SIZE"],
cfg["LOG_INP_BUFF_SIZE"],
cfg["LOG_WGT_BUFF_SIZE"],
cfg["LOG_ACC_BUFF_SIZE"],
cfg["HW_FREQ"],
cfg["HW_CLK_TARGET"],
cfg["HW_VER"].replace('.', '_'))
print(cfg_str)
print(pkg.TARGET + "_" + pkg.bitstream)
if args.get_inp_mem_banks:
print(pkg.inp_mem_banks)
if args.get_inp_mem_width:
print(pkg.inp_mem_width)
if args.get_inp_mem_depth:
print(pkg.inp_mem_depth)
if args.get_inp_mem_axi_ratio:
print(pkg.inp_mem_axi_ratio)
if args.get_wgt_mem_banks:
print(pkg.wgt_mem_banks)
if args.get_wgt_mem_width:
print(pkg.wgt_mem_width)
if args.get_wgt_mem_depth:
print(pkg.wgt_mem_depth)
if args.get_wgt_mem_axi_ratio:
print(pkg.wgt_mem_axi_ratio)
if args.get_out_mem_banks:
print(pkg.out_mem_banks)
if args.get_inpwidth:
print(cfg["LOG_INP_WIDTH"])
if args.get_out_mem_width:
print(pkg.out_mem_width)
if args.get_wgtwidth:
print(cfg["LOG_WGT_WIDTH"])
if args.get_out_mem_depth:
print(pkg.out_mem_depth)
if args.get_accwidth:
print(cfg["LOG_ACC_WIDTH"])
if args.get_out_mem_axi_ratio:
print(pkg.out_mem_axi_ratio)
if args.get_outwidth:
print(cfg["LOG_OUT_WIDTH"])
if args.get_axi_cache_bits:
print(pkg.axi_cache_bits)
if args.get_batch:
print(cfg["LOG_BATCH"])
if args.get_axi_prot_bits:
print(pkg.axi_prot_bits)
if args.get_blockin:
print(cfg["LOG_BLOCK_IN"])
if args.get_ip_reg_map_range:
print(pkg.ip_reg_map_range)
if args.get_blockout:
print(cfg["LOG_BLOCK_OUT"])
if args.get_fetch_base_addr:
print(pkg.fetch_base_addr)
if args.get_uopbuffsize:
print(cfg["LOG_UOP_BUFF_SIZE"])
if args.get_load_base_addr:
print(pkg.load_base_addr)
if args.get_inpbuffsize:
print(cfg["LOG_INP_BUFF_SIZE"])
if args.get_compute_base_addr:
print(pkg.compute_base_addr)
if args.get_wgtbuffsize:
print(cfg["LOG_WGT_BUFF_SIZE"])
if args.get_store_base_addr:
print(pkg.store_base_addr)
if args.get_outbuffsize:
print(cfg["LOG_OUT_BUFF_SIZE"])
if args.get_fpga_dev:
print(pkg.fpga_device)
if args.get_accbuffsize:
print(cfg["LOG_ACC_BUFF_SIZE"])
if args.get_fpga_family:
print(pkg.fpga_family)
if args.get_fpgafreq:
print(cfg["HW_FREQ"])
if args.get_fpga_freq:
print(pkg.fpga_freq)
if args.get_fpgaper:
print(cfg["HW_CLK_TARGET"])
if args.get_fpga_per:
print(pkg.fpga_per)
if __name__ == "__main__":
main()
......@@ -17,81 +17,30 @@
# Directories
ROOTDIR = $(CURDIR)
BUILD_NAME = build
BUILD_DIR = $(ROOTDIR)/../../$(BUILD_NAME)/hardware/xilinx
SCRIPT_DIR = $(ROOTDIR)/scripts
SRC_DIR = $(ROOTDIR)/src
SIM_DIR = $(ROOTDIR)/sim
TEST_DIR = $(ROOTDIR)/../../tests/hardware/common
INCLUDE_DIR = $(ROOTDIR)/../../include
VTA_DIR = $(CURDIR)/../..
BUILD_DIR = $(VTA_DIR)/build/hardware/xilinx
SCRIPT_DIR = $(CURDIR)/scripts
SRC_DIR = $(CURDIR)/src
# Executables
VIVADO_HLS = vivado_hls
VIVADO = vivado
HSI = hsi
# HLS mode
MODE = skip_sim
# Debug flag
DEBUG = false
# SLURM
SLURM = false
# Prevent generation of DSP
NO_DSP = false
# Prevent generation of ALU
NO_ALU = false
# Process VTA JSON config
VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py
CFLAGS := $(shell ${VTA_CONFIG} --cflags)
VTA_TARGET := $(shell ${VTA_CONFIG} --target)
#---------------------
# VTA Parameters
#--------------------
VTA_INP_WIDTH := $(shell ${VTA_CONFIG} --get-inpwidth)
VTA_WGT_WIDTH := $(shell ${VTA_CONFIG} --get-wgtwidth)
VTA_ACC_WIDTH := $(shell ${VTA_CONFIG} --get-accwidth)
VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth)
VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch)
VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin)
VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout)
VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize)
VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize)
VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize)
#---------------------
# FPGA Parameters
#--------------------
VTA_CLOCK_FREQ = $(shell ${VTA_CONFIG} --get-fpgafreq)
VTA_TARGET_PER = $(shell ${VTA_CONFIG} --get-fpgaper)
#---------------------
# Compilation parameters
#--------------------
# Number of threads during compilation
VTA_HW_COMP_THREADS = 8
VTA_CONFIG := $(CURDIR)/../../config/vta_config.py
# Derive config name
CONF = $(shell ${VTA_CONFIG} --cfg-str)
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
ifeq ($(SLURM), true)
IP_BUILD_PATH = /scratch/hls/$(CONF)
HW_BUILD_PATH = /scratch/vivado/$(CONF)
endif
CONF := $(shell python ${VTA_CONFIG} --cfg-str)
IP_BUILD_PATH := $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH := $(BUILD_DIR)/vivado/$(CONF)
# IP file path
IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip
IP_PATH := $(BUILD_DIR)/hls/$(CONF)/vta_compute/soln/impl/ip/xilinx_com_hls_compute_1_0.zip
# Bitstream file path
BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
BIT_PATH := $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
.PHONY: all ip bit bsp clean clean_all
.PHONY: all ip bit clean clean_all
all: bit
ip: $(IP_PATH)
......@@ -100,37 +49,24 @@ bit: $(BIT_PATH)
$(IP_PATH): $(SRC_DIR)/*
mkdir -p $(IP_BUILD_PATH)
cd $(IP_BUILD_PATH) && \
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
$(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(VTA_TARGET_PER) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
ifeq ($(SLURM), true)
mkdir -p $(BUILD_DIR)/hls
mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/.
endif
$(VIVADO_HLS) \
-f $(SCRIPT_DIR)/hls.tcl \
-tclargs \
$(VTA_DIR) \
${VTA_CONFIG}
$(BIT_PATH): $(IP_PATH)
mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
-tclargs $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) $(VTA_CLOCK_FREQ) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
ifeq ($(SLURM), true)
mkdir -p $(BUILD_DIR)/vivado
mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/.
endif
bsp: $(BIT_PATH)
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
cd $(HW_BUILD_PATH)/bsp && make
$(VIVADO) \
-mode tcl \
-source $(SCRIPT_DIR)/vivado.tcl \
-tclargs \
$(BUILD_DIR)/hls/$(CONF) \
${VTA_CONFIG}
clean:
rm -rf *.out *.log *.sb figures
rm -rf *.out *.log
cleanall: clean
rm -rf $(BUILD_DIR)
......@@ -14,220 +14,125 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2018 by Contributors
# file: hls.tcl
# brief: HLS generation script.
#
# Command line arguments:
# Arg 1: path to design sources
# Arg 2: path to sim sources
# Arg 3: path to test sources
# Arg 4: path to include sources
# Arg 5: mode
# Arg 6: debug
# Arg 7: no_dsp
# Arg 8: no_alu
# Arg 9: target clock period
# Arg 10: input type width (log)
# Arg 11: weight type width (log)
# Arg 12: accum type width (log)
# Arg 13: output type width (log)
# Arg 14: batch size (log)
# Arg 15: in block size (log)
# Arg 16: out block size (log)
# Arg 17: uop buffer size in B (log)
# Arg 18: inp buffer size in B (log)
# Arg 19: wgt buffer size in B (log)
# Arg 20: acc buffer size in B (log)
# Arg 21: out buffer size in B (log)
if { [llength $argv] eq 23 } {
set src_dir [lindex $argv 2]
set sim_dir [lindex $argv 3]
set test_dir [lindex $argv 4]
set include_dir [lindex $argv 5]
set mode [lindex $argv 6]
set debug [lindex $argv 7]
set no_dsp [lindex $argv 8]
set no_alu [lindex $argv 9]
set target_period [lindex $argv 10]
set inp_width [lindex $argv 11]
set wgt_width [lindex $argv 12]
set acc_width [lindex $argv 13]
set out_width [lindex $argv 14]
set batch [lindex $argv 15]
set block_in [lindex $argv 16]
set block_out [lindex $argv 17]
set uop_buff_size [lindex $argv 18]
set inp_buff_size [lindex $argv 19]
set wgt_buff_size [lindex $argv 20]
set acc_buff_size [lindex $argv 21]
set out_buff_size [lindex $argv 22]
# Arg 1: path to vta root
# Arg 2: path of config param script
if { [llength $argv] eq 4 } {
set root_dir [lindex $argv 2]
set vta_config [lindex $argv 3]
} else {
set src_dir "../src"
set sim_dir "../sim"
set test_dir "../../src/test"
set include_dir "../../include"
set mode "all"
set debug "false"
set no_dsp "true"
set no_alu "false"
set target_period 10
set inp_width 3
set wgt_width 3
set acc_width 5
set out_width 3
set batch 1
set block_in 4
set block_out 4
set uop_buff_size 15
set inp_buff_size 15
set wgt_buff_size 15
set acc_buff_size 17
set out_buff_size 15
puts "Not enough arguments provided!"
exit
}
# Derive paths
set src_dir "$root_dir/hardware/xilinx/src"
set sim_dir "$root_dir/hardware/xilinx/sim"
set test_dir "$root_dir/tests/hardware/common"
# C define flags that we want to pass to the compiler
set cflags [exec python $vta_config --cflags]
# Get the VTA configuration paramters
set ::device [exec python $vta_config --get-fpga-dev]
set ::period [exec python $vta_config --get-fpga-per]
# Get the VTA SRAM reshape/partition factors to get all memories
# to be of the same axi width.
set ::inp_reshape_factor [exec python $vta_config --get-inp-mem-axi-ratio]
set ::inp_partition_factor [exec python $vta_config --get-inp-mem-banks]
set ::wgt_reshape_factor [exec python $vta_config --get-wgt-mem-axi-ratio]
set ::wgt_partition_factor [exec python $vta_config --get-wgt-mem-banks]
set ::out_reshape_factor [exec python $vta_config --get-out-mem-axi-ratio]
set ::out_partition_factor [exec python $vta_config --get-out-mem-banks]
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
# This is necessary because of a Vivado restriction that doesn't allow for
# buses wider than 1024 bits.
proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
proc init_design {} {
# Set device number
set_part {xc7z020clg484-1}
# Set device id
set_part $::device
# Set the clock frequency
create_clock -period $per -name default
# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/1024)
set inp_partition_factor [expr {(1 << ($inp_width + $block_in + $batch)) / 1024}]
if {$inp_partition_factor == 0} {
set_directive_array_reshape -type complete -dim 2 "load" inp_mem
set_directive_array_reshape -type complete -dim 2 "compute" inp_mem
} else {
# Set input reshaping factor below to (1024/INP_VECTOR_WIDTH)
set inp_reshape_factor [expr {1024 / (1 << ($inp_width + $block_in))}]
set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "load" inp_mem
set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "compute" inp_mem
set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
create_clock -period $::period -name default
# HLS pragmas to reshape/partition the input memory read/write port
set_directive_array_reshape -type block -factor $::inp_reshape_factor -dim 2 "load" inp_mem
set_directive_array_reshape -type block -factor $::inp_reshape_factor -dim 2 "compute" inp_mem
if {$::inp_partition_factor > 1} {
set_directive_array_partition -type block -factor $::inp_partition_factor -dim 2 "load" inp_mem
set_directive_array_partition -type block -factor $::inp_partition_factor -dim 2 "compute" inp_mem
}
# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/1024)
set wgt_partition_factor [expr {(1 << ($wgt_width + $block_in + $block_out)) / 1024}]
if {$wgt_partition_factor == 0} {
set_directive_array_reshape -type complete -dim 2 "load" wgt_mem
set_directive_array_reshape -type complete -dim 2 "compute" wgt_mem
} else {
# Set weight reshaping factor below to (1024/WGT_VECTOR_WIDTH)
set wgt_reshape_factor [expr {1024 / (1 << ($wgt_width + $block_in))}]
set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "load" wgt_mem
set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "compute" wgt_mem
set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
# HLS pragmas to reshape/partition the weight memory read/write port
set_directive_array_reshape -type block -factor $::wgt_reshape_factor -dim 2 "load" wgt_mem
set_directive_array_reshape -type block -factor $::wgt_reshape_factor -dim 2 "compute" wgt_mem
if {$::wgt_partition_factor >1} {
set_directive_array_partition -type block -factor $::wgt_partition_factor -dim 2 "load" wgt_mem
set_directive_array_partition -type block -factor $::wgt_partition_factor -dim 2 "compute" wgt_mem
}
# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/1024)
set out_partition_factor [expr {(1 << ($out_width + $block_out + $batch)) / 1024}]
if {$out_partition_factor == 0} {
set_directive_array_reshape -type complete -dim 2 "compute" out_mem
set_directive_array_reshape -type complete -dim 2 "store" out_mem
} else {
# Set output reshaping factor below to (1024/OUT_VECTOR_WIDTH)
set out_reshape_factor [expr {1024 / (1 << ($out_width + $block_out))}]
set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "compute" out_mem
set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "store" out_mem
set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "compute" out_mem
set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
# HLS pragmas to reshape/partition the output memory read/write port
set_directive_array_reshape -type block -factor $::out_reshape_factor -dim 2 "compute" out_mem
set_directive_array_reshape -type block -factor $::out_reshape_factor -dim 2 "store" out_mem
if {$::out_partition_factor > 1} {
set_directive_array_partition -type block -factor $::out_partition_factor -dim 2 "compute" out_mem
set_directive_array_partition -type block -factor $::out_partition_factor -dim 2 "store" out_mem
}
}
# C define flags to pass to compiler
set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
-DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
if {$debug=="true"} {
append cflags " -DVTA_DEBUG=1"
}
if {$no_dsp=="true"} {
append cflags " -DNO_DSP"
}
if {$no_alu=="true"} {
append cflags " -DNO_ALU"
}
# HLS behavioral sim
if {$mode=="all" || $mode=="sim"} {
open_project vta_sim
set_top vta
add_files $src_dir/vta.cc -cflags $cflags
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
add_files -tb $test_dir/test_lib.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csim_design -clean
close_project
}
open_project vta_sim
set_top vta
add_files $src_dir/vta.cc -cflags $cflags
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
add_files -tb $test_dir/test_lib.cc -cflags $cflags
open_solution "soln"
init_design
csim_design -clean
close_project
# Generate fetch stage
if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} {
open_project vta_fetch
set_top fetch
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
}
close_project
}
open_project vta_fetch
set_top fetch
add_files $src_dir/vta.cc -cflags $cflags
open_solution "soln"
init_design
csynth_design
export_design -format ip_catalog
close_project
# Generate load stage
if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} {
open_project vta_load
set_top load
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
}
close_project
}
open_project vta_load
set_top load
add_files $src_dir/vta.cc -cflags $cflags
open_solution "soln"
init_design
csynth_design
export_design -format ip_catalog
close_project
# Generate compute stage
if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} {
open_project vta_compute
set_top compute
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
}
close_project
}
open_project vta_compute
set_top compute
add_files $src_dir/vta.cc -cflags $cflags
open_solution "soln"
init_design
csynth_design
export_design -format ip_catalog
close_project
# Generate store stage
if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} {
open_project vta_store
set_top store
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
}
close_project
}
open_project vta_store
set_top store
add_files $src_dir/vta.cc -cflags $cflags
open_solution "soln"
init_design
csynth_design
export_design -format ip_catalog
close_project
exit
......@@ -14,107 +14,67 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2018 by Xilinx, Contributors
# file: vivado.tcl
# brief: Vivado compilation script. Partially automatically generated
# by Vivado.
#
# Check if script is running in correct Vivado version.
set scripts_vivado_version 2018.2
set scripts_vivado_version 2018.3
set current_vivado_version [version -short]
if { [string first $scripts_vivado_version $current_vivado_version] == -1 } {
puts ""
catch {common::send_msg_id "BD_TCL-109" "ERROR" "This script was generated using Vivado \
<$scripts_vivado_version> and is being run in <$current_vivado_version> of Vivado. \
Please run the script in Vivado <$scripts_vivado_version> then open the design in Vivado \
<$current_vivado_version>. Upgrade the design by running \"Tools => Report => Report IP \
Status...\", then run write_bd_tcl to create an updated script."}
<$scripts_vivado_version> and is being run in <$current_vivado_version> of Vivado."}
return 1
}
# Parse argument list, derive the clock to utilize
set clock_id 0
if { [llength $argv] eq 12 } {
if { [llength $argv] eq 2 } {
set ip_path [lindex $argv 0]
set num_threads [lindex $argv 1]
set clock_freq [lindex $argv 2]
set inp_width [expr 1 << [lindex $argv 3]]
set wgt_width [expr 1 << [lindex $argv 4]]
set out_width [expr 1 << [lindex $argv 5]]
set batch [expr 1 << [lindex $argv 6]]
set out_block [expr 1 << [lindex $argv 7]]
set in_block [expr 1 << [lindex $argv 8]]
set inp_mem_size [expr 1 << [lindex $argv 9]]
set wgt_mem_size [expr 1 << [lindex $argv 10]]
set out_mem_size [expr 1 << [lindex $argv 11]]
if {$clock_freq eq 100} {
set clock_id 0
puts "Setting clock frequency to 100MHz"
} elseif {$clock_freq eq 142} {
set clock_id 1
puts "Setting clock frequency to 142MHz"
} elseif {$clock_freq eq 167} {
set clock_id 3
puts "Setting clock frequency to 167MHz"
} elseif {$clock_freq eq 200} {
set clock_id 2
puts "Setting clock frequency to 200MHz"
} else {
set clock_id 0
puts "Unrecognized clock frequency, setting clock to 100MHz"
}
set vta_config [lindex $argv 1]
} else {
puts "Arg list incomplete: <path to ip dir> <num threads> <clock freq> \
<inp width> <wgt_width> <out_width> <batch> <batch> <out_block> <in_block
<inp_mem_size> <wgt_mem_size> <out_mem_size>"
puts "Arg list incomplete: <path to ip dir> <path to vta_config.py>"
return 1
}
# Derive input mem parameters
set inp_mem_width [expr $inp_width * $batch * $in_block]
set inp_bus_width 1024
set inp_part [expr $inp_mem_width / $inp_bus_width]
if {[expr $inp_part == 0]} {
set inp_part 1
set inp_bus_width $inp_mem_width
}
set inp_mem_depth [expr $inp_mem_size * 8 / ($inp_mem_width * $inp_part)]
# Derive weight mem parameters
set wgt_mem_width [expr $wgt_width * $out_block * $in_block]
set wgt_bus_width 1024
set wgt_part [expr $wgt_mem_width / $wgt_bus_width]
if {[expr $wgt_part == 0]} {
set wgt_part 1
set wgt_bus_width $wgt_mem_width
}
set wgt_mem_depth [expr $wgt_mem_size * 8 / ($wgt_mem_width * $wgt_part)]
# Derive output mem parameters
set out_mem_width [expr $out_width * $batch * $out_block]
set out_bus_width 1024
set out_part [expr $out_mem_width / $out_bus_width]
if {[expr $out_part == 0]} {
set out_part 1
set out_bus_width $out_mem_width
}
set out_mem_depth [expr $out_mem_size * 8 / ($out_mem_width * $out_part)]
# User defined paths
# Get the VTA configuration paramters
set target [exec python $vta_config --target]
set device_family [exec python $vta_config --get-fpga-family]
set clock_freq [exec python $vta_config --get-fpga-freq]
# SRAM dimensions
set inp_part [exec python $vta_config --get-inp-mem-banks]
set inp_mem_width [exec python $vta_config --get-inp-mem-width]
set inp_mem_depth [exec python $vta_config --get-inp-mem-depth]
set wgt_part [exec python $vta_config --get-wgt-mem-banks]
set wgt_mem_width [exec python $vta_config --get-wgt-mem-width]
set wgt_mem_depth [exec python $vta_config --get-wgt-mem-depth]
set out_part [exec python $vta_config --get-out-mem-banks]
set out_mem_width [exec python $vta_config --get-out-mem-width]
set out_mem_depth [exec python $vta_config --get-out-mem-depth]
# AXI bus signals
set axi_cache [exec python $vta_config --get-axi-cache-bits]
set axi_prot [exec python $vta_config --get-axi-prot-bits]
# Address map
set ip_reg_map_range [exec python $vta_config --get-ip-reg-map-range]
set fetch_base_addr [exec python $vta_config --get-fetch-base-addr]
set load_base_addr [exec python $vta_config --get-load-base-addr]
set compute_base_addr [exec python $vta_config --get-compute-base-addr]
set store_base_addr [exec python $vta_config --get-store-base-addr]
# Paths to IP library of VTA modules
set proj_name vta
set design_name $proj_name
set proj_path "."
set ip_lib "ip_lib"
set fetch_ip "${ip_path}/vta_fetch/solution0/impl/ip/xilinx_com_hls_fetch_1_0.zip"
set load_ip "${ip_path}/vta_load/solution0/impl/ip/xilinx_com_hls_load_1_0.zip"
set compute_ip "${ip_path}/vta_compute/solution0/impl/ip/xilinx_com_hls_compute_1_0.zip"
set store_ip "${ip_path}/vta_store/solution0/impl/ip/xilinx_com_hls_store_1_0.zip"
set fetch_ip "${ip_path}/vta_fetch/soln/impl/ip/xilinx_com_hls_fetch_1_0.zip"
set load_ip "${ip_path}/vta_load/soln/impl/ip/xilinx_com_hls_load_1_0.zip"
set compute_ip "${ip_path}/vta_compute/soln/impl/ip/xilinx_com_hls_compute_1_0.zip"
set store_ip "${ip_path}/vta_store/soln/impl/ip/xilinx_com_hls_store_1_0.zip"
# Create custom project
create_project -force $proj_name $proj_path -part xc7z020clg484-1
set device [exec python $vta_config --get-fpga-dev]
create_project -force $proj_name $proj_path -part $device
# Update IP repository with generated IP
file mkdir $ip_lib
......@@ -125,810 +85,334 @@ update_ip_catalog -add_ip $load_ip -repo_path $ip_lib
update_ip_catalog -add_ip $compute_ip -repo_path $ip_lib
update_ip_catalog -add_ip $store_ip -repo_path $ip_lib
# CHANGE DESIGN NAME HERE
set design_name $proj_name
# Creating design if needed
set errMsg ""
set nRet 0
set cur_design [current_bd_design -quiet]
set list_cells [get_bd_cells -quiet]
if { ${design_name} eq "" } {
# USE CASES:
# 1) Design_name not set
set errMsg "Please set the variable <design_name> to a non-empty value."
set nRet 1
} elseif { ${cur_design} ne "" && ${list_cells} eq "" } {
# USE CASES:
# 2): Current design opened AND is empty AND names same.
# 3): Current design opened AND is empty AND names diff; design_name NOT in project.
# 4): Current design opened AND is empty AND names diff; design_name exists in project.
if { $cur_design ne $design_name } {
common::send_msg_id "BD_TCL-001" "INFO" "Changing value of <design_name> from <$design_name> \
to <$cur_design> since current design is empty."
set design_name [get_property NAME $cur_design]
}
common::send_msg_id "BD_TCL-002" "INFO" "Constructing design in IPI design <$cur_design>..."
} elseif { ${cur_design} ne "" && $list_cells ne "" && $cur_design eq $design_name } {
# USE CASES:
# 5) Current design opened AND has components AND same names.
set errMsg "Design <$design_name> already exists in your project, please set the variable \
<design_name> to another value."
set nRet 1
} elseif { [get_files -quiet ${design_name}.bd] ne "" } {
# USE CASES:
# 6) Current opened design, has components, but diff names, design_name exists in project.
# 7) No opened design, design_name exists in project.
set errMsg "Design <$design_name> already exists in your project, please set the variable \
<design_name> to another value."
set nRet 2
} else {
# USE CASES:
# 8) No opened design, design_name not in project.
# 9) Current opened design, has components, but diff names, design_name not in project.
common::send_msg_id "BD_TCL-003" "INFO" "Currently there is no design <$design_name> in \
project, so creating one..."
create_bd_design $design_name
common::send_msg_id "BD_TCL-004" "INFO" "Making design <$design_name> as current_bd_design."
current_bd_design $design_name
}
common::send_msg_id "BD_TCL-005" "INFO" "Currently the variable <design_name> is equal \
to \"$design_name\"."
if { $nRet != 0 } {
catch {common::send_msg_id "BD_TCL-114" "ERROR" $errMsg}
return $nRet
}
##################################################################
# DESIGN PROCs
# CONFIGURE BLOCK DIAGRAM DESIGN
##################################################################
# Create bd design
create_bd_design $design_name
current_bd_design $design_name
# Procedure to create entire design; Provide argument to make
# procedure reusable. If parentCell is "", will use root.
proc create_root_design { parentCell clk inp_part wgt_part out_part inp_bus_width inp_mem_depth wgt_bus_width wgt_mem_depth out_bus_width out_mem_depth} {
variable script_folder
if { $parentCell eq "" } {
set parentCell [get_bd_cells /]
}
# Get object for parentCell
set parentObj [get_bd_cells $parentCell]
if { $parentObj == "" } {
catch {common::send_msg_id "BD_TCL-100" "ERROR" "Unable to find parent cell <$parentCell>!"}
return
}
# Make sure parentObj is hier blk
set parentType [get_property TYPE $parentObj]
if { $parentType ne "hier" } {
catch {common::send_msg_id "BD_TCL-101" "ERROR" "Parent <$parentObj> has TYPE = \
<$parentType>. Expected to be <hier>."}
return
}
# Save current instance; Restore later
set oldCurInst [current_bd_instance .]
# Set parent object as current
current_bd_instance $parentObj
# Create interface ports
set DDR [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:ddrx_rtl:1.0 DDR ]
set FIXED_IO [ create_bd_intf_port -mode Master \
-vlnv xilinx.com:display_processing_system7:fixedio_rtl:1.0 FIXED_IO ]
# Create ports
# Create instance: axi_interconnect_1, and set properties
set axi_interconnect_1 \
[ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_1 ]
set_property -dict [ list \
CONFIG.NUM_MI {5} \
] $axi_interconnect_1
# Create instance: axi_smc, and set properties
set axi_smc [ create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 axi_smc ]
# Procedure to initialize FIFO
proc init_fifo_property {fifo width_bytes depth} {
set_property -dict [ list \
CONFIG.NUM_SI {5} \
] $axi_smc
# Create instance: axi_timer_1, and set properties
set axi_timer_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_timer:2.0 axi_timer_1 ]
# Create instance: compute_0, and set properties
set compute_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:compute:1.0 compute_0 ]
set_property -dict [ list \
CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
CONFIG.C_M_AXI_DATA_PORT_DATA_WIDTH {64} \
CONFIG.C_M_AXI_UOP_PORT_CACHE_VALUE {"1111"} \
] $compute_0
# Create instance: fetch_0, and set properties
set fetch_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:fetch:1.0 fetch_0 ]
set_property -dict [ list \
CONFIG.C_M_AXI_INS_PORT_CACHE_VALUE {"1111"} \
CONFIG.C_M_AXI_INS_PORT_DATA_WIDTH {64} \
] $fetch_0
# Create instance: g2l_queue, and set properties
set g2l_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 g2l_queue ]
set_property -dict [ list \
CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
CONFIG.Full_Flags_Reset_Value {1} \
CONFIG.Full_Threshold_Assert_Value_axis {1023} \
CONFIG.Full_Threshold_Assert_Value_rach {15} \
CONFIG.Full_Threshold_Assert_Value_wach {15} \
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
CONFIG.Input_Depth_axis {1024} \
CONFIG.Input_Depth_axis $depth \
CONFIG.Reset_Type {Asynchronous_Reset} \
CONFIG.TUSER_WIDTH {0} \
] $g2l_queue
# Create instance: g2s_queue, and set properties
set g2s_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 g2s_queue ]
set_property -dict [ list \
CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
CONFIG.Full_Flags_Reset_Value {1} \
CONFIG.Full_Threshold_Assert_Value_axis {1023} \
CONFIG.Full_Threshold_Assert_Value_rach {15} \
CONFIG.Full_Threshold_Assert_Value_wach {15} \
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
CONFIG.Input_Depth_axis {1024} \
CONFIG.Reset_Type {Asynchronous_Reset} \
CONFIG.TUSER_WIDTH {0} \
] $g2s_queue
# Create instance: gemm_queue, and set properties
set gemm_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 gemm_queue ]
set_property -dict [ list \
CONFIG.Empty_Threshold_Assert_Value_axis {510} \
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
CONFIG.Full_Flags_Reset_Value {1} \
CONFIG.Full_Threshold_Assert_Value_axis {511} \
CONFIG.Full_Threshold_Assert_Value_rach {15} \
CONFIG.Full_Threshold_Assert_Value_wach {15} \
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
CONFIG.Input_Depth_axis {512} \
CONFIG.Reset_Type {Asynchronous_Reset} \
CONFIG.TDATA_NUM_BYTES {16} \
CONFIG.TKEEP_WIDTH {16} \
CONFIG.TSTRB_WIDTH {16} \
CONFIG.TUSER_WIDTH {0} \
] $gemm_queue
# Create instance: l2g_queue, and set properties
set l2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 l2g_queue ]
set_property -dict [ list \
CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
CONFIG.Full_Flags_Reset_Value {1} \
CONFIG.Full_Threshold_Assert_Value_axis {1023} \
CONFIG.Full_Threshold_Assert_Value_rach {15} \
CONFIG.Full_Threshold_Assert_Value_wach {15} \
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
CONFIG.Input_Depth_axis {1024} \
CONFIG.Reset_Type {Asynchronous_Reset} \
CONFIG.TUSER_WIDTH {0} \
] $l2g_queue
# Create instance: load_0, and set properties
set load_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:load:1.0 load_0 ]
set_property -dict [ list \
CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
] $load_0
# Create instance: load_queue, and set properties
set load_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 load_queue ]
set_property -dict [ list \
CONFIG.Empty_Threshold_Assert_Value_axis {510} \
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
CONFIG.Full_Flags_Reset_Value {1} \
CONFIG.Full_Threshold_Assert_Value_axis {511} \
CONFIG.Full_Threshold_Assert_Value_rach {15} \
CONFIG.Full_Threshold_Assert_Value_wach {15} \
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
CONFIG.Input_Depth_axis {512} \
CONFIG.Reset_Type {Asynchronous_Reset} \
CONFIG.TDATA_NUM_BYTES {16} \
CONFIG.TKEEP_WIDTH {16} \
CONFIG.TSTRB_WIDTH {16} \
CONFIG.TUSER_WIDTH {0} \
] $load_queue
# Create instance: proc_sys_reset, and set properties
set proc_sys_reset \
[ create_bd_cell -type ip -vlnv xilinx.com:ip:proc_sys_reset:5.0 proc_sys_reset ]
# Create instance: processing_system7_1, and set properties
set processing_system7_1 \
[ create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_1 ]
set_property -dict [ list \
CONFIG.PCW_CAN0_PERIPHERAL_ENABLE {0} \
CONFIG.PCW_ENET0_PERIPHERAL_ENABLE {0} \
CONFIG.PCW_EN_CLK0_PORT {1} \
CONFIG.PCW_EN_CLK1_PORT {1} \
CONFIG.PCW_EN_CLK2_PORT {1} \
CONFIG.PCW_EN_CLK3_PORT {1} \
CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ {100} \
CONFIG.PCW_FPGA1_PERIPHERAL_FREQMHZ {142.86} \
CONFIG.PCW_FPGA2_PERIPHERAL_FREQMHZ {200} \
CONFIG.PCW_FPGA3_PERIPHERAL_FREQMHZ {167} \
CONFIG.PCW_GPIO_MIO_GPIO_ENABLE {0} \
CONFIG.PCW_I2C0_PERIPHERAL_ENABLE {0} \
CONFIG.PCW_IMPORT_BOARD_PRESET {None} \
CONFIG.PCW_IRQ_F2P_INTR {1} \
CONFIG.PCW_QSPI_GRP_SINGLE_SS_ENABLE {0} \
CONFIG.PCW_QSPI_PERIPHERAL_ENABLE {0} \
CONFIG.PCW_SD0_PERIPHERAL_ENABLE {0} \
CONFIG.PCW_USB0_PERIPHERAL_ENABLE {0} \
CONFIG.PCW_USE_DEFAULT_ACP_USER_VAL {1} \
CONFIG.PCW_USE_FABRIC_INTERRUPT {1} \
CONFIG.PCW_USE_HIGH_OCM {1} \
CONFIG.PCW_USE_S_AXI_ACP {1} \
CONFIG.PCW_USE_S_AXI_HP0 {0} \
CONFIG.PCW_USE_S_AXI_HP1 {0} \
CONFIG.PCW_USE_S_AXI_HP2 {0} \
CONFIG.PCW_USE_S_AXI_HP3 {0} \
CONFIG.preset {ZC702} \
] $processing_system7_1
# Create instance: s2g_queue, and set properties
set s2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 s2g_queue ]
set_property -dict [ list \
CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
CONFIG.Full_Flags_Reset_Value {1} \
CONFIG.Full_Threshold_Assert_Value_axis {1023} \
CONFIG.Full_Threshold_Assert_Value_rach {15} \
CONFIG.Full_Threshold_Assert_Value_wach {15} \
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
CONFIG.Input_Depth_axis {1024} \
CONFIG.Reset_Type {Asynchronous_Reset} \
CONFIG.TUSER_WIDTH {0} \
] $s2g_queue
# Create instance: store_0, and set properties
set store_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:store:1.0 store_0 ]
set_property -dict [ list \
CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
] $store_0
# Create instance: store_queue, and set properties
set store_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 store_queue ]
set_property -dict [ list \
CONFIG.Empty_Threshold_Assert_Value_axis {510} \
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
CONFIG.Full_Flags_Reset_Value {1} \
CONFIG.Full_Threshold_Assert_Value_axis {511} \
CONFIG.Full_Threshold_Assert_Value_rach {15} \
CONFIG.Full_Threshold_Assert_Value_wach {15} \
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
CONFIG.Input_Depth_axis {512} \
CONFIG.Reset_Type {Asynchronous_Reset} \
CONFIG.TDATA_NUM_BYTES {16} \
CONFIG.TKEEP_WIDTH {16} \
CONFIG.TSTRB_WIDTH {16} \
CONFIG.TUSER_WIDTH {0} \
] $store_queue
# Create instance: xlconcat_1, and set properties
set xlconcat_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_1 ]
set_property -dict [ list \
CONFIG.NUM_PORTS {5} \
] $xlconcat_1
CONFIG.TDATA_NUM_BYTES $width_bytes \
] $fifo
}
# Create and connect inp_mem partitions
if {${inp_part} > 1} {
for {set i 0} {$i < ${inp_part}} {incr i} {
# Create instance: inp_mem, and set properties
set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem_${i} ]
# Procedure to initialize BRAM
proc init_bram_property {bram width depth} {
set_property -dict [ list \
CONFIG.Assume_Synchronous_Clk {true} \
CONFIG.Byte_Size {8} \
CONFIG.Enable_32bit_Address {true} \
CONFIG.Enable_B {Use_ENB_Pin} \
CONFIG.Memory_Type {True_Dual_Port_RAM} \
CONFIG.Read_Width_A $inp_bus_width \
CONFIG.Read_Width_B $inp_bus_width \
CONFIG.Read_Width_A $width \
CONFIG.Read_Width_B $width \
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
CONFIG.Use_Byte_Write_Enable {true} \
CONFIG.Use_RSTA_Pin {true} \
CONFIG.Use_RSTB_Pin {true} \
CONFIG.Write_Depth_A $inp_mem_depth \
CONFIG.Write_Width_A $inp_bus_width \
CONFIG.Write_Width_B $inp_bus_width \
CONFIG.use_bram_block {BRAM_Controller} \
] $inp_mem
# Create interface connections
connect_bd_intf_net -intf_net load_0_inp_mem_${i}_V_PORTA \
[get_bd_intf_pins $inp_mem/BRAM_PORTA] \
[get_bd_intf_pins load_0/inp_mem_${i}_V_PORTA]
connect_bd_intf_net -intf_net compute_0_inp_mem_${i}_V_PORTA \
[get_bd_intf_pins compute_0/inp_mem_${i}_V_PORTA] \
[get_bd_intf_pins $inp_mem/BRAM_PORTB]
}
} else {
CONFIG.Write_Depth_A $depth \
CONFIG.Write_Width_A $width \
CONFIG.Write_Width_B $width \
] $bram
}
# Create instance: proc_sys_reset, and set properties
set proc_sys_reset \
[ create_bd_cell -type ip -vlnv xilinx.com:ip:proc_sys_reset:5.0 proc_sys_reset ]
# Create instance: pll_clk, and set properties
set pll_clk [ create_bd_cell -type ip -vlnv xilinx.com:ip:clk_wiz:6.0 pll_clk ]
set_property -dict [ list \
CONFIG.CLKOUT1_REQUESTED_OUT_FREQ $clock_freq \
CONFIG.RESET_PORT {resetn} \
CONFIG.RESET_TYPE {ACTIVE_LOW} \
CONFIG.USE_LOCKED {false} \
] $pll_clk
# Create instance: axi_smc0, and set properties
set axi_smc0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 axi_smc0 ]
set_property -dict [ list \
CONFIG.NUM_MI {1} \
CONFIG.NUM_SI {5} \
] $axi_smc0
# Create instance: axi_xbar, and set properties
set axi_xbar \
[ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_xbar ]
set_property -dict [ list \
CONFIG.NUM_MI {4} \
CONFIG.NUM_SI {1} \
] $axi_xbar
# Create instance: fetch_0, and set properties
set fetch_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:fetch:1.0 fetch_0 ]
set_property -dict [ list \
CONFIG.C_M_AXI_INS_PORT_CACHE_VALUE $axi_cache \
CONFIG.C_M_AXI_INS_PORT_PROT_VALUE $axi_prot \
] $fetch_0
# Create instance: load_0, and set properties
set load_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:load:1.0 load_0 ]
set_property -dict [ list \
CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \
CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \
] $load_0
# Create instance: compute_0, and set properties
set compute_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:compute:1.0 compute_0 ]
set_property -dict [ list \
CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \
CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \
CONFIG.C_M_AXI_UOP_PORT_CACHE_VALUE $axi_cache \
CONFIG.C_M_AXI_UOP_PORT_PROT_VALUE $axi_prot \
] $compute_0
# Create instance: store_0, and set properties
set store_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:store:1.0 store_0 ]
set_property -dict [ list \
CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \
CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \
] $store_0
# Create command queues and set properties
set cmd_queue_list {load_queue gemm_queue store_queue}
foreach cmd_queue $cmd_queue_list {
set tmp_cmd_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 $cmd_queue ]
# Width is 16B (128b, as set in hw_spec.h), depth is 512 (depth of FIFO on Zynq 7000 and Zynq Ultrascale+)
# TODO: derive it from vta_config.h
[ init_fifo_property $tmp_cmd_queue 16 512 ]
}
# Create dependence queues and set properties
set dep_queue_list {l2g_queue g2l_queue g2s_queue s2g_queue}
foreach dep_queue $dep_queue_list {
set tmp_dep_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 $dep_queue ]
# Width is 1B (min width), depth is 1024
# TODO: derive it from vta_config.h
[ init_fifo_property $tmp_dep_queue 1 1024 ]
}
# Create and connect inp_mem partitions
for {set i 0} {$i < $inp_part} {incr i} {
# Create instance: inp_mem, and set properties
set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem ]
set_property -dict [ list \
CONFIG.Byte_Size {8} \
CONFIG.Enable_32bit_Address {true} \
CONFIG.Enable_B {Use_ENB_Pin} \
CONFIG.Memory_Type {True_Dual_Port_RAM} \
CONFIG.Read_Width_A $inp_bus_width \
CONFIG.Read_Width_B $inp_bus_width \
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
CONFIG.Use_Byte_Write_Enable {true} \
CONFIG.Use_RSTA_Pin {true} \
CONFIG.Use_RSTB_Pin {true} \
CONFIG.Write_Depth_A $inp_mem_depth \
CONFIG.Write_Width_A $inp_bus_width \
CONFIG.Write_Width_B $inp_bus_width \
CONFIG.use_bram_block {BRAM_Controller} \
] $inp_mem
set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem_${i} ]
[ init_bram_property $inp_mem $inp_mem_width $inp_mem_depth ]
# If module has more than 1 mem port, the naming convention changes
if {$inp_part > 1} {
set porta [get_bd_intf_pins load_0/inp_mem_${i}_V_PORTA]
set portb [get_bd_intf_pins compute_0/inp_mem_${i}_V_PORTA]
} else {
set porta [get_bd_intf_pins load_0/inp_mem_V_PORTA]
set portb [get_bd_intf_pins compute_0/inp_mem_V_PORTA]
}
# Create interface connections
connect_bd_intf_net -intf_net load_0_inp_mem_V_PORTA \
[get_bd_intf_pins $inp_mem/BRAM_PORTA] \
[get_bd_intf_pins load_0/inp_mem_V_PORTA]
$porta
connect_bd_intf_net -intf_net compute_0_inp_mem_V_PORTA \
[get_bd_intf_pins compute_0/inp_mem_V_PORTA] \
[get_bd_intf_pins $inp_mem/BRAM_PORTB]
}
[get_bd_intf_pins $inp_mem/BRAM_PORTB] \
$portb
}
# Create and connect wgt_mem partitions
if {${wgt_part} > 1} {
for {set i 0} {$i < ${wgt_part}} {incr i} {
# Create and connect wgt_mem partitions
for {set i 0} {$i < $wgt_part} {incr i} {
# Create instance: wgt_mem, and set properties
set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 wgt_mem_${i} ]
set_property -dict [ list \
CONFIG.Assume_Synchronous_Clk {true} \
CONFIG.Byte_Size {8} \
CONFIG.Enable_32bit_Address {true} \
CONFIG.Enable_B {Use_ENB_Pin} \
CONFIG.Memory_Type {True_Dual_Port_RAM} \
CONFIG.Read_Width_A $wgt_bus_width \
CONFIG.Read_Width_B $wgt_bus_width \
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
CONFIG.Use_Byte_Write_Enable {true} \
CONFIG.Use_RSTA_Pin {true} \
CONFIG.Use_RSTB_Pin {true} \
CONFIG.Write_Depth_A $wgt_mem_depth \
CONFIG.Write_Width_A $wgt_bus_width \
CONFIG.Write_Width_B $wgt_bus_width \
] $wgt_mem
[ init_bram_property $wgt_mem $wgt_mem_width $wgt_mem_depth ]
# If module has more than 1 mem port, the naming convention changes
if {$wgt_part > 1} {
set porta [get_bd_intf_pins load_0/wgt_mem_${i}_V_PORTA]
set portb [get_bd_intf_pins compute_0/wgt_mem_${i}_V_PORTA]
} else {
set porta [get_bd_intf_pins load_0/wgt_mem_V_PORTA]
set portb [get_bd_intf_pins compute_0/wgt_mem_V_PORTA]
}
# Create interface connections
connect_bd_intf_net -intf_net load_0_wgt_mem_${i}_V_PORTA \
[get_bd_intf_pins load_0/wgt_mem_${i}_V_PORTA] \
[get_bd_intf_pins $wgt_mem/BRAM_PORTA]
[get_bd_intf_pins $wgt_mem/BRAM_PORTA] \
$porta
connect_bd_intf_net -intf_net compute_0_wgt_mem_${i}_V_PORTA \
[get_bd_intf_pins compute_0/wgt_mem_${i}_V_PORTA] \
[get_bd_intf_pins $wgt_mem/BRAM_PORTB]
}
} else {
# Create instance: wgt_mem, and set properties
set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 wgt_mem ]
set_property -dict [ list \
CONFIG.Assume_Synchronous_Clk {true} \
CONFIG.Byte_Size {8} \
CONFIG.Enable_32bit_Address {true} \
CONFIG.Enable_B {Use_ENB_Pin} \
CONFIG.Memory_Type {True_Dual_Port_RAM} \
CONFIG.Read_Width_A $wgt_bus_width \
CONFIG.Read_Width_B $wgt_bus_width \
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
CONFIG.Use_Byte_Write_Enable {true} \
CONFIG.Use_RSTA_Pin {true} \
CONFIG.Use_RSTB_Pin {true} \
CONFIG.Write_Depth_A $wgt_mem_depth \
CONFIG.Write_Width_A $wgt_bus_width \
CONFIG.Write_Width_B $wgt_bus_width \
] $wgt_mem
# Create interface connections
connect_bd_intf_net -intf_net load_0_wgt_mem_V_PORTA \
[get_bd_intf_pins load_0/wgt_mem_V_PORTA] \
[get_bd_intf_pins $wgt_mem/BRAM_PORTA]
connect_bd_intf_net -intf_net compute_0_wgt_mem_V_PORTA \
[get_bd_intf_pins compute_0/wgt_mem_V_PORTA] \
[get_bd_intf_pins $wgt_mem/BRAM_PORTB]
}
[get_bd_intf_pins $wgt_mem/BRAM_PORTB] \
$portb
}
# Create and connect out_mem partitions
if {${out_part} > 1} {
for {set i 0} {$i < ${out_part}} {incr i} {
# Create and connect out_mem partitions
for {set i 0} {$i < $out_part} {incr i} {
# Create instance: out_mem, and set properties
set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 out_mem_${i} ]
set_property -dict [ list \
CONFIG.Byte_Size {8} \
CONFIG.Enable_32bit_Address {true} \
CONFIG.Enable_B {Use_ENB_Pin} \
CONFIG.Memory_Type {True_Dual_Port_RAM} \
CONFIG.Read_Width_A $out_bus_width \
CONFIG.Read_Width_B $out_bus_width \
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
CONFIG.Use_Byte_Write_Enable {true} \
CONFIG.Use_RSTA_Pin {true} \
CONFIG.Use_RSTB_Pin {true} \
CONFIG.Write_Depth_A $out_mem_depth \
CONFIG.Write_Width_A $out_bus_width \
CONFIG.Write_Width_B $out_bus_width \
CONFIG.use_bram_block {BRAM_Controller} \
] $out_mem
[ init_bram_property $out_mem $out_mem_width $out_mem_depth ]
# If module has more than 1 mem port, the naming convention changes
if {$out_part > 1} {
set porta [get_bd_intf_pins compute_0/out_mem_${i}_V_PORTA]
set portb [get_bd_intf_pins store_0/out_mem_${i}_V_PORTA]
} else {
set porta [get_bd_intf_pins compute_0/out_mem_V_PORTA]
set portb [get_bd_intf_pins store_0/out_mem_V_PORTA]
}
# Create interface connections
connect_bd_intf_net -intf_net compute_0_out_mem_${i}_V_PORTA \
[get_bd_intf_pins compute_0/out_mem_${i}_V_PORTA] \
[get_bd_intf_pins $out_mem/BRAM_PORTA]
[get_bd_intf_pins $out_mem/BRAM_PORTA] \
$porta
connect_bd_intf_net -intf_net store_0_out_mem_${i}_V_PORTA \
[get_bd_intf_pins $out_mem/BRAM_PORTB] \
[get_bd_intf_pins store_0/out_mem_${i}_V_PORTA]
}
} else {
# Create instance: out_mem, and set properties
set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 out_mem ]
$portb
}
# Create instance: processing_system, and set properties
if { $device_family eq "zynq-7000" } {
set processing_system [ create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system ]
set_property -dict [ list \
CONFIG.Byte_Size {8} \
CONFIG.Enable_32bit_Address {true} \
CONFIG.Enable_B {Use_ENB_Pin} \
CONFIG.Memory_Type {True_Dual_Port_RAM} \
CONFIG.Read_Width_A $out_bus_width \
CONFIG.Read_Width_B $out_bus_width \
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
CONFIG.Use_Byte_Write_Enable {true} \
CONFIG.Use_RSTA_Pin {true} \
CONFIG.Use_RSTB_Pin {true} \
CONFIG.Write_Depth_A $out_mem_depth \
CONFIG.Write_Width_A $out_bus_width \
CONFIG.Write_Width_B $out_bus_width \
CONFIG.use_bram_block {BRAM_Controller} \
] $out_mem
# Create interface connections
connect_bd_intf_net -intf_net compute_0_out_mem_V_PORTA \
[get_bd_intf_pins compute_0/out_mem_V_PORTA] \
[get_bd_intf_pins $out_mem/BRAM_PORTA]
connect_bd_intf_net -intf_net store_0_out_mem_V_PORTA \
[get_bd_intf_pins $out_mem/BRAM_PORTB] \
[get_bd_intf_pins store_0/out_mem_V_PORTA]
}
CONFIG.PCW_EN_CLK0_PORT {1} \
CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ {100} \
CONFIG.PCW_USE_DEFAULT_ACP_USER_VAL {1} \
CONFIG.PCW_USE_S_AXI_ACP {1} \
CONFIG.preset {ZC702} \
] $processing_system
# Get ports that are specific to the Zynq 7000 processing system
set ps_clk [get_bd_pins processing_system/FCLK_CLK0]
set ps_rstn [get_bd_pins processing_system/FCLK_RESET0_N]
set maxi_clk [get_bd_pins processing_system/M_AXI_GP0_ACLK]
set saxi_clk [get_bd_pins processing_system/S_AXI_ACP_ACLK]
set maxi [get_bd_intf_pins processing_system/M_AXI_GP0]
set saxi [get_bd_intf_pins processing_system/S_AXI_ACP]
} elseif { $device_family eq "zynq-ultrascale+" } {
set processing_system [ create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.2 processing_system ]
set_property -dict [ list \
CONFIG.PSU__FPGA_PL0_ENABLE {1} \
CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ {100} \
CONFIG.PSU__USE__M_AXI_GP0 {1} \
CONFIG.PSU__USE__M_AXI_GP2 {0} \
CONFIG.PSU__USE__S_AXI_GP0 {1}
] $processing_system
# Get ports that are specific to the Zynq Ultrascale MPSoC processing system
set ps_clk [get_bd_pins processing_system/pl_clk0]
set ps_rstn [get_bd_pins processing_system/pl_resetn0]
set maxi_clk [get_bd_pins processing_system/maxihpm0_fpd_aclk]
set saxi_clk [get_bd_pins processing_system/saxihpc0_fpd_aclk]
set maxi [get_bd_intf_pins processing_system/M_AXI_HPM0_FPD]
set saxi [get_bd_intf_pins processing_system/S_AXI_HPC0_FPD]
}
# Create interface connections
connect_bd_intf_net -intf_net axi_interconnect_1_M01_AXI \
[get_bd_intf_pins axi_interconnect_1/M01_AXI] \
[get_bd_intf_pins fetch_0/s_axi_CONTROL_BUS]
connect_bd_intf_net -intf_net axi_interconnect_1_M02_AXI \
[get_bd_intf_pins axi_interconnect_1/M02_AXI] \
[get_bd_intf_pins load_0/s_axi_CONTROL_BUS]
connect_bd_intf_net -intf_net axi_interconnect_1_M03_AXI \
[get_bd_intf_pins axi_interconnect_1/M03_AXI] \
[get_bd_intf_pins compute_0/s_axi_CONTROL_BUS]
connect_bd_intf_net -intf_net axi_interconnect_1_M04_AXI \
[get_bd_intf_pins axi_interconnect_1/M04_AXI] \
[get_bd_intf_pins store_0/s_axi_CONTROL_BUS]
connect_bd_intf_net -intf_net axi_smc_M00_AXI \
[get_bd_intf_pins axi_smc/M00_AXI] \
[get_bd_intf_pins processing_system7_1/S_AXI_ACP]
connect_bd_intf_net -intf_net compute_0_g2l_dep_queue_V \
[get_bd_intf_pins compute_0/g2l_dep_queue_V] \
[get_bd_intf_pins g2l_queue/S_AXIS]
connect_bd_intf_net -intf_net compute_0_g2s_dep_queue_V \
[get_bd_intf_pins compute_0/g2s_dep_queue_V] \
[get_bd_intf_pins g2s_queue/S_AXIS]
connect_bd_intf_net -intf_net compute_0_m_axi_data_port \
[get_bd_intf_pins axi_smc/S02_AXI] \
[get_bd_intf_pins compute_0/m_axi_data_port]
connect_bd_intf_net -intf_net compute_0_m_axi_uop_port \
[get_bd_intf_pins axi_smc/S01_AXI] \
[get_bd_intf_pins compute_0/m_axi_uop_port]
connect_bd_intf_net -intf_net fetch_0_gemm_queue_V_V \
[get_bd_intf_pins fetch_0/gemm_queue_V_V] \
[get_bd_intf_pins gemm_queue/S_AXIS]
connect_bd_intf_net -intf_net fetch_0_l2g_dep_queue_V \
[get_bd_intf_pins l2g_queue/S_AXIS] \
[get_bd_intf_pins load_0/l2g_dep_queue_V]
connect_bd_intf_net -intf_net fetch_0_load_queue_V_V \
[get_bd_intf_pins fetch_0/load_queue_V_V] \
[get_bd_intf_pins load_queue/S_AXIS]
connect_bd_intf_net -intf_net fetch_0_m_axi_ins_port \
[get_bd_intf_pins axi_smc/S00_AXI] \
[get_bd_intf_pins fetch_0/m_axi_ins_port]
connect_bd_intf_net -intf_net fetch_0_store_queue_V_V \
[get_bd_intf_pins fetch_0/store_queue_V_V] \
[get_bd_intf_pins store_queue/S_AXIS]
connect_bd_intf_net -intf_net g2l_queue_M_AXIS \
[get_bd_intf_pins g2l_queue/M_AXIS] \
[get_bd_intf_pins load_0/g2l_dep_queue_V]
connect_bd_intf_net -intf_net g2s_queue_M_AXIS \
[get_bd_intf_pins g2s_queue/M_AXIS] \
[get_bd_intf_pins store_0/g2s_dep_queue_V]
connect_bd_intf_net -intf_net gemm_queue_M_AXIS \
[get_bd_intf_pins compute_0/gemm_queue_V_V] \
[get_bd_intf_pins gemm_queue/M_AXIS]
connect_bd_intf_net -intf_net l2g_queue_M_AXIS \
[get_bd_intf_pins compute_0/l2g_dep_queue_V] \
[get_bd_intf_pins l2g_queue/M_AXIS]
connect_bd_intf_net -intf_net load_0_m_axi_data_port \
[get_bd_intf_pins axi_smc/S03_AXI] \
[get_bd_intf_pins load_0/m_axi_data_port]
connect_bd_intf_net -intf_net load_queue_M_AXIS \
[get_bd_intf_pins load_0/load_queue_V_V] \
[get_bd_intf_pins load_queue/M_AXIS]
connect_bd_intf_net -intf_net processing_system7_1_axi_periph_m00_axi \
[get_bd_intf_pins axi_interconnect_1/M00_AXI] \
[get_bd_intf_pins axi_timer_1/S_AXI]
connect_bd_intf_net -intf_net processing_system7_1_ddr \
[get_bd_intf_ports DDR] \
[get_bd_intf_pins processing_system7_1/DDR]
connect_bd_intf_net -intf_net processing_system7_1_fixed_io \
[get_bd_intf_ports FIXED_IO] \
[get_bd_intf_pins processing_system7_1/FIXED_IO]
connect_bd_intf_net -intf_net processing_system7_1_m_axi_gp0 \
[get_bd_intf_pins axi_interconnect_1/S00_AXI] \
[get_bd_intf_pins processing_system7_1/M_AXI_GP0]
connect_bd_intf_net -intf_net s2g_queue_M_AXIS \
[get_bd_intf_pins compute_0/s2g_dep_queue_V] \
[get_bd_intf_pins s2g_queue/M_AXIS]
connect_bd_intf_net -intf_net store_0_m_axi_data_port \
[get_bd_intf_pins axi_smc/S04_AXI] \
[get_bd_intf_pins store_0/m_axi_data_port]
connect_bd_intf_net -intf_net store_0_s2g_dep_queue_V \
[get_bd_intf_pins s2g_queue/S_AXIS] \
[get_bd_intf_pins store_0/s2g_dep_queue_V]
connect_bd_intf_net -intf_net store_queue_M_AXIS \
[get_bd_intf_pins store_0/store_queue_V_V] \
[get_bd_intf_pins store_queue/M_AXIS]
# Create port connections
connect_bd_net -net axi_timer_1_interrupt \
[get_bd_pins axi_timer_1/interrupt] \
[get_bd_pins xlconcat_1/In0]
connect_bd_net -net compute_0_interrupt \
[get_bd_pins compute_0/interrupt] \
[get_bd_pins xlconcat_1/In3]
connect_bd_net -net fetch_0_interrupt \
[get_bd_pins fetch_0/interrupt] \
[get_bd_pins xlconcat_1/In1]
connect_bd_net -net load_0_interrupt \
[get_bd_pins load_0/interrupt] \
[get_bd_pins xlconcat_1/In2]
connect_bd_net -net proc_sys_reset_interconnect_aresetn \
[get_bd_pins axi_interconnect_1/ARESETN] \
# Create interface connections
connect_bd_intf_net -intf_net axi_xbar_M00_AXI [get_bd_intf_pins axi_xbar/M00_AXI] [get_bd_intf_pins fetch_0/s_axi_CONTROL_BUS]
connect_bd_intf_net -intf_net axi_xbar_M01_AXI [get_bd_intf_pins axi_xbar/M01_AXI] [get_bd_intf_pins load_0/s_axi_CONTROL_BUS]
connect_bd_intf_net -intf_net axi_xbar_M02_AXI [get_bd_intf_pins axi_xbar/M02_AXI] [get_bd_intf_pins compute_0/s_axi_CONTROL_BUS]
connect_bd_intf_net -intf_net axi_xbar_M03_AXI [get_bd_intf_pins axi_xbar/M03_AXI] [get_bd_intf_pins store_0/s_axi_CONTROL_BUS]
connect_bd_intf_net -intf_net fetch_0_l2g_dep_queue_V [get_bd_intf_pins l2g_queue/S_AXIS] [get_bd_intf_pins load_0/l2g_dep_queue_V]
connect_bd_intf_net -intf_net fetch_0_load_queue_V_V [get_bd_intf_pins fetch_0/load_queue_V_V] [get_bd_intf_pins load_queue/S_AXIS]
connect_bd_intf_net -intf_net fetch_0_gemm_queue_V_V [get_bd_intf_pins fetch_0/gemm_queue_V_V] [get_bd_intf_pins gemm_queue/S_AXIS]
connect_bd_intf_net -intf_net fetch_0_store_queue_V_V [get_bd_intf_pins fetch_0/store_queue_V_V] [get_bd_intf_pins store_queue/S_AXIS]
connect_bd_intf_net -intf_net compute_0_g2l_dep_queue_V [get_bd_intf_pins compute_0/g2l_dep_queue_V] [get_bd_intf_pins g2l_queue/S_AXIS]
connect_bd_intf_net -intf_net compute_0_g2s_dep_queue_V [get_bd_intf_pins compute_0/g2s_dep_queue_V] [get_bd_intf_pins g2s_queue/S_AXIS]
connect_bd_intf_net -intf_net store_0_s2g_dep_queue_V [get_bd_intf_pins s2g_queue/S_AXIS] [get_bd_intf_pins store_0/s2g_dep_queue_V]
connect_bd_intf_net -intf_net load_queue_M_AXIS [get_bd_intf_pins load_0/load_queue_V_V] [get_bd_intf_pins load_queue/M_AXIS]
connect_bd_intf_net -intf_net gemm_queue_M_AXIS [get_bd_intf_pins compute_0/gemm_queue_V_V] [get_bd_intf_pins gemm_queue/M_AXIS]
connect_bd_intf_net -intf_net store_queue_M_AXIS [get_bd_intf_pins store_0/store_queue_V_V] [get_bd_intf_pins store_queue/M_AXIS]
connect_bd_intf_net -intf_net l2g_queue_M_AXIS [get_bd_intf_pins compute_0/l2g_dep_queue_V] [get_bd_intf_pins l2g_queue/M_AXIS]
connect_bd_intf_net -intf_net g2l_queue_M_AXIS [get_bd_intf_pins g2l_queue/M_AXIS] [get_bd_intf_pins load_0/g2l_dep_queue_V]
connect_bd_intf_net -intf_net g2s_queue_M_AXIS [get_bd_intf_pins g2s_queue/M_AXIS] [get_bd_intf_pins store_0/g2s_dep_queue_V]
connect_bd_intf_net -intf_net s2g_queue_M_AXIS [get_bd_intf_pins compute_0/s2g_dep_queue_V] [get_bd_intf_pins s2g_queue/M_AXIS]
connect_bd_intf_net -intf_net fetch_0_m_axi_ins_port [get_bd_intf_pins axi_smc0/S00_AXI] [get_bd_intf_pins fetch_0/m_axi_ins_port]
connect_bd_intf_net -intf_net load_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S01_AXI] [get_bd_intf_pins load_0/m_axi_data_port]
connect_bd_intf_net -intf_net compute_0_m_axi_uop_port [get_bd_intf_pins axi_smc0/S02_AXI] [get_bd_intf_pins compute_0/m_axi_uop_port]
connect_bd_intf_net -intf_net compute_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S03_AXI] [get_bd_intf_pins compute_0/m_axi_data_port]
connect_bd_intf_net -intf_net store_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S04_AXI] [get_bd_intf_pins store_0/m_axi_data_port]
connect_bd_intf_net -intf_net axi_smc0_M00_AXI [get_bd_intf_pins axi_smc0/M00_AXI] $saxi
connect_bd_intf_net -intf_net processing_system_m_axi [get_bd_intf_pins axi_xbar/S00_AXI] $maxi
# Create port connections
connect_bd_net -net processing_system_reset \
[get_bd_pins pll_clk/resetn] \
[get_bd_pins proc_sys_reset/ext_reset_in] \
$ps_rstn
connect_bd_net -net ps_clk_net \
[get_bd_pins pll_clk/clk_in1] \
$ps_clk
connect_bd_net -net proc_sys_reset_interconnect_aresetn \
[get_bd_pins axi_xbar/ARESETN] \
[get_bd_pins proc_sys_reset/interconnect_aresetn]
connect_bd_net -net proc_sys_reset_peripheral_aresetn \
[get_bd_pins axi_interconnect_1/M00_ARESETN] \
[get_bd_pins axi_interconnect_1/M01_ARESETN] \
[get_bd_pins axi_interconnect_1/M02_ARESETN] \
[get_bd_pins axi_interconnect_1/M03_ARESETN] \
[get_bd_pins axi_interconnect_1/M04_ARESETN] \
[get_bd_pins axi_interconnect_1/S00_ARESETN] \
[get_bd_pins axi_smc/aresetn] \
[get_bd_pins axi_timer_1/s_axi_aresetn] \
[get_bd_pins compute_0/ap_rst_n] \
connect_bd_net -net proc_sys_reset_peripheral_aresetn \
[get_bd_pins proc_sys_reset/peripheral_aresetn] \
[get_bd_pins axi_smc0/aresetn] \
[get_bd_pins axi_xbar/M00_ARESETN] \
[get_bd_pins axi_xbar/M01_ARESETN] \
[get_bd_pins axi_xbar/M02_ARESETN] \
[get_bd_pins axi_xbar/M03_ARESETN] \
[get_bd_pins axi_xbar/S00_ARESETN] \
[get_bd_pins fetch_0/ap_rst_n] \
[get_bd_pins g2l_queue/s_aresetn] \
[get_bd_pins g2s_queue/s_aresetn] \
[get_bd_pins gemm_queue/s_aresetn] \
[get_bd_pins l2g_queue/s_aresetn] \
[get_bd_pins load_0/ap_rst_n] \
[get_bd_pins load_queue/s_aresetn] \
[get_bd_pins proc_sys_reset/peripheral_aresetn] \
[get_bd_pins s2g_queue/s_aresetn] \
[get_bd_pins store_0/ap_rst_n] \
[get_bd_pins store_queue/s_aresetn]
connect_bd_net -net processing_system7_1_FCLK_CLK \
[get_bd_pins axi_interconnect_1/ACLK] \
[get_bd_pins axi_interconnect_1/M00_ACLK] \
[get_bd_pins axi_interconnect_1/M01_ACLK] \
[get_bd_pins axi_interconnect_1/M02_ACLK] \
[get_bd_pins axi_interconnect_1/M03_ACLK] \
[get_bd_pins axi_interconnect_1/M04_ACLK] \
[get_bd_pins axi_interconnect_1/S00_ACLK] \
[get_bd_pins axi_smc/aclk] \
[get_bd_pins axi_timer_1/s_axi_aclk] \
[get_bd_pins compute_0/ap_clk] \
[get_bd_pins compute_0/ap_rst_n] \
[get_bd_pins load_queue/s_aresetn] \
[get_bd_pins gemm_queue/s_aresetn] \
[get_bd_pins store_queue/s_aresetn] \
[get_bd_pins l2g_queue/s_aresetn] \
[get_bd_pins g2l_queue/s_aresetn] \
[get_bd_pins g2s_queue/s_aresetn] \
[get_bd_pins s2g_queue/s_aresetn]
connect_bd_net -net processing_system_clk \
[get_bd_pins pll_clk/clk_out1] \
[get_bd_pins proc_sys_reset/slowest_sync_clk] \
[get_bd_pins axi_smc0/aclk] \
[get_bd_pins axi_xbar/ACLK] \
[get_bd_pins axi_xbar/M00_ACLK] \
[get_bd_pins axi_xbar/M01_ACLK] \
[get_bd_pins axi_xbar/M02_ACLK] \
[get_bd_pins axi_xbar/M03_ACLK] \
[get_bd_pins axi_xbar/S00_ACLK] \
[get_bd_pins fetch_0/ap_clk] \
[get_bd_pins g2l_queue/s_aclk] \
[get_bd_pins g2s_queue/s_aclk] \
[get_bd_pins gemm_queue/s_aclk] \
[get_bd_pins l2g_queue/s_aclk] \
[get_bd_pins load_0/ap_clk] \
[get_bd_pins compute_0/ap_clk] \
[get_bd_pins store_0/ap_clk] \
[get_bd_pins load_queue/s_aclk] \
[get_bd_pins proc_sys_reset/slowest_sync_clk] \
[get_bd_pins processing_system7_1/FCLK_CLK${clk}] \
[get_bd_pins processing_system7_1/M_AXI_GP0_ACLK] \
[get_bd_pins processing_system7_1/S_AXI_ACP_ACLK] \
[get_bd_pins gemm_queue/s_aclk] \
[get_bd_pins store_queue/s_aclk] \
[get_bd_pins l2g_queue/s_aclk] \
[get_bd_pins g2l_queue/s_aclk] \
[get_bd_pins g2s_queue/s_aclk] \
[get_bd_pins s2g_queue/s_aclk] \
[get_bd_pins store_0/ap_clk] \
[get_bd_pins store_queue/s_aclk]
connect_bd_net -net processing_system7_1_fclk_reset0_n \
[get_bd_pins proc_sys_reset/ext_reset_in] \
[get_bd_pins processing_system7_1/FCLK_RESET0_N]
connect_bd_net -net store_0_interrupt \
[get_bd_pins store_0/interrupt] \
[get_bd_pins xlconcat_1/In4]
connect_bd_net -net xlconcat_1_dout \
[get_bd_pins processing_system7_1/IRQ_F2P] \
[get_bd_pins xlconcat_1/dout]
# Create address segments
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
[get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
SEG_processing_system7_1_ACP_DDR_LOWOCM
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
[get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
SEG_processing_system7_1_ACP_DDR_LOWOCM
create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
[get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
SEG_processing_system7_1_ACP_HIGH_OCM
create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
[get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
SEG_processing_system7_1_ACP_HIGH_OCM
create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
[get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
SEG_processing_system7_1_ACP_IOP
create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
[get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
SEG_processing_system7_1_ACP_IOP
create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
[get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
SEG_processing_system7_1_ACP_M_AXI_GP0
create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
[get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
SEG_processing_system7_1_ACP_M_AXI_GP0
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
[get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
SEG_processing_system7_1_ACP_DDR_LOWOCM
create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
[get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
SEG_processing_system7_1_ACP_HIGH_OCM
create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
[get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
SEG_processing_system7_1_ACP_IOP
create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
[get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
SEG_processing_system7_1_ACP_M_AXI_GP0
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
[get_bd_addr_spaces load_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
SEG_processing_system7_1_ACP_DDR_LOWOCM
create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
[get_bd_addr_spaces load_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
SEG_processing_system7_1_ACP_HIGH_OCM
create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
[get_bd_addr_spaces load_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
SEG_processing_system7_1_ACP_IOP
create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
[get_bd_addr_spaces load_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
SEG_processing_system7_1_ACP_M_AXI_GP0
create_bd_addr_seg -range 0x00010000 -offset 0x42800000 \
[get_bd_addr_spaces processing_system7_1/Data] \
[get_bd_addr_segs axi_timer_1/S_AXI/Reg] SEG_axi_timer_1_Reg
create_bd_addr_seg -range 0x00010000 -offset 0x43C10000 \
[get_bd_addr_spaces processing_system7_1/Data] \
[get_bd_addr_segs compute_0/s_axi_CONTROL_BUS/Reg] SEG_compute_0_Reg
create_bd_addr_seg -range 0x00010000 -offset 0x43C00000 \
[get_bd_addr_spaces processing_system7_1/Data] \
[get_bd_addr_segs fetch_0/s_axi_CONTROL_BUS/Reg] SEG_fetch_0_Reg
create_bd_addr_seg -range 0x00010000 -offset 0x43C20000 \
[get_bd_addr_spaces processing_system7_1/Data] \
[get_bd_addr_segs load_0/s_axi_CONTROL_BUS/Reg] SEG_load_0_Reg
create_bd_addr_seg -range 0x00010000 -offset 0x43C30000 \
[get_bd_addr_spaces processing_system7_1/Data] \
[get_bd_addr_segs store_0/s_axi_CONTROL_BUS/Reg] SEG_store_0_Reg
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
[get_bd_addr_spaces store_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
SEG_processing_system7_1_ACP_DDR_LOWOCM
create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
[get_bd_addr_spaces store_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
SEG_processing_system7_1_ACP_HIGH_OCM
create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
[get_bd_addr_spaces store_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
SEG_processing_system7_1_ACP_IOP
create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
[get_bd_addr_spaces store_0/Data_m_axi_data_port] \
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
SEG_processing_system7_1_ACP_M_AXI_GP0
# Restore current instance
current_bd_instance $oldCurInst
save_bd_design
$maxi_clk \
$saxi_clk
# Create address segments
create_bd_addr_seg -range $ip_reg_map_range -offset $fetch_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs fetch_0/s_axi_CONTROL_BUS/Reg] SEG_fetch_0_Reg
create_bd_addr_seg -range $ip_reg_map_range -offset $load_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs load_0/s_axi_CONTROL_BUS/Reg] SEG_load_0_Reg
create_bd_addr_seg -range $ip_reg_map_range -offset $compute_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs compute_0/s_axi_CONTROL_BUS/Reg] SEG_compute_0_Reg
create_bd_addr_seg -range $ip_reg_map_range -offset $store_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs store_0/s_axi_CONTROL_BUS/Reg] SEG_store_0_Reg
if { $device_family eq "zynq-7000" } {
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces load_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces store_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
} elseif { $device_family eq "zynq-ultrascale+"} {
create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces load_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces store_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
}
# End of create_root_design()
save_bd_design
##################################################################
# MAIN FLOW
# COMPILATION FLOW
##################################################################
create_root_design "" $clock_id $inp_part $wgt_part $out_part $inp_bus_width \
$inp_mem_depth $wgt_bus_width $wgt_mem_depth $out_bus_width $out_mem_depth
# Create top-level wrapper file
make_wrapper -files \
[get_files $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/$proj_name.bd] -top
......@@ -937,8 +421,7 @@ update_compile_order -fileset sources_1
update_compile_order -fileset sim_1
# Run bistream generation on 8 threads with performance oriented P&R strategy
# create_run impl_1 -parent_run synth_1 -flow {Vivado Implementation 2017} \
# -strategy "Performance_ExplorePostRoutePhysOpt"
set num_threads 8
launch_runs impl_1 -to_step write_bitstream -jobs $num_threads
wait_on_run impl_1
......
......@@ -35,17 +35,6 @@ int main(void) {
printParameters();
#endif
// Micro op bound
assert(VTA_UOP_GEM_2_1 < VTA_UOP_WIDTH);
assert(VTA_UOP_ALU_1_1 < VTA_UOP_WIDTH);
// Make sure there is no misaligment
assert(VTA_INSN_GEM_9_1 < VTA_INSN_GEM_A_0);
assert(VTA_INSN_MEM_7_1 < VTA_INSN_MEM_8_0);
// Instruction bounds
assert(VTA_INSN_MEM_E_1 < VTA_INS_WIDTH);
assert(VTA_INSN_GEM_F_1 < VTA_INS_WIDTH);
assert(VTA_INSN_ALU_G_1 < VTA_INS_WIDTH);
int status = 0;
// Run ALU test (vector-scalar operators)
......@@ -65,15 +54,15 @@ int main(void) {
status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, false);
// Run blocked GEMM test
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
// Simple GEMM unit test
status |= gemm_test(64, 64, 64, true);
status |= gemm_test(4 * VTA_BATCH, 4 * VTA_BLOCK_OUT, 4 * VTA_BLOCK_IN, false);
return status;
}
......@@ -18,7 +18,6 @@
*/
/*!
* Copyright (c) 2018 by Contributors
* \file vta.cpp
* \brief VTA HLS design.
*/
......@@ -29,13 +28,114 @@
#include "vta.h"
template <typename DATA_T, int MAT_AXI_RATIO>
void reset_mem(
memop_sram_T &sram_idx,
memop_sram_T range,
DATA_T mem[][MAT_AXI_RATIO]) {
for (int i = 0; i < range; i ++) {
for (int j = 0; j < MAT_AXI_RATIO; j ++) {
#pragma HLS UNROLL
mem[sram_idx][j] = 0;
}
sram_idx ++;
}
}
template <typename DATA_T, int MAT_AXI_RATIO, int ELEM_BYTES>
void load_pad_2d(
volatile DATA_T *src,
DATA_T dst[][MAT_AXI_RATIO],
memop_sram_T sram_idx,
memop_dram_T dram_idx,
memop_size_T y_size,
memop_size_T x_size,
memop_stride_T x_stride,
memop_pad_T x_pad_0,
memop_pad_T x_pad_1,
memop_sram_T y_offset_0,
memop_sram_T y_offset_1) {
#pragma HLS INLINE
reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, y_offset_0, dst);
for (int y = 0; y < y_size; y++) {
#pragma HLS PIPELINE
reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, x_pad_0, dst);
memcpy(&dst[sram_idx][0],
(const DATA_T*) &src[dram_idx * MAT_AXI_RATIO],
x_size * ELEM_BYTES);
sram_idx += x_size;
dram_idx += x_stride;
reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, x_pad_1, dst);
}
reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, y_offset_1, dst);
}
template <typename DATA_T, int MAT_AXI_RATIO, int ELEM_BYTES>
void load_2d(
volatile DATA_T *src,
DATA_T dst[][MAT_AXI_RATIO],
memop_sram_T sram_idx,
memop_dram_T dram_idx,
memop_size_T y_size,
memop_size_T x_size,
memop_stride_T x_stride) {
#pragma HLS INLINE
for (int y = 0; y < y_size; y++) {
memcpy(&dst[sram_idx][0],
(const DATA_T*) &src[dram_idx * MAT_AXI_RATIO],
x_size * ELEM_BYTES);
#pragma HLS RESOURCE variable = sram_idx core = Mul_LUT
sram_idx += x_size;
dram_idx += x_stride;
}
}
template <typename WIDE_T, typename NARROW_T, typename IDX_T, int WIDE_W, int NARROW_W, int Y_DIM, int X_DIM>
void read_tensor(
IDX_T idx,
WIDE_T src[][NARROW_W * Y_DIM * X_DIM / WIDE_W],
NARROW_T dst[Y_DIM][X_DIM]) {
#pragma HLS INLINE
// Read in weight tensor
for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) {
WIDE_T packet = src[idx][p];
for (int w = 0; w < (WIDE_W / NARROW_W); w++) {
int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM;
int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM;
dst[x][y] = (NARROW_T) packet.range((w + 1) * NARROW_W - 1, w * NARROW_W);
}
}
}
template <typename WIDE_T, typename NARROW_T, typename IDX_T, int WIDE_W, int NARROW_W, int Y_DIM, int X_DIM>
void write_tensor(
IDX_T idx,
NARROW_T src[Y_DIM][X_DIM],
WIDE_T dst[][NARROW_W * Y_DIM * X_DIM / WIDE_W]) {
#pragma HLS INLINE
for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) {
WIDE_T packet = 0;
for (int w = 0; w < (WIDE_W / NARROW_W); w++) {
int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM;
int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM;
packet.range((w + 1) * NARROW_W - 1, w * NARROW_W) = src[x][y];
}
dst[idx][p] = packet;
}
}
void fetch(
uint32_t insn_count,
volatile insn_T *insns,
hls::stream<insn_T> &load_queue,
hls::stream<insn_T> &gemm_queue,
hls::stream<insn_T> &store_queue) {
#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
PRAGMA_HLS(HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS offset = VTA_FETCH_INSN_COUNT_OFFSET)
#pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
#pragma HLS INTERFACE axis port = load_queue
#pragma HLS INTERFACE axis port = gemm_queue
......@@ -43,305 +143,124 @@ void fetch(
#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) {
#pragma HLS PIPELINE II = 1
#pragma HLS PIPELINE
// Read instruction fields
insn_T insn = insns[pc];
insn_T raw_insn = insns[pc];
VTAInsn insn;
insn.generic = *((VTAGenericInsn *) &raw_insn);
// Do some partial decoding
opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
opcode_T opcode = insn.generic.opcode;
memop_id_T memory_type = insn.mem.memory_type;
// Push to appropriate instruction queue
if (opcode == VTA_OPCODE_STORE) {
store_queue.write(insn);
} else if (opcode == VTA_OPCODE_LOAD &&
(memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT)) {
load_queue.write(insn);
store_queue.write(raw_insn);
} else if (opcode == VTA_OPCODE_LOAD) {
if (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT) {
load_queue.write(raw_insn);
} else {
gemm_queue.write(insn);
gemm_queue.write(raw_insn);
}
} else {
gemm_queue.write(raw_insn);
}
}
}
void load(
volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights,
volatile bus_T *inputs,
volatile bus_T *weights,
hls::stream<insn_T> &load_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &l2g_dep_queue,
inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]
) {
#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]) {
#pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
#pragma HLS INTERFACE axis port = load_queue
#pragma HLS INTERFACE axis port = g2l_dep_queue
#pragma HLS INTERFACE axis port = l2g_dep_queue
#pragma HLS INTERFACE bram port = wgt_mem
#pragma HLS INTERFACE bram port = inp_mem
#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
#pragma HLS RESOURCE variable = inp_mem core = RAM_1P
#pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
// Pop load instruction
insn_T insn = load_queue.read();
// Decode instruction
bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
bool pop_next_dependence = insn[VTA_INSN_MEM_2];
bool push_prev_dependence = insn[VTA_INSN_MEM_3];
bool push_next_dependence = insn[VTA_INSN_MEM_4];
memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
insn_T raw_insn = load_queue.read();
// Cast to MemInsn
insn_T raw_copy = raw_insn;
VTAMemInsn insn = *((VTAMemInsn *) &raw_copy);
// Pop dependence token if instructed
if (pop_next_dependence) {
if (insn.pop_next_dep) {
g2l_dep_queue.read();
}
// Initialize indices
memop_sram_T sram_idx = sram_base;
memop_dram_T dram_idx = dram_base;
// Pre-compute dimensions, and offsets
memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
memop_sram_T y_offset = x_size_total * y_pad_0;
// Force this computation to be done with LUTs to avoid using too many DSPs
#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
// Skip padding along y dimension
sram_idx += y_offset;
// Perform data transfer from DRAM
for (int y = 0; y < y_size; y++) {
#pragma HLS PIPELINE rewind
// Skip padding along x dimension
sram_idx += x_pad_0;
// Perform data transfer
if (memory_type == VTA_MEM_ID_INP) {
memcpy(&inp_mem[sram_idx][0],
(const inp_vec_T*) &inputs[dram_idx * VTA_BATCH],
x_size * VTA_INP_ELEM_BYTES);
} else {
memcpy(&wgt_mem[sram_idx][0],
(const wgt_vec_T*) &weights[dram_idx * VTA_BLOCK_OUT],
x_size * VTA_WGT_ELEM_BYTES);
}
sram_idx += x_size;
dram_idx += x_stride;
// Skip padding along x dimension
sram_idx += x_pad_1;
}
// Reset SRAM index
sram_idx = sram_base;
// Pad x/y edges with zeros
for (int y = 0; y < y_size_total; y++) {
if (y < y_pad_0 || y >= y_pad_0 + y_size) {
for (int x = 0; x < x_size_total; x++) {
#pragma HLS PIPELINE II = 1 rewind
if (memory_type == VTA_MEM_ID_INP) {
for (int i = 0; i < VTA_BATCH; i++) {
inp_mem[sram_idx][i] = 0;
}
} else {
for (int i = 0; i < VTA_BLOCK_OUT; i++) {
wgt_mem[sram_idx][i] = 0;
}
}
sram_idx++;
}
} else {
for (int x = 0; x < x_pad_0; x++) {
#pragma HLS PIPELINE II = 1 rewind
if (memory_type == VTA_MEM_ID_INP) {
for (int i = 0; i < VTA_BATCH; i++) {
inp_mem[sram_idx][i] = 0;
}
} else {
for (int i = 0; i < VTA_BLOCK_OUT; i++) {
wgt_mem[sram_idx][i] = 0;
}
}
sram_idx++;
}
sram_idx += x_size;
for (int x = 0; x < x_pad_1; x++) {
#pragma HLS PIPELINE II = 1 rewind
if (memory_type == VTA_MEM_ID_INP) {
for (int i = 0; i < VTA_BATCH; i++) {
inp_mem[sram_idx][i] = 0;
}
} else {
for (int i = 0; i < VTA_BLOCK_OUT; i++) {
wgt_mem[sram_idx][i] = 0;
}
}
sram_idx++;
}
}
// Pre-processing
memop_sram_T x_width = (insn.x_pad_0 + insn.x_size + insn.x_pad_1);
memop_sram_T y_offset_0 = x_width * insn.y_pad_0;
#pragma HLS RESOURCE variable = y_offset_0 core = Mul_LUT latency = 4
memop_sram_T y_offset_1 = x_width * insn.y_pad_1;
#pragma HLS RESOURCE variable = y_offset_1 core = Mul_LUT latency = 4
if (insn.memory_type == VTA_MEM_ID_INP) {
load_pad_2d<bus_T, INP_MAT_AXI_RATIO, VTA_INP_ELEM_BYTES>(
inputs,
inp_mem,
insn.sram_base,
insn.dram_base,
insn.y_size,
insn.x_size,
insn.x_stride,
insn.x_pad_0,
insn.x_pad_1,
y_offset_0,
y_offset_1);
} else if (insn.memory_type == VTA_MEM_ID_WGT) {
load_2d<bus_T, WGT_MAT_AXI_RATIO, VTA_WGT_ELEM_BYTES>(
weights,
wgt_mem,
insn.sram_base,
insn.dram_base,
insn.y_size,
insn.x_size,
insn.x_stride);
}
// Push dependence token if instructed
if (push_next_dependence) {
if (insn.push_next_dep) {
l2g_dep_queue.write(1);
}
}
void compute(
volatile uint32_t &done,
volatile uop_T *uops,
volatile acc_vec_T *biases,
hls::stream<insn_T> &gemm_queue,
hls::stream<bool> &l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue,
inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]
) {
#pragma HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS
#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
#pragma HLS INTERFACE axis port = gemm_queue
#pragma HLS INTERFACE axis port = l2g_dep_queue
#pragma HLS INTERFACE axis port = s2g_dep_queue
#pragma HLS INTERFACE axis port = g2l_dep_queue
#pragma HLS INTERFACE axis port = g2s_dep_queue
#pragma HLS INTERFACE bram port = inp_mem
#pragma HLS INTERFACE bram port = wgt_mem
#pragma HLS INTERFACE bram port = out_mem
#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
// This is necessary connect the SRAM to the load module
#pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
// Micro-op storage
static uop_T uop_mem[VTA_UOP_BUFF_DEPTH];
// Accumulator storage
static acc_vec_T acc_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH];
#pragma HLS ARRAY_PARTITION variable = acc_mem complete dim = 2
// Pop GEMM instruction
insn_T insn = gemm_queue.read();
// Decode
opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
bool pop_next_dependence = insn[VTA_INSN_MEM_2];
bool push_prev_dependence = insn[VTA_INSN_MEM_3];
bool push_next_dependence = insn[VTA_INSN_MEM_4];
// Pop dependence token if instructed
if (pop_prev_dependence) {
l2g_dep_queue.read();
}
if (pop_next_dependence) {
s2g_dep_queue.read();
}
// Perform action based on opcode
if (opcode == VTA_OPCODE_FINISH) {
// Set done flag if we reach a FINISH instruction
done = 1;
} else if (opcode == VTA_OPCODE_LOAD || opcode == VTA_OPCODE_STORE) {
// Set done value
done = 0;
// Decode instruction
memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
// Initialize indices
memop_sram_T sram_idx = sram_base;
memop_dram_T dram_idx = dram_base;
void gemm(
insn_T insn_raw,
uop_T uop_mem[VTA_UOP_BUFF_DEPTH],
bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO],
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
#pragma HLS INLINE
// Pre-compute dimensions, and offsets
memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
memop_sram_T y_offset = x_size_total * y_pad_0;
// Force this computation to be done with LUTs to avoid using too many DSPs
#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
VTAGemInsn insn = *((VTAGemInsn *) &insn_raw);
if (memory_type == VTA_MEM_ID_UOP) {
// Perform data transfer
memcpy(&uop_mem[sram_base],
(const uop_T*) &uops[dram_base],
x_size * sizeof(uop_T));
} else {
// Skip vertical padding
sram_idx += y_offset;
// Perform data transfer from DRAM
for (int y = 0; y < y_size; y++) {
#pragma HLS PIPELINE rewind
// Skip padding along x dimension
sram_idx += x_pad_0;
// Perform data transfer
memcpy(&acc_mem[sram_idx][0],
(const acc_vec_T*) &biases[dram_idx * VTA_BATCH],
x_size*VTA_ACC_ELEM_BYTES);
sram_idx += x_size;
dram_idx += x_stride;
// Skip padding along x dimension
sram_idx += x_pad_1;
}
}
} else if (opcode == VTA_OPCODE_GEMM || opcode == VTA_OPCODE_ALU) {
// Set done value
done = 0;
// Decode
bool reset_out = insn[VTA_INSN_GEM_5];
uop_idx_T uop_bgn = insn.range(VTA_INSN_GEM_6_1, VTA_INSN_GEM_6_0);
uop_idx_T uop_end = insn.range(VTA_INSN_GEM_7_1, VTA_INSN_GEM_7_0);
loop_T iter_out = insn.range(VTA_INSN_GEM_8_1, VTA_INSN_GEM_8_0);
loop_T iter_in = insn.range(VTA_INSN_GEM_9_1, VTA_INSN_GEM_9_0);
acc_idx_T dst_factor_out = insn.range(VTA_INSN_GEM_A_1, VTA_INSN_GEM_A_0);
acc_idx_T dst_factor_in = insn.range(VTA_INSN_GEM_B_1, VTA_INSN_GEM_B_0);
inp_idx_T src_factor_out = insn.range(VTA_INSN_GEM_C_1, VTA_INSN_GEM_C_0);
inp_idx_T src_factor_in = insn.range(VTA_INSN_GEM_D_1, VTA_INSN_GEM_D_0);
// GEMM-specific fields
wgt_idx_T wgt_factor_out = insn.range(VTA_INSN_GEM_E_1, VTA_INSN_GEM_E_0);
wgt_idx_T wgt_factor_in = insn.range(VTA_INSN_GEM_F_1, VTA_INSN_GEM_F_0);
// ALU-specific field
aluop_opcode_T alu_opcode = insn.range(VTA_INSN_ALU_E_1, VTA_INSN_ALU_E_0);
bool use_imm = insn[VTA_INSN_ALU_F];
aluop_imm_T imm = insn.range(VTA_INSN_ALU_G_1, VTA_INSN_ALU_G_0);
// Loop offset
acc_idx_T dst_offset_out = 0;
inp_idx_T src_offset_out = 0;
wgt_idx_T wgt_offset_out = 0;
// Outer Loop
EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out++) {
#pragma HLS DEPENDENCE variable = acc_mem inter false
EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) {
acc_idx_T dst_offset_in = dst_offset_out;
inp_idx_T src_offset_in = src_offset_out;
wgt_idx_T wgt_offset_in = wgt_offset_out;
// Inner Loop
EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in++) {
// Perform appropriate computation based on opcode
if (opcode == VTA_OPCODE_GEMM) {
// Iterate over micro op
READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) {
#pragma HLS PIPELINE II = 1 rewind
EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) {
// Iterate over micro op
READ_GEMM_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) {
#pragma HLS PIPELINE II = 1
// Read micro-op fields
uop_T uop = uop_mem[upc];
......@@ -353,60 +272,83 @@ void compute(
wgt_idx_T wgt_idx =
uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + wgt_offset_in;
// Read weight matrix
wgt_vec_T w_matrix[VTA_BLOCK_OUT];
for (int i = 0; i < VTA_BLOCK_OUT; i++) {
w_matrix[i] = wgt_mem[wgt_idx][i];
}
// Read input matrix and accum matrix
acc_vec_T o_matrix[VTA_BATCH];
inp_vec_T i_matrix[VTA_BATCH];
for (int i = 0; i < VTA_BATCH; i++) {
o_matrix[i] = acc_mem[dst_idx][i];
i_matrix[i] = inp_mem[src_idx][i];
}
// Result matrices
acc_vec_T acc_mem_val[VTA_BATCH];
out_vec_T st_buf_val[VTA_BATCH];
// Read in weight tensor
wgt_T w_tensor[VTA_BLOCK_OUT][VTA_BLOCK_IN];
read_tensor<bus_T, wgt_T, wgt_idx_T, VTA_BUS_WIDTH, VTA_WGT_WIDTH, VTA_BLOCK_OUT, VTA_BLOCK_IN>(wgt_idx, wgt_mem, w_tensor);
// Read in input tensor
inp_T i_tensor[VTA_BATCH][VTA_BLOCK_IN];
read_tensor<bus_T, inp_T, inp_idx_T, VTA_BUS_WIDTH, VTA_INP_WIDTH, VTA_BATCH, VTA_BLOCK_IN>(src_idx, inp_mem, i_tensor);
// Read in accum tensor
acc_T a_tensor[VTA_BATCH][VTA_BLOCK_OUT];
read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, acc_mem, a_tensor);
// Output tensor
out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT];
// Inner GEMM loop
for (int i = 0; i < VTA_BATCH; i++) {
for (int b = 0; b < VTA_BLOCK_OUT; b++) {
for (int b = 0; b < VTA_BATCH; b++) {
for (int oc = 0; oc < VTA_BLOCK_OUT; oc++) {
// Initialize the accumulator values
acc_T accum =
o_matrix[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
acc_T accum = a_tensor[b][oc];
// Dot product sum
sum_T tmp = 0;
// Inner matrix multiplication loop (input channel/feature)
for (int k = 0; k < VTA_BLOCK_IN; k++) {
wgt_T w_elem =
w_matrix[b].range((k + 1) * VTA_WGT_WIDTH - 1, k * VTA_WGT_WIDTH);
inp_T i_elem =
i_matrix[i].range((k + 1) * VTA_INP_WIDTH - 1, k * VTA_INP_WIDTH);
mul_T prod = i_elem * w_elem;
#ifdef NO_DSP
#pragma HLS RESOURCE variable = prod core = Mul_LUT
#endif // NO_DSP
tmp += (sum_T) prod;
for (int ic = 0; ic < VTA_BLOCK_IN; ic++) {
wgt_T w_elem = w_tensor[oc][ic];
inp_T i_elem = i_tensor[b][ic];
mul_T prod_dsp = i_elem * w_elem;
tmp += (sum_T) prod_dsp;
}
// Update summation
accum += (acc_T) tmp;
// Update result vector
acc_mem_val[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) =
reset_out ? (acc_T) 0 : accum;
st_buf_val[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
(out_T) accum.range(VTA_OUT_WIDTH - 1, 0);
// Write back result acc_mem
a_tensor[b][oc] = insn.reset_reg ? (acc_T) 0 : accum;
// And output vector
o_tensor[b][oc] = (out_T) accum.range(VTA_OUT_WIDTH - 1, 0);
}
// Write to buffers
acc_mem[dst_idx][i] = acc_mem_val[i];
out_mem[dst_idx][i] = st_buf_val[i];
}
// Write the results back into accumulator
write_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, a_tensor, acc_mem);
// Write the results back in the output buffer
write_tensor<bus_T, out_T, acc_idx_T, VTA_BUS_WIDTH, VTA_OUT_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, o_tensor, out_mem);
}
// Update offsets
dst_offset_in += insn.dst_factor_in;
src_offset_in += insn.src_factor_in;
wgt_offset_in += insn.wgt_factor_in;
}
// Update offsets
dst_offset_out += insn.dst_factor_out;
src_offset_out += insn.src_factor_out;
wgt_offset_out += insn.wgt_factor_out;
}
#ifndef NO_ALU
else if (opcode == VTA_OPCODE_ALU) {
}
void alu(
insn_T insn_raw,
uop_T uop_mem[VTA_UOP_BUFF_DEPTH],
bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO],
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
#pragma HLS INLINE
VTAAluInsn insn = *((VTAAluInsn *) &insn_raw);
// Loop offset
acc_idx_T dst_offset_out = 0;
inp_idx_T src_offset_out = 0;
// Outer Loop
EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) {
acc_idx_T dst_offset_in = dst_offset_out;
inp_idx_T src_offset_in = src_offset_out;
// Inner Loop
EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) {
// Iterate over micro op
READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) {
READ_ALU_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) {
#pragma HLS PIPELINE II = 2
// Read micro-op fields
uop_T uop = uop_mem[upc];
......@@ -416,153 +358,197 @@ void compute(
acc_idx_T src_idx =
uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in;
// Read in src tensor
acc_T src_tensor[VTA_BATCH][VTA_BLOCK_OUT];
read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(src_idx, acc_mem, src_tensor);
// Read in dst tensor
acc_T dst_tensor[VTA_BATCH][VTA_BLOCK_OUT];
read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, acc_mem, dst_tensor);
// Output tensor
out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT];
// Perform ALU op over matrix elements
for (int i = 0; i < VTA_BATCH; i++) {
// Read input matrix and accum matrix
acc_vec_T dst_vector = acc_mem[dst_idx][i];
acc_vec_T src_vector = acc_mem[src_idx][i];
// Result matrices
acc_vec_T cmp_res;
acc_vec_T add_res;
acc_vec_T shr_res;
out_vec_T short_cmp_res;
out_vec_T short_add_res;
out_vec_T short_shr_res;
// Results vector
acc_vec_T res_vec = 0;
for (int b = 0; b < VTA_BLOCK_OUT; b++) {
#pragma HLS PIPELINE II = 1 rewind
// Read in operands
acc_T src_0 = dst_vector.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
acc_T src_1 = use_imm ?
(acc_T) imm :
src_vector.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
acc_T src_0 = dst_tensor[i][b];
acc_T src_1 = insn.use_imm ? (acc_T) insn.imm : src_tensor[i][b];
aluop_shr_arg_T shft_by = src_1.range(VTA_SHR_ARG_BIT_WIDTH - 1, 0);
aluop_mul_arg_T mul_by = src_1.range(VTA_MUL_ARG_BIT_WIDTH - 1, 0);
if (insn.alu_opcode == VTA_ALU_OPCODE_MIN || insn.alu_opcode == VTA_ALU_OPCODE_MAX) {
// Compute Min/Max
acc_T mix_val = src_0 < src_1 ?
(alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
(alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
cmp_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = mix_val;
short_cmp_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
(out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
(insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
(insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
dst_tensor[i][b] = mix_val;
o_tensor[i][b] = (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
} else if (insn.alu_opcode == VTA_ALU_OPCODE_ADD) {
// Compute Sum
acc_T add_val =
src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0);
add_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = add_val;
short_add_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
(out_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
dst_tensor[i][b] = add_val;
o_tensor[i][b] = (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
} else if (insn.alu_opcode == VTA_ALU_OPCODE_SHR) {
// Compute Shift Right
acc_T shr_val =
src_0 >> (aluop_sh_imm_T) src_1.range(VTA_LOG_ACC_WIDTH - 1, 0);
shr_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = shr_val;
short_shr_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
(out_T) shr_val.range(VTA_OUT_WIDTH-1, 0);
acc_T shr_val = src_0 >> shft_by;
dst_tensor[i][b] = shr_val;
o_tensor[i][b] = (out_T) shr_val.range(VTA_OUT_WIDTH - 1, 0);
}
// Store to accum memory/store buffer
if (alu_opcode == VTA_ALU_OPCODE_MIN ||
alu_opcode == VTA_ALU_OPCODE_MAX) {
acc_mem[dst_idx][i] = cmp_res;
out_mem[dst_idx][i] = short_cmp_res;
} else if (alu_opcode == VTA_ALU_OPCODE_ADD) {
acc_mem[dst_idx][i] = add_res;
out_mem[dst_idx][i] = short_add_res;
} else if (alu_opcode == VTA_ALU_OPCODE_SHR) {
acc_mem[dst_idx][i] = shr_res;
out_mem[dst_idx][i] = short_shr_res;
}
}
// Write the results back into accumulator
write_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, dst_tensor, acc_mem);
// Write the results back in the output buffer
write_tensor<bus_T, out_T, acc_idx_T, VTA_BUS_WIDTH, VTA_OUT_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, o_tensor, out_mem);
}
// Update offsets
dst_offset_in += insn.dst_factor_in;
src_offset_in += insn.src_factor_in;
}
#endif // NO_ALU
// Update offsets
dst_offset_in += dst_factor_in;
src_offset_in += src_factor_in;
wgt_offset_in += wgt_factor_in;
dst_offset_out += insn.dst_factor_out;
src_offset_out += insn.src_factor_out;
}
}
// Update offsets
dst_offset_out += dst_factor_out;
src_offset_out += src_factor_out;
wgt_offset_out += wgt_factor_out;
void compute(
volatile uint32_t &done,
volatile uop_T *uops,
volatile bus_T *biases,
hls::stream<insn_T> &gemm_queue,
hls::stream<bool> &l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue,
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
PRAGMA_HLS(HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS offset = VTA_COMPUTE_DONE_WR_OFFSET)
#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
#pragma HLS INTERFACE axis port = gemm_queue
#pragma HLS INTERFACE axis port = l2g_dep_queue
#pragma HLS INTERFACE axis port = s2g_dep_queue
#pragma HLS INTERFACE axis port = g2l_dep_queue
#pragma HLS INTERFACE axis port = g2s_dep_queue
#pragma HLS INTERFACE bram port = inp_mem
#pragma HLS INTERFACE bram port = wgt_mem
#pragma HLS INTERFACE bram port = out_mem
#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
#pragma HLS RESOURCE variable = inp_mem core = RAM_1P
#pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
#pragma HLS RESOURCE variable = out_mem core = RAM_1P
// Micro-op storage
static uop_T uop_mem[VTA_UOP_BUFF_DEPTH];
// Accumulator storage
static bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO];
#pragma HLS ARRAY_RESHAPE variable = acc_mem complete dim=2
// This is necessary to obtain II=1
#pragma HLS DEPENDENCE variable = acc_mem inter false
// Pop GEMM instruction
insn_T raw_insn = gemm_queue.read();
// Cast to GenericInsn
VTAInsn insn;
insn_T raw_copy = raw_insn;
insn.generic = *((VTAGenericInsn *) &raw_copy);
// Pop dependence token if instructed
if (insn.generic.pop_prev_dep) {
l2g_dep_queue.read();
}
if (insn.generic.pop_next_dep) {
s2g_dep_queue.read();
}
// Set done value
done = 0;
// Perform action based on opcode
if (insn.generic.opcode == VTA_OPCODE_FINISH) {
// Set done flag if we reach a FINISH instruction
done = 1;
} else if (insn.generic.opcode == VTA_OPCODE_LOAD) {
// Initialize indices
memop_sram_T sram_idx = insn.mem.sram_base;
memop_dram_T dram_idx = insn.mem.dram_base;
if (insn.mem.memory_type == VTA_MEM_ID_UOP) {
// Perform data transfer
memcpy(&uop_mem[sram_idx],
(const uop_T*) &uops[dram_idx],
insn.mem.x_size * sizeof(uop_T));
} else if (insn.mem.memory_type == VTA_MEM_ID_ACC) {
// Perform data transfer from DRAM
load_2d<bus_T, ACC_MAT_AXI_RATIO, VTA_ACC_ELEM_BYTES>(
biases,
acc_mem,
sram_idx,
dram_idx,
insn.mem.y_size,
insn.mem.x_size,
insn.mem.x_stride);
}
} else if (insn.generic.opcode == VTA_OPCODE_GEMM) {
gemm(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem);
} else if (insn.generic.opcode == VTA_OPCODE_ALU) {
alu(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem);
}
// Push dependence token if instructed
if (push_prev_dependence) {
if (insn.generic.push_prev_dep) {
g2l_dep_queue.write(1);
}
if (push_next_dependence) {
if (insn.generic.push_next_dep) {
g2s_dep_queue.write(1);
}
}
void store(
volatile out_vec_T *outputs,
volatile bus_T *outputs,
hls::stream<insn_T> &store_queue,
hls::stream<bool> &g2s_dep_queue,
hls::stream<bool> &s2g_dep_queue,
out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]
) {
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
#pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
#pragma HLS INTERFACE axis port = store_queue
#pragma HLS INTERFACE axis port = g2s_dep_queue
#pragma HLS INTERFACE axis port = s2g_dep_queue
#pragma HLS INTERFACE bram port = out_mem
#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
#pragma HLS RESOURCE variable = out_mem core = RAM_1P
// Load buffer
insn_T insn = store_queue.read();
// Decode
bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
bool pop_next_dependence = insn[VTA_INSN_MEM_2];
bool push_prev_dependence = insn[VTA_INSN_MEM_3];
bool push_next_dependence = insn[VTA_INSN_MEM_4];
memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
// Pop store instruction
insn_T raw_insn = store_queue.read();
// Cast to MemInsn
insn_T raw_copy = raw_insn;
VTAMemInsn insn = *((VTAMemInsn *) &raw_copy);
// Pop dependence token if instructed
if (pop_prev_dependence) {
if (insn.pop_prev_dep) {
g2s_dep_queue.read();
}
// Initialize indices
memop_sram_T sram_idx = sram_base;
memop_dram_T dram_idx = dram_base;
// Skip padding along y dimension
memop_sram_T y_offset = (x_pad_0 + x_size + x_pad_1) * y_pad_0;
sram_idx += y_offset;
// Force this computation to be done with LUTs to avoid using too many DSPs
#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
memop_sram_T sram_idx = insn.sram_base;
memop_dram_T dram_idx = insn.dram_base;
// Copy along y dimension
for (int y = 0; y < y_size; y++) {
#pragma HLS PIPELINE rewind
// Skip padding along x dimension
sram_idx += x_pad_0;
for (int y = 0; y < insn.y_size; y++) {
#pragma HLS PIPELINE
// Perform data transfer
memcpy(
const_cast<out_vec_T*>(&outputs[dram_idx*VTA_BATCH]),
(const out_vec_T*) &out_mem[sram_idx][0],
x_size * VTA_INP_ELEM_BYTES);
sram_idx += x_size;
dram_idx += x_stride;
// Skip padding along x dimension
sram_idx += x_pad_1;
const_cast<bus_T*>(&outputs[dram_idx * OUT_MAT_AXI_RATIO]),
(const bus_T*) &out_mem[sram_idx][0],
insn.x_size * VTA_OUT_ELEM_BYTES);
#pragma HLS RESOURCE variable = sram_idx core = Mul_LUT
sram_idx += insn.x_size;
dram_idx += insn.x_stride;
}
// Push dependence token if instructed
if (push_prev_dependence) {
if (insn.push_prev_dep) {
s2g_dep_queue.write(1);
}
}
......@@ -571,10 +557,10 @@ void vta(
uint32_t insn_count,
volatile insn_T *insns,
volatile uop_T *uops,
volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights,
volatile acc_vec_T *biases,
volatile out_vec_T *outputs) {
volatile bus_T *inputs,
volatile bus_T *weights,
volatile bus_T *biases,
volatile bus_T *outputs) {
#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
#pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
......@@ -606,14 +592,14 @@ void vta(
hls::stream<bool> s2g_dep_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=s2g_dep_queue)
hls::stream<bool> g2l_dep_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue)
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2l_dep_queue)
hls::stream<bool> g2s_dep_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue)
// Instantiate memories
inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH];
wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT];
out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH];
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO];
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO];
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO];
// Push all instructions into the queues
fetch(insn_count, insns, tmp_load_queue, tmp_gemm_queue, tmp_store_queue);
......@@ -642,9 +628,9 @@ void vta(
tmp_load_popped = true;
}
// Check dependences and invoke the load stage
bool pop_next_dependence = tmp_load[VTA_INSN_MEM_2];
if ((pop_next_dependence && !g2l_dep_queue.empty()) ||
!pop_next_dependence) {
VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_load);
if ((insn.pop_next_dep && !g2l_dep_queue.empty()) ||
!insn.pop_next_dep) {
// Push the instruction in the load queue
load_queue.write(tmp_load);
tmp_load_popped = false;
......@@ -662,16 +648,15 @@ void vta(
tmp_gemm_popped = true;
}
// Check dependences and invoke the load stage
bool pop_prev_dependence = tmp_gemv[VTA_INSN_MEM_1];
bool pop_next_dependence = tmp_gemv[VTA_INSN_MEM_2];
VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv);
if (
(pop_prev_dependence && !l2g_dep_queue.empty() &&
pop_next_dependence && !s2g_dep_queue.empty()) ||
(!pop_prev_dependence && pop_next_dependence &&
(insn.pop_prev_dep && !l2g_dep_queue.empty() &&
insn.pop_next_dep && !s2g_dep_queue.empty()) ||
(!insn.pop_prev_dep && insn.pop_next_dep &&
!s2g_dep_queue.empty()) ||
(pop_prev_dependence && !l2g_dep_queue.empty() &&
!pop_next_dependence) ||
(!pop_prev_dependence && !pop_next_dependence)
(insn.pop_prev_dep && !l2g_dep_queue.empty() &&
!insn.pop_next_dep) ||
(!insn.pop_prev_dep && !insn.pop_next_dep)
) {
// Push the instruction in the load queue
gemm_queue.write(tmp_gemv);
......@@ -692,9 +677,10 @@ void vta(
tmp_store_popped = true;
}
// Check dependences and invoke the load stage
bool pop_prev_dependence = tmp_store[VTA_INSN_MEM_1];
if ((pop_prev_dependence && !g2s_dep_queue.empty()) ||
!pop_prev_dependence) {
VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_store);
if ((insn.pop_prev_dep && !g2s_dep_queue.empty()) ||
!insn.pop_prev_dep) {
// Push the instruction in the load queue
store_queue.write(tmp_store);
tmp_store_popped = false;
......@@ -716,10 +702,11 @@ void vta(
}
}
if (tmp_gemm_popped) {
if (l2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_1]) {
VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv);
if (l2g_dep_queue.empty() && insn.pop_prev_dep) {
printf("waiting on l2g\n");
}
if (s2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_2]) {
if (s2g_dep_queue.empty() && insn.pop_next_dep) {
printf("waiting on s2g\n");
}
}
......
......@@ -18,7 +18,6 @@
*/
/*!
* Copyright (c) 2018 by Contributors
* \file vta.h
* \brief Type definitions and prototype for VTA HLS design.
*/
......@@ -32,6 +31,16 @@
#include <vta/hw_spec.h>
/*!
* Define HLS stream depth
*/
#define PRAGMA_SUB(x) _Pragma (#x)
#define PRAGMA_HLS(x) PRAGMA_SUB(x)
#define STREAM_IN_DEPTH 8
/* \typedef bus_T memory bus datatype*/
typedef ap_uint<VTA_BUS_WIDTH> bus_T;
/* \typedef uop_T Micro-op datatype*/
typedef ap_uint<VTA_UOP_WIDTH> uop_T;
......@@ -53,18 +62,6 @@ typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
/* \typedef inp_vec_T Input vector datatype*/
typedef ap_uint<VTA_INP_WIDTH*VTA_BLOCK_IN> inp_vec_T;
/* \typedef wgt_vec_T Weight vector datatype*/
typedef ap_uint<VTA_WGT_WIDTH*VTA_BLOCK_IN> wgt_vec_T;
/* \typedef acc_vec_T Accumulator vector datatype*/
typedef ap_uint<VTA_ACC_WIDTH*VTA_BLOCK_OUT> acc_vec_T;
/* \typedef out_vec_T Output vector datatype*/
typedef ap_uint<VTA_OUT_WIDTH*VTA_BLOCK_OUT> out_vec_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
......@@ -107,18 +104,14 @@ typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
/* \typedef aluop_imm_T ALU operation immediate datatype*/
typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_int<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
/* \typedef aluop_shr_arg_T ALU operation shift right immediate datatype*/
typedef ap_int<VTA_SHR_ARG_BIT_WIDTH> aluop_shr_arg_T;
/*!
* Define HLS stream depth
*/
#define PRAGMA_SUB(x) _Pragma (#x)
#define PRAGMA_HLS(x) PRAGMA_SUB(x)
#define STREAM_IN_DEPTH 8
/* \typedef aluop_mul_arg_T ALU operation multiply datatype*/
typedef ap_int<VTA_MUL_ARG_BIT_WIDTH> aluop_mul_arg_T;
/*!
* \brief Fetch module.
......@@ -153,13 +146,13 @@ void fetch(
* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
*/
void load(
volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights,
volatile bus_T *inputs,
volatile bus_T *weights,
hls::stream<insn_T> &load_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &l2g_dep_queue,
inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]);
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]);
/*!
* \brief Compute module.
......@@ -187,15 +180,15 @@ void load(
void compute(
volatile uint32_t &done,
volatile uop_T *uops,
volatile acc_vec_T *biases,
volatile bus_T *biases,
hls::stream<insn_T> &gemm_queue,
hls::stream<bool> &l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue,
out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]);
/*!
* \brief Store module.
......@@ -211,11 +204,11 @@ void compute(
* \param out_mem Local output SRAM buffer. Read only single port BRAM.
*/
void store(
volatile out_vec_T *outputs,
volatile bus_T *outputs,
hls::stream<insn_T> &store_queue,
hls::stream<bool> &g2s_dep_queue,
hls::stream<bool> &s2g_dep_queue,
out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]);
/*!
* \brief VTA wrapper for simulation purpose only.
......@@ -232,9 +225,9 @@ void vta(
uint32_t insn_count,
volatile insn_T *insns,
volatile uop_T *uops,
volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights,
volatile acc_vec_T *biases,
volatile out_vec_T *outputs);
volatile bus_T *inputs,
volatile bus_T *weights,
volatile bus_T *biases,
volatile bus_T *outputs);
#endif // VTA_VTA_H_
......@@ -136,19 +136,23 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size);
/*!
* \brief Flushes the region of memory out of the CPU cache to DRAM.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
* \param vir_addr Pointer to memory region allocated with VTAMemAlloc to be flushed.
* This need to be the virtual address.
* \param phy_addr Pointer to memory region allocated with VTAMemAlloc to be flushed.
* This need to be the physical address.
* \param size Size of the region to flush in Bytes.
*/
void VTAFlushCache(vta_phy_addr_t buf, int size);
void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size);
/*!
* \brief Invalidates the region of memory that is cached.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated.
* \param vir_addr Pointer to memory region allocated with VTAMemAlloc to be invalidated.
* This need to be the virtual address.
* \param phy_addr Pointer to memory region allocated with VTAMemAlloc to be invalidated.
* This need to be the physical address.
* \param size Size of the region to invalidate in Bytes.
*/
void VTAInvalidateCache(vta_phy_addr_t buf, int size);
void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size);
#ifdef __cplusplus
}
......
......@@ -18,7 +18,6 @@
*/
/*!
* Copyright (c) 2018 by Contributors
* \file hw_spec.h
* \brief Preprocessor definitions for VTA HLS design and runtime.
*/
......@@ -32,6 +31,9 @@ extern "C" {
#include <stdint.h>
/*! Memory bus width */
#define VTA_BUS_WIDTH (1 << VTA_LOG_BUS_WIDTH)
/*! log2 of instruction data type width */
#define VTA_LOG_INS_WIDTH 7
/*! Instruction data type width */
......@@ -48,10 +50,6 @@ extern "C" {
#define VTA_OUT_WIDTH (1 << VTA_LOG_OUT_WIDTH)
/*! Accumulator data type width */
#define VTA_ACC_WIDTH (1 << VTA_LOG_ACC_WIDTH)
/*! log2 of ALU data type width */
#define VTA_LOG_ALU_WIDTH (VTA_LOG_ACC_WIDTH - 1)
/*! ALU data type width */
#define VTA_ALU_WIDTH (1 << VTA_LOG_ALU_WIDTH)
/*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/
#define VTA_BATCH (1 << VTA_LOG_BATCH)
......@@ -60,15 +58,6 @@ extern "C" {
/*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */
#define VTA_BLOCK_OUT (1 << VTA_LOG_BLOCK_OUT)
/*! Weight vector width */
#define VTA_WGT_VECTOR_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_IN)
/*! Input vector width */
#define VTA_INP_VECTOR_WIDTH (VTA_INP_WIDTH * VTA_BLOCK_IN)
/*! Accumulator vector width */
#define VTA_ACC_VECTOR_WIDTH (VTA_ACC_WIDTH * VTA_BLOCK_OUT)
/*! Output vector width */
#define VTA_OUT_VECTOR_WIDTH (VTA_OUT_WIDTH * VTA_BLOCK_OUT)
/*! On-chip micro-op buffer size in B */
#define VTA_UOP_BUFF_SIZE (1 << VTA_LOG_UOP_BUFF_SIZE)
/*! On-chip weight buffer size in B */
......@@ -78,16 +67,36 @@ extern "C" {
/*! On-chip accumulator buffer size in B */
#define VTA_ACC_BUFF_SIZE (1 << VTA_LOG_ACC_BUFF_SIZE)
/*! Input vector size in bits */
#define VTA_INP_MATRIX_WIDTH (VTA_INP_WIDTH * VTA_BATCH * VTA_BLOCK_IN)
/*! Weight vector size in bits */
#define VTA_WGT_MATRIX_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_OUT * VTA_BLOCK_IN)
/*! Accumulator vector size in bits */
#define VTA_ACC_MATRIX_WIDTH (VTA_ACC_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)
/*! Output vector size in bits */
#define VTA_OUT_MATRIX_WIDTH (VTA_OUT_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)
/*! Ratio between input matrix size and axi width */
#define INP_MAT_AXI_RATIO (VTA_INP_MATRIX_WIDTH / VTA_BUS_WIDTH)
/*! Ratio between weight matrix size and axi width */
#define WGT_MAT_AXI_RATIO (VTA_WGT_MATRIX_WIDTH / VTA_BUS_WIDTH)
/*! Ratio between accumulator matrix size and axi width */
#define ACC_MAT_AXI_RATIO (VTA_ACC_MATRIX_WIDTH / VTA_BUS_WIDTH)
/*! Ratio between output matrix size and axi width */
#define OUT_MAT_AXI_RATIO (VTA_OUT_MATRIX_WIDTH / VTA_BUS_WIDTH)
/*! Size of instruction buffer element in B */
#define VTA_INS_ELEM_BYTES (VTA_INS_WIDTH / 8)
/*! Size of uop buffer element in B*/
#define VTA_UOP_ELEM_BYTES (VTA_UOP_WIDTH / 8)
/*! Size of activation buffer element in B*/
#define VTA_INP_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_IN * VTA_INP_WIDTH / 8)
#define VTA_INP_ELEM_BYTES (VTA_INP_MATRIX_WIDTH / 8)
/*! Size of weight buffer element in B*/
#define VTA_WGT_ELEM_BYTES (VTA_BLOCK_OUT * VTA_BLOCK_IN * VTA_WGT_WIDTH / 8)
#define VTA_WGT_ELEM_BYTES (VTA_WGT_MATRIX_WIDTH / 8)
/*! Size of accumulator buffer element in B*/
#define VTA_ACC_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_OUT * VTA_ACC_WIDTH / 8)
#define VTA_ACC_ELEM_BYTES (VTA_ACC_MATRIX_WIDTH / 8)
/*! Size of output buffer element in B*/
#define VTA_OUT_ELEM_BYTES (VTA_OUT_MATRIX_WIDTH / 8)
/*! On-chip micro-op buffer depth */
#define VTA_UOP_BUFF_DEPTH (VTA_UOP_BUFF_SIZE / VTA_UOP_ELEM_BYTES)
......@@ -148,10 +157,14 @@ extern "C" {
#define VTA_MEMOP_PAD_BIT_WIDTH 4
/*! Load/Store Instruction: padding value encoding width*/
#define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2
/*! ALU Instruction: immediate bitwidth*/
#define VTA_ALUOP_IMM_BIT_WIDTH 16
/*! GEMM/ALU Instruction: loop max iter bits */
#define VTA_LOOP_ITER_WIDTH 14
/*! ALU Instruction: immediate bitwidth*/
#define VTA_ALUOP_IMM_BIT_WIDTH 16
/*! ALU Instruction: shift arg bitwidth*/
#define VTA_SHR_ARG_BIT_WIDTH (VTA_LOG_ACC_WIDTH)
/*! ALU Instruction: multiply arg bitwidth*/
#define VTA_MUL_ARG_BIT_WIDTH 8
/*! Mem ID constant: uop memory */
#define VTA_MEM_ID_UOP 0
......@@ -164,186 +177,6 @@ extern "C" {
/*! Mem ID constant: output store buffer */
#define VTA_MEM_ID_OUT 4
// Instruction organization layout:
//
// LOAD/STORE
// _____________________________|_type______________|
// arg 0: opcode | opcode_T |
// arg 1: pop_prev_dependence | bool |
// arg 2: pop_next_dependence | bool |
// arg 3: push_prev_dependence | bool |
// arg 4: push_next_dependence | bool |
// arg 5: memory_type | memop_id_T |
// arg 6: pad_value | memop_pad_val_T |
// arg 7: sram_base | memop_sram_T |
// arg 8: dram_base | memop_dram_T |
// arg 9: y_size | memop_size_T |
// arg a: x_size | memop_size_T |
// arg b: x_stride | memop_stride_T |
// arg c: y_pad_0 | memop_pad_T |
// arg d: y_pad_1 | memop_pad_T |
// arg e: x_pad_0 | memop_pad_T |
// arg f: x_pad_1 | memop_pad_T |
//
// GEMM
// _____________________________|_type______________|
// arg 0: opcode | opcode_T |
// arg 1: pop_prev_dependence | bool |
// arg 2: pop_next_dependence | bool |
// arg 3: push_prev_dependence | bool |
// arg 4: push_next_dependence | bool |
// arg 5: reset_reg | bool |
// arg 6: uop_bgn | uop_idx_T |
// arg 7: uop_end | uop_idx_T |
// arg 8: iteration count ax0 | loop_T |
// arg 9: iteration count ax1 | loop_T |
// arg a: accum idx factor ax0 | acc_idx_T |
// arg b: accum idx factor ax1 | acc_idx_T |
// arg c: input idx factor ax0 | inp_idx_T |
// arg d: input idx factor ax1 | inp_idx_T |
// arg e: weight idx factor ax0 | wgt_idx_T |
// arg f: weight idx factor ax1 | wgt_idx_T |
//
// ALU
// _____________________________|_type______________|
// arg 0: opcode | opcode_T |
// arg 1: pop_prev_dependence | bool |
// arg 2: pop_next_dependence | bool |
// arg 3: push_prev_dependence | bool |
// arg 4: push_next_dependence | bool |
// arg 5: reset_reg | bool |
// arg 6: uop_bgn | uop_idx_T |
// arg 7: uop_end | uop_idx_T |
// arg 8: iteration count ax0 | loop_T |
// arg 9: iteration count ax1 | loop_T |
// arg a: dst idx factor ax0 | acc_idx_T |
// arg b: dst idx factor ax1 | acc_idx_T |
// arg c: src idx factor ax0 | inp_idx_T |
// arg d: src idx factor ax1 | inp_idx_T |
// arg e: alu_opcode | aluop_opcode_T |
// arg f: use_imm | bool |
// arg g: imm | alu_imm_T |
/*! Load/Store instruction start position of the opcode field */
#define VTA_INSN_MEM_0_0 0
/*! Load/Store instruction end position of the opcode field */
#define VTA_INSN_MEM_0_1 (VTA_INSN_MEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1)
/*! Load/Store instruction position of the pop_prev_dep field */
#define VTA_INSN_MEM_1 (VTA_INSN_MEM_0_1 + 1)
/*! Load/Store instruction position of the pop_next_dep field */
#define VTA_INSN_MEM_2 (VTA_INSN_MEM_1 + 1)
/*! Load/Store instruction position of the push_prev_dependence field */
#define VTA_INSN_MEM_3 (VTA_INSN_MEM_2 + 1)
/*! Load/Store instruction position of the push_next_dependence field */
#define VTA_INSN_MEM_4 (VTA_INSN_MEM_3 + 1)
/*! Load/Store instruction start position of the memory_type field */
#define VTA_INSN_MEM_5_0 (VTA_INSN_MEM_4 + 1)
/*! Load/Store instruction end position of the memory_type field */
#define VTA_INSN_MEM_5_1 (VTA_INSN_MEM_5_0 + VTA_MEMOP_ID_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the sram_base field */
#define VTA_INSN_MEM_6_0 (VTA_INSN_MEM_5_1 + 1)
/*! Load/Store instruction end position of the sram_base field */
#define VTA_INSN_MEM_6_1 (VTA_INSN_MEM_6_0 + VTA_MEMOP_SRAM_ADDR_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the dram_base field */
#define VTA_INSN_MEM_7_0 (VTA_INSN_MEM_6_1 + 1)
/*! Load/Store instruction end position of the dram_base field */
#define VTA_INSN_MEM_7_1 (VTA_INSN_MEM_7_0 + VTA_MEMOP_DRAM_ADDR_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the y_size field */
#define VTA_INSN_MEM_8_0 64
/*! Load/Store instruction end position of the y_size field */
#define VTA_INSN_MEM_8_1 (VTA_INSN_MEM_8_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the x_size field */
#define VTA_INSN_MEM_9_0 (VTA_INSN_MEM_8_1 + 1)
/*! Load/Store instruction start position of the x_size field */
#define VTA_INSN_MEM_9_1 (VTA_INSN_MEM_9_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the x_stride field */
#define VTA_INSN_MEM_A_0 (VTA_INSN_MEM_9_1 + 1)
/*! Load/Store instruction end position of the x_stride field */
#define VTA_INSN_MEM_A_1 (VTA_INSN_MEM_A_0 + VTA_MEMOP_STRIDE_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the y_pad_0 field */
#define VTA_INSN_MEM_B_0 (VTA_INSN_MEM_A_1 + 1)
/*! Load/Store instruction start position of the y_pad_0 field */
#define VTA_INSN_MEM_B_1 (VTA_INSN_MEM_B_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the y_pad_1 field */
#define VTA_INSN_MEM_C_0 (VTA_INSN_MEM_B_1 + 1)
/*! Load/Store instruction start position of the y_pad_1 field */
#define VTA_INSN_MEM_C_1 (VTA_INSN_MEM_C_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the x_pad_0 field */
#define VTA_INSN_MEM_D_0 (VTA_INSN_MEM_C_1 + 1)
/*! Load/Store instruction start position of the x_pad_0 field */
#define VTA_INSN_MEM_D_1 (VTA_INSN_MEM_D_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the x_pad_1 field */
#define VTA_INSN_MEM_E_0 (VTA_INSN_MEM_D_1 + 1)
/*! Load/Store instruction start position of the x_pad_1 field */
#define VTA_INSN_MEM_E_1 (VTA_INSN_MEM_E_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
/*! GEMM instruction start position of the opcode field */
#define VTA_INSN_GEM_0_0 0
/*! GEMM instruction end position of the opcode field */
#define VTA_INSN_GEM_0_1 (VTA_INSN_GEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1)
/*! GEMM instruction position of the pop_prev_dep field */
#define VTA_INSN_GEM_1 (VTA_INSN_GEM_0_1 + 1)
/*! GEMM instruction position of the pop_next_dep field */
#define VTA_INSN_GEM_2 (VTA_INSN_GEM_1 + 1)
/*! GEMM instruction position of the push_prev_dependence field */
#define VTA_INSN_GEM_3 (VTA_INSN_GEM_2 + 1)
/*! GEMM instruction position of the push_next_dependence field */
#define VTA_INSN_GEM_4 (VTA_INSN_GEM_3 + 1)
/*! GEMM instruction position of the reset register bit */
#define VTA_INSN_GEM_5 (VTA_INSN_GEM_4 + 1)
/*! GEMM instruction start position of the uop_bgn field */
#define VTA_INSN_GEM_6_0 (VTA_INSN_GEM_5 + 1)
/*! GEMM instruction end position of the uop_bgn field */
#define VTA_INSN_GEM_6_1 (VTA_INSN_GEM_6_0 + VTA_LOG_UOP_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the uop_end field */
#define VTA_INSN_GEM_7_0 (VTA_INSN_GEM_6_1 + 1)
/*! GEMM instruction end position of the uop_end field */
#define VTA_INSN_GEM_7_1 (VTA_INSN_GEM_7_0 + VTA_LOG_UOP_BUFF_DEPTH + 1 - 1)
/*! GEMM instruction start position of the iter_out field */
#define VTA_INSN_GEM_8_0 (VTA_INSN_GEM_7_1 + 1)
/*! GEMM instruction end position of the iter_out field */
#define VTA_INSN_GEM_8_1 (VTA_INSN_GEM_8_0 + VTA_LOOP_ITER_WIDTH - 1)
/*! GEMM instruction start position of the iter_in field */
#define VTA_INSN_GEM_9_0 (VTA_INSN_GEM_8_1 + 1)
/*! GEMM instruction end position of the iter_in field */
#define VTA_INSN_GEM_9_1 (VTA_INSN_GEM_9_0 + VTA_LOOP_ITER_WIDTH - 1)
/*! GEMM instruction start position of the dst_factor_out field */
#define VTA_INSN_GEM_A_0 64
/*! GEMM instruction end position of the dst_factor_out field */
#define VTA_INSN_GEM_A_1 (VTA_INSN_GEM_A_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the dst_factor_in field */
#define VTA_INSN_GEM_B_0 (VTA_INSN_GEM_A_1 + 1)
/*! GEMM instruction end position of the dst_factor_in field */
#define VTA_INSN_GEM_B_1 (VTA_INSN_GEM_B_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the src_factor_out field */
#define VTA_INSN_GEM_C_0 (VTA_INSN_GEM_B_1 + 1)
/*! GEMM instruction end position of the src_factor_out field */
#define VTA_INSN_GEM_C_1 (VTA_INSN_GEM_C_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the src_factor_in field */
#define VTA_INSN_GEM_D_0 (VTA_INSN_GEM_C_1 + 1)
/*! GEMM instruction end position of the src_factor_in field */
#define VTA_INSN_GEM_D_1 (VTA_INSN_GEM_D_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the wgt_factor_out field */
#define VTA_INSN_GEM_E_0 (VTA_INSN_GEM_D_1 + 1)
/*! GEMM instruction end position of the wgt_factor_out field */
#define VTA_INSN_GEM_E_1 (VTA_INSN_GEM_E_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the wgt_factor_in field */
#define VTA_INSN_GEM_F_0 (VTA_INSN_GEM_E_1 + 1)
/*! GEMM instruction end position of the wgt_factor_in field */
#define VTA_INSN_GEM_F_1 (VTA_INSN_GEM_F_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
/*! ALU instruction start position of the alu_opcode field */
#define VTA_INSN_ALU_E_0 (VTA_INSN_GEM_D_1 + 1)
/*! ALU instruction end position of the alu_opcode field */
#define VTA_INSN_ALU_E_1 (VTA_INSN_ALU_E_0 + VTA_ALU_OPCODE_BIT_WIDTH - 1)
/*! ALU instruction position of the use_imm field */
#define VTA_INSN_ALU_F (VTA_INSN_ALU_E_1 + 1)
/*! ALU instruction start position of the immediate field */
#define VTA_INSN_ALU_G_0 (VTA_INSN_ALU_F + 1)
/*! ALU instruction end position of the immediate field */
#define VTA_INSN_ALU_G_1 (VTA_INSN_ALU_G_0 + VTA_ALUOP_IMM_BIT_WIDTH - 1)
/*! GEMM Micro-op start position of the acc_idx field */
#define VTA_UOP_GEM_0_0 0
/*! GEMM Micro-op end position of the acc_idx field */
......@@ -368,8 +201,20 @@ extern "C" {
/*! \brief VTA generic instruction */
typedef struct {
uint64_t word_0 : 64;
uint64_t word_1 : 64;
/*! \brief The instruction opcode */
uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
/*! \brief Unused in this instruction */
uint64_t pop_prev_dep : 1;
/*! \brief Pop dependence token from GEMM stage */
uint64_t pop_next_dep : 1;
/*! \brief Unused in this instruction */
uint64_t push_prev_dep : 1;
/*! \brief Push dependence token to GEMM stage */
uint64_t push_next_dep : 1;
/*! \brief Padding */
uint64_t pad_0 : 64 - VTA_OPCODE_BIT_WIDTH - 4;
/*! \brief Padding */
uint64_t pad_1 : 64;
} VTAGenericInsn;
/*! \brief VTA load/store instruction
......
......@@ -45,10 +45,11 @@ def get_bitstream_path():
# Derive destination path
cache_dir = os.getenv("VTA_CACHE_PATH", os.path.join(os.getenv("HOME"), ".vta_cache/"))
cache_dir = os.path.join(cache_dir, env.TARGET)
cache_dir = os.path.join(cache_dir, env.HW_VER.replace('.', '_'))
# Create the directory if it didn't exist
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
bit_path = os.path.join(cache_dir, env.BITSTREAM)
bit_path = os.path.join(cache_dir, env.BITSTREAM) + ".bit"
return bit_path
......@@ -63,7 +64,7 @@ def download_bitstream():
bit = get_bitstream_path()
url = os.path.join(BITSTREAM_URL, env.TARGET)
url = os.path.join(url, env.HW_VER)
url = os.path.join(url, env.BITSTREAM)
url = os.path.join(url, env.BITSTREAM + ".bit")
try:
download(url, bit)
......
......@@ -113,15 +113,9 @@ class Environment(object):
# initialization function
def __init__(self, cfg):
self.__dict__.update(cfg)
for key in PkgConfig.cfg_keys:
if key not in cfg:
raise ValueError("Expect key %s in cfg" % key)
# derive output buffer size
self.LOG_OUT_BUFF_SIZE = (
self.LOG_ACC_BUFF_SIZE +
self.LOG_OUT_WIDTH -
self.LOG_ACC_WIDTH)
# Produce the derived parameters and update dict
self.pkg = self.pkg_config(cfg)
self.__dict__.update(self.pkg.cfg_dict)
# data type width
self.INP_WIDTH = 1 << self.LOG_INP_WIDTH
self.WGT_WIDTH = 1 << self.LOG_WGT_WIDTH
......@@ -154,25 +148,15 @@ class Environment(object):
self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8
self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8
self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8
# Configuration bitstream name
self.BITSTREAM = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}.bit".format(
(1 << cfg["LOG_BATCH"]),
(1 << cfg["LOG_BLOCK_IN"]),
(1 << cfg["LOG_BLOCK_OUT"]),
(1 << cfg["LOG_INP_WIDTH"]),
(1 << cfg["LOG_WGT_WIDTH"]),
cfg["LOG_UOP_BUFF_SIZE"],
cfg["LOG_INP_BUFF_SIZE"],
cfg["LOG_WGT_BUFF_SIZE"],
cfg["LOG_ACC_BUFF_SIZE"],
cfg["HW_FREQ"],
cfg["HW_CLK_TARGET"],
cfg["HW_VER"].replace('.', '_'))
# dtypes
self.acc_dtype = "int%d" % self.ACC_WIDTH
self.inp_dtype = "int%d" % self.INP_WIDTH
self.wgt_dtype = "int%d" % self.WGT_WIDTH
self.out_dtype = "int%d" % self.OUT_WIDTH
# bistream name
self.BITSTREAM = self.pkg.bitstream
# model string
self.MODEL = self.TARGET + "_" + self.BITSTREAM
# lazy cached members
self.mock_mode = False
self._mock_env = None
......@@ -187,11 +171,15 @@ class Environment(object):
def __exit__(self, ptype, value, trace):
Environment.current = self._last_env
def pkg_config(self):
def pkg_config(self, cfg):
"""PkgConfig instance"""
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
proj_root = os.path.abspath(os.path.join(curr_path, "../../"))
return PkgConfig(self.__dict__, proj_root)
return PkgConfig(cfg, proj_root)
@property
def cfg_dict(self):
return self.pkg.cfg_dict
@property
def dev(self):
......@@ -236,13 +224,15 @@ class Environment(object):
@property
def target(self):
return tvm.target.vta(model=self.TARGET)
return tvm.target.vta(model=self.MODEL)
@property
def target_host(self):
"""The target host"""
if self.TARGET == "pynq":
return "llvm -target=armv7-none-linux-gnueabihf"
if self.TARGET == "ultra96":
return "llvm -target=aarch64-linux-gnu"
if self.TARGET == "sim" or self.TARGET == "tsim":
return "llvm"
raise ValueError("Unknown target %s" % self.TARGET)
......@@ -316,21 +306,18 @@ def coproc_dep_pop(op):
def _init_env():
"""Iniitalize the default global env"""
"""Initialize the default global env"""
curr_path = os.path.dirname(
os.path.abspath(os.path.expanduser(__file__)))
proj_root = os.path.abspath(os.path.join(curr_path, "../../../"))
path_list = [
os.path.join(curr_path, "vta_config.json"),
os.path.join(proj_root, "build", "vta_config.json"),
os.path.join(proj_root, "vta_config.json"),
os.path.join(proj_root, "vta/config/vta_config.json")
]
path_list = [p for p in path_list if os.path.exists(p)]
if not path_list:
raise RuntimeError(
"Error: {} not found.make sure you have config.json in your vta root"
.format(filename))
return Environment(json.load(open(path_list[0])))
"Error: vta_config.json not found.")
cfg = json.load(open(path_list[0]))
return Environment(cfg)
Environment.current = _init_env()
......@@ -38,49 +38,209 @@ class PkgConfig(object):
"""
cfg_keys = [
"TARGET",
"HW_FREQ",
"HW_CLK_TARGET",
"HW_VER",
"LOG_INP_WIDTH",
"LOG_WGT_WIDTH",
"LOG_ACC_WIDTH",
"LOG_OUT_WIDTH",
"LOG_BATCH",
"LOG_BLOCK_IN",
"LOG_BLOCK_OUT",
"LOG_BLOCK",
"LOG_UOP_BUFF_SIZE",
"LOG_INP_BUFF_SIZE",
"LOG_WGT_BUFF_SIZE",
"LOG_ACC_BUFF_SIZE",
]
def __init__(self, cfg, proj_root):
# include path
# Derived parameters
cfg["LOG_BLOCK_IN"] = cfg["LOG_BLOCK"]
cfg["LOG_BLOCK_OUT"] = cfg["LOG_BLOCK"]
cfg["LOG_OUT_WIDTH"] = cfg["LOG_INP_WIDTH"]
cfg["LOG_OUT_BUFF_SIZE"] = (
cfg["LOG_ACC_BUFF_SIZE"] +
cfg["LOG_OUT_WIDTH"] -
cfg["LOG_ACC_WIDTH"])
# Update cfg now that we've extended it
self.__dict__.update(cfg)
# Include path
self.include_path = [
"-I%s/include" % proj_root,
"-I%s/vta/include" % proj_root,
"-I%s/3rdparty/dlpack/include" % proj_root,
"-I%s/3rdparty/dmlc-core/include" % proj_root
]
# List of source files that can be used to build standalone library.
self.lib_source = []
self.lib_source += glob.glob("%s/vta/src/*.cc" % proj_root)
self.lib_source += glob.glob("%s/vta/src/%s/*.cc" % (proj_root, cfg["TARGET"]))
# macro keys
self.macro_defs = []
self.cfg_dict = {}
for key in self.cfg_keys:
self.macro_defs.append("-DVTA_%s=%s" % (key, str(cfg[key])))
self.cfg_dict[key] = cfg[key]
if self.TARGET in ["pynq", "ultra96"]:
# add pynq drivers for any board that uses pynq driver stack (see pynq.io)
self.lib_source += glob.glob("%s/vta/src/pynq/*.cc" % (proj_root))
self.target = cfg["TARGET"]
if self.target == "pynq":
# Linker flags
if self.TARGET in ["pynq", "ultra96"]:
self.ldflags = [
"-L/usr/lib",
"-l:libcma.so"]
else:
self.ldflags = []
# Derive bitstream config string.
self.bitstream = "{}x{}_i{}w{}a{}_{}_{}_{}_{}".format(
(1 << cfg["LOG_BATCH"]),
(1 << cfg["LOG_BLOCK"]),
(1 << cfg["LOG_INP_WIDTH"]),
(1 << cfg["LOG_WGT_WIDTH"]),
(1 << cfg["LOG_ACC_WIDTH"]),
cfg["LOG_UOP_BUFF_SIZE"],
cfg["LOG_INP_BUFF_SIZE"],
cfg["LOG_WGT_BUFF_SIZE"],
cfg["LOG_ACC_BUFF_SIZE"])
# Derive FPGA parameters from target
# - device: part number
# - family: fpga family
# - freq: PLL frequency
# - per: clock period to achieve in HLS
# (how aggressively design is pipelined)
# - axi_bus_width: axi bus width used for DMA transactions
# (property of FPGA memory interface)
# - axi_cache_bits: ARCACHE/AWCACHE signals for the AXI bus
# (e.g. 1111 is write-back read and write allocate)
# - axi_prot_bits: ARPROT/AWPROT signals for the AXI bus
if self.TARGET == "ultra96":
self.fpga_device = "xczu3eg-sbva484-1-e"
self.fpga_family = "zynq-ultrascale+"
self.fpga_freq = 333
self.fpga_per = 2
self.fpga_log_axi_bus_width = 7
self.axi_prot_bits = '010'
# IP register address map
self.ip_reg_map_range = "0x1000"
self.fetch_base_addr = "0xA0000000"
self.load_base_addr = "0xA0001000"
self.compute_base_addr = "0xA0002000"
self.store_base_addr = "0xA0003000"
else:
# By default, we use the pynq parameters
self.fpga_device = "xc7z020clg484-1"
self.fpga_family = "zynq-7000"
self.fpga_freq = 100
self.fpga_per = 7
self.fpga_log_axi_bus_width = 6
self.axi_prot_bits = '000'
# IP register address map
self.ip_reg_map_range = "0x1000"
self.fetch_base_addr = "0x43C00000"
self.load_base_addr = "0x43C01000"
self.compute_base_addr = "0x43C02000"
self.store_base_addr = "0x43C03000"
# Set coherence settings
coherent = True
if coherent:
self.axi_cache_bits = '1111'
self.coherent = True
# Define IP memory mapped registers offsets.
# In HLS 0x00-0x0C is reserved for block-level I/O protocol.
# Make sure to leave 8B between register offsets to maintain
# compatibility with 64bit systems.
self.fetch_insn_count_offset = 0x10
self.fetch_insn_addr_offset = self.fetch_insn_count_offset + 0x08
self.load_inp_addr_offset = 0x10
self.load_wgt_addr_offset = self.load_inp_addr_offset + 0x08
self.compute_done_wr_offet = 0x10
self.compute_done_rd_offet = self.compute_done_wr_offet + 0x08
self.compute_uop_addr_offset = self.compute_done_rd_offet + 0x08
self.compute_bias_addr_offset = self.compute_uop_addr_offset + 0x08
self.store_out_addr_offset = 0x10
# Derive SRAM parameters
# The goal here is to determine how many memory banks are needed,
# how deep and wide each bank needs to be. This is derived from
# the size of each memory element (result of data width, and tensor shape),
# and also how wide a memory can be as permitted by the FPGA tools.
#
# The mem axi ratio is a parameter used by HLS to resize memories
# so memory read/write ports are the same size as the design axi bus width.
#
# Max bus width allowed (property of FPGA vendor toolchain)
max_bus_width = 1024
# Bus width of a memory interface
mem_bus_width = 1 << self.fpga_log_axi_bus_width
# Input memory
inp_mem_bus_width = 1 << (cfg["LOG_INP_WIDTH"] + \
cfg["LOG_BATCH"] + \
cfg["LOG_BLOCK_IN"])
self.inp_mem_size = 1 << cfg["LOG_INP_BUFF_SIZE"] # bytes
self.inp_mem_banks = (inp_mem_bus_width + \
max_bus_width - 1) // \
max_bus_width
self.inp_mem_width = min(inp_mem_bus_width, max_bus_width)
self.inp_mem_depth = self.inp_mem_size * 8 // inp_mem_bus_width
self.inp_mem_axi_ratio = self.inp_mem_width // mem_bus_width
# Weight memory
wgt_mem_bus_width = 1 << (cfg["LOG_WGT_WIDTH"] + \
cfg["LOG_BLOCK_IN"] + \
cfg["LOG_BLOCK_OUT"])
self.wgt_mem_size = 1 << cfg["LOG_WGT_BUFF_SIZE"] # bytes
self.wgt_mem_banks = (wgt_mem_bus_width + \
max_bus_width - 1) // \
max_bus_width
self.wgt_mem_width = min(wgt_mem_bus_width, max_bus_width)
self.wgt_mem_depth = self.wgt_mem_size * 8 // wgt_mem_bus_width
self.wgt_mem_axi_ratio = self.wgt_mem_width // mem_bus_width
# Output memory
out_mem_bus_width = 1 << (cfg["LOG_OUT_WIDTH"] + \
cfg["LOG_BATCH"] + \
cfg["LOG_BLOCK_OUT"])
self.out_mem_size = 1 << cfg["LOG_OUT_BUFF_SIZE"] # bytes
self.out_mem_banks = (out_mem_bus_width + \
max_bus_width - 1) // \
max_bus_width
self.out_mem_width = min(out_mem_bus_width, max_bus_width)
self.out_mem_depth = self.out_mem_size * 8 // out_mem_bus_width
self.out_mem_axi_ratio = self.out_mem_width // mem_bus_width
# Macro defs
self.macro_defs = []
self.cfg_dict = {}
for key in cfg:
self.macro_defs.append("-DVTA_%s=%s" % (key, str(cfg[key])))
self.cfg_dict[key] = cfg[key]
self.macro_defs.append("-DVTA_LOG_BUS_WIDTH=%s" % (self.fpga_log_axi_bus_width))
# Macros used by the VTA driver
self.macro_defs.append("-DVTA_IP_REG_MAP_RANGE=%s" % (self.ip_reg_map_range))
self.macro_defs.append("-DVTA_FETCH_ADDR=%s" % (self.fetch_base_addr))
self.macro_defs.append("-DVTA_LOAD_ADDR=%s" % (self.load_base_addr))
self.macro_defs.append("-DVTA_COMPUTE_ADDR=%s" % (self.compute_base_addr))
self.macro_defs.append("-DVTA_STORE_ADDR=%s" % (self.store_base_addr))
# IP register offsets
self.macro_defs.append("-DVTA_FETCH_INSN_COUNT_OFFSET=%s" % \
(self.fetch_insn_count_offset))
self.macro_defs.append("-DVTA_FETCH_INSN_ADDR_OFFSET=%s" % \
(self.fetch_insn_addr_offset))
self.macro_defs.append("-DVTA_LOAD_INP_ADDR_OFFSET=%s" % \
(self.load_inp_addr_offset))
self.macro_defs.append("-DVTA_LOAD_WGT_ADDR_OFFSET=%s" % \
(self.load_wgt_addr_offset))
self.macro_defs.append("-DVTA_COMPUTE_DONE_WR_OFFSET=%s" % \
(self.compute_done_wr_offet))
self.macro_defs.append("-DVTA_COMPUTE_DONE_RD_OFFSET=%s" % \
(self.compute_done_rd_offet))
self.macro_defs.append("-DVTA_COMPUTE_UOP_ADDR_OFFSET=%s" % \
(self.compute_uop_addr_offset))
self.macro_defs.append("-DVTA_COMPUTE_BIAS_ADDR_OFFSET=%s" % \
(self.compute_bias_addr_offset))
self.macro_defs.append("-DVTA_STORE_OUT_ADDR_OFFSET=%s" % \
(self.store_out_addr_offset))
# Coherency
if coherent:
self.macro_defs.append("-DVTA_COHERENT_ACCESSES=true")
else:
self.macro_defs.append("-DVTA_COHERENT_ACCESSES=false")
@property
def cflags(self):
return self.include_path + self.macro_defs
......
......@@ -48,9 +48,12 @@ def pynq_bitstream_program(bitstream_path):
bitstream.download()
def bitstream_program(target, bitstream):
if target == 'pynq':
if target in ['pynq', 'ultra96']:
pynq_bitstream_program(bitstream)
elif target != 'sim':
elif target in ['sim', 'tsim']:
# In simulation, bit stream programming is a no-op
return
else:
raise RuntimeError("Unknown target {}".format(target))
if __name__ == "__main__":
......
......@@ -30,7 +30,7 @@ def reconfig_runtime(remote):
"""
env = get_env()
freconfig = remote.get_function("tvm.contrib.vta.reconfig_runtime")
freconfig(env.pkg_config().cfg_json)
freconfig(env.pkg.cfg_json)
def program_fpga(remote, bitstream=None):
......
......@@ -33,7 +33,6 @@ def run(run_func):
env = get_env()
if env.TARGET in ["sim", "tsim"]:
# Talk to local RPC if necessary to debug RPC server.
# Compile vta on your host with make at the root.
# Make sure TARGET is set to "sim" in the config.json file.
......@@ -53,21 +52,20 @@ def run(run_func):
assert simulator.enabled()
run_func(env, rpc.LocalSession())
elif env.TARGET == "pynq":
elif env.TARGET in ["pynq", "ultra96"]:
# The environment variables below should be set if we are using
# a tracker to obtain a remote for a test device
tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
tracket_port = os.environ.get("TVM_TRACKER_PORT", None)
tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
# Otherwise, we can set the variables below to directly
# obtain a remote from a test device
pynq_host = os.environ.get("VTA_PYNQ_RPC_HOST", None)
pynq_port = os.environ.get("VTA_PYNQ_RPC_PORT", None)
# Run device from fleet node if env variables are defined
if tracket_host and tracket_port:
if tracker_host and tracker_port:
remote = autotvm.measure.request_remote(env.TARGET,
tracket_host,
int(tracket_port),
tracker_host,
int(tracker_port),
timeout=10000)
run_func(env, remote)
else:
......@@ -78,3 +76,6 @@ def run(run_func):
else:
raise RuntimeError(
"Please set the VTA_PYNQ_RPC_HOST and VTA_PYNQ_RPC_PORT environment variables")
else:
raise RuntimeError("Unknown target %s" % env.TARGET)
......@@ -15,12 +15,9 @@
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* Copyright (c) 2018 by Contributors
*
* \file pynq_driver.c
* \brief VTA driver for Pynq board.
* \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io).
*/
#include <vta/driver.h>
......@@ -53,19 +50,19 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
memcpy(dst, src, size);
}
void VTAFlushCache(vta_phy_addr_t buf, int size) {
// Call the xlnkFlushCache on the CMA buffer
void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
// Call the cma_flush_cache on the CMA buffer
// so that the FPGA can read the buffer data.
xlnkFlushCache(reinterpret_cast<void*>(buf), size);
cma_flush_cache(vir_addr, phy_addr, size);
}
void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
// Call the xlnkInvalidateCache on the CMA buffer
void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
// Call the cma_invalidate_cache on the CMA buffer
// so that the host needs to read the buffer data.
xlnkInvalidateCache(reinterpret_cast<void*>(buf), size);
cma_invalidate_cache(vir_addr, phy_addr, size);
}
void *VTAMapRegister(uint32_t addr, size_t length) {
void *VTAMapRegister(uint32_t addr) {
// Align the base address with the pages
uint32_t virt_base = addr & ~(getpagesize() - 1);
// Calculate base address offset w.r.t the base address
......@@ -73,16 +70,16 @@ void *VTAMapRegister(uint32_t addr, size_t length) {
// Open file and mmap
uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC);
return mmap(NULL,
(length+virt_offset),
(VTA_IP_REG_MAP_RANGE + virt_offset),
PROT_READ|PROT_WRITE,
MAP_SHARED,
mmap_file,
virt_base);
}
void VTAUnmapRegister(void *vta, size_t length) {
void VTAUnmapRegister(void *vta) {
// Unmap memory
int status = munmap(vta, length);
int status = munmap(vta, VTA_IP_REG_MAP_RANGE);
assert(status == 0);
}
......@@ -98,39 +95,30 @@ class VTADevice {
public:
VTADevice() {
// VTA stage handles
vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR);
vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR);
vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR);
vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR);
}
~VTADevice() {
// Close VTA stage handle
VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE);
VTAUnmapRegister(vta_load_handle_, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle_, VTA_RANGE);
VTAUnmapRegister(vta_store_handle_, VTA_RANGE);
VTAUnmapRegister(vta_fetch_handle_);
VTAUnmapRegister(vta_load_handle_);
VTAUnmapRegister(vta_compute_handle_);
VTAUnmapRegister(vta_store_handle_);
}
int Run(vta_phy_addr_t insn_phy_addr,
uint32_t insn_count,
uint32_t wait_cycles) {
// NOTE: Register address map is derived from the auto-generated
// driver files available under hardware/build/vivado/<design>/export/driver
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V
VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_phy_addr);
// LOAD @ 0x10 : Data signal of inputs_V
VTAWriteMappedReg(vta_load_handle_, 0x10, 0);
// LOAD @ 0x18 : Data signal of weight_V
VTAWriteMappedReg(vta_load_handle_, 0x18, 0);
// COMPUTE @ 0x20 : Data signal of uops_V
VTAWriteMappedReg(vta_compute_handle_, 0x20, 0);
// COMPUTE @ 0x28 : Data signal of biases_V
VTAWriteMappedReg(vta_compute_handle_, 0x28, 0);
// STORE @ 0x10 : Data signal of outputs_V
VTAWriteMappedReg(vta_store_handle_, 0x10, 0);
VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy_addr);
VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_INP_ADDR_OFFSET, 0);
VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_WGT_ADDR_OFFSET, 0);
VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_UOP_ADDR_OFFSET, 0);
VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_BIAS_ADDR_OFFSET, 0);
VTAWriteMappedReg(vta_store_handle_, VTA_STORE_OUT_ADDR_OFFSET, 0);
// VTA start
VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
......@@ -141,7 +129,7 @@ class VTADevice {
// Loop until the VTA is done
unsigned t, flag = 0;
for (t = 0; t < wait_cycles; ++t) {
flag = VTAReadMappedReg(vta_compute_handle_, 0x18);
flag = VTAReadMappedReg(vta_compute_handle_, VTA_COMPUTE_DONE_RD_OFFSET);
if (flag == VTA_DONE) break;
std::this_thread::yield();
}
......
......@@ -15,12 +15,9 @@
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* Copyright (c) 2018 by Contributors
* \file vta_pynq_driver.h
* \brief VTA driver for Pynq board.
*
* \file pynq_driver.h
* \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io).
*/
#ifndef VTA_PYNQ_PYNQ_DRIVER_H_
......@@ -41,23 +38,21 @@ extern "C" {
#include <time.h>
#include <unistd.h>
#ifdef __arm__
#if defined(__arm__) || defined(__aarch64__)
#include <libxlnk_cma.h>
#else
void* cma_alloc(size_t size, int cached);
void cma_free(void* buf);
uint32_t cma_get_phy_addr(void* buf);
void cma_flush_cache(void* buf, unsigned int phys_addr, int size);
void cma_invalidate_cache(void* buf, unsigned int phys_addr, int size);
#endif
void xlnkFlushCache(void* buf, int size);
void xlnkInvalidateCache(void* buf, int size);
void *VTAMapRegister(uint32_t addr, size_t length);
void VTAUnmapRegister(void *vta, size_t length);
void *VTAMapRegister(uint32_t addr);
void VTAUnmapRegister(void *vta);
void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
/*! \brief VTA configuration register address range */
#define VTA_RANGE 0x100
/*! \brief VTA configuration register start value */
#define VTA_START 0x1
/*! \brief VTA configuration register auto-restart value */
......@@ -65,27 +60,6 @@ uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
/*! \brief VTA configuration register done value */
#define VTA_DONE 0x1
/*! \brief VTA fetch stage configuration register address
* from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_FETCH_ADDR 0x43C00000
/*! \brief VTA compute stage configuration register address
* from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_COMPUTE_ADDR 0x43C10000
/*! \brief VTA compute stage configuration register address
* from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_LOAD_ADDR 0x43C20000
/*! \brief VTA store stage configuration register address
* from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_STORE_ADDR 0x43C30000
#ifdef __cplusplus
}
#endif
......
......@@ -44,8 +44,10 @@ namespace vta {
static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8,
"VTA_UOP_WIDTH do not match VTAUop size");
/*! \brief Enable coherent access between VTA and CPU (used on shared mem systems). */
static const bool kBufferCoherent = true;
/*! \brief Enable coherent access of data buffers between VTA and CPU */
static const bool kBufferCoherent = VTA_COHERENT_ACCESSES;
/*! \brief Always cache buffers (otherwise, write back to DRAM from CPU) */
static const bool kAlwaysCache = true;
/*!
* \brief Data buffer represents data on CMA.
......@@ -65,8 +67,10 @@ struct DataBuffer {
* \param size The size of the data.
*/
void InvalidateCache(size_t offset, size_t size) {
if (!kBufferCoherent) {
VTAInvalidateCache(phy_addr_ + offset, size);
if (!kBufferCoherent && kAlwaysCache) {
VTAInvalidateCache(reinterpret_cast<char *>(data_) + offset,
phy_addr_ + offset,
size);
}
}
/*!
......@@ -75,8 +79,10 @@ struct DataBuffer {
* \param size The size of the data.
*/
void FlushCache(size_t offset, size_t size) {
if (!kBufferCoherent) {
VTAFlushCache(phy_addr_ + offset, size);
if (!kBufferCoherent && kAlwaysCache) {
VTAFlushCache(reinterpret_cast<char *>(data_) + offset,
phy_addr_ + offset,
size);
}
}
/*!
......@@ -102,7 +108,7 @@ struct DataBuffer {
* \param size The size of the buffer.
*/
static DataBuffer* Alloc(size_t size) {
void* data = VTAMemAlloc(size, 1);
void* data = VTAMemAlloc(size, kAlwaysCache);
CHECK(data != nullptr);
DataBuffer* buffer = new DataBuffer();
buffer->data_ = data;
......@@ -469,7 +475,9 @@ class UopQueue : public BaseQueue<VTAUop> {
// Flush if we're using a shared memory system
// and if interface is non-coherent
if (!coherent_ && always_cache_) {
VTAFlushCache(fpga_buff_phy_, offset);
VTAFlushCache(fpga_buff_,
fpga_buff_phy_,
offset);
}
}
......@@ -860,7 +868,9 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
// Flush if we're using a shared memory system
// and if interface is non-coherent
if (!coherent_ && always_cache_) {
VTAFlushCache(fpga_buff_phy_, buff_size);
VTAFlushCache(fpga_buff_,
fpga_buff_phy_,
buff_size);
}
}
......@@ -1302,9 +1312,9 @@ class CommandQueue {
// The kernel we are currently recording
UopKernel* record_kernel_{nullptr};
// Micro op queue
UopQueue<VTA_MAX_XFER, true, true> uop_queue_;
UopQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> uop_queue_;
// instruction queue
InsnQueue<VTA_MAX_XFER, true, true> insn_queue_;
InsnQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> insn_queue_;
// Device handle
VTADeviceHandle device_{nullptr};
#ifdef USE_TSIM
......
......@@ -615,10 +615,10 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
memcpy(dst, src, size);
}
void VTAFlushCache(vta_phy_addr_t buf, int size) {
void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
}
void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
}
VTADeviceHandle VTADeviceAlloc() {
......
......@@ -228,10 +228,10 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
memcpy(dst, src, size);
}
void VTAFlushCache(vta_phy_addr_t buf, int size) {
void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
}
void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
}
VTADeviceHandle VTADeviceAlloc() {
......
......@@ -18,7 +18,6 @@
*/
/*!
* Copyright (c) 2018 by Contributors
* \file test_lib.cpp
* \brief Test library for the VTA design simulation and driver tests.
*/
......@@ -32,10 +31,10 @@ uint64_t vta(
uint32_t insn_count,
VTAGenericInsn *insns,
VTAUop *uops,
inp_T *inputs,
wgt_T *weights,
acc_T *biases,
inp_T *outputs) {
uint32_t *inputs,
uint32_t *weights,
uint32_t *biases,
uint32_t *outputs) {
// Performance counter variables
uint64_t t_fpga;
struct timespec start, stop;
......@@ -53,18 +52,18 @@ uint64_t vta(
snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
// Get VTA handles
void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR);
void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR);
void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR);
void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR);
// Physical address pointers
uint32_t insn_phy = insns ? VTAMemGetPhyAddr(insns) : 0;
uint32_t uop_phy = uops ? VTAMemGetPhyAddr(uops) : 0;
uint32_t input_phy = inputs ? VTAMemGetPhyAddr(inputs) : 0;
uint32_t weight_phy = weights ? VTAMemGetPhyAddr(weights) : 0;
uint32_t bias_phy = biases ? VTAMemGetPhyAddr(biases) : 0;
uint32_t output_phy = outputs ? VTAMemGetPhyAddr(outputs) : 0;
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
#if VTA_DEBUG == 1
printf("INFO - Starting FPGA!\n");
......@@ -72,20 +71,13 @@ uint64_t vta(
clock_gettime(CLOCK_REALTIME, &start);
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V
if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
// LOAD @ 0x10 : Data signal of inputs_V
if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
// LOAD @ 0x18 : Data signal of weight_V
if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
// COMPUTE @ 0x20 : Data signal of uops_V
if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
// COMPUTE @ 0x28 : Data signal of biases_V
if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
// STORE @ 0x10 : Data signal of outputs_V
if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
if (insns) VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy);
if (inputs) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_INP_ADDR_OFFSET, input_phy);
if (weights) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_WGT_ADDR_OFFSET, weight_phy);
if (uops) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_UOP_ADDR_OFFSET, uop_phy);
if (biases) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_BIAS_ADDR_OFFSET, bias_phy);
if (outputs) VTAWriteMappedReg(vta_store_handle, VTA_STORE_OUT_ADDR_OFFSET, output_phy);
// VTA start
VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
......@@ -95,7 +87,7 @@ uint64_t vta(
int flag = 0, t = 0;
for (t = 0; t < 10000000; ++t) {
flag = VTAReadMappedReg(vta_compute_handle, 0x18);
flag = VTAReadMappedReg(vta_compute_handle, VTA_COMPUTE_DONE_RD_OFFSET);
if (flag & VTA_DONE) break;
}
......@@ -111,10 +103,10 @@ uint64_t vta(
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
// Unmap VTA register
VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
VTAUnmapRegister(vta_load_handle, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
VTAUnmapRegister(vta_store_handle, VTA_RANGE);
VTAUnmapRegister(vta_fetch_handle);
VTAUnmapRegister(vta_load_handle);
VTAUnmapRegister(vta_compute_handle);
VTAUnmapRegister(vta_store_handle);
return t_fpga;
}
......@@ -147,27 +139,30 @@ const char* getOpcodeString(int opcode, bool use_imm) {
} else if (opcode == VTA_ALU_OPCODE_SHR) {
return "shr";
}
// else if (opcode == VTA_ALU_OPCODE_MUL) {
// return "mul";
// }
return "unknown op";
}
template <typename T, int T_WIDTH>
void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block) {
template <typename DST_T, int DST_T_WIDTH, typename SRC_T, int SRC_T_WIDTH>
void packBuffer(DST_T *dst, SRC_T **src, int y_size, int x_size, int y_block, int x_block) {
assert((SRC_T_WIDTH * x_block * y_block) % DST_T_WIDTH == 0);
assert(DST_T_WIDTH <= 64);
int buffer_idx = 0;
int ratio = DST_T_WIDTH / SRC_T_WIDTH;
long long int mask = (1ULL << SRC_T_WIDTH) - 1;
DST_T tmp = 0;
for (int i = 0; i < y_size / y_block; i++) {
for (int j = 0; j < x_size / x_block; j++) {
for (int k = 0; k < y_block; k++) {
if (T_WIDTH < 8) {
for (int l = 0; l < x_block; l += 8 / T_WIDTH) {
dst[buffer_idx] = 0;
for (int m = 0; m < 8 / T_WIDTH; m++) {
dst[buffer_idx] |= (src[i * y_block + k][j * x_block + l + m] &
((1ULL << T_WIDTH) - 1)) << (m * T_WIDTH);
}
buffer_idx++;
}
} else {
for (int l = 0; l < x_block; l++) {
dst[buffer_idx++] = src[i * y_block + k][j * x_block + l];
int block_idx = l + k * x_block;
tmp |= (src[i * y_block + k][j * x_block + l] & mask) << ((block_idx % ratio) * SRC_T_WIDTH);
// When tmp is packed, write to destination array
if (block_idx % ratio == ratio - 1) {
dst[buffer_idx++] = tmp;
tmp = 0;
}
}
}
......@@ -175,23 +170,20 @@ void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_bloc
}
}
template <typename T, int T_WIDTH>
void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block) {
template <typename DST_T, int DST_T_WIDTH, typename SRC_T, int SRC_T_WIDTH>
void unpackBuffer(DST_T **dst, SRC_T *src, int y_size, int x_size, int y_block, int x_block) {
assert((DST_T_WIDTH * x_block * y_block) % SRC_T_WIDTH == 0);
int buffer_idx = 0;
long long int mask = (1ULL << DST_T_WIDTH) - 1;
int ratio = SRC_T_WIDTH / DST_T_WIDTH;
for (int i = 0; i < y_size / y_block; i++) {
for (int j = 0; j < x_size / x_block; j++) {
for (int k = 0; k < y_block; k++) {
if (T_WIDTH < 8) {
for (int l = 0; l < x_block; l += 8 / T_WIDTH) {
for (int m = 0; m < 8 / T_WIDTH; m++) {
dst[i * y_block + k][j * x_block + l + m] = (src[buffer_idx] >> (m * T_WIDTH))
& ((1 << T_WIDTH) - 1);
}
buffer_idx++;
}
} else {
for (int l = 0; l < x_block; l++) {
dst[i * y_block + k][j * x_block + l] = src[buffer_idx++];
int block_idx = l + k * x_block;
dst[i * y_block + k][j * x_block + l] = (src[buffer_idx] >> ((block_idx % ratio) * DST_T_WIDTH)) & mask;
if (block_idx % ratio == ratio - 1) {
buffer_idx++;
}
}
}
......@@ -199,7 +191,7 @@ void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_bl
}
}
template <typename T, int T_WIDTH>
template <typename T>
T ** allocInit2dArray(int rows, int cols) {
// Allocate
T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
......@@ -209,8 +201,23 @@ T ** allocInit2dArray(int rows, int cols) {
// Init
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
array[i][j] =
static_cast<T>(rand_r(&globalSeed) % (1LL << (T_WIDTH - 1)) - (1LL << (T_WIDTH - 2)));
array[i][j] = static_cast<T>(rand_r(&globalSeed));
}
}
return array;
}
template <typename T>
T ** allocSet2dArray(int rows, int cols, int val) {
// Allocate
T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
for (int i = 0; i < rows; i++) {
array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
}
// Init
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
array[i][j] = static_cast<T>(val);
}
}
return array;
......@@ -563,45 +570,6 @@ void printParameters() {
printf("VTA_ACC_ELEM_BYTES: %d\n", VTA_ACC_ELEM_BYTES);
printf("VTA_BLOCK_IN: %d\n", VTA_BLOCK_IN);
printf("VTA_BLOCK_OUT: %d\n", VTA_BLOCK_OUT);
printf("VTA_INSN_MEM_0 [%d-%d]\n", VTA_INSN_MEM_0_0, VTA_INSN_MEM_0_1);
printf("VTA_INSN_MEM_1 [%d]\n", VTA_INSN_MEM_1);
printf("VTA_INSN_MEM_2 [%d]\n", VTA_INSN_MEM_2);
printf("VTA_INSN_MEM_3 [%d]\n", VTA_INSN_MEM_3);
printf("VTA_INSN_MEM_4 [%d]\n", VTA_INSN_MEM_4);
printf("VTA_INSN_MEM_5 [%d-%d]\n", VTA_INSN_MEM_5_0, VTA_INSN_MEM_5_1);
printf("VTA_INSN_MEM_6 [%d-%d]\n", VTA_INSN_MEM_6_0, VTA_INSN_MEM_6_1);
printf("VTA_INSN_MEM_7 [%d-%d]\n", VTA_INSN_MEM_7_0, VTA_INSN_MEM_7_1);
printf("VTA_INSN_MEM_8 [%d-%d]\n", VTA_INSN_MEM_8_0, VTA_INSN_MEM_8_1);
printf("VTA_INSN_MEM_9 [%d-%d]\n", VTA_INSN_MEM_9_0, VTA_INSN_MEM_9_1);
printf("VTA_INSN_MEM_A [%d-%d]\n", VTA_INSN_MEM_A_0, VTA_INSN_MEM_A_1);
printf("VTA_INSN_MEM_B [%d-%d]\n", VTA_INSN_MEM_B_0, VTA_INSN_MEM_B_1);
printf("VTA_INSN_MEM_C [%d-%d]\n", VTA_INSN_MEM_C_0, VTA_INSN_MEM_C_1);
printf("VTA_INSN_MEM_D [%d-%d]\n", VTA_INSN_MEM_D_0, VTA_INSN_MEM_D_1);
printf("VTA_INSN_MEM_E [%d-%d]\n", VTA_INSN_MEM_E_0, VTA_INSN_MEM_E_1);
printf("VTA_INSN_GEM_0 [%d-%d]\n", VTA_INSN_GEM_0_0, VTA_INSN_GEM_0_1);
printf("VTA_INSN_GEM_1 [%d]\n", VTA_INSN_GEM_1);
printf("VTA_INSN_GEM_2 [%d]\n", VTA_INSN_GEM_2);
printf("VTA_INSN_GEM_3 [%d]\n", VTA_INSN_GEM_3);
printf("VTA_INSN_GEM_4 [%d]\n", VTA_INSN_GEM_4);
printf("VTA_INSN_GEM_5 [%d]\n", VTA_INSN_GEM_5);
printf("VTA_INSN_GEM_6 [%d-%d]\n", VTA_INSN_GEM_6_0, VTA_INSN_GEM_6_1);
printf("VTA_INSN_GEM_7 [%d-%d]\n", VTA_INSN_GEM_7_0, VTA_INSN_GEM_7_1);
printf("VTA_INSN_GEM_8 [%d-%d]\n", VTA_INSN_GEM_8_0, VTA_INSN_GEM_8_1);
printf("VTA_INSN_GEM_9 [%d-%d]\n", VTA_INSN_GEM_9_0, VTA_INSN_GEM_9_1);
printf("VTA_INSN_GEM_A [%d-%d]\n", VTA_INSN_GEM_A_0, VTA_INSN_GEM_A_1);
printf("VTA_INSN_GEM_B [%d-%d]\n", VTA_INSN_GEM_B_0, VTA_INSN_GEM_B_1);
printf("VTA_INSN_GEM_C [%d-%d]\n", VTA_INSN_GEM_C_0, VTA_INSN_GEM_C_1);
printf("VTA_INSN_GEM_D [%d-%d]\n", VTA_INSN_GEM_D_0, VTA_INSN_GEM_D_1);
printf("VTA_INSN_GEM_E [%d-%d]\n", VTA_INSN_GEM_E_0, VTA_INSN_GEM_E_1);
printf("VTA_INSN_GEM_F [%d-%d]\n", VTA_INSN_GEM_F_0, VTA_INSN_GEM_F_1);
printf("VTA_INSN_ALU_E [%d-%d]\n", VTA_INSN_ALU_E_0, VTA_INSN_ALU_E_1);
printf("VTA_INSN_ALU_F [%d]\n", VTA_INSN_ALU_F);
printf("VTA_INSN_ALU_G [%d-%d]\n", VTA_INSN_ALU_G_0, VTA_INSN_ALU_G_1);
printf("VTA_UOP_GEM_0 [%d-%d]\n", VTA_UOP_GEM_0_0, VTA_UOP_GEM_0_1);
printf("VTA_UOP_GEM_1 [%d-%d]\n", VTA_UOP_GEM_1_0, VTA_UOP_GEM_1_1);
printf("VTA_UOP_GEM_2 [%d-%d]\n", VTA_UOP_GEM_2_0, VTA_UOP_GEM_2_1);
printf("VTA_UOP_ALU_0 [%d-%d]\n", VTA_UOP_ALU_0_0, VTA_UOP_ALU_0_1);
printf("VTA_UOP_ALU_1 [%d-%d]\n", VTA_UOP_ALU_1_0, VTA_UOP_ALU_1_1);
}
void printInstruction(int num_insn, VTAGenericInsn *insns) {
......@@ -742,7 +710,6 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
// Some assertions
assert(batch % VTA_BATCH == 0);
assert(vector_size % VTA_BLOCK_OUT == 0);
assert(!(opcode == VTA_ALU_OPCODE_SHR && !use_imm));
printf("=====================================================================================\n");
printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n",
getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression);
......@@ -764,17 +731,21 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
for (int b = 0; b < batch / VTA_BATCH; b++) {
if (opcode == VTA_ALU_OPCODE_MIN) {
immediate[b] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
} else if (opcode == VTA_ALU_OPCODE_MAX) {
immediate[b] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
} else if (opcode == VTA_ALU_OPCODE_ADD) {
immediate[b] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
} else if (opcode == VTA_ALU_OPCODE_SHR) {
immediate[b] = static_cast<acc_T>(
rand_r(&globalSeed) % VTA_ACC_WIDTH - VTA_ACC_WIDTH/2);
rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
}
// else if (opcode == VTA_ALU_OPCODE_MUL) {
// immediate[b] = static_cast<acc_T>(
// rand_r(&globalSeed) % (1LL << (VTA_MUL_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_MUL_ARG_BIT_WIDTH - 2)));
// }
}
// Initialize instructions
......@@ -845,7 +816,10 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
} else if (opcode == VTA_ALU_OPCODE_ADD) {
inputs[i][j] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 2)) - (1LL << (VTA_INP_WIDTH - 3)));
} else if (opcode == VTA_ALU_OPCODE_SHR) {
inputs[i][j] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
}
}
}
......@@ -854,54 +828,55 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
for (int i = 0; i < batch; i++) {
for (int j = 0; j < vector_size; j++) {
acc_T tmp = 0;
acc_T out_val = 0;
acc_T imm_val = immediate[i / VTA_BATCH];
acc_T src_val = inputs[i][j + vector_size];
if (opcode == VTA_ALU_OPCODE_MIN) {
if (!use_imm) {
tmp = inputs[i][j] < inputs[i][j + vector_size] ?
inputs[i][j] :
inputs[i][j + vector_size];
out_val = inputs[i][j] < src_val ? inputs[i][j] : src_val;
} else {
tmp = inputs[i][j] < immediate[i / VTA_BATCH] ?
inputs[i][j] :
immediate[i / VTA_BATCH];
out_val = inputs[i][j] < imm_val ? inputs[i][j] : imm_val;
}
} else if (opcode == VTA_ALU_OPCODE_MAX) {
if (!use_imm) {
tmp = inputs[i][j] > inputs[i][j + vector_size] ?
inputs[i][j] :
inputs[i][j + vector_size];
out_val = inputs[i][j] > src_val ? inputs[i][j] : src_val;
} else {
tmp = inputs[i][j] > immediate[i / VTA_BATCH] ?
inputs[i][j] :
immediate[i / VTA_BATCH];
out_val = inputs[i][j] > imm_val ? inputs[i][j] : imm_val;
}
} else if (opcode == VTA_ALU_OPCODE_ADD) {
if (!use_imm) {
tmp = inputs[i][j] + inputs[i][j + vector_size];
out_val = inputs[i][j] + src_val;
} else {
tmp = inputs[i][j] + immediate[i / VTA_BATCH];
out_val = inputs[i][j] + imm_val;
}
} else if (opcode == VTA_ALU_OPCODE_SHR) {
if (immediate[i / VTA_BATCH] >= 0) {
tmp = inputs[i][j] >> immediate[i / VTA_BATCH];
if (!use_imm) {
if (src_val >= 0) {
out_val = inputs[i][j] >> src_val;
} else {
out_val = inputs[i][j] << (0 - src_val);
}
} else {
if (imm_val >= 0) {
out_val = inputs[i][j] >> imm_val;
} else {
tmp = inputs[i][j] << (0 - immediate[i / VTA_BATCH]);
out_val = inputs[i][j] << (0 - imm_val);
}
}
// Set
outputs_ref[i][j] = (out_T) tmp;
}
outputs_ref[i][j] = (out_T) out_val;
}
}
// Pack input buffer
acc_T *bias_buf =
static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
packBuffer<acc_T, VTA_ACC_WIDTH>(
uint32_t *bias_buf = static_cast<uint32_t *>(
allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(
bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT);
// Prepare output buffer
out_T *output_buf =
static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * batch * tx_size * input_sets));
uint32_t *output_buf = static_cast<uint32_t *>(
allocBuffer(VTA_OUT_ELEM_BYTES * batch * tx_size * input_sets));
#ifdef NO_SIM
// Invoke the VTA
......@@ -914,15 +889,15 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
vta(ins_size,
(volatile insn_T *) insn_buf,
(volatile uop_T *) uop_buf,
(volatile inp_vec_T *) NULL,
(volatile wgt_vec_T *) NULL,
(volatile acc_vec_T *) bias_buf,
(volatile out_vec_T *) output_buf);
(volatile bus_T *) NULL,
(volatile bus_T *) NULL,
(volatile bus_T *) bias_buf,
(volatile bus_T *) output_buf);
#endif
// Unpack output buffer
out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
output_buf,
batch,
vector_size,
......@@ -1123,11 +1098,11 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
#endif
// Initialize inputs
inp_T **inputs = allocInit2dArray<inp_T, VTA_INP_WIDTH>(batch, in_feat);
inp_T **inputs = allocInit2dArray<inp_T>(batch, in_feat);
// Initialize weights
wgt_T **weights = allocInit2dArray<wgt_T, VTA_WGT_WIDTH>(out_feat, in_feat);
wgt_T **weights = allocInit2dArray<wgt_T>(out_feat, in_feat);
// Initialize biases
acc_T **biases = allocInit2dArray<acc_T, VTA_ACC_WIDTH>(batch, out_feat);
acc_T **biases = allocInit2dArray<acc_T>(batch, out_feat);
// Reference GEMM implementation
out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
......@@ -1143,31 +1118,35 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
}
// Prepare the input buffer
inp_T *input_buf = static_cast<inp_T *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
packBuffer<inp_T, VTA_INP_WIDTH>(input_buf,
uint32_t *input_buf = static_cast<uint32_t *>(
allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
packBuffer<uint32_t, 32, inp_T, VTA_INP_WIDTH>(input_buf,
inputs,
batch,
in_feat,
VTA_BATCH,
VTA_BLOCK_IN);
// Prepare the weight buffer
wgt_T *weight_buf = static_cast<wgt_T *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
packBuffer<wgt_T, VTA_WGT_WIDTH>(weight_buf,
uint32_t *weight_buf = static_cast<uint32_t *>(
allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
packBuffer<uint32_t, 32, wgt_T, VTA_WGT_WIDTH>(weight_buf,
weights,
out_feat,
in_feat,
VTA_BLOCK_OUT,
VTA_BLOCK_IN);
// Prepare the bias buffer
acc_T *bias_buf = static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
packBuffer<acc_T, VTA_ACC_WIDTH>(bias_buf,
uint32_t *bias_buf = static_cast<uint32_t *>(
allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(bias_buf,
biases,
batch,
out_feat,
VTA_BATCH,
VTA_BLOCK_OUT);
// Prepare the output buffer
out_T *output_buf = static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * out_size));
uint32_t *output_buf = static_cast<uint32_t *>(
allocBuffer(VTA_INP_ELEM_BYTES * out_size));
#ifdef NO_SIM
// Invoke the VTA
......@@ -1187,15 +1166,15 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
vta(ins_size,
(volatile insn_T *) insn_buf,
(volatile uop_T *) uop_buf,
(volatile inp_vec_T *) input_buf,
(volatile wgt_vec_T *) weight_buf,
(volatile acc_vec_T *) bias_buf,
(volatile out_vec_T *) output_buf);
(volatile bus_T *) input_buf,
(volatile bus_T *) weight_buf,
(volatile bus_T *) bias_buf,
(volatile bus_T *) output_buf);
#endif
// Unpack output data
out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
output_buf,
batch,
out_feat,
......@@ -1352,11 +1331,11 @@ int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression
#endif
// Initialize inputs
inp_T **inputs = allocInit2dArray<inp_T, VTA_INP_WIDTH>(batch, in_channels);
inp_T **inputs = allocInit2dArray<inp_T>(batch, in_channels);
// Initialize weights
wgt_T **weights = allocInit2dArray<wgt_T, VTA_WGT_WIDTH>(out_channels, in_channels);
wgt_T **weights = allocInit2dArray<wgt_T>(out_channels, in_channels);
// Initialize biases
acc_T **biases = allocInit2dArray<acc_T, VTA_ACC_WIDTH>(batch, out_channels);
acc_T **biases = allocInit2dArray<acc_T>(batch, out_channels);
// Reference GEMM implementation
out_T **outputs_ref = alloc2dArray<out_T>(batch, out_channels);
......@@ -1372,31 +1351,31 @@ int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression
}
// Prepare the input buffer
inp_T *input_buf = static_cast<inp_T *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
packBuffer<inp_T, VTA_INP_WIDTH>(input_buf,
uint32_t *input_buf = static_cast<uint32_t *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
packBuffer<uint32_t, 32, inp_T, VTA_INP_WIDTH>(input_buf,
inputs,
batch,
in_channels,
VTA_BATCH,
VTA_BLOCK_IN);
// Prepare the weight buffer
wgt_T *weight_buf = static_cast<wgt_T *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
packBuffer<wgt_T, VTA_WGT_WIDTH>(weight_buf,
uint32_t *weight_buf = static_cast<uint32_t *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
packBuffer<uint32_t, 32, wgt_T, VTA_WGT_WIDTH>(weight_buf,
weights,
out_channels,
in_channels,
VTA_BLOCK_OUT,
VTA_BLOCK_IN);
// Prepare the bias buffer
acc_T *bias_buf = static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
packBuffer<acc_T, VTA_ACC_WIDTH>(bias_buf,
uint32_t *bias_buf = static_cast<uint32_t *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(bias_buf,
biases,
batch,
out_channels,
VTA_BATCH,
VTA_BLOCK_OUT);
// Prepare the output buffer
out_T *output_buf = static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * out_size));
uint32_t *output_buf = static_cast<uint32_t *>(allocBuffer(VTA_OUT_ELEM_BYTES * out_size));
#ifdef NO_SIM
// Invoke the VTA
......@@ -1416,15 +1395,15 @@ int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression
vta(ins_size,
(volatile insn_T *) insn_buf,
(volatile uop_T *) uop_buf,
(volatile inp_vec_T *) input_buf,
(volatile wgt_vec_T *) weight_buf,
(volatile acc_vec_T *) bias_buf,
(volatile out_vec_T *) output_buf);
(volatile bus_T *) input_buf,
(volatile bus_T *) weight_buf,
(volatile bus_T *) bias_buf,
(volatile bus_T *) output_buf);
#endif
// Unpack output data
out_T **outputs = alloc2dArray<out_T>(batch, out_channels);
unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
output_buf,
batch,
out_channels,
......
......@@ -18,7 +18,6 @@
*/
/*!
* Copyright (c) 2018 by Contributors
* \file test_lib.cpp
* \brief Test library for the VTA design simulation and driver tests.
*/
......@@ -40,7 +39,6 @@
#include "../../../src/pynq/pynq_driver.h"
#endif // VTA_TARGET_PYNQ
typedef uint64_t axi_T;
typedef uint32_t uop_T;
typedef int8_t wgt_T;
typedef int8_t inp_T;
......@@ -95,15 +93,25 @@ template <typename T, int T_WIDTH>
void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block);
/*!
* \brief Allocates and initializes a 2D array in the heap.
* \brief Allocates and randomly initializes a 2D array in the heap.
* \param rows Number of rows.
* \param cols Number of columns.
* \return Pointer to the 2D array.
*/
template <typename T, int T_WIDTH>
template <typename T>
T ** allocInit2dArray(int rows, int cols);
/*!
* \brief Allocates and initializes a 2D array to a set value in the heap.
* \param rows Number of rows.
* \param cols Number of columns.
* \param val Value to set the whole array to.
* \return Pointer to the 2D array.
*/
template <typename T>
T ** allocSet2dArray(int rows, int cols, int val);
/*!
* \brief Allocates a 2D array in the heap.
* \param rows Number of rows.
* \param cols Number of columns.
......
......@@ -24,7 +24,7 @@ def test_env():
def test_env_scope():
env = vta.get_env()
cfg = env.pkg_config().cfg_dict
cfg = env.cfg_dict
cfg["TARGET"] = "xyz"
with vta.Environment(cfg):
assert vta.get_env().TARGET == "xyz"
......
......@@ -100,9 +100,9 @@ if env.TARGET not in ["sim", "tsim"]:
# the host, make sure you've set the variables below to the IP of
# your board.
device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
device_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091")
if not tracker_host or not tracker_port:
remote = rpc.connect(device_host, device_port)
remote = rpc.connect(device_host, int(device_port))
else:
remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment