Commit f55609b4 by Thierry Moreau Committed by Jared Roesch

[VTA] Refactor to increase platform coverage (Ultra96 etc.) (#3496)

* hardware refactor for increased FPGA coverage, small optimizations

* fix header

* cleaning up parameters that won't be needed for now

* streamlining makefile, and simplifying tcl scripts

* moving parameter derivation into pkg_config.py, keeping tcl scripts lightweight

* refactoring tcl script to avoid global variables

* deriving AXI signals in pkg_config.py

* unifying address map definition for hardware and software drivers

* single channel design for ultra96 to simplify build

* enable alu by default, no mul opcode for now

* hardware fix

* new bitstream; vta version

* avoid error when env variable is not set

* ultra96 cleanup

* further cleaning up tcl script for bitstream generation

* preliminary rpc server support on ultra96

* rpc server tracker scripts

* ultra96 ldflag

* ultra96 support

* ultra96 support

* cleanup line

* cmake support for ultra96

* simplify memory instantiation

* cleaning up IP parameter initialization

* fix queue instantiation

* 2019.1 transition

* fix macro def

* removing bus width from config

* cleanup

* fix

* turning off testing for now

* cleanup ultra96 ps insantiation

* minor refactor

* adding comments

* upgrading to tophub v0.6

* model used in TVM target now refers to a specific version of VTA for better autoTVM scheduling

* revert change due to bug

* rename driver files to be for zynq-type devices

* streamlining address mapping

* unifying register map offset values between driver and hardware generator

* rely on cma library for cache flush/invalidation

* coherence management

* not make buffer packing depend on data types that can be wider than 64bits

* refactor config derivation to minimize free parameters

* fix environment/pkg config interaction

* adding cfg dump property to pkgconfig:

* fix rpc reconfig

* fix spacing

* cleanup

* fix spacing

* long line fix

* fix spacing and lint

* fix line length

* cmake fix

* environment fix

* renaming after pynq since the driver stack relies on the pynq library - see pynq.io

* update doc

* adding parameterization to  name

* space

* removing reg width

* vta RPC

* update doc on how to edit vta_config.json

* fix path

* fix path
parent bca8ac17
......@@ -17,7 +17,10 @@
# under the License.
PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"
# Derive target specified by vta_config.json
VTA_CONFIG=${PROJROOT}/vta/config/vta_config.py
TARGET=$(python ${VTA_CONFIG} --target)
export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq
python3 -m vta.exec.rpc_server --tracker fleet:9190 --key pynq
python3 -m vta.exec.rpc_server --tracker fleet:9190 --key $TARGET
......@@ -38,11 +38,16 @@ elseif(PYTHON)
string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_=.]*" VTA_DEFINITIONS "${__vta_defs}")
file(GLOB VTA_RUNTIME_SRCS vta/src/*.cc)
file(GLOB __vta_target_srcs vta/src/${VTA_TARGET}/*.cc)
# Add sim driver sources
if(${VTA_TARGET} STREQUAL "sim")
file(GLOB __vta_target_srcs vta/src/sim/*.cc)
endif()
# Add pynq driver sources
if(${VTA_TARGET} STREQUAL "pynq" OR ${VTA_TARGET} STREQUAL "ultra96")
file(GLOB __vta_target_srcs vta/src/pynq/*.cc)
endif()
list(APPEND VTA_RUNTIME_SRCS ${__vta_target_srcs})
add_library(vta SHARED ${VTA_RUNTIME_SRCS})
# Add tsim driver sources
if(${VTA_TARGET} STREQUAL "tsim")
target_compile_definitions(vta PUBLIC USE_TSIM)
include_directories("vta/include")
......@@ -50,6 +55,8 @@ elseif(PYTHON)
list(APPEND RUNTIME_SRCS ${RUNTIME_DPI_SRCS})
endif()
add_library(vta SHARED ${VTA_RUNTIME_SRCS})
target_include_directories(vta PUBLIC vta/include)
foreach(__def ${VTA_DEFINITIONS})
......@@ -62,7 +69,7 @@ elseif(PYTHON)
endif(APPLE)
# PYNQ rules for Pynq v2.4
if(${VTA_TARGET} STREQUAL "pynq")
if(${VTA_TARGET} STREQUAL "pynq" OR ${VTA_TARGET} STREQUAL "ultra96")
find_library(__cma_lib NAMES cma PATH /usr/lib)
target_link_libraries(vta ${__cma_lib})
endif()
......
......@@ -36,10 +36,6 @@ below.
+=======================+============+========================================================+
| ``TARGET`` | String | The TVM device target. |
+-----------------------+------------+--------------------------------------------------------+
| ``HW_TARGET`` | Int | FPGA frequency in MHz. |
+-----------------------+------------+--------------------------------------------------------+
| ``HW_CLK_TARGET`` | Int | FPGA clock period in ns target for HLS tool. |
+-----------------------+------------+--------------------------------------------------------+
| ``HW_VER`` | String | VTA hardware version number. |
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_INP_WIDTH`` | Int (log2) | Input data type signed integer width. |
......@@ -48,13 +44,9 @@ below.
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_ACC_WIDTH`` | Int (log2) | Accumulator data type signed integer width. |
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_OUT_WIDTH`` | Int (log2) | Output data type signed integer width. |
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_BATCH`` | Int (log2) | VTA matrix multiply intrinsic output dimension 0. |
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_BLOCK_IN`` | Int (log2) | VTA matrix multiply reduction dimension. |
| ``LOG_BATCH`` | Int (log2) | VTA matrix multiply intrinsic input/output dimension 0.|
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_BLOCK_OUT`` | Int (log2) | VTA matrix multiply intrinsic output dimension 1. |
| ``LOG_BLOCK`` | Int (log2) | VTA matrix multiply inner dimensions. |
+-----------------------+------------+--------------------------------------------------------+
| ``LOG_UOP_BUFF_SIZE`` | Int (log2) | Micro-op on-chip buffer in Bytes. |
+-----------------------+------------+--------------------------------------------------------+
......@@ -75,13 +67,8 @@ below.
We provide additional detail below regarding each parameter:
- ``TARGET``: Can be set to ``"pynq"`` or ``"sim"``.
- ``HW_TARGET``: In pynq mode, can be set to ``100``, ``142``, ``167``, or ``200`` MHz.
- ``HW_CLK_TARGET``: The lower the target, the more pipeline stages HLS will insert to achieve timing closure during place and route (this can also slightly decrease performance).
- ``TARGET``: Can be set to ``"pynq"``, ``"ultra96"``, ``"sim"`` (fast simulator), or ``"tsim"`` (cycle accurate sim with verilator).
- ``HW_VER``: Hardware version which increments everytime the VTA hardware design changes. This parameter is used to uniquely idenfity hardware bitstreams.
- ``LOG_OUT_WIDTH``: We recommend matching ``LOG_OUT_WIDTH`` to ``LOG_INP_WIDTH``.
- ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension.
- ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension.
- ``LOG_BLOCK_IN``: Equivalent to B in multiplication of shape (A, B) x (B, C), or typically, the input channel dimension.
- ``LOG_BLOCK_OUT``: Equivalent to C in multiplication of shape (A, B) x (B, C), or typically, the output channel dimension.
- ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension of inner tensor computation.
- ``LOG_BLOCK``: Equivalent to B and C in multiplication of shape (A, B) x (B, C), or typically, the input/output channel dimensions of the innter tensor computation.
......@@ -61,7 +61,7 @@ To do so,
```bash
cd <tvm root>
cp vta/config/vta_config.json vta_config.json
vim vta/config/vta_config.json
# edit vta_config.json
make vta
```
......@@ -118,7 +118,7 @@ cd /home/xilinx/tvm
mkdir build
cp cmake/config.cmake build/.
# Copy pynq specific configuration
cp vta/config/pynq_sample.json build/vta_config.json
cp vta/config/pynq_sample.json vta/config/vta_config.json
cd build
cmake ..
make runtime vta -j2
......@@ -147,13 +147,12 @@ export VTA_PYNQ_RPC_PORT=9091
```
In addition, you'll need to edit the `vta_config.json` file on the host to indicate that we are targeting the Pynq platform, by setting the `TARGET` field to `"pynq"`.
Alternatively, you can copy the default `vta/config/pynq_sample.json` into the TVM root as `vta_config.json`.
> Note: in contrast to our simulation setup, there are no libraries to compile on the host side since the host offloads all of the computation to the Pynq board.
```bash
# On the Host-side
cd <tvm root>
cp vta/config/pynq_sample.json vta_config.json
cp vta/config/pynq_sample.json vta/config/vta_config.json
```
This time again, we will run the 2D convolution testbench.
......@@ -187,28 +186,28 @@ This third and last guide allows users to generate custom VTA bitstreams using f
### Xilinx Toolchain Installation
We recommend using `Vivado 2018.2` since our scripts have been tested to work on this version of the Xilinx toolchains.
We recommend using `Vivado 2019.1` since our scripts have been tested to work on this version of the Xilinx toolchains.
Our guide is written for Linux (Ubuntu) installation.
You’ll need to install Xilinx’ FPGA compilation toolchain, [Vivado HL WebPACK 2018.2](https://www.xilinx.com/products/design-tools/vivado.html), which a license-free version of the Vivado HLx toolchain.
You’ll need to install Xilinx’ FPGA compilation toolchain, [Vivado HL WebPACK 2019.1](https://www.xilinx.com/products/design-tools/vivado.html), which a license-free version of the Vivado HLx toolchain.
#### Obtaining and Launching the Vivado GUI Installer
1. Go to the [download webpage](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/2018-2.html), and download the Linux Self Extracting Web Installer for Vivado HLx 2018.2: WebPACK and Editions.
1. Go to the [download webpage](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/2019-1.html), and download the Linux Self Extracting Web Installer for Vivado HLx 2019.1: WebPACK and Editions.
2. You’ll have to sign in with a Xilinx account. This requires a Xilinx account creation that will take 2 minutes.
3. Complete the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin`.
3. Complete the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin`.
4. Now that the file is downloaded, go to your `Downloads` directory, and change the file permissions so it can be executed:
```bash
chmod u+x Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin
chmod u+x Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin
```
5. Now you can execute the binary:
```bash
./Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin
./Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin
```
#### Xilinx Vivado GUI Installer Steps
At this point you've launched the Vivado 2018.2 Installer GUI program.
At this point you've launched the Vivado 2019.1 Installer GUI program.
1. Click “Next” on the *Welcome* screen.
2. On the *Select Install Type* screen, enter your Xilinx user credentials under the “User Authentication” box and select the “Download and Install Now” option before clicking “Next” .
......@@ -230,8 +229,8 @@ At this point you've launched the Vivado 2018.2 Installer GUI program.
The last step is to update your `~/.bashrc` with the following lines. This will include all of the Xilinx binary paths so you can launch compilation scripts from the command line.
```bash
# Xilinx Vivado 2018.2 environment
export XILINX_VIVADO=${XILINX_PATH}/Vivado/2018.2
# Xilinx Vivado 2019.1 environment
export XILINX_VIVADO=${XILINX_PATH}/Vivado/2019.1
export PATH=${XILINX_VIVADO}/bin:${PATH}
```
......
......@@ -44,7 +44,7 @@ PACKAGE_VERSION = {
'opencl': "v0.02",
'mali': "v0.05",
'vta': "v0.05",
'vta': "v0.06",
}
logger = logging.getLogger('autotvm')
......
{
"TARGET" : "pynq",
"HW_FREQ" : 100,
"HW_CLK_TARGET" : 8,
"HW_VER" : "0.0.0",
"HW_VER" : "0.0.1",
"LOG_INP_WIDTH" : 3,
"LOG_WGT_WIDTH" : 3,
"LOG_ACC_WIDTH" : 5,
"LOG_OUT_WIDTH" : 3,
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_BLOCK" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" :15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}
{
"TARGET" : "ultra96",
"HW_VER" : "0.0.1",
"LOG_INP_WIDTH" : 3,
"LOG_WGT_WIDTH" : 3,
"LOG_ACC_WIDTH" : 5,
"LOG_BATCH" : 0,
"LOG_BLOCK" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" :15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}
{
"TARGET" : "sim",
"HW_FREQ" : 100,
"HW_CLK_TARGET" : 7,
"HW_VER" : "0.0.0",
"HW_VER" : "0.0.1",
"LOG_INP_WIDTH" : 3,
"LOG_WGT_WIDTH" : 3,
"LOG_ACC_WIDTH" : 5,
"LOG_OUT_WIDTH" : 3,
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_BLOCK" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
......
......@@ -17,81 +17,30 @@
# Directories
ROOTDIR = $(CURDIR)
BUILD_NAME = build
BUILD_DIR = $(ROOTDIR)/../../$(BUILD_NAME)/hardware/xilinx
SCRIPT_DIR = $(ROOTDIR)/scripts
SRC_DIR = $(ROOTDIR)/src
SIM_DIR = $(ROOTDIR)/sim
TEST_DIR = $(ROOTDIR)/../../tests/hardware/common
INCLUDE_DIR = $(ROOTDIR)/../../include
VTA_DIR = $(CURDIR)/../..
BUILD_DIR = $(VTA_DIR)/build/hardware/xilinx
SCRIPT_DIR = $(CURDIR)/scripts
SRC_DIR = $(CURDIR)/src
# Executables
VIVADO_HLS = vivado_hls
VIVADO = vivado
HSI = hsi
# HLS mode
MODE = skip_sim
# Debug flag
DEBUG = false
# SLURM
SLURM = false
# Prevent generation of DSP
NO_DSP = false
# Prevent generation of ALU
NO_ALU = false
# Process VTA JSON config
VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py
CFLAGS := $(shell ${VTA_CONFIG} --cflags)
VTA_TARGET := $(shell ${VTA_CONFIG} --target)
#---------------------
# VTA Parameters
#--------------------
VTA_INP_WIDTH := $(shell ${VTA_CONFIG} --get-inpwidth)
VTA_WGT_WIDTH := $(shell ${VTA_CONFIG} --get-wgtwidth)
VTA_ACC_WIDTH := $(shell ${VTA_CONFIG} --get-accwidth)
VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth)
VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch)
VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin)
VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout)
VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize)
VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize)
VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize)
#---------------------
# FPGA Parameters
#--------------------
VTA_CLOCK_FREQ = $(shell ${VTA_CONFIG} --get-fpgafreq)
VTA_TARGET_PER = $(shell ${VTA_CONFIG} --get-fpgaper)
#---------------------
# Compilation parameters
#--------------------
# Number of threads during compilation
VTA_HW_COMP_THREADS = 8
VTA_CONFIG := $(CURDIR)/../../config/vta_config.py
# Derive config name
CONF = $(shell ${VTA_CONFIG} --cfg-str)
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
ifeq ($(SLURM), true)
IP_BUILD_PATH = /scratch/hls/$(CONF)
HW_BUILD_PATH = /scratch/vivado/$(CONF)
endif
CONF := $(shell python ${VTA_CONFIG} --cfg-str)
IP_BUILD_PATH := $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH := $(BUILD_DIR)/vivado/$(CONF)
# IP file path
IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip
IP_PATH := $(BUILD_DIR)/hls/$(CONF)/vta_compute/soln/impl/ip/xilinx_com_hls_compute_1_0.zip
# Bitstream file path
BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
BIT_PATH := $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
.PHONY: all ip bit bsp clean clean_all
.PHONY: all ip bit clean clean_all
all: bit
ip: $(IP_PATH)
......@@ -100,37 +49,24 @@ bit: $(BIT_PATH)
$(IP_PATH): $(SRC_DIR)/*
mkdir -p $(IP_BUILD_PATH)
cd $(IP_BUILD_PATH) && \
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
$(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(VTA_TARGET_PER) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
ifeq ($(SLURM), true)
mkdir -p $(BUILD_DIR)/hls
mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/.
endif
$(VIVADO_HLS) \
-f $(SCRIPT_DIR)/hls.tcl \
-tclargs \
$(VTA_DIR) \
${VTA_CONFIG}
$(BIT_PATH): $(IP_PATH)
mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
-tclargs $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) $(VTA_CLOCK_FREQ) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
ifeq ($(SLURM), true)
mkdir -p $(BUILD_DIR)/vivado
mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/.
endif
bsp: $(BIT_PATH)
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
cd $(HW_BUILD_PATH)/bsp && make
$(VIVADO) \
-mode tcl \
-source $(SCRIPT_DIR)/vivado.tcl \
-tclargs \
$(BUILD_DIR)/hls/$(CONF) \
${VTA_CONFIG}
clean:
rm -rf *.out *.log *.sb figures
rm -rf *.out *.log
cleanall: clean
rm -rf $(BUILD_DIR)
......@@ -35,17 +35,6 @@ int main(void) {
printParameters();
#endif
// Micro op bound
assert(VTA_UOP_GEM_2_1 < VTA_UOP_WIDTH);
assert(VTA_UOP_ALU_1_1 < VTA_UOP_WIDTH);
// Make sure there is no misaligment
assert(VTA_INSN_GEM_9_1 < VTA_INSN_GEM_A_0);
assert(VTA_INSN_MEM_7_1 < VTA_INSN_MEM_8_0);
// Instruction bounds
assert(VTA_INSN_MEM_E_1 < VTA_INS_WIDTH);
assert(VTA_INSN_GEM_F_1 < VTA_INS_WIDTH);
assert(VTA_INSN_ALU_G_1 < VTA_INS_WIDTH);
int status = 0;
// Run ALU test (vector-scalar operators)
......@@ -65,15 +54,15 @@ int main(void) {
status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, false);
// Run blocked GEMM test
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
// Simple GEMM unit test
status |= gemm_test(64, 64, 64, true);
status |= gemm_test(4 * VTA_BATCH, 4 * VTA_BLOCK_OUT, 4 * VTA_BLOCK_IN, false);
return status;
}
......@@ -18,7 +18,6 @@
*/
/*!
* Copyright (c) 2018 by Contributors
* \file vta.h
* \brief Type definitions and prototype for VTA HLS design.
*/
......@@ -32,6 +31,16 @@
#include <vta/hw_spec.h>
/*!
* Define HLS stream depth
*/
#define PRAGMA_SUB(x) _Pragma (#x)
#define PRAGMA_HLS(x) PRAGMA_SUB(x)
#define STREAM_IN_DEPTH 8
/* \typedef bus_T memory bus datatype*/
typedef ap_uint<VTA_BUS_WIDTH> bus_T;
/* \typedef uop_T Micro-op datatype*/
typedef ap_uint<VTA_UOP_WIDTH> uop_T;
......@@ -53,18 +62,6 @@ typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
/* \typedef inp_vec_T Input vector datatype*/
typedef ap_uint<VTA_INP_WIDTH*VTA_BLOCK_IN> inp_vec_T;
/* \typedef wgt_vec_T Weight vector datatype*/
typedef ap_uint<VTA_WGT_WIDTH*VTA_BLOCK_IN> wgt_vec_T;
/* \typedef acc_vec_T Accumulator vector datatype*/
typedef ap_uint<VTA_ACC_WIDTH*VTA_BLOCK_OUT> acc_vec_T;
/* \typedef out_vec_T Output vector datatype*/
typedef ap_uint<VTA_OUT_WIDTH*VTA_BLOCK_OUT> out_vec_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
......@@ -107,18 +104,14 @@ typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
/* \typedef aluop_imm_T ALU operation immediate datatype*/
typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_int<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
/* \typedef aluop_shr_arg_T ALU operation shift right immediate datatype*/
typedef ap_int<VTA_SHR_ARG_BIT_WIDTH> aluop_shr_arg_T;
/*!
* Define HLS stream depth
*/
#define PRAGMA_SUB(x) _Pragma (#x)
#define PRAGMA_HLS(x) PRAGMA_SUB(x)
#define STREAM_IN_DEPTH 8
/* \typedef aluop_mul_arg_T ALU operation multiply datatype*/
typedef ap_int<VTA_MUL_ARG_BIT_WIDTH> aluop_mul_arg_T;
/*!
* \brief Fetch module.
......@@ -153,13 +146,13 @@ void fetch(
* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
*/
void load(
volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights,
volatile bus_T *inputs,
volatile bus_T *weights,
hls::stream<insn_T> &load_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &l2g_dep_queue,
inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]);
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]);
/*!
* \brief Compute module.
......@@ -187,15 +180,15 @@ void load(
void compute(
volatile uint32_t &done,
volatile uop_T *uops,
volatile acc_vec_T *biases,
volatile bus_T *biases,
hls::stream<insn_T> &gemm_queue,
hls::stream<bool> &l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue,
out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]);
/*!
* \brief Store module.
......@@ -211,11 +204,11 @@ void compute(
* \param out_mem Local output SRAM buffer. Read only single port BRAM.
*/
void store(
volatile out_vec_T *outputs,
volatile bus_T *outputs,
hls::stream<insn_T> &store_queue,
hls::stream<bool> &g2s_dep_queue,
hls::stream<bool> &s2g_dep_queue,
out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]);
/*!
* \brief VTA wrapper for simulation purpose only.
......@@ -232,9 +225,9 @@ void vta(
uint32_t insn_count,
volatile insn_T *insns,
volatile uop_T *uops,
volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights,
volatile acc_vec_T *biases,
volatile out_vec_T *outputs);
volatile bus_T *inputs,
volatile bus_T *weights,
volatile bus_T *biases,
volatile bus_T *outputs);
#endif // VTA_VTA_H_
......@@ -136,19 +136,23 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size);
/*!
* \brief Flushes the region of memory out of the CPU cache to DRAM.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
* This need to be the physical address.
* \param vir_addr Pointer to memory region allocated with VTAMemAlloc to be flushed.
* This need to be the virtual address.
* \param phy_addr Pointer to memory region allocated with VTAMemAlloc to be flushed.
* This need to be the physical address.
* \param size Size of the region to flush in Bytes.
*/
void VTAFlushCache(vta_phy_addr_t buf, int size);
void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size);
/*!
* \brief Invalidates the region of memory that is cached.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated.
* This need to be the physical address.
* \param vir_addr Pointer to memory region allocated with VTAMemAlloc to be invalidated.
* This need to be the virtual address.
* \param phy_addr Pointer to memory region allocated with VTAMemAlloc to be invalidated.
* This need to be the physical address.
* \param size Size of the region to invalidate in Bytes.
*/
void VTAInvalidateCache(vta_phy_addr_t buf, int size);
void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size);
#ifdef __cplusplus
}
......
......@@ -45,10 +45,11 @@ def get_bitstream_path():
# Derive destination path
cache_dir = os.getenv("VTA_CACHE_PATH", os.path.join(os.getenv("HOME"), ".vta_cache/"))
cache_dir = os.path.join(cache_dir, env.TARGET)
cache_dir = os.path.join(cache_dir, env.HW_VER.replace('.', '_'))
# Create the directory if it didn't exist
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
bit_path = os.path.join(cache_dir, env.BITSTREAM)
bit_path = os.path.join(cache_dir, env.BITSTREAM) + ".bit"
return bit_path
......@@ -63,7 +64,7 @@ def download_bitstream():
bit = get_bitstream_path()
url = os.path.join(BITSTREAM_URL, env.TARGET)
url = os.path.join(url, env.HW_VER)
url = os.path.join(url, env.BITSTREAM)
url = os.path.join(url, env.BITSTREAM + ".bit")
try:
download(url, bit)
......
......@@ -113,15 +113,9 @@ class Environment(object):
# initialization function
def __init__(self, cfg):
self.__dict__.update(cfg)
for key in PkgConfig.cfg_keys:
if key not in cfg:
raise ValueError("Expect key %s in cfg" % key)
# derive output buffer size
self.LOG_OUT_BUFF_SIZE = (
self.LOG_ACC_BUFF_SIZE +
self.LOG_OUT_WIDTH -
self.LOG_ACC_WIDTH)
# Produce the derived parameters and update dict
self.pkg = self.pkg_config(cfg)
self.__dict__.update(self.pkg.cfg_dict)
# data type width
self.INP_WIDTH = 1 << self.LOG_INP_WIDTH
self.WGT_WIDTH = 1 << self.LOG_WGT_WIDTH
......@@ -154,25 +148,15 @@ class Environment(object):
self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8
self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8
self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8
# Configuration bitstream name
self.BITSTREAM = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}.bit".format(
(1 << cfg["LOG_BATCH"]),
(1 << cfg["LOG_BLOCK_IN"]),
(1 << cfg["LOG_BLOCK_OUT"]),
(1 << cfg["LOG_INP_WIDTH"]),
(1 << cfg["LOG_WGT_WIDTH"]),
cfg["LOG_UOP_BUFF_SIZE"],
cfg["LOG_INP_BUFF_SIZE"],
cfg["LOG_WGT_BUFF_SIZE"],
cfg["LOG_ACC_BUFF_SIZE"],
cfg["HW_FREQ"],
cfg["HW_CLK_TARGET"],
cfg["HW_VER"].replace('.', '_'))
# dtypes
self.acc_dtype = "int%d" % self.ACC_WIDTH
self.inp_dtype = "int%d" % self.INP_WIDTH
self.wgt_dtype = "int%d" % self.WGT_WIDTH
self.out_dtype = "int%d" % self.OUT_WIDTH
# bistream name
self.BITSTREAM = self.pkg.bitstream
# model string
self.MODEL = self.TARGET + "_" + self.BITSTREAM
# lazy cached members
self.mock_mode = False
self._mock_env = None
......@@ -187,11 +171,15 @@ class Environment(object):
def __exit__(self, ptype, value, trace):
Environment.current = self._last_env
def pkg_config(self):
def pkg_config(self, cfg):
"""PkgConfig instance"""
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
proj_root = os.path.abspath(os.path.join(curr_path, "../../"))
return PkgConfig(self.__dict__, proj_root)
return PkgConfig(cfg, proj_root)
@property
def cfg_dict(self):
return self.pkg.cfg_dict
@property
def dev(self):
......@@ -236,13 +224,15 @@ class Environment(object):
@property
def target(self):
return tvm.target.vta(model=self.TARGET)
return tvm.target.vta(model=self.MODEL)
@property
def target_host(self):
"""The target host"""
if self.TARGET == "pynq":
return "llvm -target=armv7-none-linux-gnueabihf"
if self.TARGET == "ultra96":
return "llvm -target=aarch64-linux-gnu"
if self.TARGET == "sim" or self.TARGET == "tsim":
return "llvm"
raise ValueError("Unknown target %s" % self.TARGET)
......@@ -316,21 +306,18 @@ def coproc_dep_pop(op):
def _init_env():
"""Iniitalize the default global env"""
"""Initialize the default global env"""
curr_path = os.path.dirname(
os.path.abspath(os.path.expanduser(__file__)))
proj_root = os.path.abspath(os.path.join(curr_path, "../../../"))
path_list = [
os.path.join(curr_path, "vta_config.json"),
os.path.join(proj_root, "build", "vta_config.json"),
os.path.join(proj_root, "vta_config.json"),
os.path.join(proj_root, "vta/config/vta_config.json")
]
path_list = [p for p in path_list if os.path.exists(p)]
if not path_list:
raise RuntimeError(
"Error: {} not found.make sure you have config.json in your vta root"
.format(filename))
return Environment(json.load(open(path_list[0])))
"Error: vta_config.json not found.")
cfg = json.load(open(path_list[0]))
return Environment(cfg)
Environment.current = _init_env()
......@@ -48,9 +48,12 @@ def pynq_bitstream_program(bitstream_path):
bitstream.download()
def bitstream_program(target, bitstream):
if target == 'pynq':
if target in ['pynq', 'ultra96']:
pynq_bitstream_program(bitstream)
elif target != 'sim':
elif target in ['sim', 'tsim']:
# In simulation, bit stream programming is a no-op
return
else:
raise RuntimeError("Unknown target {}".format(target))
if __name__ == "__main__":
......
......@@ -30,7 +30,7 @@ def reconfig_runtime(remote):
"""
env = get_env()
freconfig = remote.get_function("tvm.contrib.vta.reconfig_runtime")
freconfig(env.pkg_config().cfg_json)
freconfig(env.pkg.cfg_json)
def program_fpga(remote, bitstream=None):
......
......@@ -33,7 +33,6 @@ def run(run_func):
env = get_env()
if env.TARGET in ["sim", "tsim"]:
# Talk to local RPC if necessary to debug RPC server.
# Compile vta on your host with make at the root.
# Make sure TARGET is set to "sim" in the config.json file.
......@@ -53,21 +52,20 @@ def run(run_func):
assert simulator.enabled()
run_func(env, rpc.LocalSession())
elif env.TARGET == "pynq":
elif env.TARGET in ["pynq", "ultra96"]:
# The environment variables below should be set if we are using
# a tracker to obtain a remote for a test device
tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
tracket_port = os.environ.get("TVM_TRACKER_PORT", None)
tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
# Otherwise, we can set the variables below to directly
# obtain a remote from a test device
pynq_host = os.environ.get("VTA_PYNQ_RPC_HOST", None)
pynq_port = os.environ.get("VTA_PYNQ_RPC_PORT", None)
# Run device from fleet node if env variables are defined
if tracket_host and tracket_port:
if tracker_host and tracker_port:
remote = autotvm.measure.request_remote(env.TARGET,
tracket_host,
int(tracket_port),
tracker_host,
int(tracker_port),
timeout=10000)
run_func(env, remote)
else:
......@@ -78,3 +76,6 @@ def run(run_func):
else:
raise RuntimeError(
"Please set the VTA_PYNQ_RPC_HOST and VTA_PYNQ_RPC_PORT environment variables")
else:
raise RuntimeError("Unknown target %s" % env.TARGET)
......@@ -15,12 +15,9 @@
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* Copyright (c) 2018 by Contributors
*
* \file pynq_driver.c
* \brief VTA driver for Pynq board.
* \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io).
*/
#include <vta/driver.h>
......@@ -53,19 +50,19 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
memcpy(dst, src, size);
}
void VTAFlushCache(vta_phy_addr_t buf, int size) {
// Call the xlnkFlushCache on the CMA buffer
void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
// Call the cma_flush_cache on the CMA buffer
// so that the FPGA can read the buffer data.
xlnkFlushCache(reinterpret_cast<void*>(buf), size);
cma_flush_cache(vir_addr, phy_addr, size);
}
void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
// Call the xlnkInvalidateCache on the CMA buffer
void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
// Call the cma_invalidate_cache on the CMA buffer
// so that the host needs to read the buffer data.
xlnkInvalidateCache(reinterpret_cast<void*>(buf), size);
cma_invalidate_cache(vir_addr, phy_addr, size);
}
void *VTAMapRegister(uint32_t addr, size_t length) {
void *VTAMapRegister(uint32_t addr) {
// Align the base address with the pages
uint32_t virt_base = addr & ~(getpagesize() - 1);
// Calculate base address offset w.r.t the base address
......@@ -73,16 +70,16 @@ void *VTAMapRegister(uint32_t addr, size_t length) {
// Open file and mmap
uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC);
return mmap(NULL,
(length+virt_offset),
(VTA_IP_REG_MAP_RANGE + virt_offset),
PROT_READ|PROT_WRITE,
MAP_SHARED,
mmap_file,
virt_base);
}
void VTAUnmapRegister(void *vta, size_t length) {
void VTAUnmapRegister(void *vta) {
// Unmap memory
int status = munmap(vta, length);
int status = munmap(vta, VTA_IP_REG_MAP_RANGE);
assert(status == 0);
}
......@@ -98,39 +95,30 @@ class VTADevice {
public:
VTADevice() {
// VTA stage handles
vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR);
vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR);
vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR);
vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR);
}
~VTADevice() {
// Close VTA stage handle
VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE);
VTAUnmapRegister(vta_load_handle_, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle_, VTA_RANGE);
VTAUnmapRegister(vta_store_handle_, VTA_RANGE);
VTAUnmapRegister(vta_fetch_handle_);
VTAUnmapRegister(vta_load_handle_);
VTAUnmapRegister(vta_compute_handle_);
VTAUnmapRegister(vta_store_handle_);
}
int Run(vta_phy_addr_t insn_phy_addr,
uint32_t insn_count,
uint32_t wait_cycles) {
// NOTE: Register address map is derived from the auto-generated
// driver files available under hardware/build/vivado/<design>/export/driver
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V
VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_phy_addr);
// LOAD @ 0x10 : Data signal of inputs_V
VTAWriteMappedReg(vta_load_handle_, 0x10, 0);
// LOAD @ 0x18 : Data signal of weight_V
VTAWriteMappedReg(vta_load_handle_, 0x18, 0);
// COMPUTE @ 0x20 : Data signal of uops_V
VTAWriteMappedReg(vta_compute_handle_, 0x20, 0);
// COMPUTE @ 0x28 : Data signal of biases_V
VTAWriteMappedReg(vta_compute_handle_, 0x28, 0);
// STORE @ 0x10 : Data signal of outputs_V
VTAWriteMappedReg(vta_store_handle_, 0x10, 0);
VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy_addr);
VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_INP_ADDR_OFFSET, 0);
VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_WGT_ADDR_OFFSET, 0);
VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_UOP_ADDR_OFFSET, 0);
VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_BIAS_ADDR_OFFSET, 0);
VTAWriteMappedReg(vta_store_handle_, VTA_STORE_OUT_ADDR_OFFSET, 0);
// VTA start
VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
......@@ -141,7 +129,7 @@ class VTADevice {
// Loop until the VTA is done
unsigned t, flag = 0;
for (t = 0; t < wait_cycles; ++t) {
flag = VTAReadMappedReg(vta_compute_handle_, 0x18);
flag = VTAReadMappedReg(vta_compute_handle_, VTA_COMPUTE_DONE_RD_OFFSET);
if (flag == VTA_DONE) break;
std::this_thread::yield();
}
......
......@@ -6,21 +6,18 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* Copyright (c) 2018 by Contributors
* \file vta_pynq_driver.h
* \brief VTA driver for Pynq board.
*
* \file pynq_driver.h
* \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io).
*/
#ifndef VTA_PYNQ_PYNQ_DRIVER_H_
......@@ -41,23 +38,21 @@ extern "C" {
#include <time.h>
#include <unistd.h>
#ifdef __arm__
#if defined(__arm__) || defined(__aarch64__)
#include <libxlnk_cma.h>
#else
void* cma_alloc(size_t size, int cached);
void cma_free(void* buf);
uint32_t cma_get_phy_addr(void* buf);
void cma_flush_cache(void* buf, unsigned int phys_addr, int size);
void cma_invalidate_cache(void* buf, unsigned int phys_addr, int size);
#endif
void xlnkFlushCache(void* buf, int size);
void xlnkInvalidateCache(void* buf, int size);
void *VTAMapRegister(uint32_t addr, size_t length);
void VTAUnmapRegister(void *vta, size_t length);
void *VTAMapRegister(uint32_t addr);
void VTAUnmapRegister(void *vta);
void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
/*! \brief VTA configuration register address range */
#define VTA_RANGE 0x100
/*! \brief VTA configuration register start value */
#define VTA_START 0x1
/*! \brief VTA configuration register auto-restart value */
......@@ -65,27 +60,6 @@ uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
/*! \brief VTA configuration register done value */
#define VTA_DONE 0x1
/*! \brief VTA fetch stage configuration register address
* from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_FETCH_ADDR 0x43C00000
/*! \brief VTA compute stage configuration register address
* from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_COMPUTE_ADDR 0x43C10000
/*! \brief VTA compute stage configuration register address
* from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_LOAD_ADDR 0x43C20000
/*! \brief VTA store stage configuration register address
* from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_STORE_ADDR 0x43C30000
#ifdef __cplusplus
}
#endif
......
......@@ -44,8 +44,10 @@ namespace vta {
static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8,
"VTA_UOP_WIDTH do not match VTAUop size");
/*! \brief Enable coherent access between VTA and CPU (used on shared mem systems). */
static const bool kBufferCoherent = true;
/*! \brief Enable coherent access of data buffers between VTA and CPU */
static const bool kBufferCoherent = VTA_COHERENT_ACCESSES;
/*! \brief Always cache buffers (otherwise, write back to DRAM from CPU) */
static const bool kAlwaysCache = true;
/*!
* \brief Data buffer represents data on CMA.
......@@ -65,8 +67,10 @@ struct DataBuffer {
* \param size The size of the data.
*/
void InvalidateCache(size_t offset, size_t size) {
if (!kBufferCoherent) {
VTAInvalidateCache(phy_addr_ + offset, size);
if (!kBufferCoherent && kAlwaysCache) {
VTAInvalidateCache(reinterpret_cast<char *>(data_) + offset,
phy_addr_ + offset,
size);
}
}
/*!
......@@ -75,8 +79,10 @@ struct DataBuffer {
* \param size The size of the data.
*/
void FlushCache(size_t offset, size_t size) {
if (!kBufferCoherent) {
VTAFlushCache(phy_addr_ + offset, size);
if (!kBufferCoherent && kAlwaysCache) {
VTAFlushCache(reinterpret_cast<char *>(data_) + offset,
phy_addr_ + offset,
size);
}
}
/*!
......@@ -102,7 +108,7 @@ struct DataBuffer {
* \param size The size of the buffer.
*/
static DataBuffer* Alloc(size_t size) {
void* data = VTAMemAlloc(size, 1);
void* data = VTAMemAlloc(size, kAlwaysCache);
CHECK(data != nullptr);
DataBuffer* buffer = new DataBuffer();
buffer->data_ = data;
......@@ -469,7 +475,9 @@ class UopQueue : public BaseQueue<VTAUop> {
// Flush if we're using a shared memory system
// and if interface is non-coherent
if (!coherent_ && always_cache_) {
VTAFlushCache(fpga_buff_phy_, offset);
VTAFlushCache(fpga_buff_,
fpga_buff_phy_,
offset);
}
}
......@@ -860,7 +868,9 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
// Flush if we're using a shared memory system
// and if interface is non-coherent
if (!coherent_ && always_cache_) {
VTAFlushCache(fpga_buff_phy_, buff_size);
VTAFlushCache(fpga_buff_,
fpga_buff_phy_,
buff_size);
}
}
......@@ -1302,9 +1312,9 @@ class CommandQueue {
// The kernel we are currently recording
UopKernel* record_kernel_{nullptr};
// Micro op queue
UopQueue<VTA_MAX_XFER, true, true> uop_queue_;
UopQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> uop_queue_;
// instruction queue
InsnQueue<VTA_MAX_XFER, true, true> insn_queue_;
InsnQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> insn_queue_;
// Device handle
VTADeviceHandle device_{nullptr};
#ifdef USE_TSIM
......
......@@ -615,10 +615,10 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
memcpy(dst, src, size);
}
void VTAFlushCache(vta_phy_addr_t buf, int size) {
void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
}
void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
}
VTADeviceHandle VTADeviceAlloc() {
......
......@@ -228,10 +228,10 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
memcpy(dst, src, size);
}
void VTAFlushCache(vta_phy_addr_t buf, int size) {
void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
}
void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
}
VTADeviceHandle VTADeviceAlloc() {
......
......@@ -18,7 +18,6 @@
*/
/*!
* Copyright (c) 2018 by Contributors
* \file test_lib.cpp
* \brief Test library for the VTA design simulation and driver tests.
*/
......@@ -40,7 +39,6 @@
#include "../../../src/pynq/pynq_driver.h"
#endif // VTA_TARGET_PYNQ
typedef uint64_t axi_T;
typedef uint32_t uop_T;
typedef int8_t wgt_T;
typedef int8_t inp_T;
......@@ -95,15 +93,25 @@ template <typename T, int T_WIDTH>
void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block);
/*!
* \brief Allocates and initializes a 2D array in the heap.
* \brief Allocates and randomly initializes a 2D array in the heap.
* \param rows Number of rows.
* \param cols Number of columns.
* \return Pointer to the 2D array.
*/
template <typename T, int T_WIDTH>
template <typename T>
T ** allocInit2dArray(int rows, int cols);
/*!
* \brief Allocates and initializes a 2D array to a set value in the heap.
* \param rows Number of rows.
* \param cols Number of columns.
* \param val Value to set the whole array to.
* \return Pointer to the 2D array.
*/
template <typename T>
T ** allocSet2dArray(int rows, int cols, int val);
/*!
* \brief Allocates a 2D array in the heap.
* \param rows Number of rows.
* \param cols Number of columns.
......
......@@ -24,7 +24,7 @@ def test_env():
def test_env_scope():
env = vta.get_env()
cfg = env.pkg_config().cfg_dict
cfg = env.cfg_dict
cfg["TARGET"] = "xyz"
with vta.Environment(cfg):
assert vta.get_env().TARGET == "xyz"
......
......@@ -100,9 +100,9 @@ if env.TARGET not in ["sim", "tsim"]:
# the host, make sure you've set the variables below to the IP of
# your board.
device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
device_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091")
if not tracker_host or not tracker_port:
remote = rpc.connect(device_host, device_port)
remote = rpc.connect(device_host, int(device_port))
else:
remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment