Commit 56a0dea8 by Thierry Moreau Committed by Tianqi Chen

[REFACTOR] Macro standardization, lint tests (#7)

* code refactoring

* code refactoring

* code refactoring

* code refactoring

* fixing macro

* refactoring, tests, makefile

* style - making sure lint test pass

* prefixed macros with VTA, fixed bugs
parent 28a10b69
......@@ -76,7 +76,7 @@ lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ)
lint: pylint cpplint
cpplint:
python nnvm/dmlc-core/scripts/lint.py vta cpp include src
python nnvm/dmlc-core/scripts/lint.py vta cpp include src hardware tests
pylint:
pylint python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc
......
# Directories
ROOTDIR = $(CURDIR)
BUILD_DIR = $(ROOTDIR)/build
BUILD_DIR = $(ROOTDIR)/../../build/hardware/vivado
SCRIPT_DIR = $(ROOTDIR)/scripts
SRC_DIR = $(ROOTDIR)/src
SIM_DIR = $(ROOTDIR)/sim
......@@ -27,20 +27,21 @@ include $(config)
#--------------------
# Number of threads during compilation
NUM_THREADS = 8
VTA_HW_COMP_THREADS = 8
# Target Frequency
CLOCK_FREQ = 100
VTA_HW_COMP_CLOCK_FREQ = 100
# Timing closure compensation (0 for none, 3 for highest)
TIMING_CLOSURE_COMP = 0
VTA_HW_COMP_TIMING_COMP = 0
# Derive clock target period
TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
TARGET_PER = \
$(shell echo "$$(( (1000 + $(VTA_HW_COMP_CLOCK_FREQ) - 1) / $(VTA_HW_COMP_CLOCK_FREQ) - $(VTA_HW_COMP_TIMING_COMP)))" )
# Derive config name
CONF = \
$(BATCH)x$(IN_BLOCK)x$(OUT_BLOCK)_$(INP_WIDTH)bx$(WGT_WIDTH)b_$(CLOCK_FREQ)MHz_$(TARGET_PER)ns
$(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
......@@ -53,23 +54,23 @@ ip:
cd $(IP_BUILD_PATH) && \
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
$(LOG_BATCH) $(LOG_BLOCK_OUT) $(LOG_BLOCK_IN) \
$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
$(VTA_LOG_INP_WIDTH) $(VTA_LOG_WGT_WIDTH) $(VTA_LOG_ACC_WIDTH) $(VTA_LOG_OUT_WIDTH) \
$(VTA_LOG_BATCH) $(VTA_LOG_BLOCK_OUT) $(VTA_LOG_BLOCK_IN) \
$(VTA_LOG_UOP_BUFF_SIZE) $(VTA_LOG_INP_BUFF_SIZE) $(VTA_LOG_WGT_BUFF_SIZE) \
$(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE)
bit: ip
mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
-tclargs $(IP_BUILD_PATH) $(NUM_THREADS) $(CLOCK_FREQ) \
$(INP_WIDTH) $(WGT_WIDTH) $(OUT_WIDTH) \
$(BATCH) $(IN_BLOCK) $(OUT_BLOCK) \
$(INP_BUFF_SIZE) $(WGT_BUFF_SIZE) $(OUT_BUFF_SIZE)
-tclargs $(IP_BUILD_PATH) $(VTA_HW_COMP_THREADS) $(VTA_HW_COMP_CLOCK_FREQ) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
driver: bit
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
cd $(HW_BUILD_PATH)/bsp && make
clean:
rm -rf build
\ No newline at end of file
rm -rf $(BUILD_DIR)
\ No newline at end of file
......@@ -63,12 +63,12 @@ if { [llength $argv] eq 19 } {
# C define flags to pass to compiler
set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
-DLOG_UOP_BUFF_SIZE=$uop_buff_size -DLOG_INP_BUFF_SIZE=$inp_buff_size \
-DLOG_WGT_BUFF_SIZE=$wgt_buff_size -DLOG_ACC_BUFF_SIZE=$acc_buff_size \
-DLOG_OUT_BUFF_SIZE=$out_buff_size"
-DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
-DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
# This is necessary because of a Vivado restriction that doesn't allow for
......
......@@ -11,52 +11,49 @@
#include "../src/vta.h"
#include "../../../tests/hardware/common/test_lib.h"
int main(void)
{
#if DEBUG==1
int main(void) {
#if DEBUG == 1
printParameters();
#endif
// Buffer indexing
assert(LOG_ACC_BUFF_DEPTH>=LOG_INP_BUFF_DEPTH);
assert(VTA_LOG_ACC_BUFF_DEPTH >= VTA_LOG_INP_BUFF_DEPTH);
// Micro op bound
assert(UOP_GEM_3_1<UOP_WIDTH);
assert(UOP_ALU_3_1<UOP_WIDTH);
assert(VTA_UOP_GEM_3_1 < VTA_UOP_WIDTH);
assert(VTA_UOP_ALU_3_1 < VTA_UOP_WIDTH);
// Instruction alignment checks
assert(INSN_MEM_7_1<INSN_MEM_8_0);
assert(INSN_GEM_8_1<INSN_GEM_9_0);
assert(VTA_INSN_MEM_7_1 < VTA_INSN_MEM_8_0);
assert(VTA_INSN_GEM_8_1 < VTA_INSN_GEM_9_0);
// Instruction bounds
assert(INSN_MEM_E_1<INS_WIDTH);
assert(INSN_GEM_E_1<INS_WIDTH);
assert(INSN_ALU_F_1<INS_WIDTH);
assert(VTA_INSN_MEM_E_1 < VTA_INS_WIDTH);
assert(VTA_INSN_GEM_E_1 < VTA_INS_WIDTH);
assert(VTA_INSN_ALU_F_1 < VTA_INS_WIDTH);
int status = 0;
// Run ALU test (vector-scalar operators)
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MIN, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MIN, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false);
// Run ALU test (vector-vector operators)
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MIN, false, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MIN, false, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false);
// Run blocked GEMM test
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
return status;
}
......@@ -3,96 +3,96 @@
* \file vta.h
* \brief Type definitions and prototype for VTA HLS design.
*/
#ifndef VTA_MAIN_H_
#define VTA_MAIN_H_
#ifndef VTA_VTA_H_
#define VTA_VTA_H_
#include <assert.h>
#include <ap_axi_sdata.h>
#include <ap_int.h>
#include <assert.h>
#include <hls_stream.h>
#include <vta/hw_spec.h>
/* \typedef uop_T Micro-op datatype*/
typedef ap_uint<UOP_WIDTH> uop_T;
typedef ap_uint<VTA_UOP_WIDTH> uop_T;
/* \typedef inp_T Input datatype*/
typedef ap_int<INP_WIDTH> inp_T;
typedef ap_int<VTA_INP_WIDTH> inp_T;
/* \typedef wgt_T Weight datatype*/
typedef ap_int<WGT_WIDTH> wgt_T;
typedef ap_int<VTA_WGT_WIDTH> wgt_T;
/* \typedef out_T Output datatype*/
typedef ap_int<OUT_WIDTH> out_T;
typedef ap_int<VTA_OUT_WIDTH> out_T;
/* \typedef acc_T Accumulator datatype*/
typedef ap_int<ACC_WIDTH> acc_T;
typedef ap_int<VTA_ACC_WIDTH> acc_T;
/* \typedef mul_T Multiplier output datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
/* \typedef inp_vec_T Input vector datatype*/
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
typedef ap_uint<VTA_INP_WIDTH*VTA_BLOCK_IN> inp_vec_T;
/* \typedef wgt_vec_T Weight vector datatype*/
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
typedef ap_uint<VTA_WGT_WIDTH*VTA_BLOCK_IN> wgt_vec_T;
/* \typedef acc_vec_T Accumulator vector datatype*/
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
typedef ap_uint<VTA_ACC_WIDTH*VTA_BLOCK_OUT> acc_vec_T;
/* \typedef out_vec_T Output vector datatype*/
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
typedef ap_uint<VTA_OUT_WIDTH*VTA_BLOCK_OUT> out_vec_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
/* \typedef inp_idx_T Input SRAM index datatype*/
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
typedef ap_uint<VTA_LOG_INP_BUFF_DEPTH+1> inp_idx_T;
/* \typedef wgt_idx_T Weight SRAM index datatype*/
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
typedef ap_uint<VTA_LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
/* \typedef acc_idx_T Accumulator SRAM index datatype*/
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
typedef ap_uint<VTA_LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
/* \typedef opcode_T Opcode datatype*/
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
typedef ap_uint<VTA_OPCODE_BIT_WIDTH> opcode_T;
/* \typedef insn_T Instruction datatype*/
typedef ap_uint<INS_WIDTH> insn_T;
typedef ap_uint<VTA_INS_WIDTH> insn_T;
/* \typedef loop_T Loop bound datatype*/
typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
typedef ap_uint<VTA_LOOP_ITER_WIDTH> loop_T;
/* \typedef memop_id_T Memory operation ID datatype*/
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
typedef ap_uint<VTA_MEMOP_ID_BIT_WIDTH> memop_id_T;
/* \typedef memop_sram_T Memory operation SRAM index datatype*/
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
typedef ap_uint<VTA_MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
/* \typedef memop_dram_T Memory operation DRAM index datatype*/
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
typedef ap_uint<VTA_MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
/* \typedef memop_size_T Memory operation range datatype*/
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
typedef ap_uint<VTA_MEMOP_SIZE_BIT_WIDTH> memop_size_T;
/* \typedef memop_stride_T Memory operation stride datatype*/
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
typedef ap_uint<VTA_MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
/* \typedef memop_pad_T Memory operation pad width datatype*/
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
typedef ap_uint<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
/*!
* \brief Fetch module.
......@@ -104,12 +104,12 @@ typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
* \param store_queue Store instruction queue. AXI-stream FIFO.
*/
void fetch (
void fetch(
uint32_t insn_count,
volatile insn_T *insns,
hls::stream<insn_T> &load_queue,
hls::stream<insn_T> &gemm_queue,
hls::stream<insn_T> &store_queue);
hls::stream<insn_T> *load_queue,
hls::stream<insn_T> *gemm_queue,
hls::stream<insn_T> *store_queue);
/*!
* \brief Load module.
......@@ -126,15 +126,14 @@ void fetch (
* \param inp_mem Local input SRAM buffer. Write only single port BRAM.
* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
*/
void load (
void load(
volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights,
hls::stream<insn_T> &load_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &l2g_dep_queue,
inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]
);
hls::stream<insn_T> *load_queue,
hls::stream<bool> *g2l_dep_queue,
hls::stream<bool> *l2g_dep_queue,
inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]);
/*!
* \brief Compute module.
......@@ -159,19 +158,18 @@ void load (
* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
* \param out_mem Local output SRAM buffer. Write only single port BRAM.
*/
void compute (
volatile uint32_t &done,
void compute(
volatile uint32_t *done,
volatile uop_T *uops,
volatile acc_vec_T *biases,
hls::stream<insn_T> &gemm_queue,
hls::stream<bool> &l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue,
out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT],
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
);
hls::stream<insn_T> *gemm_queue,
hls::stream<bool> *l2g_dep_queue,
hls::stream<bool> *s2g_dep_queue,
hls::stream<bool> *g2l_dep_queue,
hls::stream<bool> *g2s_dep_queue,
out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
/*!
* \brief Store module.
......@@ -186,13 +184,12 @@ void compute (
* AXI-stream FIFO.
* \param out_mem Local output SRAM buffer. Read only single port BRAM.
*/
void store (
void store(
volatile out_vec_T *outputs,
hls::stream<insn_T> &store_queue,
hls::stream<bool> &g2s_dep_queue,
hls::stream<bool> &s2g_dep_queue,
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
);
hls::stream<insn_T> *store_queue,
hls::stream<bool> *g2s_dep_queue,
hls::stream<bool> *s2g_dep_queue,
out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
/*!
* \brief VTA wrapper for simulation purpose only.
......@@ -205,7 +202,7 @@ void store (
* \param biases Bias data base address in DRAM. AXI-4 master port.
* \param outputs Output data base address in DRAM. AXI-4 master port.
*/
void vta (
void vta(
uint32_t insn_count,
volatile insn_T *insns,
volatile uop_T *uops,
......@@ -214,4 +211,4 @@ void vta (
volatile acc_vec_T *biases,
volatile out_vec_T *outputs);
#endif // VTA_MAIN_H_
\ No newline at end of file
#endif // VTA_VTA_H_
......@@ -14,10 +14,10 @@ extern "C" {
#include <stdlib.h>
#include <stdint.h>
/*! \brief Memory management constants with libxlnk_cma */
#define CACHED 1
/*! \brief Memory management constants with libxlnk_cma */
#define NOT_CACHED 0
/*! \brief Memory management constants */
#define VTA_CACHED 1
/*! \brief Memory management constants */
#define VTA_NOT_CACHED 0
/*! \brief VTA command handle */
typedef void * VTAHandle;
......
......@@ -27,70 +27,72 @@ ADD_LDFLAGS=
ADD_CFLAGS=
# the hardware target
TARGET=PYNQ_TARGET
TARGET = VTA_PYNQ_TARGET
#---------------------
# VTA hardware parameters
#--------------------
# Log of input/activation width in bits (default 3 -> 8 bits)
LOG_INP_WIDTH = 3
VTA_LOG_INP_WIDTH = 3
# Log of kernel weight width in bits (default 3 -> 8 bits)
LOG_WGT_WIDTH = 3
VTA_LOG_WGT_WIDTH = 3
# Log of accum width in bits (default 5 -> 32 bits)
LOG_ACC_WIDTH = 5
VTA_LOG_ACC_WIDTH = 5
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0
VTA_LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_BLOCK_IN = 4
VTA_LOG_BLOCK_IN = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_BLOCK_OUT = 4
VTA_LOG_BLOCK_OUT = 4
# Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15
VTA_LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15
VTA_LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15
VTA_LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17
VTA_LOG_ACC_BUFF_SIZE = 17
#---------------------
# Derived VTA hardware parameters
#--------------------
# Input width in bits
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
VTA_INP_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_INP_WIDTH) ))" )
# Weight width in bits
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
VTA_WGT_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_WGT_WIDTH) ))" )
# Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
VTA_LOG_OUT_WIDTH = $(VTA_LOG_INP_WIDTH)
# Output width in bits
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
VTA_OUT_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_OUT_WIDTH) ))" )
# Tensor batch size
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
VTA_BATCH = $(shell echo "$$(( 1 << $(VTA_LOG_BATCH) ))" )
# Tensor outer block size
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_IN) ))" )
VTA_IN_BLOCK = $(shell echo "$$(( 1 << $(VTA_LOG_BLOCK_IN) ))" )
# Tensor inner block size
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_OUT) ))" )
VTA_OUT_BLOCK = $(shell echo "$$(( 1 << $(VTA_LOG_BLOCK_OUT) ))" )
# Uop buffer size in Bytes
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
VTA_UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_UOP_BUFF_SIZE) ))" )
# Inp buffer size in Bytes
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
VTA_INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_INP_BUFF_SIZE) ))" )
# Wgt buffer size in Bytes
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
VTA_WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_WGT_BUFF_SIZE) ))" )
# Acc buffer size in Bytes
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
VTA_ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_ACC_BUFF_SIZE) ))" )
# Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
VTA_LOG_OUT_BUFF_SIZE = \
$(shell echo "$$(( $(VTA_LOG_ACC_BUFF_SIZE) + $(VTA_LOG_OUT_WIDTH) - $(VTA_LOG_ACC_WIDTH) ))" )
# Out buffer size in Bytes
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
VTA_OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
# Update ADD_CFLAGS
ADD_CFLAGS += \
-D$(TARGET) \
-DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_BLOCK_IN) -DLOG_BLOCK_OUT=$(LOG_BLOCK_OUT) \
-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
\ No newline at end of file
-DVTA_LOG_WGT_WIDTH=$(VTA_LOG_WGT_WIDTH) -DVTA_LOG_INP_WIDTH=$(VTA_LOG_INP_WIDTH) \
-DVTA_LOG_ACC_WIDTH=$(VTA_LOG_ACC_WIDTH) -DVTA_LOG_OUT_WIDTH=$(VTA_LOG_OUT_WIDTH) \
-DVTA_LOG_BATCH=$(VTA_LOG_BATCH) \
-DVTA_LOG_BLOCK_IN=$(VTA_LOG_BLOCK_IN) -DVTA_LOG_BLOCK_OUT=$(VTA_LOG_BLOCK_OUT) \
-DVTA_LOG_UOP_BUFF_SIZE=$(VTA_LOG_UOP_BUFF_SIZE) -DVTA_LOG_INP_BUFF_SIZE=$(VTA_LOG_INP_BUFF_SIZE) \
-DVTA_LOG_WGT_BUFF_SIZE=$(VTA_LOG_WGT_BUFF_SIZE) -DVTA_LOG_ACC_BUFF_SIZE=$(VTA_LOG_ACC_BUFF_SIZE) \
-DVTA_LOG_OUT_BUFF_SIZE=$(VTA_LOG_OUT_BUFF_SIZE)
......@@ -29,65 +29,61 @@ void VTAInvalidateCache(void* buf, int size) {
}
void *VTAMapRegister(uint32_t addr, size_t length) {
// Align the base address with the pages
uint32_t virt_base = addr & ~(getpagesize() - 1);
// Calculate base address offset w.r.t the base address
uint32_t virt_offset = addr - virt_base;
// Open file and mmap
uint32_t mmap_file = open(DEV_MEM_PATH, O_RDWR|O_SYNC);
return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base);
uint32_t mmap_file = open(VTA_PYNQ_DEV_MEM_PATH, O_RDWR|O_SYNC);
return mmap(NULL,
(length+virt_offset),
PROT_READ|PROT_WRITE,
MAP_SHARED,
mmap_file,
virt_base);
}
void VTAUnmapRegister(void *vta, size_t length) {
// Unmap memory
int status = munmap(vta, length);
assert(status==0);
assert(status == 0);
}
void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
*((volatile uint32_t *) (((char *) base_addr) + offset)) = val;
*((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset)) = val;
}
uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
return *((volatile uint32_t *) (((char *) base_addr) + offset));
return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset));
}
void VTAProgram(const char* bitstream) {
int elem;
FILE *src, *dst, *partial;
partial = fopen(BS_IS_PARTIAL, "w");
partial = fopen(VTA_PYNQ_BS_IS_PARTIAL, "w");
if (partial == NULL) {
printf("Cannot open partial config file %s\n", BS_IS_PARTIAL);
printf("Cannot open partial config file %s\n", VTA_PYNQ_BS_IS_PARTIAL);
fclose(partial);
exit(1);
}
fputc('0', partial);
fclose(partial);
src = fopen(bitstream, "rb");
if (src == NULL) {
printf("Cannot open bitstream %s\n", bitstream);
exit(1);
}
dst = fopen(BS_XDEVCFG, "wb");
dst = fopen(VTA_PYNQ_BS_XDEVCFG, "wb");
if (dst == NULL) {
printf("Cannot open device file %s\n", BS_XDEVCFG);
printf("Cannot open device file %s\n", VTA_PYNQ_BS_XDEVCFG);
fclose(dst);
exit(1);
}
elem = fgetc(src);
while (elem != EOF) {
fputc(elem, dst);
elem = fgetc(src);
}
fclose(src);
fclose(dst);
}
......@@ -4,8 +4,8 @@
* \brief VTA driver for Pynq board.
*/
#ifndef VTA_PYNQ_DRIVER_H_
#define VTA_PYNQ_DRIVER_H_
#ifndef VTA_PYNQ_PYNQ_DRIVER_H_
#define VTA_PYNQ_PYNQ_DRIVER_H_
#ifdef __cplusplus
extern "C" {
......@@ -32,17 +32,20 @@ void xlnkFlushCache(void* buf, int size);
void xlnkInvalidateCache(void* buf, int size);
#endif
/*! \brief partial bitstream status file path */
#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
/*! \brief bitstream destination file path */
#define BS_XDEVCFG "/dev/xdevcfg"
/*! \brief (Pynq only) Partial bitstream status file path */
#define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
/*! \brief (Pynq only) Bitstream destination file path */
#define VTA_PYNQ_BS_XDEVCFG "/dev/xdevcfg"
/*! \brief Path to /dev/mem */
#define DEV_MEM_PATH "/dev/mem"
/*! \brief MMIO driver constant */
#define MMIO_WORD_LENGTH 4
/*! \brief MMIO driver constant */
#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
/*! \brief (Pynq only) Path to /dev/mem */
#define VTA_PYNQ_DEV_MEM_PATH "/dev/mem"
/*! \brief (Pynq only) MMIO driver constant */
#define VTA_PYNQ_MMIO_WORD_LENGTH 4
/*! \brief (Pynq only) MMIO driver constant */
#define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
/*! \brief Physically contiguous buffer size limit */
#define VTA_MAX_XFER (1<<22)
/*! \brief VTA configuration register address range */
#define VTA_RANGE 0x100
......@@ -74,10 +77,7 @@ void xlnkInvalidateCache(void* buf, int size);
*/
#define VTA_STORE_ADDR 0x43C30000
/*! \brief Buffer size limit */
#define MAX_XFER (1<<22)
#ifdef __cplusplus
}
#endif
#endif // VTA_PYNQ_DRIVER_H_
\ No newline at end of file
#endif // VTA_PYNQ_PYNQ_DRIVER_H_
\ No newline at end of file
// simply include the driver for now.
/*!
* Copyright (c) 2018 by Contributors
* \file vta_device_api.cc
* \brief VTA device API for TVM
*/
#include <tvm/runtime/registry.h>
#include <dmlc/thread_local.h>
#include <vta/runtime.h>
#include "../../tvm/src/runtime/workspace_pool.h"
#include "../../nnvm/tvm/src/runtime/workspace_pool.h"
namespace tvm {
namespace runtime {
......
......@@ -4,8 +4,8 @@
* \brief Test library for the VTA design simulation and driver tests.
*/
#ifndef VTA_TESTLIB_H_
#define VTA_TESTLIB_H_
#ifndef TESTS_HARDWARE_COMMON_TEST_LIB_H_
#define TESTS_HARDWARE_COMMON_TEST_LIB_H_
#include <assert.h>
#include <stdint.h>
......@@ -17,9 +17,9 @@
#include <vta/driver.h>
#ifdef PYNQ_TARGET
#ifdef VTA_PYNQ_TARGET
#include "../../../src/pynq/pynq_driver.h"
#endif //PYNQ_TARGET
#endif // VTA_PYNQ_TARGET
typedef uint64_t axi_T;
typedef uint32_t uop_T;
......@@ -28,7 +28,7 @@ typedef int8_t inp_T;
typedef int8_t out_T;
typedef int32_t acc_T;
uint64_t vta (
uint64_t vta(
uint32_t insn_count,
VTAGenericInsn *insns,
VTAUop *uops,
......@@ -37,11 +37,11 @@ uint64_t vta (
acc_T *biases,
inp_T *outputs);
#else //NO_SIM
#else // NO_SIM
#include "../../../hardware/vivado/src/vta.h"
#endif //NO_SIM
#endif // NO_SIM
/*!
* \brief Returns opcode string.
......@@ -300,4 +300,4 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
int virtual_threads);
#endif // VTA_TESTLIB_H_
\ No newline at end of file
#endif // TESTS_HARDWARE_COMMON_TEST_LIB_H_
......@@ -14,7 +14,7 @@
#include "../common/test_lib.h"
// VTA invocation (present the same abstraction as in the simulation tests)
uint64_t vta (
uint64_t vta(
uint32_t insn_count,
VTAGenericInsn *insns,
VTAUop *uops,
......@@ -22,24 +22,23 @@ uint64_t vta (
wgt_T *weights,
acc_T *biases,
inp_T *outputs) {
// Performance counter variables
uint64_t t_fpga;
struct timespec start, stop;
// Derive bitstream file
char bitstream[64];
char bitstream[128];
char str_batch_size[4];
char str_block_out_size[4];
char str_block_in_size[4];
char str_block_bit_width[4];
sprintf(str_batch_size, "%d", BATCH);
sprintf(str_block_out_size, "%d", BLOCK_OUT);
sprintf(str_block_in_size, "%d", BLOCK_IN);
sprintf(str_block_bit_width, "%d", WGT_WIDTH);
strcpy(bitstream, "vta.bit");
snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
#if DEBUG==1
#if VTA_DEBUG == 1
printf("INFO - Programming FPGA: %s!\n", bitstream);
#endif
......@@ -59,7 +58,7 @@ uint64_t vta (
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
#if DEBUG==1
#if VTA_DEBUG == 1
printf("INFO - Starting FPGA!\n");
#endif
......@@ -92,14 +91,13 @@ uint64_t vta (
if (flag & VTA_DONE) break;
}
if (t==10000000) {
if (t == 10000000) {
printf("\tWARNING: VTA TIMEOUT!!!!\n");
}
#if DEBUG==1
else {
#if VTA_DEBUG == 1
} else {
printf("INFO - FPGA Finished!\n");
}
#endif
}
clock_gettime(CLOCK_REALTIME, &stop);
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
......@@ -111,43 +109,40 @@ uint64_t vta (
VTAUnmapRegister(vta_store_handle, VTA_RANGE);
return t_fpga;
};
int main(void)
{
}
#if DEBUG==1
int main(void) {
#if VTA_DEBUG == 1
printParameters();
#endif
int status = 0;
// Run ALU test (vector-scalar operators)
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false);
// Run ALU test (vector-vector operators)
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false);
// Run blocked GEMM test
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
if (status==0) {
if (status == 0) {
printf("\nINFO - Unit tests successful!\n");
} else {
printf("\nINTO - Unit tests failed!\n");
}
return status;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment