Commit 56a0dea8 by Thierry Moreau Committed by Tianqi Chen

[REFACTOR] Macro standardization, lint tests (#7)

* code refactoring

* code refactoring

* code refactoring

* code refactoring

* fixing macro

* refactoring, tests, makefile

* style - making sure lint test pass

* prefixed macros with VTA, fixed bugs
parent 28a10b69
...@@ -76,7 +76,7 @@ lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ) ...@@ -76,7 +76,7 @@ lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ)
lint: pylint cpplint lint: pylint cpplint
cpplint: cpplint:
python nnvm/dmlc-core/scripts/lint.py vta cpp include src python nnvm/dmlc-core/scripts/lint.py vta cpp include src hardware tests
pylint: pylint:
pylint python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc pylint python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc
......
# Directories # Directories
ROOTDIR = $(CURDIR) ROOTDIR = $(CURDIR)
BUILD_DIR = $(ROOTDIR)/build BUILD_DIR = $(ROOTDIR)/../../build/hardware/vivado
SCRIPT_DIR = $(ROOTDIR)/scripts SCRIPT_DIR = $(ROOTDIR)/scripts
SRC_DIR = $(ROOTDIR)/src SRC_DIR = $(ROOTDIR)/src
SIM_DIR = $(ROOTDIR)/sim SIM_DIR = $(ROOTDIR)/sim
...@@ -27,20 +27,21 @@ include $(config) ...@@ -27,20 +27,21 @@ include $(config)
#-------------------- #--------------------
# Number of threads during compilation # Number of threads during compilation
NUM_THREADS = 8 VTA_HW_COMP_THREADS = 8
# Target Frequency # Target Frequency
CLOCK_FREQ = 100 VTA_HW_COMP_CLOCK_FREQ = 100
# Timing closure compensation (0 for none, 3 for highest) # Timing closure compensation (0 for none, 3 for highest)
TIMING_CLOSURE_COMP = 0 VTA_HW_COMP_TIMING_COMP = 0
# Derive clock target period # Derive clock target period
TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" ) TARGET_PER = \
$(shell echo "$$(( (1000 + $(VTA_HW_COMP_CLOCK_FREQ) - 1) / $(VTA_HW_COMP_CLOCK_FREQ) - $(VTA_HW_COMP_TIMING_COMP)))" )
# Derive config name # Derive config name
CONF = \ CONF = \
$(BATCH)x$(IN_BLOCK)x$(OUT_BLOCK)_$(INP_WIDTH)bx$(WGT_WIDTH)b_$(CLOCK_FREQ)MHz_$(TARGET_PER)ns $(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF) IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF) HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
...@@ -53,23 +54,23 @@ ip: ...@@ -53,23 +54,23 @@ ip:
cd $(IP_BUILD_PATH) && \ cd $(IP_BUILD_PATH) && \
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \ $(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \ -tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \ $(VTA_LOG_INP_WIDTH) $(VTA_LOG_WGT_WIDTH) $(VTA_LOG_ACC_WIDTH) $(VTA_LOG_OUT_WIDTH) \
$(LOG_BATCH) $(LOG_BLOCK_OUT) $(LOG_BLOCK_IN) \ $(VTA_LOG_BATCH) $(VTA_LOG_BLOCK_OUT) $(VTA_LOG_BLOCK_IN) \
$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \ $(VTA_LOG_UOP_BUFF_SIZE) $(VTA_LOG_INP_BUFF_SIZE) $(VTA_LOG_WGT_BUFF_SIZE) \
$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE) $(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE)
bit: ip bit: ip
mkdir -p $(HW_BUILD_PATH) mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \ cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \ $(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
-tclargs $(IP_BUILD_PATH) $(NUM_THREADS) $(CLOCK_FREQ) \ -tclargs $(IP_BUILD_PATH) $(VTA_HW_COMP_THREADS) $(VTA_HW_COMP_CLOCK_FREQ) \
$(INP_WIDTH) $(WGT_WIDTH) $(OUT_WIDTH) \ $(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(OUT_WIDTH) \
$(BATCH) $(IN_BLOCK) $(OUT_BLOCK) \ $(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(INP_BUFF_SIZE) $(WGT_BUFF_SIZE) $(OUT_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
driver: bit driver: bit
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
cd $(HW_BUILD_PATH)/bsp && make cd $(HW_BUILD_PATH)/bsp && make
clean: clean:
rm -rf build rm -rf $(BUILD_DIR)
\ No newline at end of file \ No newline at end of file
...@@ -63,12 +63,12 @@ if { [llength $argv] eq 19 } { ...@@ -63,12 +63,12 @@ if { [llength $argv] eq 19 } {
# C define flags to pass to compiler # C define flags to pass to compiler
set cflags "-I $include_dir -I $src_dir -I $test_dir \ set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \ -DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \ -DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \ -DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
-DLOG_UOP_BUFF_SIZE=$uop_buff_size -DLOG_INP_BUFF_SIZE=$inp_buff_size \ -DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
-DLOG_WGT_BUFF_SIZE=$wgt_buff_size -DLOG_ACC_BUFF_SIZE=$acc_buff_size \ -DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
-DLOG_OUT_BUFF_SIZE=$out_buff_size" -DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
# Initializes the HLS design and sets HLS pragmas for memory partitioning. # Initializes the HLS design and sets HLS pragmas for memory partitioning.
# This is necessary because of a Vivado restriction that doesn't allow for # This is necessary because of a Vivado restriction that doesn't allow for
......
...@@ -11,52 +11,49 @@ ...@@ -11,52 +11,49 @@
#include "../src/vta.h" #include "../src/vta.h"
#include "../../../tests/hardware/common/test_lib.h" #include "../../../tests/hardware/common/test_lib.h"
int main(void) int main(void) {
{ #if DEBUG == 1
#if DEBUG==1
printParameters(); printParameters();
#endif #endif
// Buffer indexing // Buffer indexing
assert(LOG_ACC_BUFF_DEPTH>=LOG_INP_BUFF_DEPTH); assert(VTA_LOG_ACC_BUFF_DEPTH >= VTA_LOG_INP_BUFF_DEPTH);
// Micro op bound // Micro op bound
assert(UOP_GEM_3_1<UOP_WIDTH); assert(VTA_UOP_GEM_3_1 < VTA_UOP_WIDTH);
assert(UOP_ALU_3_1<UOP_WIDTH); assert(VTA_UOP_ALU_3_1 < VTA_UOP_WIDTH);
// Instruction alignment checks // Instruction alignment checks
assert(INSN_MEM_7_1<INSN_MEM_8_0); assert(VTA_INSN_MEM_7_1 < VTA_INSN_MEM_8_0);
assert(INSN_GEM_8_1<INSN_GEM_9_0); assert(VTA_INSN_GEM_8_1 < VTA_INSN_GEM_9_0);
// Instruction bounds // Instruction bounds
assert(INSN_MEM_E_1<INS_WIDTH); assert(VTA_INSN_MEM_E_1 < VTA_INS_WIDTH);
assert(INSN_GEM_E_1<INS_WIDTH); assert(VTA_INSN_GEM_E_1 < VTA_INS_WIDTH);
assert(INSN_ALU_F_1<INS_WIDTH); assert(VTA_INSN_ALU_F_1 < VTA_INS_WIDTH);
int status = 0; int status = 0;
// Run ALU test (vector-scalar operators) // Run ALU test (vector-scalar operators)
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MIN, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MIN, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false);
// Run ALU test (vector-vector operators) // Run ALU test (vector-vector operators)
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MIN, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MIN, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false);
// Run blocked GEMM test // Run blocked GEMM test
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
return status; return status;
}
}
\ No newline at end of file
...@@ -3,96 +3,96 @@ ...@@ -3,96 +3,96 @@
* \file vta.h * \file vta.h
* \brief Type definitions and prototype for VTA HLS design. * \brief Type definitions and prototype for VTA HLS design.
*/ */
#ifndef VTA_MAIN_H_ #ifndef VTA_VTA_H_
#define VTA_MAIN_H_ #define VTA_VTA_H_
#include <assert.h>
#include <ap_axi_sdata.h> #include <ap_axi_sdata.h>
#include <ap_int.h> #include <ap_int.h>
#include <assert.h>
#include <hls_stream.h> #include <hls_stream.h>
#include <vta/hw_spec.h> #include <vta/hw_spec.h>
/* \typedef uop_T Micro-op datatype*/ /* \typedef uop_T Micro-op datatype*/
typedef ap_uint<UOP_WIDTH> uop_T; typedef ap_uint<VTA_UOP_WIDTH> uop_T;
/* \typedef inp_T Input datatype*/ /* \typedef inp_T Input datatype*/
typedef ap_int<INP_WIDTH> inp_T; typedef ap_int<VTA_INP_WIDTH> inp_T;
/* \typedef wgt_T Weight datatype*/ /* \typedef wgt_T Weight datatype*/
typedef ap_int<WGT_WIDTH> wgt_T; typedef ap_int<VTA_WGT_WIDTH> wgt_T;
/* \typedef out_T Output datatype*/ /* \typedef out_T Output datatype*/
typedef ap_int<OUT_WIDTH> out_T; typedef ap_int<VTA_OUT_WIDTH> out_T;
/* \typedef acc_T Accumulator datatype*/ /* \typedef acc_T Accumulator datatype*/
typedef ap_int<ACC_WIDTH> acc_T; typedef ap_int<VTA_ACC_WIDTH> acc_T;
/* \typedef mul_T Multiplier output datatype*/ /* \typedef mul_T Multiplier output datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T; typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/ /* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T; typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
/* \typedef inp_vec_T Input vector datatype*/ /* \typedef inp_vec_T Input vector datatype*/
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T; typedef ap_uint<VTA_INP_WIDTH*VTA_BLOCK_IN> inp_vec_T;
/* \typedef wgt_vec_T Weight vector datatype*/ /* \typedef wgt_vec_T Weight vector datatype*/
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T; typedef ap_uint<VTA_WGT_WIDTH*VTA_BLOCK_IN> wgt_vec_T;
/* \typedef acc_vec_T Accumulator vector datatype*/ /* \typedef acc_vec_T Accumulator vector datatype*/
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T; typedef ap_uint<VTA_ACC_WIDTH*VTA_BLOCK_OUT> acc_vec_T;
/* \typedef out_vec_T Output vector datatype*/ /* \typedef out_vec_T Output vector datatype*/
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T; typedef ap_uint<VTA_OUT_WIDTH*VTA_BLOCK_OUT> out_vec_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/ /* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T; typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
/* \typedef inp_idx_T Input SRAM index datatype*/ /* \typedef inp_idx_T Input SRAM index datatype*/
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T; typedef ap_uint<VTA_LOG_INP_BUFF_DEPTH+1> inp_idx_T;
/* \typedef wgt_idx_T Weight SRAM index datatype*/ /* \typedef wgt_idx_T Weight SRAM index datatype*/
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T; typedef ap_uint<VTA_LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
/* \typedef acc_idx_T Accumulator SRAM index datatype*/ /* \typedef acc_idx_T Accumulator SRAM index datatype*/
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T; typedef ap_uint<VTA_LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
/* \typedef opcode_T Opcode datatype*/ /* \typedef opcode_T Opcode datatype*/
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T; typedef ap_uint<VTA_OPCODE_BIT_WIDTH> opcode_T;
/* \typedef insn_T Instruction datatype*/ /* \typedef insn_T Instruction datatype*/
typedef ap_uint<INS_WIDTH> insn_T; typedef ap_uint<VTA_INS_WIDTH> insn_T;
/* \typedef loop_T Loop bound datatype*/ /* \typedef loop_T Loop bound datatype*/
typedef ap_uint<LOOP_ITER_WIDTH> loop_T; typedef ap_uint<VTA_LOOP_ITER_WIDTH> loop_T;
/* \typedef memop_id_T Memory operation ID datatype*/ /* \typedef memop_id_T Memory operation ID datatype*/
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T; typedef ap_uint<VTA_MEMOP_ID_BIT_WIDTH> memop_id_T;
/* \typedef memop_sram_T Memory operation SRAM index datatype*/ /* \typedef memop_sram_T Memory operation SRAM index datatype*/
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T; typedef ap_uint<VTA_MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
/* \typedef memop_dram_T Memory operation DRAM index datatype*/ /* \typedef memop_dram_T Memory operation DRAM index datatype*/
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T; typedef ap_uint<VTA_MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
/* \typedef memop_size_T Memory operation range datatype*/ /* \typedef memop_size_T Memory operation range datatype*/
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T; typedef ap_uint<VTA_MEMOP_SIZE_BIT_WIDTH> memop_size_T;
/* \typedef memop_stride_T Memory operation stride datatype*/ /* \typedef memop_stride_T Memory operation stride datatype*/
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T; typedef ap_uint<VTA_MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
/* \typedef memop_pad_T Memory operation pad width datatype*/ /* \typedef memop_pad_T Memory operation pad width datatype*/
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T; typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/ /* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T; typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_opcode_T ALU operation immediate datatype*/ /* \typedef aluop_opcode_T ALU operation immediate datatype*/
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T; typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/ /* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T; typedef ap_uint<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
/*! /*!
* \brief Fetch module. * \brief Fetch module.
...@@ -104,12 +104,12 @@ typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T; ...@@ -104,12 +104,12 @@ typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
* \param gemm_queue GEMM instruction queue. AXI-stream FIFO. * \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
* \param store_queue Store instruction queue. AXI-stream FIFO. * \param store_queue Store instruction queue. AXI-stream FIFO.
*/ */
void fetch ( void fetch(
uint32_t insn_count, uint32_t insn_count,
volatile insn_T *insns, volatile insn_T *insns,
hls::stream<insn_T> &load_queue, hls::stream<insn_T> *load_queue,
hls::stream<insn_T> &gemm_queue, hls::stream<insn_T> *gemm_queue,
hls::stream<insn_T> &store_queue); hls::stream<insn_T> *store_queue);
/*! /*!
* \brief Load module. * \brief Load module.
...@@ -126,15 +126,14 @@ void fetch ( ...@@ -126,15 +126,14 @@ void fetch (
* \param inp_mem Local input SRAM buffer. Write only single port BRAM. * \param inp_mem Local input SRAM buffer. Write only single port BRAM.
* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM. * \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
*/ */
void load ( void load(
volatile inp_vec_T *inputs, volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights, volatile wgt_vec_T *weights,
hls::stream<insn_T> &load_queue, hls::stream<insn_T> *load_queue,
hls::stream<bool> &g2l_dep_queue, hls::stream<bool> *g2l_dep_queue,
hls::stream<bool> &l2g_dep_queue, hls::stream<bool> *l2g_dep_queue,
inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH], inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT] wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]);
);
/*! /*!
* \brief Compute module. * \brief Compute module.
...@@ -159,19 +158,18 @@ void load ( ...@@ -159,19 +158,18 @@ void load (
* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM. * \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
* \param out_mem Local output SRAM buffer. Write only single port BRAM. * \param out_mem Local output SRAM buffer. Write only single port BRAM.
*/ */
void compute ( void compute(
volatile uint32_t &done, volatile uint32_t *done,
volatile uop_T *uops, volatile uop_T *uops,
volatile acc_vec_T *biases, volatile acc_vec_T *biases,
hls::stream<insn_T> &gemm_queue, hls::stream<insn_T> *gemm_queue,
hls::stream<bool> &l2g_dep_queue, hls::stream<bool> *l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue, hls::stream<bool> *s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue, hls::stream<bool> *g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue, hls::stream<bool> *g2s_dep_queue,
out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH], out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT], wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH] out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
);
/*! /*!
* \brief Store module. * \brief Store module.
...@@ -186,13 +184,12 @@ void compute ( ...@@ -186,13 +184,12 @@ void compute (
* AXI-stream FIFO. * AXI-stream FIFO.
* \param out_mem Local output SRAM buffer. Read only single port BRAM. * \param out_mem Local output SRAM buffer. Read only single port BRAM.
*/ */
void store ( void store(
volatile out_vec_T *outputs, volatile out_vec_T *outputs,
hls::stream<insn_T> &store_queue, hls::stream<insn_T> *store_queue,
hls::stream<bool> &g2s_dep_queue, hls::stream<bool> *g2s_dep_queue,
hls::stream<bool> &s2g_dep_queue, hls::stream<bool> *s2g_dep_queue,
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH] out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
);
/*! /*!
* \brief VTA wrapper for simulation purpose only. * \brief VTA wrapper for simulation purpose only.
...@@ -205,7 +202,7 @@ void store ( ...@@ -205,7 +202,7 @@ void store (
* \param biases Bias data base address in DRAM. AXI-4 master port. * \param biases Bias data base address in DRAM. AXI-4 master port.
* \param outputs Output data base address in DRAM. AXI-4 master port. * \param outputs Output data base address in DRAM. AXI-4 master port.
*/ */
void vta ( void vta(
uint32_t insn_count, uint32_t insn_count,
volatile insn_T *insns, volatile insn_T *insns,
volatile uop_T *uops, volatile uop_T *uops,
...@@ -214,4 +211,4 @@ void vta ( ...@@ -214,4 +211,4 @@ void vta (
volatile acc_vec_T *biases, volatile acc_vec_T *biases,
volatile out_vec_T *outputs); volatile out_vec_T *outputs);
#endif // VTA_MAIN_H_ #endif // VTA_VTA_H_
\ No newline at end of file
...@@ -14,10 +14,10 @@ extern "C" { ...@@ -14,10 +14,10 @@ extern "C" {
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
/*! \brief Memory management constants with libxlnk_cma */ /*! \brief Memory management constants */
#define CACHED 1 #define VTA_CACHED 1
/*! \brief Memory management constants with libxlnk_cma */ /*! \brief Memory management constants */
#define NOT_CACHED 0 #define VTA_NOT_CACHED 0
/*! \brief VTA command handle */ /*! \brief VTA command handle */
typedef void * VTAHandle; typedef void * VTAHandle;
...@@ -97,4 +97,4 @@ void VTAProgram(const char* bitstream); ...@@ -97,4 +97,4 @@ void VTAProgram(const char* bitstream);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif // VTA_DRIVER_H_ #endif // VTA_DRIVER_H_
...@@ -27,70 +27,72 @@ ADD_LDFLAGS= ...@@ -27,70 +27,72 @@ ADD_LDFLAGS=
ADD_CFLAGS= ADD_CFLAGS=
# the hardware target # the hardware target
TARGET=PYNQ_TARGET TARGET = VTA_PYNQ_TARGET
#--------------------- #---------------------
# VTA hardware parameters # VTA hardware parameters
#-------------------- #--------------------
# Log of input/activation width in bits (default 3 -> 8 bits) # Log of input/activation width in bits (default 3 -> 8 bits)
LOG_INP_WIDTH = 3 VTA_LOG_INP_WIDTH = 3
# Log of kernel weight width in bits (default 3 -> 8 bits) # Log of kernel weight width in bits (default 3 -> 8 bits)
LOG_WGT_WIDTH = 3 VTA_LOG_WGT_WIDTH = 3
# Log of accum width in bits (default 5 -> 32 bits) # Log of accum width in bits (default 5 -> 32 bits)
LOG_ACC_WIDTH = 5 VTA_LOG_ACC_WIDTH = 5
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication) # Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0 VTA_LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication) # Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_BLOCK_IN = 4 VTA_LOG_BLOCK_IN = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication) # Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_BLOCK_OUT = 4 VTA_LOG_BLOCK_OUT = 4
# Log of uop buffer size in Bytes # Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15 VTA_LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes # Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15 VTA_LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes # Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15 VTA_LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes # Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17 VTA_LOG_ACC_BUFF_SIZE = 17
#--------------------- #---------------------
# Derived VTA hardware parameters # Derived VTA hardware parameters
#-------------------- #--------------------
# Input width in bits # Input width in bits
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" ) VTA_INP_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_INP_WIDTH) ))" )
# Weight width in bits # Weight width in bits
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" ) VTA_WGT_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_WGT_WIDTH) ))" )
# Log of output width in bits # Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH) VTA_LOG_OUT_WIDTH = $(VTA_LOG_INP_WIDTH)
# Output width in bits # Output width in bits
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" ) VTA_OUT_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_OUT_WIDTH) ))" )
# Tensor batch size # Tensor batch size
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" ) VTA_BATCH = $(shell echo "$$(( 1 << $(VTA_LOG_BATCH) ))" )
# Tensor outer block size # Tensor outer block size
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_IN) ))" ) VTA_IN_BLOCK = $(shell echo "$$(( 1 << $(VTA_LOG_BLOCK_IN) ))" )
# Tensor inner block size # Tensor inner block size
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_OUT) ))" ) VTA_OUT_BLOCK = $(shell echo "$$(( 1 << $(VTA_LOG_BLOCK_OUT) ))" )
# Uop buffer size in Bytes # Uop buffer size in Bytes
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" ) VTA_UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_UOP_BUFF_SIZE) ))" )
# Inp buffer size in Bytes # Inp buffer size in Bytes
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" ) VTA_INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_INP_BUFF_SIZE) ))" )
# Wgt buffer size in Bytes # Wgt buffer size in Bytes
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" ) VTA_WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_WGT_BUFF_SIZE) ))" )
# Acc buffer size in Bytes # Acc buffer size in Bytes
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" ) VTA_ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_ACC_BUFF_SIZE) ))" )
# Log of out buffer size in Bytes # Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" ) VTA_LOG_OUT_BUFF_SIZE = \
$(shell echo "$$(( $(VTA_LOG_ACC_BUFF_SIZE) + $(VTA_LOG_OUT_WIDTH) - $(VTA_LOG_ACC_WIDTH) ))" )
# Out buffer size in Bytes # Out buffer size in Bytes
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" ) VTA_OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
# Update ADD_CFLAGS # Update ADD_CFLAGS
ADD_CFLAGS += \ ADD_CFLAGS += \
-D$(TARGET) \ -D$(TARGET) \
-DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \ -DVTA_LOG_WGT_WIDTH=$(VTA_LOG_WGT_WIDTH) -DVTA_LOG_INP_WIDTH=$(VTA_LOG_INP_WIDTH) \
-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \ -DVTA_LOG_ACC_WIDTH=$(VTA_LOG_ACC_WIDTH) -DVTA_LOG_OUT_WIDTH=$(VTA_LOG_OUT_WIDTH) \
-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_BLOCK_IN) -DLOG_BLOCK_OUT=$(LOG_BLOCK_OUT) \ -DVTA_LOG_BATCH=$(VTA_LOG_BATCH) \
-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \ -DVTA_LOG_BLOCK_IN=$(VTA_LOG_BLOCK_IN) -DVTA_LOG_BLOCK_OUT=$(VTA_LOG_BLOCK_OUT) \
-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \ -DVTA_LOG_UOP_BUFF_SIZE=$(VTA_LOG_UOP_BUFF_SIZE) -DVTA_LOG_INP_BUFF_SIZE=$(VTA_LOG_INP_BUFF_SIZE) \
-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE) -DVTA_LOG_WGT_BUFF_SIZE=$(VTA_LOG_WGT_BUFF_SIZE) -DVTA_LOG_ACC_BUFF_SIZE=$(VTA_LOG_ACC_BUFF_SIZE) \
\ No newline at end of file -DVTA_LOG_OUT_BUFF_SIZE=$(VTA_LOG_OUT_BUFF_SIZE)
...@@ -29,65 +29,61 @@ void VTAInvalidateCache(void* buf, int size) { ...@@ -29,65 +29,61 @@ void VTAInvalidateCache(void* buf, int size) {
} }
void *VTAMapRegister(uint32_t addr, size_t length) { void *VTAMapRegister(uint32_t addr, size_t length) {
// Align the base address with the pages // Align the base address with the pages
uint32_t virt_base = addr & ~(getpagesize() - 1); uint32_t virt_base = addr & ~(getpagesize() - 1);
// Calculate base address offset w.r.t the base address // Calculate base address offset w.r.t the base address
uint32_t virt_offset = addr - virt_base; uint32_t virt_offset = addr - virt_base;
// Open file and mmap // Open file and mmap
uint32_t mmap_file = open(DEV_MEM_PATH, O_RDWR|O_SYNC); uint32_t mmap_file = open(VTA_PYNQ_DEV_MEM_PATH, O_RDWR|O_SYNC);
return mmap(NULL,
return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base); (length+virt_offset),
PROT_READ|PROT_WRITE,
MAP_SHARED,
mmap_file,
virt_base);
} }
void VTAUnmapRegister(void *vta, size_t length) { void VTAUnmapRegister(void *vta, size_t length) {
// Unmap memory // Unmap memory
int status = munmap(vta, length); int status = munmap(vta, length);
assert(status==0); assert(status == 0);
} }
void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) { void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
*((volatile uint32_t *) (((char *) base_addr) + offset)) = val; *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset)) = val;
} }
uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) { uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
return *((volatile uint32_t *) (((char *) base_addr) + offset)); return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset));
} }
void VTAProgram(const char* bitstream) { void VTAProgram(const char* bitstream) {
int elem; int elem;
FILE *src, *dst, *partial; FILE *src, *dst, *partial;
partial = fopen(VTA_PYNQ_BS_IS_PARTIAL, "w");
partial = fopen(BS_IS_PARTIAL, "w");
if (partial == NULL) { if (partial == NULL) {
printf("Cannot open partial config file %s\n", BS_IS_PARTIAL); printf("Cannot open partial config file %s\n", VTA_PYNQ_BS_IS_PARTIAL);
fclose(partial); fclose(partial);
exit(1); exit(1);
} }
fputc('0', partial); fputc('0', partial);
fclose(partial); fclose(partial);
src = fopen(bitstream, "rb"); src = fopen(bitstream, "rb");
if (src == NULL) { if (src == NULL) {
printf("Cannot open bitstream %s\n", bitstream); printf("Cannot open bitstream %s\n", bitstream);
exit(1); exit(1);
} }
dst = fopen(VTA_PYNQ_BS_XDEVCFG, "wb");
dst = fopen(BS_XDEVCFG, "wb");
if (dst == NULL) { if (dst == NULL) {
printf("Cannot open device file %s\n", BS_XDEVCFG); printf("Cannot open device file %s\n", VTA_PYNQ_BS_XDEVCFG);
fclose(dst); fclose(dst);
exit(1); exit(1);
} }
elem = fgetc(src); elem = fgetc(src);
while (elem != EOF) { while (elem != EOF) {
fputc(elem, dst); fputc(elem, dst);
elem = fgetc(src); elem = fgetc(src);
} }
fclose(src); fclose(src);
fclose(dst); fclose(dst);
}
}
\ No newline at end of file
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
* \brief VTA driver for Pynq board. * \brief VTA driver for Pynq board.
*/ */
#ifndef VTA_PYNQ_DRIVER_H_ #ifndef VTA_PYNQ_PYNQ_DRIVER_H_
#define VTA_PYNQ_DRIVER_H_ #define VTA_PYNQ_PYNQ_DRIVER_H_
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
...@@ -32,17 +32,20 @@ void xlnkFlushCache(void* buf, int size); ...@@ -32,17 +32,20 @@ void xlnkFlushCache(void* buf, int size);
void xlnkInvalidateCache(void* buf, int size); void xlnkInvalidateCache(void* buf, int size);
#endif #endif
/*! \brief partial bitstream status file path */ /*! \brief (Pynq only) Partial bitstream status file path */
#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream" #define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
/*! \brief bitstream destination file path */ /*! \brief (Pynq only) Bitstream destination file path */
#define BS_XDEVCFG "/dev/xdevcfg" #define VTA_PYNQ_BS_XDEVCFG "/dev/xdevcfg"
/*! \brief Path to /dev/mem */ /*! \brief (Pynq only) Path to /dev/mem */
#define DEV_MEM_PATH "/dev/mem" #define VTA_PYNQ_DEV_MEM_PATH "/dev/mem"
/*! \brief MMIO driver constant */ /*! \brief (Pynq only) MMIO driver constant */
#define MMIO_WORD_LENGTH 4 #define VTA_PYNQ_MMIO_WORD_LENGTH 4
/*! \brief MMIO driver constant */ /*! \brief (Pynq only) MMIO driver constant */
#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1)) #define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
/*! \brief Physically contiguous buffer size limit */
#define VTA_MAX_XFER (1<<22)
/*! \brief VTA configuration register address range */ /*! \brief VTA configuration register address range */
#define VTA_RANGE 0x100 #define VTA_RANGE 0x100
...@@ -74,10 +77,7 @@ void xlnkInvalidateCache(void* buf, int size); ...@@ -74,10 +77,7 @@ void xlnkInvalidateCache(void* buf, int size);
*/ */
#define VTA_STORE_ADDR 0x43C30000 #define VTA_STORE_ADDR 0x43C30000
/*! \brief Buffer size limit */
#define MAX_XFER (1<<22)
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif // VTA_PYNQ_DRIVER_H_ #endif // VTA_PYNQ_PYNQ_DRIVER_H_
\ No newline at end of file \ No newline at end of file
// simply include the driver for now. /*!
* Copyright (c) 2018 by Contributors
* \file vta_device_api.cc
* \brief VTA device API for TVM
*/
#include <tvm/runtime/registry.h> #include <tvm/runtime/registry.h>
#include <dmlc/thread_local.h> #include <dmlc/thread_local.h>
#include <vta/runtime.h> #include <vta/runtime.h>
#include "../../tvm/src/runtime/workspace_pool.h"
#include "../../nnvm/tvm/src/runtime/workspace_pool.h"
namespace tvm { namespace tvm {
namespace runtime { namespace runtime {
......
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
* \brief Test library for the VTA design simulation and driver tests. * \brief Test library for the VTA design simulation and driver tests.
*/ */
#ifndef VTA_TESTLIB_H_ #ifndef TESTS_HARDWARE_COMMON_TEST_LIB_H_
#define VTA_TESTLIB_H_ #define TESTS_HARDWARE_COMMON_TEST_LIB_H_
#include <assert.h> #include <assert.h>
#include <stdint.h> #include <stdint.h>
...@@ -17,9 +17,9 @@ ...@@ -17,9 +17,9 @@
#include <vta/driver.h> #include <vta/driver.h>
#ifdef PYNQ_TARGET #ifdef VTA_PYNQ_TARGET
#include "../../../src/pynq/pynq_driver.h" #include "../../../src/pynq/pynq_driver.h"
#endif //PYNQ_TARGET #endif // VTA_PYNQ_TARGET
typedef uint64_t axi_T; typedef uint64_t axi_T;
typedef uint32_t uop_T; typedef uint32_t uop_T;
...@@ -28,7 +28,7 @@ typedef int8_t inp_T; ...@@ -28,7 +28,7 @@ typedef int8_t inp_T;
typedef int8_t out_T; typedef int8_t out_T;
typedef int32_t acc_T; typedef int32_t acc_T;
uint64_t vta ( uint64_t vta(
uint32_t insn_count, uint32_t insn_count,
VTAGenericInsn *insns, VTAGenericInsn *insns,
VTAUop *uops, VTAUop *uops,
...@@ -37,11 +37,11 @@ uint64_t vta ( ...@@ -37,11 +37,11 @@ uint64_t vta (
acc_T *biases, acc_T *biases,
inp_T *outputs); inp_T *outputs);
#else //NO_SIM #else // NO_SIM
#include "../../../hardware/vivado/src/vta.h" #include "../../../hardware/vivado/src/vta.h"
#endif //NO_SIM #endif // NO_SIM
/*! /*!
* \brief Returns opcode string. * \brief Returns opcode string.
...@@ -300,4 +300,4 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp ...@@ -300,4 +300,4 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
int blocked_gemm_test(int batch, int channels, int block, bool uop_compression, int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
int virtual_threads); int virtual_threads);
#endif // VTA_TESTLIB_H_ #endif // TESTS_HARDWARE_COMMON_TEST_LIB_H_
\ No newline at end of file
...@@ -14,140 +14,135 @@ ...@@ -14,140 +14,135 @@
#include "../common/test_lib.h" #include "../common/test_lib.h"
// VTA invocation (present the same abstraction as in the simulation tests) // VTA invocation (present the same abstraction as in the simulation tests)
uint64_t vta ( uint64_t vta(
uint32_t insn_count, uint32_t insn_count,
VTAGenericInsn *insns, VTAGenericInsn *insns,
VTAUop *uops, VTAUop *uops,
inp_T *inputs, inp_T *inputs,
wgt_T *weights, wgt_T *weights,
acc_T *biases, acc_T *biases,
inp_T *outputs) { inp_T *outputs) {
// Performance counter variables
// Performance counter variables uint64_t t_fpga;
uint64_t t_fpga; struct timespec start, stop;
struct timespec start, stop;
// Derive bitstream file
// Derive bitstream file char bitstream[128];
char bitstream[64]; char str_batch_size[4];
char str_batch_size[4]; char str_block_out_size[4];
char str_block_out_size[4]; char str_block_in_size[4];
char str_block_in_size[4]; char str_block_bit_width[4];
char str_block_bit_width[4]; snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
sprintf(str_batch_size, "%d", BATCH); snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
sprintf(str_block_out_size, "%d", BLOCK_OUT); snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
sprintf(str_block_in_size, "%d", BLOCK_IN); snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
sprintf(str_block_bit_width, "%d", WGT_WIDTH); snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
strcpy(bitstream, "vta.bit");
#if VTA_DEBUG == 1
#if DEBUG==1 printf("INFO - Programming FPGA: %s!\n", bitstream);
printf("INFO - Programming FPGA: %s!\n", bitstream);
#endif #endif
// Program VTA // Program VTA
VTAProgram(bitstream); VTAProgram(bitstream);
// Get VTA handles // Get VTA handles
VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE); VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE); VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
// Physical address pointers // Physical address pointers
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0; uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0; uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0; uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0; uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0; uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0; uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
#if DEBUG==1 #if VTA_DEBUG == 1
printf("INFO - Starting FPGA!\n"); printf("INFO - Starting FPGA!\n");
#endif #endif
clock_gettime(CLOCK_REALTIME, &start); clock_gettime(CLOCK_REALTIME, &start);
// FETCH @ 0x10 : Data signal of insn_count_V // FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count); VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V // FETCH @ 0x18 : Data signal of insns_V
if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy); if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
// LOAD @ 0x10 : Data signal of inputs_V // LOAD @ 0x10 : Data signal of inputs_V
if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy); if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
// LOAD @ 0x18 : Data signal of weight_V // LOAD @ 0x18 : Data signal of weight_V
if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy); if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
// COMPUTE @ 0x20 : Data signal of uops_V // COMPUTE @ 0x20 : Data signal of uops_V
if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy); if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
// COMPUTE @ 0x28 : Data signal of biases_V // COMPUTE @ 0x28 : Data signal of biases_V
if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy); if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
// STORE @ 0x10 : Data signal of outputs_V // STORE @ 0x10 : Data signal of outputs_V
if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy); if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
// VTA start // VTA start
VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1); VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
VTAWriteMappedReg(vta_load_handle, 0x0, 0x81); VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81); VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_store_handle, 0x0, 0x81); VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
int flag = 0, t = 0; int flag = 0, t = 0;
for (t = 0; t < 10000000; ++t) { for (t = 0; t < 10000000; ++t) {
flag = VTAReadMappedReg(vta_compute_handle, 0x18); flag = VTAReadMappedReg(vta_compute_handle, 0x18);
if (flag & VTA_DONE) break; if (flag & VTA_DONE) break;
} }
if (t==10000000) { if (t == 10000000) {
printf("\tWARNING: VTA TIMEOUT!!!!\n"); printf("\tWARNING: VTA TIMEOUT!!!!\n");
} #if VTA_DEBUG == 1
#if DEBUG==1 } else {
else { printf("INFO - FPGA Finished!\n");
printf("INFO - FPGA Finished!\n");
}
#endif #endif
}
clock_gettime(CLOCK_REALTIME, &stop); clock_gettime(CLOCK_REALTIME, &stop);
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec); t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
// Unmap VTA register // Unmap VTA register
VTAUnmapRegister(vta_fetch_handle, VTA_RANGE); VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
VTAUnmapRegister(vta_load_handle, VTA_RANGE); VTAUnmapRegister(vta_load_handle, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle, VTA_RANGE); VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
VTAUnmapRegister(vta_store_handle, VTA_RANGE); VTAUnmapRegister(vta_store_handle, VTA_RANGE);
return t_fpga; return t_fpga;
}; }
int main(void)
{
#if DEBUG==1 int main(void) {
printParameters(); #if VTA_DEBUG == 1
printParameters();
#endif #endif
int status = 0; int status = 0;
// Run ALU test (vector-scalar operators) // Run ALU test (vector-scalar operators)
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false);
// Run ALU test (vector-vector operators) // Run ALU test (vector-vector operators)
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false);
// Run blocked GEMM test // Run blocked GEMM test
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
if (status==0) { if (status == 0) {
printf("\nINFO - Unit tests successful!\n"); printf("\nINFO - Unit tests successful!\n");
} else { } else {
printf("\nINTO - Unit tests failed!\n"); printf("\nINTO - Unit tests failed!\n");
} }
return status; return status;
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment