Commit 56a0dea8 by Thierry Moreau Committed by Tianqi Chen

[REFACTOR] Macro standardization, lint tests (#7)

* code refactoring

* code refactoring

* code refactoring

* code refactoring

* fixing macro

* refactoring, tests, makefile

* style - making sure lint test pass

* prefixed macros with VTA, fixed bugs
parent 28a10b69
...@@ -76,7 +76,7 @@ lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ) ...@@ -76,7 +76,7 @@ lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ)
lint: pylint cpplint lint: pylint cpplint
cpplint: cpplint:
python nnvm/dmlc-core/scripts/lint.py vta cpp include src python nnvm/dmlc-core/scripts/lint.py vta cpp include src hardware tests
pylint: pylint:
pylint python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc pylint python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc
......
# Directories # Directories
ROOTDIR = $(CURDIR) ROOTDIR = $(CURDIR)
BUILD_DIR = $(ROOTDIR)/build BUILD_DIR = $(ROOTDIR)/../../build/hardware/vivado
SCRIPT_DIR = $(ROOTDIR)/scripts SCRIPT_DIR = $(ROOTDIR)/scripts
SRC_DIR = $(ROOTDIR)/src SRC_DIR = $(ROOTDIR)/src
SIM_DIR = $(ROOTDIR)/sim SIM_DIR = $(ROOTDIR)/sim
...@@ -27,20 +27,21 @@ include $(config) ...@@ -27,20 +27,21 @@ include $(config)
#-------------------- #--------------------
# Number of threads during compilation # Number of threads during compilation
NUM_THREADS = 8 VTA_HW_COMP_THREADS = 8
# Target Frequency # Target Frequency
CLOCK_FREQ = 100 VTA_HW_COMP_CLOCK_FREQ = 100
# Timing closure compensation (0 for none, 3 for highest) # Timing closure compensation (0 for none, 3 for highest)
TIMING_CLOSURE_COMP = 0 VTA_HW_COMP_TIMING_COMP = 0
# Derive clock target period # Derive clock target period
TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" ) TARGET_PER = \
$(shell echo "$$(( (1000 + $(VTA_HW_COMP_CLOCK_FREQ) - 1) / $(VTA_HW_COMP_CLOCK_FREQ) - $(VTA_HW_COMP_TIMING_COMP)))" )
# Derive config name # Derive config name
CONF = \ CONF = \
$(BATCH)x$(IN_BLOCK)x$(OUT_BLOCK)_$(INP_WIDTH)bx$(WGT_WIDTH)b_$(CLOCK_FREQ)MHz_$(TARGET_PER)ns $(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF) IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF) HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
...@@ -53,23 +54,23 @@ ip: ...@@ -53,23 +54,23 @@ ip:
cd $(IP_BUILD_PATH) && \ cd $(IP_BUILD_PATH) && \
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \ $(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \ -tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \ $(VTA_LOG_INP_WIDTH) $(VTA_LOG_WGT_WIDTH) $(VTA_LOG_ACC_WIDTH) $(VTA_LOG_OUT_WIDTH) \
$(LOG_BATCH) $(LOG_BLOCK_OUT) $(LOG_BLOCK_IN) \ $(VTA_LOG_BATCH) $(VTA_LOG_BLOCK_OUT) $(VTA_LOG_BLOCK_IN) \
$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \ $(VTA_LOG_UOP_BUFF_SIZE) $(VTA_LOG_INP_BUFF_SIZE) $(VTA_LOG_WGT_BUFF_SIZE) \
$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE) $(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE)
bit: ip bit: ip
mkdir -p $(HW_BUILD_PATH) mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \ cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \ $(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
-tclargs $(IP_BUILD_PATH) $(NUM_THREADS) $(CLOCK_FREQ) \ -tclargs $(IP_BUILD_PATH) $(VTA_HW_COMP_THREADS) $(VTA_HW_COMP_CLOCK_FREQ) \
$(INP_WIDTH) $(WGT_WIDTH) $(OUT_WIDTH) \ $(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(OUT_WIDTH) \
$(BATCH) $(IN_BLOCK) $(OUT_BLOCK) \ $(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(INP_BUFF_SIZE) $(WGT_BUFF_SIZE) $(OUT_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
driver: bit driver: bit
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
cd $(HW_BUILD_PATH)/bsp && make cd $(HW_BUILD_PATH)/bsp && make
clean: clean:
rm -rf build rm -rf $(BUILD_DIR)
\ No newline at end of file \ No newline at end of file
...@@ -63,12 +63,12 @@ if { [llength $argv] eq 19 } { ...@@ -63,12 +63,12 @@ if { [llength $argv] eq 19 } {
# C define flags to pass to compiler # C define flags to pass to compiler
set cflags "-I $include_dir -I $src_dir -I $test_dir \ set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \ -DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \ -DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \ -DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
-DLOG_UOP_BUFF_SIZE=$uop_buff_size -DLOG_INP_BUFF_SIZE=$inp_buff_size \ -DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
-DLOG_WGT_BUFF_SIZE=$wgt_buff_size -DLOG_ACC_BUFF_SIZE=$acc_buff_size \ -DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
-DLOG_OUT_BUFF_SIZE=$out_buff_size" -DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
# Initializes the HLS design and sets HLS pragmas for memory partitioning. # Initializes the HLS design and sets HLS pragmas for memory partitioning.
# This is necessary because of a Vivado restriction that doesn't allow for # This is necessary because of a Vivado restriction that doesn't allow for
......
...@@ -11,52 +11,49 @@ ...@@ -11,52 +11,49 @@
#include "../src/vta.h" #include "../src/vta.h"
#include "../../../tests/hardware/common/test_lib.h" #include "../../../tests/hardware/common/test_lib.h"
int main(void) int main(void) {
{ #if DEBUG == 1
#if DEBUG==1
printParameters(); printParameters();
#endif #endif
// Buffer indexing // Buffer indexing
assert(LOG_ACC_BUFF_DEPTH>=LOG_INP_BUFF_DEPTH); assert(VTA_LOG_ACC_BUFF_DEPTH >= VTA_LOG_INP_BUFF_DEPTH);
// Micro op bound // Micro op bound
assert(UOP_GEM_3_1<UOP_WIDTH); assert(VTA_UOP_GEM_3_1 < VTA_UOP_WIDTH);
assert(UOP_ALU_3_1<UOP_WIDTH); assert(VTA_UOP_ALU_3_1 < VTA_UOP_WIDTH);
// Instruction alignment checks // Instruction alignment checks
assert(INSN_MEM_7_1<INSN_MEM_8_0); assert(VTA_INSN_MEM_7_1 < VTA_INSN_MEM_8_0);
assert(INSN_GEM_8_1<INSN_GEM_9_0); assert(VTA_INSN_GEM_8_1 < VTA_INSN_GEM_9_0);
// Instruction bounds // Instruction bounds
assert(INSN_MEM_E_1<INS_WIDTH); assert(VTA_INSN_MEM_E_1 < VTA_INS_WIDTH);
assert(INSN_GEM_E_1<INS_WIDTH); assert(VTA_INSN_GEM_E_1 < VTA_INS_WIDTH);
assert(INSN_ALU_F_1<INS_WIDTH); assert(VTA_INSN_ALU_F_1 < VTA_INS_WIDTH);
int status = 0; int status = 0;
// Run ALU test (vector-scalar operators) // Run ALU test (vector-scalar operators)
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MIN, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MIN, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false);
// Run ALU test (vector-vector operators) // Run ALU test (vector-vector operators)
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MIN, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MIN, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false);
// Run blocked GEMM test // Run blocked GEMM test
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
return status; return status;
} }
...@@ -10,80 +10,78 @@ ...@@ -10,80 +10,78 @@
#include "./vta.h" #include "./vta.h"
void fetch ( void fetch(
uint32_t insn_count, uint32_t insn_count,
volatile insn_T *insns, volatile insn_T *insns,
hls::stream<insn_T> &load_queue, hls::stream<insn_T> *load_queue,
hls::stream<insn_T> &gemm_queue, hls::stream<insn_T> *gemm_queue,
hls::stream<insn_T> &store_queue) { hls::stream<insn_T> *store_queue) {
#pragma HLS INTERFACE s_axilite port=insn_count bundle=CONTROL_BUS #pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
#pragma HLS INTERFACE m_axi port=insns offset=slave bundle=ins_port #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
#pragma HLS INTERFACE axis port=load_queue #pragma HLS INTERFACE axis port = load_queue
#pragma HLS INTERFACE axis port=gemm_queue #pragma HLS INTERFACE axis port = gemm_queue
#pragma HLS INTERFACE axis port=store_queue #pragma HLS INTERFACE axis port = store_queue
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
INSN_DECODE: for (int pc = 0; pc < insn_count; pc ++) { INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) {
#pragma HLS PIPELINE II=1 #pragma HLS PIPELINE II = 1
// Read instruction fields // Read instruction fields
insn_T insn = insns[pc]; insn_T insn = insns[pc];
// Do some partial decoding // Do some partial decoding
opcode_T opcode = insn.range(INSN_MEM_0_1, INSN_MEM_0_0); opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0); memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
// Push to appropriate instruction queue // Push to appropriate instruction queue
if (opcode == OPCODE_STORE) { if (opcode == VTA_OPCODE_STORE) {
store_queue.write(insn); store_queue->write(insn);
} else if (opcode == OPCODE_LOAD && } else if (opcode == VTA_OPCODE_LOAD &&
(memory_type == MEM_ID_INP || memory_type == MEM_ID_WGT)) { (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT)) {
load_queue.write(insn); load_queue->write(insn);
} else { } else {
gemm_queue.write(insn); gemm_queue->write(insn);
} }
} }
} }
void load ( void load(
volatile inp_vec_T *inputs, volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights, volatile wgt_vec_T *weights,
hls::stream<insn_T> &load_queue, hls::stream<insn_T> *load_queue,
hls::stream<bool> &g2l_dep_queue, hls::stream<bool> *g2l_dep_queue,
hls::stream<bool> &l2g_dep_queue, hls::stream<bool> *l2g_dep_queue,
inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH], inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT] wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]
) { ) {
#pragma HLS INTERFACE m_axi port=weights offset=slave bundle=data_port #pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
#pragma HLS INTERFACE m_axi port=inputs offset=slave bundle=data_port #pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
#pragma HLS INTERFACE axis port=load_queue #pragma HLS INTERFACE axis port = load_queue
#pragma HLS INTERFACE axis port=g2l_dep_queue #pragma HLS INTERFACE axis port = g2l_dep_queue
#pragma HLS INTERFACE axis port=l2g_dep_queue #pragma HLS INTERFACE axis port = l2g_dep_queue
#pragma HLS INTERFACE bram port=wgt_mem #pragma HLS INTERFACE bram port = wgt_mem
#pragma HLS INTERFACE bram port=inp_mem #pragma HLS INTERFACE bram port = inp_mem
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
// #pragma HLS ARRAY_PARTITION variable=inp_mem complete dim=2
// Pop load instruction // Pop load instruction
insn_T insn = load_queue.read(); insn_T insn = load_queue->read();
// Decode instruction // Decode instruction
bool pop_prev_dependence = insn[INSN_MEM_1]; bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
bool pop_next_dependence = insn[INSN_MEM_2]; bool pop_next_dependence = insn[VTA_INSN_MEM_2];
bool push_prev_dependence = insn[INSN_MEM_3]; bool push_prev_dependence = insn[VTA_INSN_MEM_3];
bool push_next_dependence = insn[INSN_MEM_4]; bool push_next_dependence = insn[VTA_INSN_MEM_4];
memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0); memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0); memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0); memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0); memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0); memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0); memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0); memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0); memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0); memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0); memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
// Pop dependence token if instructed // Pop dependence token if instructed
if (pop_next_dependence) { if (pop_next_dependence) {
g2l_dep_queue.read(); g2l_dep_queue->read();
} }
// Initialize indices // Initialize indices
...@@ -94,29 +92,26 @@ void load ( ...@@ -94,29 +92,26 @@ void load (
memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1; memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1; memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
memop_sram_T y_offset = x_size_total * y_pad_0; memop_sram_T y_offset = x_size_total * y_pad_0;
#pragma HLS RESOURCE variable=y_offset core=Mul_LUT // Force this computation to be done with LUTs to avoid using too many DSPs
#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
// Skip padding along y dimension // Skip padding along y dimension
sram_idx += y_offset; sram_idx += y_offset;
// Perform data transfer from DRAM // Perform data transfer from DRAM
for (int y = 0; y < y_size; y ++) { for (int y = 0; y < y_size; y++) {
#pragma HLS PIPELINE rewind #pragma HLS PIPELINE rewind
// Skip padding along x dimension // Skip padding along x dimension
sram_idx += x_pad_0; sram_idx += x_pad_0;
// Perform data transfer // Perform data transfer
if (memory_type == MEM_ID_INP) { if (memory_type == VTA_MEM_ID_INP) {
memcpy( memcpy(&inp_mem[sram_idx][0],
&inp_mem[sram_idx][0], (const inp_vec_T*) &inputs[dram_idx * VTA_BATCH],
(const inp_vec_T*) &inputs[dram_idx * BATCH], x_size * VTA_INP_ELEM_BYTES);
x_size * INP_ELEM_BYTES
);
} else { } else {
memcpy( memcpy(&wgt_mem[sram_idx][0],
&wgt_mem[sram_idx][0], (const wgt_vec_T*) &weights[dram_idx * VTA_BLOCK_OUT],
(const wgt_vec_T*) &weights[dram_idx * BLOCK_OUT], x_size * VTA_WGT_ELEM_BYTES);
x_size * WGT_ELEM_BYTES
);
} }
sram_idx += x_size; sram_idx += x_size;
dram_idx += x_stride; dram_idx += x_stride;
...@@ -127,136 +122,130 @@ void load ( ...@@ -127,136 +122,130 @@ void load (
// Reset SRAM index // Reset SRAM index
sram_idx = sram_base; sram_idx = sram_base;
// Pad x/y edges with zeros // Pad x/y edges with zeros
for (int y = 0; y < y_size_total; y ++) { for (int y = 0; y < y_size_total; y++) {
if (y < y_pad_0 || y >= y_pad_0 + y_size) { if (y < y_pad_0 || y >= y_pad_0 + y_size) {
for (int x = 0; x < x_size_total; x ++) { for (int x = 0; x < x_size_total; x++) {
#pragma HLS PIPELINE II=1 rewind #pragma HLS PIPELINE II = 1 rewind
if (memory_type == MEM_ID_INP) { if (memory_type == VTA_MEM_ID_INP) {
for (int i = 0; i < BATCH; i ++) { for (int i = 0; i < VTA_BATCH; i++) {
inp_mem[sram_idx][i] = 0; inp_mem[sram_idx][i] = 0;
} }
} else { } else {
for (int i = 0; i < BLOCK_OUT; i ++) { for (int i = 0; i < VTA_BLOCK_OUT; i++) {
wgt_mem[sram_idx][i] = 0; wgt_mem[sram_idx][i] = 0;
} }
} }
sram_idx ++; sram_idx++;
} }
} else { } else {
for (int x = 0; x < x_pad_0; x ++) { for (int x = 0; x < x_pad_0; x++) {
#pragma HLS PIPELINE II=1 rewind #pragma HLS PIPELINE II = 1 rewind
if (memory_type == MEM_ID_INP) { if (memory_type == VTA_MEM_ID_INP) {
for (int i = 0; i < BATCH; i ++) { for (int i = 0; i < VTA_BATCH; i++) {
inp_mem[sram_idx][i] = 0; inp_mem[sram_idx][i] = 0;
} }
} else { } else {
for (int i = 0; i < BLOCK_OUT; i ++) { for (int i = 0; i < VTA_BLOCK_OUT; i++) {
wgt_mem[sram_idx][i] = 0; wgt_mem[sram_idx][i] = 0;
} }
} }
sram_idx ++; sram_idx++;
} }
sram_idx += x_size; sram_idx += x_size;
for (int x = 0; x < x_pad_1; x ++) { for (int x = 0; x < x_pad_1; x++) {
#pragma HLS PIPELINE II=1 rewind #pragma HLS PIPELINE II = 1 rewind
if (memory_type == MEM_ID_INP) { if (memory_type == VTA_MEM_ID_INP) {
for (int i = 0; i < BATCH; i ++) { for (int i = 0; i < VTA_BATCH; i++) {
inp_mem[sram_idx][i] = 0; inp_mem[sram_idx][i] = 0;
} }
} else { } else {
for (int i = 0; i < BLOCK_OUT; i ++) { for (int i = 0; i < VTA_BLOCK_OUT; i++) {
wgt_mem[sram_idx][i] = 0; wgt_mem[sram_idx][i] = 0;
} }
} }
sram_idx ++; sram_idx++;
} }
} }
} }
// Push dependence token if instructed // Push dependence token if instructed
if (push_next_dependence) { if (push_next_dependence) {
l2g_dep_queue.write(1); l2g_dep_queue->write(1);
} }
} }
void compute ( void compute(
volatile uint32_t &done, volatile uint32_t *done,
volatile uop_T *uops, volatile uop_T *uops,
volatile acc_vec_T *biases, volatile acc_vec_T *biases,
hls::stream<insn_T> &gemm_queue, hls::stream<insn_T> *gemm_queue,
hls::stream<bool> &l2g_dep_queue, hls::stream<bool> *l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue, hls::stream<bool> *s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue, hls::stream<bool> *g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue, hls::stream<bool> *g2s_dep_queue,
out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH], out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT], wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH] out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]
) { ) {
#pragma HLS INTERFACE s_axilite port=done bundle=CONTROL_BUS #pragma HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS
#pragma HLS INTERFACE m_axi port=uops offset=slave bundle=uop_port #pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
#pragma HLS INTERFACE m_axi port=biases offset=slave bundle=data_port #pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
#pragma HLS INTERFACE axis port=gemm_queue #pragma HLS INTERFACE axis port = gemm_queue
#pragma HLS INTERFACE axis port=l2g_dep_queue #pragma HLS INTERFACE axis port = l2g_dep_queue
#pragma HLS INTERFACE axis port=s2g_dep_queue #pragma HLS INTERFACE axis port = s2g_dep_queue
#pragma HLS INTERFACE axis port=g2l_dep_queue #pragma HLS INTERFACE axis port = g2l_dep_queue
#pragma HLS INTERFACE axis port=g2s_dep_queue #pragma HLS INTERFACE axis port = g2s_dep_queue
#pragma HLS INTERFACE bram port=inp_mem #pragma HLS INTERFACE bram port = inp_mem
#pragma HLS INTERFACE bram port=wgt_mem #pragma HLS INTERFACE bram port = wgt_mem
#pragma HLS INTERFACE bram port=out_mem #pragma HLS INTERFACE bram port = out_mem
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
// #pragma HLS ARRAY_PARTITION variable=inp_mem complete dim=2
// #pragma HLS ARRAY_PARTITION variable=out_mem complete dim=2
// This is necessary connect the SRAM to the load module // This is necessary connect the SRAM to the load module
#pragma HLS RESOURCE variable=wgt_mem core=RAM_1P #pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
// Micro-op storage // Micro-op storage
static uop_T uop_mem[UOP_BUFF_DEPTH]; static uop_T uop_mem[VTA_UOP_BUFF_DEPTH];
// Accumulator storage // Accumulator storage
static acc_vec_T acc_mem[ACC_BUFF_DEPTH][BATCH]; static acc_vec_T acc_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH];
#pragma HLS ARRAY_PARTITION variable=acc_mem complete dim=2 #pragma HLS ARRAY_PARTITION variable = acc_mem complete dim = 2
// Pop GEMM instruction // Pop GEMM instruction
insn_T insn = gemm_queue.read(); insn_T insn = gemm_queue->read();
// Decode // Decode
opcode_T opcode = insn.range(INSN_MEM_0_1, INSN_MEM_0_0); opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
bool pop_prev_dependence = insn[INSN_MEM_1]; bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
bool pop_next_dependence = insn[INSN_MEM_2]; bool pop_next_dependence = insn[VTA_INSN_MEM_2];
bool push_prev_dependence = insn[INSN_MEM_3]; bool push_prev_dependence = insn[VTA_INSN_MEM_3];
bool push_next_dependence = insn[INSN_MEM_4]; bool push_next_dependence = insn[VTA_INSN_MEM_4];
// Pop dependence token if instructed // Pop dependence token if instructed
if (pop_prev_dependence) { if (pop_prev_dependence) {
l2g_dep_queue.read(); l2g_dep_queue->read();
} }
if (pop_next_dependence) { if (pop_next_dependence) {
s2g_dep_queue.read(); s2g_dep_queue->read();
} }
// Perform action based on opcode // Perform action based on opcode
if (opcode == OPCODE_FINISH) { if (opcode == VTA_OPCODE_FINISH) {
// Set done flag if we reach a FINISH instruction // Set done flag if we reach a FINISH instruction
done = 1; *done = 1;
} else if (opcode == VTA_OPCODE_LOAD || opcode == VTA_OPCODE_STORE) {
} else if (opcode == OPCODE_LOAD || opcode == OPCODE_STORE) {
// Set done value // Set done value
done = 0; *done = 0;
// Decode instruction // Decode instruction
memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0); memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0); memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0); memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0); memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0); memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0); memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0); memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0); memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0); memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0); memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
// Initialize indices // Initialize indices
memop_sram_T sram_idx = sram_base; memop_sram_T sram_idx = sram_base;
...@@ -266,220 +255,202 @@ void compute ( ...@@ -266,220 +255,202 @@ void compute (
memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1; memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1; memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
memop_sram_T y_offset = x_size_total * y_pad_0; memop_sram_T y_offset = x_size_total * y_pad_0;
#pragma HLS RESOURCE variable=y_offset core=Mul_LUT // Force this computation to be done with LUTs to avoid using too many DSPs
#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
if (memory_type == MEM_ID_UOP) { if (memory_type == VTA_MEM_ID_UOP) {
// Perform data transfer // Perform data transfer
memcpy( memcpy(&uop_mem[sram_base],
&uop_mem[sram_base],
(const uop_T*) &uops[dram_base], (const uop_T*) &uops[dram_base],
x_size * sizeof(uop_T) x_size * sizeof(uop_T));
);
} else { } else {
// Skip vertical padding // Skip vertical padding
sram_idx += y_offset; sram_idx += y_offset;
// Perform data transfer from DRAM // Perform data transfer from DRAM
for (int y = 0; y < y_size; y ++) { for (int y = 0; y < y_size; y++) {
#pragma HLS PIPELINE rewind #pragma HLS PIPELINE rewind
// Skip padding along x dimension // Skip padding along x dimension
sram_idx += x_pad_0; sram_idx += x_pad_0;
// Perform data transfer // Perform data transfer
memcpy( memcpy(&acc_mem[sram_idx][0],
&acc_mem[sram_idx][0], (const acc_vec_T*) &biases[dram_idx * VTA_BATCH],
(const acc_vec_T*) &biases[dram_idx * BATCH], x_size*VTA_ACC_ELEM_BYTES);
x_size*ACC_ELEM_BYTES
);
sram_idx += x_size; sram_idx += x_size;
dram_idx += x_stride; dram_idx += x_stride;
// Skip padding along x dimension // Skip padding along x dimension
sram_idx += x_pad_1; sram_idx += x_pad_1;
} }
} }
} else if (opcode == VTA_OPCODE_GEMM || opcode == VTA_OPCODE_ALU) {
} else if (opcode == OPCODE_GEMM || opcode == OPCODE_ALU) {
// Set done value // Set done value
done = 0; *done = 0;
// Decode // Decode
uop_idx_T uop_bgn = insn.range(INSN_GEM_5_1, INSN_GEM_5_0); uop_idx_T uop_bgn = insn.range(VTA_INSN_GEM_5_1, VTA_INSN_GEM_5_0);
uop_idx_T uop_end = insn.range(INSN_GEM_6_1, INSN_GEM_6_0); uop_idx_T uop_end = insn.range(VTA_INSN_GEM_6_1, VTA_INSN_GEM_6_0);
loop_T iter_out = insn.range(INSN_GEM_7_1, INSN_GEM_7_0); loop_T iter_out = insn.range(VTA_INSN_GEM_7_1, VTA_INSN_GEM_7_0);
loop_T iter_in = insn.range(INSN_GEM_8_1, INSN_GEM_8_0); loop_T iter_in = insn.range(VTA_INSN_GEM_8_1, VTA_INSN_GEM_8_0);
acc_idx_T dst_factor_out = insn.range(INSN_GEM_9_1, INSN_GEM_9_0); acc_idx_T dst_factor_out = insn.range(VTA_INSN_GEM_9_1, VTA_INSN_GEM_9_0);
acc_idx_T dst_factor_in = insn.range(INSN_GEM_A_1, INSN_GEM_A_0); acc_idx_T dst_factor_in = insn.range(VTA_INSN_GEM_A_1, VTA_INSN_GEM_A_0);
inp_idx_T src_factor_out = insn.range(INSN_GEM_B_1, INSN_GEM_B_0); inp_idx_T src_factor_out = insn.range(VTA_INSN_GEM_B_1, VTA_INSN_GEM_B_0);
inp_idx_T src_factor_in = insn.range(INSN_GEM_C_1, INSN_GEM_C_0); inp_idx_T src_factor_in = insn.range(VTA_INSN_GEM_C_1, VTA_INSN_GEM_C_0);
// GEMM-specific fields // GEMM-specific fields
wgt_idx_T wgt_factor_out = insn.range(INSN_GEM_D_1, INSN_GEM_D_0); wgt_idx_T wgt_factor_out = insn.range(VTA_INSN_GEM_D_1, VTA_INSN_GEM_D_0);
wgt_idx_T wgt_factor_in = insn.range(INSN_GEM_E_1, INSN_GEM_E_0); wgt_idx_T wgt_factor_in = insn.range(VTA_INSN_GEM_E_1, VTA_INSN_GEM_E_0);
// ALU-specific field // ALU-specific field
aluop_opcode_T alu_opcode = insn.range(INSN_ALU_D_1, INSN_ALU_D_0); aluop_opcode_T alu_opcode = insn.range(VTA_INSN_ALU_D_1, VTA_INSN_ALU_D_0);
bool use_imm = insn[INSN_ALU_E]; bool use_imm = insn[VTA_INSN_ALU_E];
aluop_imm_T imm = insn.range(INSN_ALU_F_1, INSN_ALU_F_0); aluop_imm_T imm = insn.range(VTA_INSN_ALU_F_1, VTA_INSN_ALU_F_0);
acc_idx_T dst_offset_out = 0; acc_idx_T dst_offset_out = 0;
inp_idx_T src_offset_out = 0; inp_idx_T src_offset_out = 0;
wgt_idx_T wgt_offset_out = 0; wgt_idx_T wgt_offset_out = 0;
// Outer Loop // Outer Loop
EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out ++) { EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out++) {
#pragma HLS DEPENDENCE variable=acc_mem inter false #pragma HLS DEPENDENCE variable = acc_mem inter false
acc_idx_T dst_offset_in = dst_offset_out; acc_idx_T dst_offset_in = dst_offset_out;
inp_idx_T src_offset_in = src_offset_out; inp_idx_T src_offset_in = src_offset_out;
wgt_idx_T wgt_offset_in = wgt_offset_out; wgt_idx_T wgt_offset_in = wgt_offset_out;
// Inner Loop // Inner Loop
EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in ++) { EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in++) {
// Perform appropriate computation based on opcode // Perform appropriate computation based on opcode
if (opcode == OPCODE_GEMM) { if (opcode == VTA_OPCODE_GEMM) {
// Iterate over micro op // Iterate over micro op
READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc ++) { READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) {
#pragma HLS PIPELINE II=1 rewind #pragma HLS PIPELINE II = 1 rewind
// Read micro-op fields // Read micro-op fields
uop_T uop = uop_mem[upc]; uop_T uop = uop_mem[upc];
// Decode indices // Decode indices
bool reset_out = uop[UOP_GEM_0]; bool reset_out = uop[VTA_UOP_GEM_0];
acc_idx_T dst_idx = acc_idx_T dst_idx =
uop.range(UOP_GEM_1_1, UOP_GEM_1_0) + dst_offset_in; uop.range(VTA_UOP_GEM_1_1, VTA_UOP_GEM_1_0) + dst_offset_in;
acc_idx_T src_idx = acc_idx_T src_idx =
uop.range(UOP_GEM_2_1, UOP_GEM_2_0) + src_offset_in; uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + src_offset_in;
wgt_idx_T wgt_idx = wgt_idx_T wgt_idx =
uop.range(UOP_GEM_3_1, UOP_GEM_3_0) + wgt_offset_in; uop.range(VTA_UOP_GEM_3_1, VTA_UOP_GEM_3_0) + wgt_offset_in;
// Read weight matrix // Read weight matrix
wgt_vec_T w_matrix[BLOCK_OUT]; wgt_vec_T w_matrix[VTA_BLOCK_OUT];
for (int i = 0; i < BLOCK_OUT; i ++) { for (int i = 0; i < VTA_BLOCK_OUT; i++) {
w_matrix[i] = wgt_mem[wgt_idx][i]; w_matrix[i] = wgt_mem[wgt_idx][i];
} }
// Read input matrix and accum matrix // Read input matrix and accum matrix
acc_vec_T o_matrix[BATCH]; acc_vec_T o_matrix[VTA_BATCH];
out_vec_T i_matrix[BATCH]; out_vec_T i_matrix[VTA_BATCH];
for (int i = 0; i < BATCH; i ++) { for (int i = 0; i < VTA_BATCH; i++) {
o_matrix[i] = acc_mem[dst_idx][i]; o_matrix[i] = acc_mem[dst_idx][i];
i_matrix[i] = inp_mem[src_idx][i]; i_matrix[i] = inp_mem[src_idx][i];
} }
// Result matrices // Result matrices
acc_vec_T acc_mem_val[BATCH]; acc_vec_T acc_mem_val[VTA_BATCH];
out_vec_T st_buf_val[BATCH]; out_vec_T st_buf_val[VTA_BATCH];
// Inner GEMM loop // Inner GEMM loop
for (int i = 0; i < BATCH; i ++) { for (int i = 0; i < VTA_BATCH; i++) {
for (int b = 0; b < BLOCK_OUT; b ++) { for (int b = 0; b < VTA_BLOCK_OUT; b++) {
// Initialize the accumulator values // Initialize the accumulator values
acc_T accum = acc_T accum =
o_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH); o_matrix[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
// Dot product sum // Dot product sum
sum_T tmp = 0; sum_T tmp = 0;
// Inner matrix multiplication loop (input channel/feature) // Inner matrix multiplication loop (input channel/feature)
for (int k=0; k<BLOCK_IN; k++) { for (int k = 0; k < VTA_BLOCK_IN; k++) {
wgt_T w_elem = wgt_T w_elem =
w_matrix[b].range((k + 1) * WGT_WIDTH - 1, k * WGT_WIDTH); w_matrix[b].range((k + 1) * VTA_WGT_WIDTH - 1, k * VTA_WGT_WIDTH);
inp_T i_elem = inp_T i_elem =
i_matrix[i].range((k + 1) * INP_WIDTH - 1, k * INP_WIDTH); i_matrix[i].range((k + 1) * VTA_INP_WIDTH - 1, k * VTA_INP_WIDTH);
mul_T prod = i_elem * w_elem; mul_T prod = i_elem * w_elem;
tmp += (sum_T) prod; tmp += (sum_T) prod;
} }
// Update summation // Update summation
accum += (acc_T) tmp; accum += (acc_T) tmp;
// Update result vector // Update result vector
acc_mem_val[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) = acc_mem_val[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) =
reset_out ? (acc_T) 0 : accum; reset_out ? (acc_T) 0 : accum;
st_buf_val[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) = st_buf_val[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
(inp_T) accum.range(INP_WIDTH - 1, 0); (inp_T) accum.range(VTA_OUT_WIDTH - 1, 0);
} }
// Write to buffers // Write to buffers
acc_mem[dst_idx][i] = acc_mem_val[i]; acc_mem[dst_idx][i] = acc_mem_val[i];
out_mem[dst_idx][i] = st_buf_val[i]; out_mem[dst_idx][i] = st_buf_val[i];
} }
} }
} else if (opcode == VTA_OPCODE_ALU) {
} else if (opcode == OPCODE_ALU) {
// Iterate over micro op // Iterate over micro op
READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc ++) { READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) {
// Read micro-op fields // Read micro-op fields
uop_T uop = uop_mem[upc]; uop_T uop = uop_mem[upc];
// Decode // Decode
bool reset_out = uop[UOP_ALU_0]; bool reset_out = uop[VTA_UOP_ALU_0];
acc_idx_T dst_idx = acc_idx_T dst_idx =
uop.range(UOP_ALU_1_1, UOP_ALU_1_0) + dst_offset_in; uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + dst_offset_in;
acc_idx_T src_idx = acc_idx_T src_idx =
uop.range(UOP_ALU_2_1, UOP_ALU_2_0) + src_offset_in; uop.range(VTA_UOP_ALU_2_1, VTA_UOP_ALU_2_0) + src_offset_in;
// Read input matrix and accum matrix // Read input matrix and accum matrix
acc_vec_T dst_matrix[BATCH]; acc_vec_T dst_matrix[VTA_BATCH];
acc_vec_T src_matrix[BATCH]; acc_vec_T src_matrix[VTA_BATCH];
for (int i = 0; i < BATCH; i ++) { for (int i = 0; i < VTA_BATCH; i++) {
#pragma HLS UNROLL complete #pragma HLS UNROLL complete
dst_matrix[i] = acc_mem[dst_idx][i]; dst_matrix[i] = acc_mem[dst_idx][i];
src_matrix[i] = acc_mem[src_idx][i]; src_matrix[i] = acc_mem[src_idx][i];
} }
// Result matrices // Result matrices
acc_vec_T cmp_res[BATCH]; acc_vec_T cmp_res[VTA_BATCH];
acc_vec_T add_res[BATCH]; acc_vec_T add_res[VTA_BATCH];
acc_vec_T shr_res[BATCH]; acc_vec_T shr_res[VTA_BATCH];
out_vec_T short_cmp_res[BATCH]; out_vec_T short_cmp_res[VTA_BATCH];
out_vec_T short_add_res[BATCH]; out_vec_T short_add_res[VTA_BATCH];
out_vec_T short_shr_res[BATCH]; out_vec_T short_shr_res[VTA_BATCH];
// Perform ALU op over matrix elements // Perform ALU op over matrix elements
for (int i = 0; i < BATCH; i ++) { for (int i = 0; i < VTA_BATCH; i++) {
#pragma HLS PIPELINE II=1 rewind #pragma HLS PIPELINE II = 1 rewind
// Results vector // Results vector
acc_vec_T res_vec = 0; acc_vec_T res_vec = 0;
for (int b = 0; b < BLOCK_OUT; b ++) { for (int b = 0; b < VTA_BLOCK_OUT; b++) {
// Read in operands // Read in operands
acc_T src_0 = acc_T src_0 = dst_matrix[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
dst_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH); acc_T src_1 = use_imm ?
acc_T src_1 =
use_imm ?
(acc_T) imm : (acc_T) imm :
src_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH); src_matrix[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
// Compute Min/Max // Compute Min/Max
acc_T mix_val = acc_T mix_val = src_0 < src_1 ?
src_0 < src_1 ? (alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
(alu_opcode == ALU_OPCODE_MIN ? src_0 : src_1) : (alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
(alu_opcode == ALU_OPCODE_MIN ? src_1 : src_0); cmp_res[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = mix_val;
cmp_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) = short_cmp_res[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
mix_val; (inp_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
short_cmp_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
(inp_T) mix_val.range(INP_WIDTH - 1, 0);
// Compute Sum // Compute Sum
acc_T add_val = acc_T add_val =
src_0.range(ACC_WIDTH - 1, 0) + src_1.range(ACC_WIDTH - 1, 0); src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0);
add_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) = add_res[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = add_val;
add_val; short_add_res[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
short_add_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) = (inp_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
(inp_T) add_val.range(INP_WIDTH - 1, 0);
// Compute Shift // Compute Shift
acc_T shr_val = acc_T shr_val =
src_0 >> (aluop_sh_imm_T) src_1.range(LOG_ACC_WIDTH - 1, 0); src_0 >> (aluop_sh_imm_T) src_1.range(VTA_LOG_ACC_WIDTH - 1, 0);
shr_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) = shr_res[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = shr_val;
shr_val; short_shr_res[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
short_shr_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) = (inp_T) shr_val.range(VTA_OUT_WIDTH-1, 0);
(inp_T) shr_val.range(INP_WIDTH-1, 0);
} }
// Store to accum memory/store buffer // Store to accum memory/store buffer
if (alu_opcode == ALU_OPCODE_MIN || if (alu_opcode == VTA_ALU_OPCODE_MIN ||
alu_opcode == ALU_OPCODE_MAX) { alu_opcode == VTA_ALU_OPCODE_MAX) {
acc_mem[dst_idx][i] = cmp_res[i]; acc_mem[dst_idx][i] = cmp_res[i];
out_mem[dst_idx][i] = short_cmp_res[i]; out_mem[dst_idx][i] = short_cmp_res[i];
} else if (alu_opcode==ALU_OPCODE_ADD) { } else if (alu_opcode == VTA_ALU_OPCODE_ADD) {
acc_mem[dst_idx][i] = add_res[i]; acc_mem[dst_idx][i] = add_res[i];
out_mem[dst_idx][i] = short_add_res[i]; out_mem[dst_idx][i] = short_add_res[i];
} else if (alu_opcode==ALU_OPCODE_SHR) { } else if (alu_opcode == VTA_ALU_OPCODE_SHR) {
acc_mem[dst_idx][i] = shr_res[i]; acc_mem[dst_idx][i] = shr_res[i];
out_mem[dst_idx][i] = short_shr_res[i]; out_mem[dst_idx][i] = short_shr_res[i];
} }
...@@ -502,51 +473,49 @@ void compute ( ...@@ -502,51 +473,49 @@ void compute (
// Push dependence token if instructed // Push dependence token if instructed
if (push_prev_dependence) { if (push_prev_dependence) {
g2l_dep_queue.write(1); g2l_dep_queue->write(1);
} }
if (push_next_dependence) { if (push_next_dependence) {
g2s_dep_queue.write(1); g2s_dep_queue->write(1);
} }
} }
void store ( void store(
volatile out_vec_T *outputs, volatile out_vec_T *outputs,
hls::stream<insn_T> &store_queue, hls::stream<insn_T> *store_queue,
hls::stream<bool> &g2s_dep_queue, hls::stream<bool> *g2s_dep_queue,
hls::stream<bool> &s2g_dep_queue, hls::stream<bool> *s2g_dep_queue,
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH] out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]
) { ) {
#pragma HLS INTERFACE m_axi port=outputs offset=slave bundle=data_port #pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
#pragma HLS INTERFACE axis port=store_queue #pragma HLS INTERFACE axis port = store_queue
#pragma HLS INTERFACE axis port=g2s_dep_queue #pragma HLS INTERFACE axis port = g2s_dep_queue
#pragma HLS INTERFACE axis port=s2g_dep_queue #pragma HLS INTERFACE axis port = s2g_dep_queue
#pragma HLS INTERFACE bram port=out_mem #pragma HLS INTERFACE bram port = out_mem
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
// #pragma HLS ARRAY_PARTITION variable=out_mem complete dim=2
// Load buffer // Load buffer
insn_T insn = store_queue.read(); insn_T insn = store_queue->read();
// Decode // Decode
bool pop_prev_dependence = insn[INSN_MEM_1]; bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
bool pop_next_dependence = insn[INSN_MEM_2]; bool pop_next_dependence = insn[VTA_INSN_MEM_2];
bool push_prev_dependence = insn[INSN_MEM_3]; bool push_prev_dependence = insn[VTA_INSN_MEM_3];
bool push_next_dependence = insn[INSN_MEM_4]; bool push_next_dependence = insn[VTA_INSN_MEM_4];
memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0); memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0); memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0); memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0); memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0); memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0); memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0); memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0); memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0); memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0); memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
// Pop dependence token if instructed // Pop dependence token if instructed
if (pop_prev_dependence) { if (pop_prev_dependence) {
g2s_dep_queue.read(); g2s_dep_queue->read();
} }
// Initialize indices // Initialize indices
...@@ -556,18 +525,19 @@ void store ( ...@@ -556,18 +525,19 @@ void store (
// Skip padding along y dimension // Skip padding along y dimension
memop_sram_T y_offset = (x_pad_0 + x_size + x_pad_1) * y_pad_0; memop_sram_T y_offset = (x_pad_0 + x_size + x_pad_1) * y_pad_0;
sram_idx += y_offset; sram_idx += y_offset;
#pragma HLS RESOURCE variable=y_offset core=Mul_LUT // Force this computation to be done with LUTs to avoid using too many DSPs
#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
// Copy along y dimension // Copy along y dimension
for (int y = 0; y < y_size; y ++) { for (int y = 0; y < y_size; y++) {
#pragma HLS PIPELINE rewind #pragma HLS PIPELINE rewind
// Skip padding along x dimension // Skip padding along x dimension
sram_idx += x_pad_0; sram_idx += x_pad_0;
// Perform data transfer // Perform data transfer
memcpy( memcpy(
(out_vec_T *) &outputs[dram_idx*BATCH], const_cast<out_vec_T*>(&outputs[dram_idx*VTA_BATCH]),
(const out_vec_T*) &out_mem[sram_idx][0], (const out_vec_T*) &out_mem[sram_idx][0],
x_size * INP_ELEM_BYTES); x_size * VTA_INP_ELEM_BYTES);
sram_idx += x_size; sram_idx += x_size;
dram_idx += x_stride; dram_idx += x_stride;
// Skip padding along x dimension // Skip padding along x dimension
...@@ -576,11 +546,11 @@ void store ( ...@@ -576,11 +546,11 @@ void store (
// Push dependence token if instructed // Push dependence token if instructed
if (push_prev_dependence) { if (push_prev_dependence) {
s2g_dep_queue.write(1); s2g_dep_queue->write(1);
} }
} }
void vta ( void vta(
uint32_t insn_count, uint32_t insn_count,
volatile insn_T *insns, volatile insn_T *insns,
volatile uop_T *uops, volatile uop_T *uops,
...@@ -588,14 +558,14 @@ void vta ( ...@@ -588,14 +558,14 @@ void vta (
volatile wgt_vec_T *weights, volatile wgt_vec_T *weights,
volatile acc_vec_T *biases, volatile acc_vec_T *biases,
volatile out_vec_T *outputs) { volatile out_vec_T *outputs) {
#pragma HLS INTERFACE s_axilite port=insn_count bundle=CONTROL_BUS #pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
#pragma HLS INTERFACE m_axi port=insns offset=slave bundle=ins_port #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
#pragma HLS INTERFACE m_axi port=uops offset=slave bundle=uop_port #pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
#pragma HLS INTERFACE m_axi port=inputs offset=slave bundle=data_port #pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
#pragma HLS INTERFACE m_axi port=weights offset=slave bundle=data_port #pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
#pragma HLS INTERFACE m_axi port=biases offset=slave bundle=data_port #pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
#pragma HLS INTERFACE m_axi port=outputs offset=slave bundle=data_port #pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
// Instantiate temporary instruction queues (used for peeking) // Instantiate temporary instruction queues (used for peeking)
hls::stream<insn_T> tmp_load_queue; hls::stream<insn_T> tmp_load_queue;
...@@ -614,18 +584,12 @@ void vta ( ...@@ -614,18 +584,12 @@ void vta (
hls::stream<bool> g2s_dep_queue; hls::stream<bool> g2s_dep_queue;
// Instantiate memories // Instantiate memories
inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH]; inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH];
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]; wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT];
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]; out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH];
// Push all instructions into the queues // Push all instructions into the queues
fetch( fetch(insn_count, insns, &tmp_load_queue, &tmp_gemm_queue, &tmp_store_queue);
insn_count,
insns,
tmp_load_queue,
tmp_gemm_queue,
tmp_store_queue
);
// Global done indicator // Global done indicator
uint32_t done = 0; uint32_t done = 0;
...@@ -651,21 +615,13 @@ void vta ( ...@@ -651,21 +615,13 @@ void vta (
tmp_load_popped = true; tmp_load_popped = true;
} }
// Check dependences and invoke the load stage // Check dependences and invoke the load stage
bool pop_next_dependence = tmp_load[INSN_MEM_2]; bool pop_next_dependence = tmp_load[VTA_INSN_MEM_2];
if ((pop_next_dependence && !g2l_dep_queue.empty()) || if ((pop_next_dependence && !g2l_dep_queue.empty()) ||
!pop_next_dependence) { !pop_next_dependence) {
// Push the instruction in the load queue // Push the instruction in the load queue
load_queue.write(tmp_load); load_queue.write(tmp_load);
tmp_load_popped = false; tmp_load_popped = false;
load( load(inputs, weights, &load_queue, &g2l_dep_queue, &l2g_dep_queue, inp_mem, wgt_mem);
inputs,
weights,
load_queue,
g2l_dep_queue,
l2g_dep_queue,
inp_mem,
wgt_mem
);
} else { } else {
// Execution of load stage pending on completion of other stages, so break here... // Execution of load stage pending on completion of other stages, so break here...
break; break;
...@@ -679,8 +635,8 @@ void vta ( ...@@ -679,8 +635,8 @@ void vta (
tmp_gemm_popped = true; tmp_gemm_popped = true;
} }
// Check dependences and invoke the load stage // Check dependences and invoke the load stage
bool pop_prev_dependence = tmp_gemv[INSN_MEM_1]; bool pop_prev_dependence = tmp_gemv[VTA_INSN_MEM_1];
bool pop_next_dependence = tmp_gemv[INSN_MEM_2]; bool pop_next_dependence = tmp_gemv[VTA_INSN_MEM_2];
if ( if (
(pop_prev_dependence && !l2g_dep_queue.empty() && (pop_prev_dependence && !l2g_dep_queue.empty() &&
pop_next_dependence && !s2g_dep_queue.empty()) || pop_next_dependence && !s2g_dep_queue.empty()) ||
...@@ -693,19 +649,8 @@ void vta ( ...@@ -693,19 +649,8 @@ void vta (
// Push the instruction in the load queue // Push the instruction in the load queue
gemm_queue.write(tmp_gemv); gemm_queue.write(tmp_gemv);
tmp_gemm_popped = false; tmp_gemm_popped = false;
compute( compute(&done, uops, biases, &gemm_queue, &l2g_dep_queue, &s2g_dep_queue,
done, &g2l_dep_queue, &g2s_dep_queue, inp_mem, wgt_mem, out_mem);
uops,
biases,
gemm_queue,
l2g_dep_queue,
s2g_dep_queue,
g2l_dep_queue,
g2s_dep_queue,
inp_mem,
wgt_mem,
out_mem
);
} else { } else {
// Execution of load stage pending on completion of other stages, // Execution of load stage pending on completion of other stages,
// so break here... // so break here...
...@@ -720,19 +665,13 @@ void vta ( ...@@ -720,19 +665,13 @@ void vta (
tmp_store_popped = true; tmp_store_popped = true;
} }
// Check dependences and invoke the load stage // Check dependences and invoke the load stage
bool pop_prev_dependence = tmp_store[INSN_MEM_1]; bool pop_prev_dependence = tmp_store[VTA_INSN_MEM_1];
if ((pop_prev_dependence && !g2s_dep_queue.empty()) || if ((pop_prev_dependence && !g2s_dep_queue.empty()) ||
!pop_prev_dependence) { !pop_prev_dependence) {
// Push the instruction in the load queue // Push the instruction in the load queue
store_queue.write(tmp_store); store_queue.write(tmp_store);
tmp_store_popped = false; tmp_store_popped = false;
store( store(outputs, &store_queue, &g2s_dep_queue, &s2g_dep_queue, out_mem);
outputs,
store_queue,
g2s_dep_queue,
s2g_dep_queue,
out_mem
);
} else { } else {
// Execution of load stage pending on completion of other stages, so break here... // Execution of load stage pending on completion of other stages, so break here...
break; break;
...@@ -742,7 +681,7 @@ void vta ( ...@@ -742,7 +681,7 @@ void vta (
if (done) { if (done) {
break; break;
} }
exit_counter ++; exit_counter++;
if (exit_counter > 1000) { if (exit_counter > 1000) {
if (tmp_load_popped) { if (tmp_load_popped) {
if (g2l_dep_queue.empty()) { if (g2l_dep_queue.empty()) {
...@@ -750,10 +689,10 @@ void vta ( ...@@ -750,10 +689,10 @@ void vta (
} }
} }
if (tmp_gemm_popped) { if (tmp_gemm_popped) {
if (l2g_dep_queue.empty() && tmp_gemv[INSN_MEM_1]) { if (l2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_1]) {
printf("waiting on l2g\n"); printf("waiting on l2g\n");
} }
if (s2g_dep_queue.empty() && tmp_gemv[INSN_MEM_2]) { if (s2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_2]) {
printf("waiting on s2g\n"); printf("waiting on s2g\n");
} }
} }
...@@ -772,17 +711,17 @@ void vta ( ...@@ -772,17 +711,17 @@ void vta (
int s2g_count = 0; int s2g_count = 0;
int g2l_count = 0; int g2l_count = 0;
int g2s_count = 0; int g2s_count = 0;
while(l2g_dep_queue.read_nb(tmp_tok)) { while (l2g_dep_queue.read_nb(tmp_tok)) {
l2g_count ++; l2g_count++;
} }
while(s2g_dep_queue.read_nb(tmp_tok)) { while (s2g_dep_queue.read_nb(tmp_tok)) {
s2g_count ++; s2g_count++;
} }
while(g2l_dep_queue.read_nb(tmp_tok)) { while (g2l_dep_queue.read_nb(tmp_tok)) {
g2l_count ++; g2l_count++;
} }
while(g2s_dep_queue.read_nb(tmp_tok)) { while (g2s_dep_queue.read_nb(tmp_tok)) {
g2s_count ++; g2s_count++;
} }
assert(l2g_count == 0 && g2s_count == 0 && g2l_count == 0 && g2s_count == 0); assert(l2g_count == 0 && g2s_count == 0 && g2l_count == 0 && g2s_count == 0);
......
...@@ -3,96 +3,96 @@ ...@@ -3,96 +3,96 @@
* \file vta.h * \file vta.h
* \brief Type definitions and prototype for VTA HLS design. * \brief Type definitions and prototype for VTA HLS design.
*/ */
#ifndef VTA_MAIN_H_ #ifndef VTA_VTA_H_
#define VTA_MAIN_H_ #define VTA_VTA_H_
#include <assert.h>
#include <ap_axi_sdata.h> #include <ap_axi_sdata.h>
#include <ap_int.h> #include <ap_int.h>
#include <assert.h>
#include <hls_stream.h> #include <hls_stream.h>
#include <vta/hw_spec.h> #include <vta/hw_spec.h>
/* \typedef uop_T Micro-op datatype*/ /* \typedef uop_T Micro-op datatype*/
typedef ap_uint<UOP_WIDTH> uop_T; typedef ap_uint<VTA_UOP_WIDTH> uop_T;
/* \typedef inp_T Input datatype*/ /* \typedef inp_T Input datatype*/
typedef ap_int<INP_WIDTH> inp_T; typedef ap_int<VTA_INP_WIDTH> inp_T;
/* \typedef wgt_T Weight datatype*/ /* \typedef wgt_T Weight datatype*/
typedef ap_int<WGT_WIDTH> wgt_T; typedef ap_int<VTA_WGT_WIDTH> wgt_T;
/* \typedef out_T Output datatype*/ /* \typedef out_T Output datatype*/
typedef ap_int<OUT_WIDTH> out_T; typedef ap_int<VTA_OUT_WIDTH> out_T;
/* \typedef acc_T Accumulator datatype*/ /* \typedef acc_T Accumulator datatype*/
typedef ap_int<ACC_WIDTH> acc_T; typedef ap_int<VTA_ACC_WIDTH> acc_T;
/* \typedef mul_T Multiplier output datatype*/ /* \typedef mul_T Multiplier output datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T; typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/ /* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T; typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
/* \typedef inp_vec_T Input vector datatype*/ /* \typedef inp_vec_T Input vector datatype*/
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T; typedef ap_uint<VTA_INP_WIDTH*VTA_BLOCK_IN> inp_vec_T;
/* \typedef wgt_vec_T Weight vector datatype*/ /* \typedef wgt_vec_T Weight vector datatype*/
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T; typedef ap_uint<VTA_WGT_WIDTH*VTA_BLOCK_IN> wgt_vec_T;
/* \typedef acc_vec_T Accumulator vector datatype*/ /* \typedef acc_vec_T Accumulator vector datatype*/
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T; typedef ap_uint<VTA_ACC_WIDTH*VTA_BLOCK_OUT> acc_vec_T;
/* \typedef out_vec_T Output vector datatype*/ /* \typedef out_vec_T Output vector datatype*/
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T; typedef ap_uint<VTA_OUT_WIDTH*VTA_BLOCK_OUT> out_vec_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/ /* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T; typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
/* \typedef inp_idx_T Input SRAM index datatype*/ /* \typedef inp_idx_T Input SRAM index datatype*/
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T; typedef ap_uint<VTA_LOG_INP_BUFF_DEPTH+1> inp_idx_T;
/* \typedef wgt_idx_T Weight SRAM index datatype*/ /* \typedef wgt_idx_T Weight SRAM index datatype*/
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T; typedef ap_uint<VTA_LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
/* \typedef acc_idx_T Accumulator SRAM index datatype*/ /* \typedef acc_idx_T Accumulator SRAM index datatype*/
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T; typedef ap_uint<VTA_LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
/* \typedef opcode_T Opcode datatype*/ /* \typedef opcode_T Opcode datatype*/
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T; typedef ap_uint<VTA_OPCODE_BIT_WIDTH> opcode_T;
/* \typedef insn_T Instruction datatype*/ /* \typedef insn_T Instruction datatype*/
typedef ap_uint<INS_WIDTH> insn_T; typedef ap_uint<VTA_INS_WIDTH> insn_T;
/* \typedef loop_T Loop bound datatype*/ /* \typedef loop_T Loop bound datatype*/
typedef ap_uint<LOOP_ITER_WIDTH> loop_T; typedef ap_uint<VTA_LOOP_ITER_WIDTH> loop_T;
/* \typedef memop_id_T Memory operation ID datatype*/ /* \typedef memop_id_T Memory operation ID datatype*/
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T; typedef ap_uint<VTA_MEMOP_ID_BIT_WIDTH> memop_id_T;
/* \typedef memop_sram_T Memory operation SRAM index datatype*/ /* \typedef memop_sram_T Memory operation SRAM index datatype*/
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T; typedef ap_uint<VTA_MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
/* \typedef memop_dram_T Memory operation DRAM index datatype*/ /* \typedef memop_dram_T Memory operation DRAM index datatype*/
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T; typedef ap_uint<VTA_MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
/* \typedef memop_size_T Memory operation range datatype*/ /* \typedef memop_size_T Memory operation range datatype*/
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T; typedef ap_uint<VTA_MEMOP_SIZE_BIT_WIDTH> memop_size_T;
/* \typedef memop_stride_T Memory operation stride datatype*/ /* \typedef memop_stride_T Memory operation stride datatype*/
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T; typedef ap_uint<VTA_MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
/* \typedef memop_pad_T Memory operation pad width datatype*/ /* \typedef memop_pad_T Memory operation pad width datatype*/
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T; typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/ /* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T; typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_opcode_T ALU operation immediate datatype*/ /* \typedef aluop_opcode_T ALU operation immediate datatype*/
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T; typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/ /* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T; typedef ap_uint<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
/*! /*!
* \brief Fetch module. * \brief Fetch module.
...@@ -104,12 +104,12 @@ typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T; ...@@ -104,12 +104,12 @@ typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
* \param gemm_queue GEMM instruction queue. AXI-stream FIFO. * \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
* \param store_queue Store instruction queue. AXI-stream FIFO. * \param store_queue Store instruction queue. AXI-stream FIFO.
*/ */
void fetch ( void fetch(
uint32_t insn_count, uint32_t insn_count,
volatile insn_T *insns, volatile insn_T *insns,
hls::stream<insn_T> &load_queue, hls::stream<insn_T> *load_queue,
hls::stream<insn_T> &gemm_queue, hls::stream<insn_T> *gemm_queue,
hls::stream<insn_T> &store_queue); hls::stream<insn_T> *store_queue);
/*! /*!
* \brief Load module. * \brief Load module.
...@@ -126,15 +126,14 @@ void fetch ( ...@@ -126,15 +126,14 @@ void fetch (
* \param inp_mem Local input SRAM buffer. Write only single port BRAM. * \param inp_mem Local input SRAM buffer. Write only single port BRAM.
* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM. * \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
*/ */
void load ( void load(
volatile inp_vec_T *inputs, volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights, volatile wgt_vec_T *weights,
hls::stream<insn_T> &load_queue, hls::stream<insn_T> *load_queue,
hls::stream<bool> &g2l_dep_queue, hls::stream<bool> *g2l_dep_queue,
hls::stream<bool> &l2g_dep_queue, hls::stream<bool> *l2g_dep_queue,
inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH], inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT] wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]);
);
/*! /*!
* \brief Compute module. * \brief Compute module.
...@@ -159,19 +158,18 @@ void load ( ...@@ -159,19 +158,18 @@ void load (
* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM. * \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
* \param out_mem Local output SRAM buffer. Write only single port BRAM. * \param out_mem Local output SRAM buffer. Write only single port BRAM.
*/ */
void compute ( void compute(
volatile uint32_t &done, volatile uint32_t *done,
volatile uop_T *uops, volatile uop_T *uops,
volatile acc_vec_T *biases, volatile acc_vec_T *biases,
hls::stream<insn_T> &gemm_queue, hls::stream<insn_T> *gemm_queue,
hls::stream<bool> &l2g_dep_queue, hls::stream<bool> *l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue, hls::stream<bool> *s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue, hls::stream<bool> *g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue, hls::stream<bool> *g2s_dep_queue,
out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH], out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT], wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH] out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
);
/*! /*!
* \brief Store module. * \brief Store module.
...@@ -186,13 +184,12 @@ void compute ( ...@@ -186,13 +184,12 @@ void compute (
* AXI-stream FIFO. * AXI-stream FIFO.
* \param out_mem Local output SRAM buffer. Read only single port BRAM. * \param out_mem Local output SRAM buffer. Read only single port BRAM.
*/ */
void store ( void store(
volatile out_vec_T *outputs, volatile out_vec_T *outputs,
hls::stream<insn_T> &store_queue, hls::stream<insn_T> *store_queue,
hls::stream<bool> &g2s_dep_queue, hls::stream<bool> *g2s_dep_queue,
hls::stream<bool> &s2g_dep_queue, hls::stream<bool> *s2g_dep_queue,
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH] out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
);
/*! /*!
* \brief VTA wrapper for simulation purpose only. * \brief VTA wrapper for simulation purpose only.
...@@ -205,7 +202,7 @@ void store ( ...@@ -205,7 +202,7 @@ void store (
* \param biases Bias data base address in DRAM. AXI-4 master port. * \param biases Bias data base address in DRAM. AXI-4 master port.
* \param outputs Output data base address in DRAM. AXI-4 master port. * \param outputs Output data base address in DRAM. AXI-4 master port.
*/ */
void vta ( void vta(
uint32_t insn_count, uint32_t insn_count,
volatile insn_T *insns, volatile insn_T *insns,
volatile uop_T *uops, volatile uop_T *uops,
...@@ -214,4 +211,4 @@ void vta ( ...@@ -214,4 +211,4 @@ void vta (
volatile acc_vec_T *biases, volatile acc_vec_T *biases,
volatile out_vec_T *outputs); volatile out_vec_T *outputs);
#endif // VTA_MAIN_H_ #endif // VTA_VTA_H_
\ No newline at end of file
...@@ -14,10 +14,10 @@ extern "C" { ...@@ -14,10 +14,10 @@ extern "C" {
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
/*! \brief Memory management constants with libxlnk_cma */ /*! \brief Memory management constants */
#define CACHED 1 #define VTA_CACHED 1
/*! \brief Memory management constants with libxlnk_cma */ /*! \brief Memory management constants */
#define NOT_CACHED 0 #define VTA_NOT_CACHED 0
/*! \brief VTA command handle */ /*! \brief VTA command handle */
typedef void * VTAHandle; typedef void * VTAHandle;
......
...@@ -14,150 +14,153 @@ extern "C" { ...@@ -14,150 +14,153 @@ extern "C" {
#include <stdint.h> #include <stdint.h>
/*! log2 of instruction data type width */ /*! log2 of instruction data type width */
#define LOG_INS_WIDTH 7 #define VTA_LOG_INS_WIDTH 7
/*! Instruction data type width */ /*! Instruction data type width */
#define INS_WIDTH (1<<LOG_INS_WIDTH) #define VTA_INS_WIDTH (1 << VTA_LOG_INS_WIDTH)
/*! log2 of micro op data type width */ /*! log2 of micro op data type width */
#define LOG_UOP_WIDTH 5 #define VTA_LOG_UOP_WIDTH 5
/*! Micro Op data type width */ /*! Micro Op data type width */
#define UOP_WIDTH (1<<LOG_UOP_WIDTH) #define VTA_UOP_WIDTH (1 << VTA_LOG_UOP_WIDTH)
/*! Weight data type width */ /*! Weight data type width */
#define WGT_WIDTH (1<<LOG_WGT_WIDTH) #define VTA_WGT_WIDTH (1 << VTA_LOG_WGT_WIDTH)
/*! Input data type width */ /*! Input data type width */
#define INP_WIDTH (1<<LOG_INP_WIDTH) #define VTA_INP_WIDTH (1 << VTA_LOG_INP_WIDTH)
/*! Output data type width */ /*! Output data type width */
#define OUT_WIDTH (1<<LOG_OUT_WIDTH) #define VTA_OUT_WIDTH (1 << VTA_LOG_OUT_WIDTH)
/*! Accumulator data type width */ /*! Accumulator data type width */
#define ACC_WIDTH (1<<LOG_ACC_WIDTH) #define VTA_ACC_WIDTH (1 << VTA_LOG_ACC_WIDTH)
/*! log2 of ALU data type width */ /*! log2 of ALU data type width */
#define LOG_ALU_WIDTH (LOG_ACC_WIDTH-1) #define VTA_LOG_ALU_WIDTH (VTA_LOG_ACC_WIDTH - 1)
/*! ALU data type width */ /*! ALU data type width */
#define ALU_WIDTH (1<<LOG_ALU_WIDTH) #define VTA_ALU_WIDTH (1 << VTA_LOG_ALU_WIDTH)
/*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/ /*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/
#define BATCH (1<<LOG_BATCH) #define VTA_BATCH (1 << VTA_LOG_BATCH)
/*! Blocking factor of inner most loop (corresponds to B in (A,B)x(B,C) mat mult) */ /*! Blocking factor of inner most loop (corresponds to B in (A,B)x(B,C) mat mult) */
#define BLOCK_IN (1<<LOG_BLOCK_IN) #define VTA_BLOCK_IN (1 << VTA_LOG_BLOCK_IN)
/*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */ /*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */
#define BLOCK_OUT (1<<LOG_BLOCK_OUT) #define VTA_BLOCK_OUT (1 << VTA_LOG_BLOCK_OUT)
/*! Weight vector width */ /*! Weight vector width */
#define WGT_VECTOR_WIDTH (WGT_WIDTH*BLOCK_IN) #define VTA_WGT_VECTOR_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_IN)
/*! Input vector width */ /*! Input vector width */
#define INP_VECTOR_WIDTH (INP_WIDTH*BLOCK_IN) #define VTA_INP_VECTOR_WIDTH (VTA_INP_WIDTH * VTA_BLOCK_IN)
/*! Accumulator vector width */ /*! Accumulator vector width */
#define ACC_VECTOR_WIDTH (ACC_WIDTH*BLOCK_OUT) #define VTA_ACC_VECTOR_WIDTH (VTA_ACC_WIDTH * VTA_BLOCK_OUT)
/*! Output vector width */ /*! Output vector width */
#define OUT_VECTOR_WIDTH (OUT_WIDTH*BLOCK_OUT) #define VTA_OUT_VECTOR_WIDTH (VTA_OUT_WIDTH * VTA_BLOCK_OUT)
/*! On-chip micro-op buffer size in B */ /*! On-chip micro-op buffer size in B */
#define UOP_BUFF_SIZE (1<<LOG_UOP_BUFF_SIZE) #define VTA_UOP_BUFF_SIZE (1 << VTA_LOG_UOP_BUFF_SIZE)
/*! On-chip weight buffer size in B */ /*! On-chip weight buffer size in B */
#define WGT_BUFF_SIZE (1<<LOG_WGT_BUFF_SIZE) #define VTA_WGT_BUFF_SIZE (1 << VTA_LOG_WGT_BUFF_SIZE)
/*! On-chip activation buffer size in B */ /*! On-chip activation buffer size in B */
#define INP_BUFF_SIZE (1<<LOG_INP_BUFF_SIZE) #define VTA_INP_BUFF_SIZE (1 << VTA_LOG_INP_BUFF_SIZE)
/*! On-chip accumulator buffer size in B */ /*! On-chip accumulator buffer size in B */
#define ACC_BUFF_SIZE (1<<LOG_ACC_BUFF_SIZE) #define VTA_ACC_BUFF_SIZE (1 << VTA_LOG_ACC_BUFF_SIZE)
/*! Size of instruction buffer element in B */ /*! Size of instruction buffer element in B */
#define INS_ELEM_BYTES (INS_WIDTH/8) #define VTA_INS_ELEM_BYTES (VTA_INS_WIDTH / 8)
/*! Size of uop buffer element in B*/ /*! Size of uop buffer element in B*/
#define UOP_ELEM_BYTES (UOP_WIDTH/8) #define VTA_UOP_ELEM_BYTES (VTA_UOP_WIDTH / 8)
/*! Size of activation buffer element in B*/ /*! Size of activation buffer element in B*/
#define INP_ELEM_BYTES (BATCH*BLOCK_IN*INP_WIDTH/8) #define VTA_INP_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_IN * VTA_INP_WIDTH / 8)
/*! Size of weight buffer element in B*/ /*! Size of weight buffer element in B*/
#define WGT_ELEM_BYTES (BLOCK_OUT*BLOCK_IN*WGT_WIDTH/8) #define VTA_WGT_ELEM_BYTES (VTA_BLOCK_OUT * VTA_BLOCK_IN * VTA_WGT_WIDTH / 8)
/*! Size of accumulator buffer element in B*/ /*! Size of accumulator buffer element in B*/
#define ACC_ELEM_BYTES (BATCH*BLOCK_OUT*ACC_WIDTH/8) #define VTA_ACC_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_OUT * VTA_ACC_WIDTH / 8)
/*! On-chip micro-op buffer depth */ /*! On-chip micro-op buffer depth */
#define UOP_BUFF_DEPTH (UOP_BUFF_SIZE/UOP_ELEM_BYTES) #define VTA_UOP_BUFF_DEPTH (VTA_UOP_BUFF_SIZE / VTA_UOP_ELEM_BYTES)
/*! log2 of on-chip micro-op buffer depth */ /*! log2 of on-chip micro-op buffer depth */
#define LOG_UOP_BUFF_DEPTH (LOG_UOP_BUFF_SIZE-LOG_UOP_WIDTH+3) #define VTA_LOG_UOP_BUFF_DEPTH (VTA_LOG_UOP_BUFF_SIZE - VTA_LOG_UOP_WIDTH + 3)
// ! \brief On-chip weight buffer depth // ! \brief On-chip weight buffer depth
#define WGT_BUFF_DEPTH (WGT_BUFF_SIZE/WGT_ELEM_BYTES) #define VTA_WGT_BUFF_DEPTH (VTA_WGT_BUFF_SIZE / VTA_WGT_ELEM_BYTES)
/*! log2 of weight micro-op buffer depth */ /*! log2 of weight micro-op buffer depth */
#define LOG_WGT_BUFF_DEPTH (LOG_WGT_BUFF_SIZE-LOG_BLOCK_OUT-LOG_BLOCK_IN-LOG_WGT_WIDTH+3) #define VTA_LOG_WGT_BUFF_DEPTH \
(VTA_LOG_WGT_BUFF_SIZE - VTA_LOG_BLOCK_OUT - VTA_LOG_BLOCK_IN - VTA_LOG_WGT_WIDTH + 3)
/*! On-chip activation buffer depth */ /*! On-chip activation buffer depth */
#define INP_BUFF_DEPTH (INP_BUFF_SIZE/INP_ELEM_BYTES) #define VTA_INP_BUFF_DEPTH (VTA_INP_BUFF_SIZE / VTA_INP_ELEM_BYTES)
/*! log2 of activation micro-op buffer depth */ /*! log2 of activation micro-op buffer depth */
#define LOG_INP_BUFF_DEPTH (LOG_INP_BUFF_SIZE-LOG_BATCH-LOG_BLOCK_IN-LOG_INP_WIDTH+3) #define VTA_LOG_INP_BUFF_DEPTH \
(VTA_LOG_INP_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_IN - VTA_LOG_INP_WIDTH + 3)
/*! On-chip accumulator buffer depth */ /*! On-chip accumulator buffer depth */
#define ACC_BUFF_DEPTH (ACC_BUFF_SIZE/ACC_ELEM_BYTES) #define VTA_ACC_BUFF_DEPTH (VTA_ACC_BUFF_SIZE / VTA_ACC_ELEM_BYTES)
/*! log2 of on-chip accumulator buffer depth */ /*! log2 of on-chip accumulator buffer depth */
#define LOG_ACC_BUFF_DEPTH (LOG_ACC_BUFF_SIZE-LOG_BATCH-LOG_BLOCK_OUT-LOG_ACC_WIDTH+3) #define VTA_LOG_ACC_BUFF_DEPTH \
(VTA_LOG_ACC_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_OUT - VTA_LOG_ACC_WIDTH + 3)
/*! Instruction opcode field bitwidth */ /*! Instruction opcode field bitwidth */
#define OPCODE_BIT_WIDTH 3 #define VTA_OPCODE_BIT_WIDTH 3
/*! ALU opcode field bitwidth */ /*! ALU opcode field bitwidth */
#define ALU_OPCODE_BIT_WIDTH 3 #define VTA_ALU_OPCODE_BIT_WIDTH 3
/*! ALU instruction reset mode bitwidth */ /*! ALU instruction reset mode bitwidth */
#define ALU_RESET_BIT_WIDTH 2 #define VTA_ALU_RESET_BIT_WIDTH 2
/*! Opcode: load encoding */ /*! Opcode: load encoding */
#define OPCODE_LOAD 0 #define VTA_OPCODE_LOAD 0
/*! Opcode: store encoding */ /*! Opcode: store encoding */
#define OPCODE_STORE 1 #define VTA_OPCODE_STORE 1
/*! Opcode: GEMM encoding */ /*! Opcode: GEMM encoding */
#define OPCODE_GEMM 2 #define VTA_OPCODE_GEMM 2
/*! Opcode: finish encoding */ /*! Opcode: finish encoding */
#define OPCODE_FINISH 3 #define VTA_OPCODE_FINISH 3
/*! Opcode: ALU encoding */ /*! Opcode: ALU encoding */
#define OPCODE_ALU 4 #define VTA_OPCODE_ALU 4
/*! ALU opcode: unary min op */ /*! ALU opcode: unary min op */
#define ALU_OPCODE_MIN 0 #define VTA_ALU_OPCODE_MIN 0
/*! ALU opcode: unary max op */ /*! ALU opcode: unary max op */
#define ALU_OPCODE_MAX 1 #define VTA_ALU_OPCODE_MAX 1
/*! ALU opcode: binary add op */ /*! ALU opcode: binary add op */
#define ALU_OPCODE_ADD 2 #define VTA_ALU_OPCODE_ADD 2
/*! ALU opcode: binary sub op [NOT IMPLEMENTED] */ /*! ALU opcode: binary sub op [NOT IMPLEMENTED] */
#define ALU_OPCODE_SUB 3 #define VTA_ALU_OPCODE_SUB 3
/*! ALU opcode: binary mul op [NOT IMPLEMENTED] */ /*! ALU opcode: binary mul op [NOT IMPLEMENTED] */
#define ALU_OPCODE_MUL 4 #define VTA_ALU_OPCODE_MUL 4
/*! ALU opcode: shift left by immediate op */ /*! ALU opcode: shift left by immediate op */
#define ALU_OPCODE_SHL 5 #define VTA_ALU_OPCODE_SHL 5
/*! ALU opcode: shift right by immediate op [NOT IMPLEMENTED] */ /*! ALU opcode: shift right by immediate op [NOT IMPLEMENTED] */
#define ALU_OPCODE_SHR 6 #define VTA_ALU_OPCODE_SHR 6
/*! ALU instruction reset mode: set to min */ /*! ALU instruction reset mode: set to min */
#define ALU_RESET_MIN 3 #define VTA_ALU_RESET_MIN 3
/*! ALU instruction reset mode: set to zero */ /*! ALU instruction reset mode: set to zero */
#define ALU_RESET_ZERO 0 #define VTA_ALU_RESET_ZERO 0
/*! ALU instruction reset mode: no reset */ /*! ALU instruction reset mode: no reset */
#define ALU_NO_RESET 2 #define VTA_ALU_NO_RESET 2
/*! ALU instruction reset mode: set to max */ /*! ALU instruction reset mode: set to max */
#define ALU_RESET_MAX 1 #define VTA_ALU_RESET_MAX 1
/*! Memory type field bitwidth */ /*! Memory type field bitwidth */
#define MEMOP_ID_BIT_WIDTH 2 #define VTA_MEMOP_ID_BIT_WIDTH 2
/*! Load/Store Instruction: DRAM address width*/ /*! Load/Store Instruction: DRAM address width*/
#define MEMOP_SRAM_ADDR_BIT_WIDTH 16 #define VTA_MEMOP_SRAM_ADDR_BIT_WIDTH 16
/*! Load/Store Instruction: DRAM address width*/ /*! Load/Store Instruction: DRAM address width*/
#define MEMOP_DRAM_ADDR_BIT_WIDTH 32 #define VTA_MEMOP_DRAM_ADDR_BIT_WIDTH 32
/*! Load/Store Instruction: transfer size width*/ /*! Load/Store Instruction: transfer size width*/
#define MEMOP_SIZE_BIT_WIDTH 16 #define VTA_MEMOP_SIZE_BIT_WIDTH 16
/*! Load/Store Instruction: stride size width*/ /*! Load/Store Instruction: stride size width*/
#define MEMOP_STRIDE_BIT_WIDTH 16 #define VTA_MEMOP_STRIDE_BIT_WIDTH 16
/*! Load/Store Instruction: padding width*/ /*! Load/Store Instruction: padding width*/
#define MEMOP_PAD_BIT_WIDTH 4 #define VTA_MEMOP_PAD_BIT_WIDTH 4
/*! Load/Store Instruction: padding value encoding width*/ /*! Load/Store Instruction: padding value encoding width*/
#define MEMOP_PAD_VAL_BIT_WIDTH 2 #define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2
/*! ALU Instruction: immediate bitwidth*/ /*! ALU Instruction: immediate bitwidth*/
#define ALUOP_IMM_BIT_WIDTH 16 #define VTA_ALUOP_IMM_BIT_WIDTH 16
/*! GEMM/ALU Instruction: loop max iter bits */ /*! GEMM/ALU Instruction: loop max iter bits */
#define LOOP_ITER_WIDTH 15 #define VTA_LOOP_ITER_WIDTH 15
/*! Mem ID constant: uop memory */ /*! Mem ID constant: uop memory */
#define MEM_ID_UOP 0 #define VTA_MEM_ID_UOP 0
/*! Mem ID constant: weight memory */ /*! Mem ID constant: weight memory */
#define MEM_ID_WGT 1 #define VTA_MEM_ID_WGT 1
/*! Mem ID constant: input memory */ /*! Mem ID constant: input memory */
#define MEM_ID_INP 2 #define VTA_MEM_ID_INP 2
/*! Mem ID constant: accumulator/bias memory */ /*! Mem ID constant: accumulator/bias memory */
#define MEM_ID_ACC 3 #define VTA_MEM_ID_ACC 3
/*! Mem ID constant: output store buffer */ /*! Mem ID constant: output store buffer */
#define MEM_ID_OUT 4 #define VTA_MEM_ID_OUT 4
// Instruction organization layout: // Instruction organization layout:
// //
...@@ -218,152 +221,152 @@ extern "C" { ...@@ -218,152 +221,152 @@ extern "C" {
// arg f: imm | alu_imm_T | // arg f: imm | alu_imm_T |
/*! Load/Store instruction start position of the opcode field */ /*! Load/Store instruction start position of the opcode field */
#define INSN_MEM_0_0 0 #define VTA_INSN_MEM_0_0 0
/*! Load/Store instruction end position of the opcode field */ /*! Load/Store instruction end position of the opcode field */
#define INSN_MEM_0_1 (INSN_MEM_0_0+OPCODE_BIT_WIDTH-1) #define VTA_INSN_MEM_0_1 (VTA_INSN_MEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1)
/*! Load/Store instruction position of the pop_prev_dep field */ /*! Load/Store instruction position of the pop_prev_dep field */
#define INSN_MEM_1 (INSN_MEM_0_1+1) #define VTA_INSN_MEM_1 (VTA_INSN_MEM_0_1 + 1)
/*! Load/Store instruction position of the pop_next_dep field */ /*! Load/Store instruction position of the pop_next_dep field */
#define INSN_MEM_2 (INSN_MEM_1+1) #define VTA_INSN_MEM_2 (VTA_INSN_MEM_1 + 1)
/*! Load/Store instruction position of the push_prev_dependence field */ /*! Load/Store instruction position of the push_prev_dependence field */
#define INSN_MEM_3 (INSN_MEM_2+1) #define VTA_INSN_MEM_3 (VTA_INSN_MEM_2 + 1)
/*! Load/Store instruction position of the push_next_dependence field */ /*! Load/Store instruction position of the push_next_dependence field */
#define INSN_MEM_4 (INSN_MEM_3+1) #define VTA_INSN_MEM_4 (VTA_INSN_MEM_3 + 1)
/*! Load/Store instruction start position of the memory_type field */ /*! Load/Store instruction start position of the memory_type field */
#define INSN_MEM_5_0 (INSN_MEM_4+1) #define VTA_INSN_MEM_5_0 (VTA_INSN_MEM_4 + 1)
/*! Load/Store instruction end position of the memory_type field */ /*! Load/Store instruction end position of the memory_type field */
#define INSN_MEM_5_1 (INSN_MEM_5_0+MEMOP_ID_BIT_WIDTH-1) #define VTA_INSN_MEM_5_1 (VTA_INSN_MEM_5_0 + VTA_MEMOP_ID_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the sram_base field */ /*! Load/Store instruction start position of the sram_base field */
#define INSN_MEM_6_0 (INSN_MEM_5_1+1) #define VTA_INSN_MEM_6_0 (VTA_INSN_MEM_5_1 + 1)
/*! Load/Store instruction end position of the sram_base field */ /*! Load/Store instruction end position of the sram_base field */
#define INSN_MEM_6_1 (INSN_MEM_6_0+MEMOP_SRAM_ADDR_BIT_WIDTH-1) #define VTA_INSN_MEM_6_1 (VTA_INSN_MEM_6_0 + VTA_MEMOP_SRAM_ADDR_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the dram_base field */ /*! Load/Store instruction start position of the dram_base field */
#define INSN_MEM_7_0 (INSN_MEM_6_1+1) #define VTA_INSN_MEM_7_0 (VTA_INSN_MEM_6_1 + 1)
/*! Load/Store instruction end position of the dram_base field */ /*! Load/Store instruction end position of the dram_base field */
#define INSN_MEM_7_1 (INSN_MEM_7_0+MEMOP_DRAM_ADDR_BIT_WIDTH-1) #define VTA_INSN_MEM_7_1 (VTA_INSN_MEM_7_0 + VTA_MEMOP_DRAM_ADDR_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the y_size field */ /*! Load/Store instruction start position of the y_size field */
#define INSN_MEM_8_0 64 #define VTA_INSN_MEM_8_0 64
/*! Load/Store instruction end position of the y_size field */ /*! Load/Store instruction end position of the y_size field */
#define INSN_MEM_8_1 (INSN_MEM_8_0+MEMOP_SIZE_BIT_WIDTH-1) #define VTA_INSN_MEM_8_1 (VTA_INSN_MEM_8_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the x_size field */ /*! Load/Store instruction start position of the x_size field */
#define INSN_MEM_9_0 (INSN_MEM_8_1+1) #define VTA_INSN_MEM_9_0 (VTA_INSN_MEM_8_1 + 1)
/*! Load/Store instruction start position of the x_size field */ /*! Load/Store instruction start position of the x_size field */
#define INSN_MEM_9_1 (INSN_MEM_9_0+MEMOP_SIZE_BIT_WIDTH-1) #define VTA_INSN_MEM_9_1 (VTA_INSN_MEM_9_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the x_stride field */ /*! Load/Store instruction start position of the x_stride field */
#define INSN_MEM_A_0 (INSN_MEM_9_1+1) #define VTA_INSN_MEM_A_0 (VTA_INSN_MEM_9_1 + 1)
/*! Load/Store instruction end position of the x_stride field */ /*! Load/Store instruction end position of the x_stride field */
#define INSN_MEM_A_1 (INSN_MEM_A_0+MEMOP_STRIDE_BIT_WIDTH-1) #define VTA_INSN_MEM_A_1 (VTA_INSN_MEM_A_0 + VTA_MEMOP_STRIDE_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the y_pad_0 field */ /*! Load/Store instruction start position of the y_pad_0 field */
#define INSN_MEM_B_0 (INSN_MEM_A_1+1) #define VTA_INSN_MEM_B_0 (VTA_INSN_MEM_A_1 + 1)
/*! Load/Store instruction start position of the y_pad_0 field */ /*! Load/Store instruction start position of the y_pad_0 field */
#define INSN_MEM_B_1 (INSN_MEM_B_0+MEMOP_PAD_BIT_WIDTH-1) #define VTA_INSN_MEM_B_1 (VTA_INSN_MEM_B_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the y_pad_1 field */ /*! Load/Store instruction start position of the y_pad_1 field */
#define INSN_MEM_C_0 (INSN_MEM_B_1+1) #define VTA_INSN_MEM_C_0 (VTA_INSN_MEM_B_1 + 1)
/*! Load/Store instruction start position of the y_pad_1 field */ /*! Load/Store instruction start position of the y_pad_1 field */
#define INSN_MEM_C_1 (INSN_MEM_C_0+MEMOP_PAD_BIT_WIDTH-1) #define VTA_INSN_MEM_C_1 (VTA_INSN_MEM_C_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the x_pad_0 field */ /*! Load/Store instruction start position of the x_pad_0 field */
#define INSN_MEM_D_0 (INSN_MEM_C_1+1) #define VTA_INSN_MEM_D_0 (VTA_INSN_MEM_C_1 + 1)
/*! Load/Store instruction start position of the x_pad_0 field */ /*! Load/Store instruction start position of the x_pad_0 field */
#define INSN_MEM_D_1 (INSN_MEM_D_0+MEMOP_PAD_BIT_WIDTH-1) #define VTA_INSN_MEM_D_1 (VTA_INSN_MEM_D_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
/*! Load/Store instruction start position of the x_pad_1 field */ /*! Load/Store instruction start position of the x_pad_1 field */
#define INSN_MEM_E_0 (INSN_MEM_D_1+1) #define VTA_INSN_MEM_E_0 (VTA_INSN_MEM_D_1 + 1)
/*! Load/Store instruction start position of the x_pad_1 field */ /*! Load/Store instruction start position of the x_pad_1 field */
#define INSN_MEM_E_1 (INSN_MEM_E_0+MEMOP_PAD_BIT_WIDTH-1) #define VTA_INSN_MEM_E_1 (VTA_INSN_MEM_E_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
/*! GEMM instruction start position of the opcode field */ /*! GEMM instruction start position of the opcode field */
#define INSN_GEM_0_0 0 #define VTA_INSN_GEM_0_0 0
/*! GEMM instruction end position of the opcode field */ /*! GEMM instruction end position of the opcode field */
#define INSN_GEM_0_1 (INSN_GEM_0_0+OPCODE_BIT_WIDTH-1) #define VTA_INSN_GEM_0_1 (VTA_INSN_GEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1)
/*! GEMM instruction position of the pop_prev_dep field */ /*! GEMM instruction position of the pop_prev_dep field */
#define INSN_GEM_1 (INSN_GEM_0_1+1) #define VTA_INSN_GEM_1 (VTA_INSN_GEM_0_1 + 1)
/*! GEMM instruction position of the pop_next_dep field */ /*! GEMM instruction position of the pop_next_dep field */
#define INSN_GEM_2 (INSN_GEM_1+1) #define VTA_INSN_GEM_2 (VTA_INSN_GEM_1 + 1)
/*! GEMM instruction position of the push_prev_dependence field */ /*! GEMM instruction position of the push_prev_dependence field */
#define INSN_GEM_3 (INSN_GEM_2+1) #define VTA_INSN_GEM_3 (VTA_INSN_GEM_2 + 1)
/*! GEMM instruction position of the push_next_dependence field */ /*! GEMM instruction position of the push_next_dependence field */
#define INSN_GEM_4 (INSN_GEM_3+1) #define VTA_INSN_GEM_4 (VTA_INSN_GEM_3 + 1)
/*! GEMM instruction start position of the uop_bgn field */ /*! GEMM instruction start position of the uop_bgn field */
#define INSN_GEM_5_0 (INSN_GEM_4+1) #define VTA_INSN_GEM_5_0 (VTA_INSN_GEM_4 + 1)
/*! GEMM instruction end position of the uop_bgn field */ /*! GEMM instruction end position of the uop_bgn field */
#define INSN_GEM_5_1 (INSN_GEM_5_0+LOG_UOP_BUFF_DEPTH-1) #define VTA_INSN_GEM_5_1 (VTA_INSN_GEM_5_0 + VTA_LOG_UOP_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the uop_end field */ /*! GEMM instruction start position of the uop_end field */
#define INSN_GEM_6_0 (INSN_GEM_5_1+1) #define VTA_INSN_GEM_6_0 (VTA_INSN_GEM_5_1 + 1)
/*! GEMM instruction end position of the uop_end field */ /*! GEMM instruction end position of the uop_end field */
#define INSN_GEM_6_1 (INSN_GEM_6_0+LOG_UOP_BUFF_DEPTH+1-1) #define VTA_INSN_GEM_6_1 (VTA_INSN_GEM_6_0 + VTA_LOG_UOP_BUFF_DEPTH + 1 - 1)
/*! GEMM instruction start position of the iter_out field */ /*! GEMM instruction start position of the iter_out field */
#define INSN_GEM_7_0 (INSN_GEM_6_1+1) #define VTA_INSN_GEM_7_0 (VTA_INSN_GEM_6_1 + 1)
/*! GEMM instruction end position of the iter_out field */ /*! GEMM instruction end position of the iter_out field */
#define INSN_GEM_7_1 (INSN_GEM_7_0+LOOP_ITER_WIDTH-1) #define VTA_INSN_GEM_7_1 (VTA_INSN_GEM_7_0 + VTA_LOOP_ITER_WIDTH - 1)
/*! GEMM instruction start position of the iter_in field */ /*! GEMM instruction start position of the iter_in field */
#define INSN_GEM_8_0 (INSN_GEM_7_1+1) #define VTA_INSN_GEM_8_0 (VTA_INSN_GEM_7_1 + 1)
/*! GEMM instruction end position of the iter_in field */ /*! GEMM instruction end position of the iter_in field */
#define INSN_GEM_8_1 (INSN_GEM_8_0+LOOP_ITER_WIDTH-1) #define VTA_INSN_GEM_8_1 (VTA_INSN_GEM_8_0 + VTA_LOOP_ITER_WIDTH - 1)
/*! GEMM instruction start position of the dst_factor_out field */ /*! GEMM instruction start position of the dst_factor_out field */
#define INSN_GEM_9_0 64 #define VTA_INSN_GEM_9_0 64
/*! GEMM instruction end position of the dst_factor_out field */ /*! GEMM instruction end position of the dst_factor_out field */
#define INSN_GEM_9_1 (INSN_GEM_9_0+LOG_ACC_BUFF_DEPTH-1) #define VTA_INSN_GEM_9_1 (VTA_INSN_GEM_9_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the dst_factor_in field */ /*! GEMM instruction start position of the dst_factor_in field */
#define INSN_GEM_A_0 (INSN_GEM_9_1+1) #define VTA_INSN_GEM_A_0 (VTA_INSN_GEM_9_1 + 1)
/*! GEMM instruction end position of the dst_factor_in field */ /*! GEMM instruction end position of the dst_factor_in field */
#define INSN_GEM_A_1 (INSN_GEM_A_0+LOG_ACC_BUFF_DEPTH-1) #define VTA_INSN_GEM_A_1 (VTA_INSN_GEM_A_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the src_factor_out field */ /*! GEMM instruction start position of the src_factor_out field */
#define INSN_GEM_B_0 (INSN_GEM_A_1+1) #define VTA_INSN_GEM_B_0 (VTA_INSN_GEM_A_1 + 1)
/*! GEMM instruction end position of the src_factor_out field */ /*! GEMM instruction end position of the src_factor_out field */
#define INSN_GEM_B_1 (INSN_GEM_B_0+LOG_ACC_BUFF_DEPTH-1) #define VTA_INSN_GEM_B_1 (VTA_INSN_GEM_B_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the src_factor_in field */ /*! GEMM instruction start position of the src_factor_in field */
#define INSN_GEM_C_0 (INSN_GEM_B_1+1) #define VTA_INSN_GEM_C_0 (VTA_INSN_GEM_B_1 + 1)
/*! GEMM instruction end position of the src_factor_in field */ /*! GEMM instruction end position of the src_factor_in field */
#define INSN_GEM_C_1 (INSN_GEM_C_0+LOG_ACC_BUFF_DEPTH-1) #define VTA_INSN_GEM_C_1 (VTA_INSN_GEM_C_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the wgt_factor_out field */ /*! GEMM instruction start position of the wgt_factor_out field */
#define INSN_GEM_D_0 (INSN_GEM_C_1+1) #define VTA_INSN_GEM_D_0 (VTA_INSN_GEM_C_1 + 1)
/*! GEMM instruction end position of the wgt_factor_out field */ /*! GEMM instruction end position of the wgt_factor_out field */
#define INSN_GEM_D_1 (INSN_GEM_D_0+LOG_WGT_BUFF_DEPTH-1) #define VTA_INSN_GEM_D_1 (VTA_INSN_GEM_D_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
/*! GEMM instruction start position of the wgt_factor_in field */ /*! GEMM instruction start position of the wgt_factor_in field */
#define INSN_GEM_E_0 (INSN_GEM_D_1+1) #define VTA_INSN_GEM_E_0 (VTA_INSN_GEM_D_1 + 1)
/*! GEMM instruction end position of the wgt_factor_in field */ /*! GEMM instruction end position of the wgt_factor_in field */
#define INSN_GEM_E_1 (INSN_GEM_E_0+LOG_WGT_BUFF_DEPTH-1) #define VTA_INSN_GEM_E_1 (VTA_INSN_GEM_E_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
/*! ALU instruction start position of the alu_opcode field */ /*! ALU instruction start position of the alu_opcode field */
#define INSN_ALU_D_0 (INSN_GEM_C_1+1) #define VTA_INSN_ALU_D_0 (VTA_INSN_GEM_C_1 + 1)
/*! ALU instruction end position of the alu_opcode field */ /*! ALU instruction end position of the alu_opcode field */
#define INSN_ALU_D_1 (INSN_ALU_D_0+ALU_OPCODE_BIT_WIDTH-1) #define VTA_INSN_ALU_D_1 (VTA_INSN_ALU_D_0 + VTA_ALU_OPCODE_BIT_WIDTH - 1)
/*! ALU instruction position of the use_imm field */ /*! ALU instruction position of the use_imm field */
#define INSN_ALU_E (INSN_ALU_D_1+1) #define VTA_INSN_ALU_E (VTA_INSN_ALU_D_1 + 1)
/*! ALU instruction start position of the immediate field */ /*! ALU instruction start position of the immediate field */
#define INSN_ALU_F_0 (INSN_ALU_E+1) #define VTA_INSN_ALU_F_0 (VTA_INSN_ALU_E + 1)
/*! ALU instruction end position of the immediate field */ /*! ALU instruction end position of the immediate field */
#define INSN_ALU_F_1 (INSN_ALU_F_0+ALUOP_IMM_BIT_WIDTH-1) #define VTA_INSN_ALU_F_1 (VTA_INSN_ALU_F_0 + VTA_ALUOP_IMM_BIT_WIDTH - 1)
/*! GEMM Micro-op position of the reset_out field */ /*! GEMM Micro-op position of the reset_out field */
#define UOP_GEM_0 0 #define VTA_UOP_GEM_0 0
/*! GEMM Micro-op start position of the acc_idx field */ /*! GEMM Micro-op start position of the acc_idx field */
#define UOP_GEM_1_0 (UOP_GEM_0+1) #define VTA_UOP_GEM_1_0 (VTA_UOP_GEM_0 + 1)
/*! GEMM Micro-op end position of the acc_idx field */ /*! GEMM Micro-op end position of the acc_idx field */
#define UOP_GEM_1_1 (UOP_GEM_1_0+LOG_ACC_BUFF_DEPTH-1) #define VTA_UOP_GEM_1_1 (VTA_UOP_GEM_1_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM Micro-op start position of the inp_idx field */ /*! GEMM Micro-op start position of the inp_idx field */
#define UOP_GEM_2_0 (UOP_GEM_1_1+1) #define VTA_UOP_GEM_2_0 (VTA_UOP_GEM_1_1 + 1)
/*! GEMM Micro-op end position of the inp_idx field */ /*! GEMM Micro-op end position of the inp_idx field */
#define UOP_GEM_2_1 (UOP_GEM_2_0+LOG_ACC_BUFF_DEPTH-1) #define VTA_UOP_GEM_2_1 (VTA_UOP_GEM_2_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM Micro-op start position of the wgt_idx field */ /*! GEMM Micro-op start position of the wgt_idx field */
#define UOP_GEM_3_0 (UOP_GEM_2_1+1) #define VTA_UOP_GEM_3_0 (VTA_UOP_GEM_2_1 + 1)
/*! GEMM Micro-op end position of the wgt_idx field */ /*! GEMM Micro-op end position of the wgt_idx field */
#define UOP_GEM_3_1 (UOP_GEM_3_0+LOG_WGT_BUFF_DEPTH-1) #define VTA_UOP_GEM_3_1 (VTA_UOP_GEM_3_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
/*! GEMM Micro-op position of the reset_out field */ /*! GEMM Micro-op position of the reset_out field */
#define UOP_ALU_0 0 #define VTA_UOP_ALU_0 0
/*! GEMM Micro-op start position of the acc_idx field */ /*! GEMM Micro-op start position of the acc_idx field */
#define UOP_ALU_1_0 (UOP_ALU_0+1) #define VTA_UOP_ALU_1_0 (VTA_UOP_ALU_0 + 1)
/*! GEMM Micro-op end position of the acc_idx field */ /*! GEMM Micro-op end position of the acc_idx field */
#define UOP_ALU_1_1 (UOP_ALU_1_0+LOG_ACC_BUFF_DEPTH-1) #define VTA_UOP_ALU_1_1 (VTA_UOP_ALU_1_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM Micro-op start position of the inp_idx field */ /*! GEMM Micro-op start position of the inp_idx field */
#define UOP_ALU_2_0 (UOP_ALU_1_1+1) #define VTA_UOP_ALU_2_0 (VTA_UOP_ALU_1_1 + 1)
/*! GEMM Micro-op end position of the inp_idx field */ /*! GEMM Micro-op end position of the inp_idx field */
#define UOP_ALU_2_1 (UOP_ALU_2_0+LOG_ACC_BUFF_DEPTH-1) #define VTA_UOP_ALU_2_1 (VTA_UOP_ALU_2_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM Micro-op start position of the wgt_idx field */ /*! GEMM Micro-op start position of the wgt_idx field */
#define UOP_ALU_3_0 (UOP_ALU_2_1+1) #define VTA_UOP_ALU_3_0 (VTA_UOP_ALU_2_1 + 1)
/*! GEMM Micro-op end position of the wgt_idx field */ /*! GEMM Micro-op end position of the wgt_idx field */
#define UOP_ALU_3_1 (UOP_ALU_3_0+LOG_WGT_BUFF_DEPTH-1) #define VTA_UOP_ALU_3_1 (VTA_UOP_ALU_3_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
/*! \brief VTA generic instruction */ /*! \brief VTA generic instruction */
typedef struct { typedef struct {
...@@ -382,7 +385,7 @@ typedef struct { ...@@ -382,7 +385,7 @@ typedef struct {
*/ */
typedef struct { typedef struct {
/*! \brief The instruction opcode */ /*! \brief The instruction opcode */
uint64_t opcode : OPCODE_BIT_WIDTH; uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
/*! \brief Unused in this instruction */ /*! \brief Unused in this instruction */
uint64_t pop_prev_dep : 1; uint64_t pop_prev_dep : 1;
/*! \brief Pop dependence token from GEMM stage */ /*! \brief Pop dependence token from GEMM stage */
...@@ -392,25 +395,25 @@ typedef struct { ...@@ -392,25 +395,25 @@ typedef struct {
/*! \brief Push dependence token to GEMM stage */ /*! \brief Push dependence token to GEMM stage */
uint64_t push_next_dep : 1; uint64_t push_next_dep : 1;
/*! \brief Source/destination SRAM for store/load instruction */ /*! \brief Source/destination SRAM for store/load instruction */
uint64_t memory_type : MEMOP_ID_BIT_WIDTH; uint64_t memory_type : VTA_MEMOP_ID_BIT_WIDTH;
/*! \brief SRAM base address (pointer to memory elem type) */ /*! \brief SRAM base address (pointer to memory elem type) */
uint64_t sram_base : MEMOP_SRAM_ADDR_BIT_WIDTH; uint64_t sram_base : VTA_MEMOP_SRAM_ADDR_BIT_WIDTH;
/*! \brief DRAM base address (pointer to memory elem type) */ /*! \brief DRAM base address (pointer to memory elem type) */
uint64_t dram_base : MEMOP_DRAM_ADDR_BIT_WIDTH; uint64_t dram_base : VTA_MEMOP_DRAM_ADDR_BIT_WIDTH;
/*! \brief 2D access pattern: y-size */ /*! \brief 2D access pattern: y-size */
uint64_t y_size : MEMOP_SIZE_BIT_WIDTH; uint64_t y_size : VTA_MEMOP_SIZE_BIT_WIDTH;
/*! \brief 2D access pattern: x-size (in terms of memory elements) */ /*! \brief 2D access pattern: x-size (in terms of memory elements) */
uint64_t x_size : MEMOP_SIZE_BIT_WIDTH; uint64_t x_size : VTA_MEMOP_SIZE_BIT_WIDTH;
/*! \brief 2D access pattern: x-stride (in terms of memory elements) */ /*! \brief 2D access pattern: x-stride (in terms of memory elements) */
uint64_t x_stride : MEMOP_STRIDE_BIT_WIDTH; uint64_t x_stride : VTA_MEMOP_STRIDE_BIT_WIDTH;
/*! \brief 2D access pattern: start padding along y dimension */ /*! \brief 2D access pattern: start padding along y dimension */
uint64_t y_pad_0 : MEMOP_PAD_BIT_WIDTH; uint64_t y_pad_0 : VTA_MEMOP_PAD_BIT_WIDTH;
/*! \brief 2D access pattern: end padding along y dimension */ /*! \brief 2D access pattern: end padding along y dimension */
uint64_t y_pad_1 : MEMOP_PAD_BIT_WIDTH; uint64_t y_pad_1 : VTA_MEMOP_PAD_BIT_WIDTH;
/*! \brief 2D access pattern: start padding along x dimension */ /*! \brief 2D access pattern: start padding along x dimension */
uint64_t x_pad_0 : MEMOP_PAD_BIT_WIDTH; uint64_t x_pad_0 : VTA_MEMOP_PAD_BIT_WIDTH;
/*! \brief 2D access pattern: end padding along x dimension */ /*! \brief 2D access pattern: end padding along x dimension */
uint64_t x_pad_1 : MEMOP_PAD_BIT_WIDTH; uint64_t x_pad_1 : VTA_MEMOP_PAD_BIT_WIDTH;
} VTAMemInsn; } VTAMemInsn;
/*! \brief VTA GEMM instruction /*! \brief VTA GEMM instruction
...@@ -442,7 +445,7 @@ typedef struct { ...@@ -442,7 +445,7 @@ typedef struct {
*/ */
typedef struct { typedef struct {
/*! \brief The instruction opcode */ /*! \brief The instruction opcode */
uint64_t opcode : OPCODE_BIT_WIDTH; uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
/*! \brief Pop dependence token from load stage */ /*! \brief Pop dependence token from load stage */
uint64_t pop_prev_dep : 1; uint64_t pop_prev_dep : 1;
/*! \brief Pop dependence token from store stage */ /*! \brief Pop dependence token from store stage */
...@@ -452,25 +455,25 @@ typedef struct { ...@@ -452,25 +455,25 @@ typedef struct {
/*! \brief Push dependence token to store stage */ /*! \brief Push dependence token to store stage */
uint64_t push_next_dep : 1; uint64_t push_next_dep : 1;
/*! \brief Micro-op begin address */ /*! \brief Micro-op begin address */
uint64_t uop_bgn : LOG_UOP_BUFF_DEPTH; uint64_t uop_bgn : VTA_LOG_UOP_BUFF_DEPTH;
/*! \brief Micro-op end address */ /*! \brief Micro-op end address */
uint64_t uop_end : LOG_UOP_BUFF_DEPTH+1; uint64_t uop_end : VTA_LOG_UOP_BUFF_DEPTH+1;
/*! \brief Iterations in the outer uop execution loop */ /*! \brief Iterations in the outer uop execution loop */
uint64_t iter_out : LOOP_ITER_WIDTH; uint64_t iter_out : VTA_LOOP_ITER_WIDTH;
/*! \brief Iterations in the inner uop execution loop */ /*! \brief Iterations in the inner uop execution loop */
uint64_t iter_in : LOOP_ITER_WIDTH; uint64_t iter_in : VTA_LOOP_ITER_WIDTH;
/*! \brief Outer loop accumulator memory index factor */ /*! \brief Outer loop accumulator memory index factor */
uint64_t dst_factor_out : LOG_ACC_BUFF_DEPTH; uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Inner loop accumulator memory index factor */ /*! \brief Inner loop accumulator memory index factor */
uint64_t dst_factor_in : LOG_ACC_BUFF_DEPTH; uint64_t dst_factor_in : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Outer loop input memory index factor */ /*! \brief Outer loop input memory index factor */
uint64_t src_factor_out : LOG_ACC_BUFF_DEPTH; uint64_t src_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Inner loop input memory index factor */ /*! \brief Inner loop input memory index factor */
uint64_t src_factor_in : LOG_ACC_BUFF_DEPTH; uint64_t src_factor_in : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Outer loop weight memory index factor */ /*! \brief Outer loop weight memory index factor */
uint64_t wgt_factor_out : LOG_WGT_BUFF_DEPTH; uint64_t wgt_factor_out : VTA_LOG_WGT_BUFF_DEPTH;
/*! \brief Inner loop weight memory index factor */ /*! \brief Inner loop weight memory index factor */
uint64_t wgt_factor_in : LOG_WGT_BUFF_DEPTH; uint64_t wgt_factor_in : VTA_LOG_WGT_BUFF_DEPTH;
} VTAGemInsn; } VTAGemInsn;
/*! \brief VTA ALU instruction /*! \brief VTA ALU instruction
...@@ -504,7 +507,7 @@ typedef struct { ...@@ -504,7 +507,7 @@ typedef struct {
*/ */
typedef struct { typedef struct {
/*! \brief The instruction opcode */ /*! \brief The instruction opcode */
uint64_t opcode : OPCODE_BIT_WIDTH; uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
/*! \brief Pop dependence token from load stage */ /*! \brief Pop dependence token from load stage */
uint64_t pop_prev_dep : 1; uint64_t pop_prev_dep : 1;
/*! \brief Pop dependence token from store stage */ /*! \brief Pop dependence token from store stage */
...@@ -514,27 +517,27 @@ typedef struct { ...@@ -514,27 +517,27 @@ typedef struct {
/*! \brief Push dependence token to store stage */ /*! \brief Push dependence token to store stage */
uint64_t push_next_dep : 1; uint64_t push_next_dep : 1;
/*! \brief Micro-op begin address */ /*! \brief Micro-op begin address */
uint64_t uop_bgn : LOG_UOP_BUFF_DEPTH; uint64_t uop_bgn : VTA_LOG_UOP_BUFF_DEPTH;
/*! \brief Micro-op end address */ /*! \brief Micro-op end address */
uint64_t uop_end : LOG_UOP_BUFF_DEPTH+1; uint64_t uop_end : VTA_LOG_UOP_BUFF_DEPTH+1;
/*! \brief Iterations in the outer uop execution loop */ /*! \brief Iterations in the outer uop execution loop */
uint64_t iter_out : LOOP_ITER_WIDTH; uint64_t iter_out : VTA_LOOP_ITER_WIDTH;
/*! \brief Iterations in the inner uop execution loop */ /*! \brief Iterations in the inner uop execution loop */
uint64_t iter_in : LOOP_ITER_WIDTH; uint64_t iter_in : VTA_LOOP_ITER_WIDTH;
/*! \brief Outer loop accumulator memory destination index factor */ /*! \brief Outer loop accumulator memory destination index factor */
uint64_t dst_factor_out : LOG_ACC_BUFF_DEPTH; uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Inner loop accumulator memory destination index factor */ /*! \brief Inner loop accumulator memory destination index factor */
uint64_t dst_factor_in : LOG_ACC_BUFF_DEPTH; uint64_t dst_factor_in : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Outer loop accumulator memory source index factor */ /*! \brief Outer loop accumulator memory source index factor */
uint64_t src_factor_out : LOG_ACC_BUFF_DEPTH; uint64_t src_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Inner loop accumulator memory source index factor */ /*! \brief Inner loop accumulator memory source index factor */
uint64_t src_factor_in : LOG_ACC_BUFF_DEPTH; uint64_t src_factor_in : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief ALU opcode */ /*! \brief ALU opcode */
uint64_t alu_opcode : ALU_OPCODE_BIT_WIDTH; uint64_t alu_opcode : VTA_ALU_OPCODE_BIT_WIDTH;
/*! \brief Use immediate is true */ /*! \brief Use immediate is true */
uint64_t use_imm : 1; uint64_t use_imm : 1;
/*! \brief Immediate value */ /*! \brief Immediate value */
uint64_t imm : ALUOP_IMM_BIT_WIDTH; uint64_t imm : VTA_ALUOP_IMM_BIT_WIDTH;
} VTAAluInsn; } VTAAluInsn;
/*! \brief VTA ALU instruction converter */ /*! \brief VTA ALU instruction converter */
...@@ -554,11 +557,11 @@ typedef struct { ...@@ -554,11 +557,11 @@ typedef struct {
/*! \brief Initialize acc_mem at index dst_idx to 0*/ /*! \brief Initialize acc_mem at index dst_idx to 0*/
uint32_t reset_out : 1; uint32_t reset_out : 1;
/*! \brief Destination index (indexes accum buffer) */ /*! \brief Destination index (indexes accum buffer) */
uint32_t dst_idx : LOG_ACC_BUFF_DEPTH; uint32_t dst_idx : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Source index (indexes input buffer for GEMM or accum buffer for ALU) */ /*! \brief Source index (indexes input buffer for GEMM or accum buffer for ALU) */
uint32_t src_idx : LOG_ACC_BUFF_DEPTH; uint32_t src_idx : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Weight index (indexes weight buffer) */ /*! \brief Weight index (indexes weight buffer) */
uint32_t wgt_idx : LOG_WGT_BUFF_DEPTH; uint32_t wgt_idx : VTA_LOG_WGT_BUFF_DEPTH;
} VTAUop; } VTAUop;
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -27,70 +27,72 @@ ADD_LDFLAGS= ...@@ -27,70 +27,72 @@ ADD_LDFLAGS=
ADD_CFLAGS= ADD_CFLAGS=
# the hardware target # the hardware target
TARGET=PYNQ_TARGET TARGET = VTA_PYNQ_TARGET
#--------------------- #---------------------
# VTA hardware parameters # VTA hardware parameters
#-------------------- #--------------------
# Log of input/activation width in bits (default 3 -> 8 bits) # Log of input/activation width in bits (default 3 -> 8 bits)
LOG_INP_WIDTH = 3 VTA_LOG_INP_WIDTH = 3
# Log of kernel weight width in bits (default 3 -> 8 bits) # Log of kernel weight width in bits (default 3 -> 8 bits)
LOG_WGT_WIDTH = 3 VTA_LOG_WGT_WIDTH = 3
# Log of accum width in bits (default 5 -> 32 bits) # Log of accum width in bits (default 5 -> 32 bits)
LOG_ACC_WIDTH = 5 VTA_LOG_ACC_WIDTH = 5
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication) # Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0 VTA_LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication) # Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_BLOCK_IN = 4 VTA_LOG_BLOCK_IN = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication) # Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_BLOCK_OUT = 4 VTA_LOG_BLOCK_OUT = 4
# Log of uop buffer size in Bytes # Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15 VTA_LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes # Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15 VTA_LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes # Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15 VTA_LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes # Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17 VTA_LOG_ACC_BUFF_SIZE = 17
#--------------------- #---------------------
# Derived VTA hardware parameters # Derived VTA hardware parameters
#-------------------- #--------------------
# Input width in bits # Input width in bits
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" ) VTA_INP_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_INP_WIDTH) ))" )
# Weight width in bits # Weight width in bits
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" ) VTA_WGT_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_WGT_WIDTH) ))" )
# Log of output width in bits # Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH) VTA_LOG_OUT_WIDTH = $(VTA_LOG_INP_WIDTH)
# Output width in bits # Output width in bits
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" ) VTA_OUT_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_OUT_WIDTH) ))" )
# Tensor batch size # Tensor batch size
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" ) VTA_BATCH = $(shell echo "$$(( 1 << $(VTA_LOG_BATCH) ))" )
# Tensor outer block size # Tensor outer block size
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_IN) ))" ) VTA_IN_BLOCK = $(shell echo "$$(( 1 << $(VTA_LOG_BLOCK_IN) ))" )
# Tensor inner block size # Tensor inner block size
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_OUT) ))" ) VTA_OUT_BLOCK = $(shell echo "$$(( 1 << $(VTA_LOG_BLOCK_OUT) ))" )
# Uop buffer size in Bytes # Uop buffer size in Bytes
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" ) VTA_UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_UOP_BUFF_SIZE) ))" )
# Inp buffer size in Bytes # Inp buffer size in Bytes
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" ) VTA_INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_INP_BUFF_SIZE) ))" )
# Wgt buffer size in Bytes # Wgt buffer size in Bytes
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" ) VTA_WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_WGT_BUFF_SIZE) ))" )
# Acc buffer size in Bytes # Acc buffer size in Bytes
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" ) VTA_ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_ACC_BUFF_SIZE) ))" )
# Log of out buffer size in Bytes # Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" ) VTA_LOG_OUT_BUFF_SIZE = \
$(shell echo "$$(( $(VTA_LOG_ACC_BUFF_SIZE) + $(VTA_LOG_OUT_WIDTH) - $(VTA_LOG_ACC_WIDTH) ))" )
# Out buffer size in Bytes # Out buffer size in Bytes
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" ) VTA_OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
# Update ADD_CFLAGS # Update ADD_CFLAGS
ADD_CFLAGS += \ ADD_CFLAGS += \
-D$(TARGET) \ -D$(TARGET) \
-DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \ -DVTA_LOG_WGT_WIDTH=$(VTA_LOG_WGT_WIDTH) -DVTA_LOG_INP_WIDTH=$(VTA_LOG_INP_WIDTH) \
-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \ -DVTA_LOG_ACC_WIDTH=$(VTA_LOG_ACC_WIDTH) -DVTA_LOG_OUT_WIDTH=$(VTA_LOG_OUT_WIDTH) \
-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_BLOCK_IN) -DLOG_BLOCK_OUT=$(LOG_BLOCK_OUT) \ -DVTA_LOG_BATCH=$(VTA_LOG_BATCH) \
-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \ -DVTA_LOG_BLOCK_IN=$(VTA_LOG_BLOCK_IN) -DVTA_LOG_BLOCK_OUT=$(VTA_LOG_BLOCK_OUT) \
-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \ -DVTA_LOG_UOP_BUFF_SIZE=$(VTA_LOG_UOP_BUFF_SIZE) -DVTA_LOG_INP_BUFF_SIZE=$(VTA_LOG_INP_BUFF_SIZE) \
-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE) -DVTA_LOG_WGT_BUFF_SIZE=$(VTA_LOG_WGT_BUFF_SIZE) -DVTA_LOG_ACC_BUFF_SIZE=$(VTA_LOG_ACC_BUFF_SIZE) \
\ No newline at end of file -DVTA_LOG_OUT_BUFF_SIZE=$(VTA_LOG_OUT_BUFF_SIZE)
...@@ -29,65 +29,61 @@ void VTAInvalidateCache(void* buf, int size) { ...@@ -29,65 +29,61 @@ void VTAInvalidateCache(void* buf, int size) {
} }
void *VTAMapRegister(uint32_t addr, size_t length) { void *VTAMapRegister(uint32_t addr, size_t length) {
// Align the base address with the pages // Align the base address with the pages
uint32_t virt_base = addr & ~(getpagesize() - 1); uint32_t virt_base = addr & ~(getpagesize() - 1);
// Calculate base address offset w.r.t the base address // Calculate base address offset w.r.t the base address
uint32_t virt_offset = addr - virt_base; uint32_t virt_offset = addr - virt_base;
// Open file and mmap // Open file and mmap
uint32_t mmap_file = open(DEV_MEM_PATH, O_RDWR|O_SYNC); uint32_t mmap_file = open(VTA_PYNQ_DEV_MEM_PATH, O_RDWR|O_SYNC);
return mmap(NULL,
return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base); (length+virt_offset),
PROT_READ|PROT_WRITE,
MAP_SHARED,
mmap_file,
virt_base);
} }
void VTAUnmapRegister(void *vta, size_t length) { void VTAUnmapRegister(void *vta, size_t length) {
// Unmap memory // Unmap memory
int status = munmap(vta, length); int status = munmap(vta, length);
assert(status==0); assert(status == 0);
} }
void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) { void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
*((volatile uint32_t *) (((char *) base_addr) + offset)) = val; *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset)) = val;
} }
uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) { uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
return *((volatile uint32_t *) (((char *) base_addr) + offset)); return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset));
} }
void VTAProgram(const char* bitstream) { void VTAProgram(const char* bitstream) {
int elem; int elem;
FILE *src, *dst, *partial; FILE *src, *dst, *partial;
partial = fopen(VTA_PYNQ_BS_IS_PARTIAL, "w");
partial = fopen(BS_IS_PARTIAL, "w");
if (partial == NULL) { if (partial == NULL) {
printf("Cannot open partial config file %s\n", BS_IS_PARTIAL); printf("Cannot open partial config file %s\n", VTA_PYNQ_BS_IS_PARTIAL);
fclose(partial); fclose(partial);
exit(1); exit(1);
} }
fputc('0', partial); fputc('0', partial);
fclose(partial); fclose(partial);
src = fopen(bitstream, "rb"); src = fopen(bitstream, "rb");
if (src == NULL) { if (src == NULL) {
printf("Cannot open bitstream %s\n", bitstream); printf("Cannot open bitstream %s\n", bitstream);
exit(1); exit(1);
} }
dst = fopen(VTA_PYNQ_BS_XDEVCFG, "wb");
dst = fopen(BS_XDEVCFG, "wb");
if (dst == NULL) { if (dst == NULL) {
printf("Cannot open device file %s\n", BS_XDEVCFG); printf("Cannot open device file %s\n", VTA_PYNQ_BS_XDEVCFG);
fclose(dst); fclose(dst);
exit(1); exit(1);
} }
elem = fgetc(src); elem = fgetc(src);
while (elem != EOF) { while (elem != EOF) {
fputc(elem, dst); fputc(elem, dst);
elem = fgetc(src); elem = fgetc(src);
} }
fclose(src); fclose(src);
fclose(dst); fclose(dst);
} }
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
* \brief VTA driver for Pynq board. * \brief VTA driver for Pynq board.
*/ */
#ifndef VTA_PYNQ_DRIVER_H_ #ifndef VTA_PYNQ_PYNQ_DRIVER_H_
#define VTA_PYNQ_DRIVER_H_ #define VTA_PYNQ_PYNQ_DRIVER_H_
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
...@@ -32,17 +32,20 @@ void xlnkFlushCache(void* buf, int size); ...@@ -32,17 +32,20 @@ void xlnkFlushCache(void* buf, int size);
void xlnkInvalidateCache(void* buf, int size); void xlnkInvalidateCache(void* buf, int size);
#endif #endif
/*! \brief partial bitstream status file path */ /*! \brief (Pynq only) Partial bitstream status file path */
#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream" #define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
/*! \brief bitstream destination file path */ /*! \brief (Pynq only) Bitstream destination file path */
#define BS_XDEVCFG "/dev/xdevcfg" #define VTA_PYNQ_BS_XDEVCFG "/dev/xdevcfg"
/*! \brief Path to /dev/mem */ /*! \brief (Pynq only) Path to /dev/mem */
#define DEV_MEM_PATH "/dev/mem" #define VTA_PYNQ_DEV_MEM_PATH "/dev/mem"
/*! \brief MMIO driver constant */ /*! \brief (Pynq only) MMIO driver constant */
#define MMIO_WORD_LENGTH 4 #define VTA_PYNQ_MMIO_WORD_LENGTH 4
/*! \brief MMIO driver constant */ /*! \brief (Pynq only) MMIO driver constant */
#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1)) #define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
/*! \brief Physically contiguous buffer size limit */
#define VTA_MAX_XFER (1<<22)
/*! \brief VTA configuration register address range */ /*! \brief VTA configuration register address range */
#define VTA_RANGE 0x100 #define VTA_RANGE 0x100
...@@ -74,10 +77,7 @@ void xlnkInvalidateCache(void* buf, int size); ...@@ -74,10 +77,7 @@ void xlnkInvalidateCache(void* buf, int size);
*/ */
#define VTA_STORE_ADDR 0x43C30000 #define VTA_STORE_ADDR 0x43C30000
/*! \brief Buffer size limit */
#define MAX_XFER (1<<22)
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif // VTA_PYNQ_DRIVER_H_ #endif // VTA_PYNQ_PYNQ_DRIVER_H_
\ No newline at end of file \ No newline at end of file
...@@ -4,19 +4,20 @@ ...@@ -4,19 +4,20 @@
* \brief VTA runtime for PYNQ in C++11 * \brief VTA runtime for PYNQ in C++11
*/ */
#ifdef VTA_PYNQ_TARGET
#include "./pynq/pynq_driver.h"
#endif // VTA_PYNQ_TARGET
#include <vta/driver.h>
#include <vta/hw_spec.h>
#include <vta/runtime.h>
#include <cassert> #include <cassert>
#include <cstring> #include <cstring>
#include <vector> #include <vector>
#include <thread> #include <thread>
#include <memory> #include <memory>
#include <atomic> #include <atomic>
#include <vta/driver.h>
#include <vta/hw_spec.h>
#include <vta/runtime.h>
#ifdef PYNQ_TARGET
#include "./pynq/pynq_driver.h"
#endif //PYNQ_TARGET
namespace vta { namespace vta {
...@@ -193,21 +194,21 @@ class UopKernel { ...@@ -193,21 +194,21 @@ class UopKernel {
op.wgt_idx = wgt_index; op.wgt_idx = wgt_index;
seq_.push_back(op); seq_.push_back(op);
// Ensure that mode is consistent if set // Ensure that mode is consistent if set
if (mode_==0xFFFFFFFF) { if (mode_ == 0xFFFFFFFF) {
mode_ = mode; mode_ = mode;
} else { } else {
assert(mode_==mode); assert(mode_ == mode);
} }
// Check kernel op and imm/imm_val in ALU mode // Check kernel op and imm/imm_val in ALU mode
if (mode==1) { if (mode == 1) {
if (opcode_==0xFFFFFFFF) { if (opcode_ == 0xFFFFFFFF) {
opcode_=opcode; opcode_ = opcode;
use_imm_=use_imm; use_imm_ = use_imm;
imm_val_=imm_val; imm_val_ = imm_val;
} else { } else {
assert(opcode_==opcode); assert(opcode_ == opcode);
assert(use_imm_==use_imm); assert(use_imm_ == use_imm);
assert(imm_val_==imm_val); assert(imm_val_ == imm_val);
} }
} }
} }
...@@ -222,7 +223,6 @@ class UopKernel { ...@@ -222,7 +223,6 @@ class UopKernel {
seq_[i].src_idx, seq_[i].src_idx,
seq_[i].wgt_idx, seq_[i].wgt_idx,
seq_[i].reset_out); seq_[i].reset_out);
} }
printf("\n"); printf("\n");
} }
...@@ -233,6 +233,7 @@ class UopKernel { ...@@ -233,6 +233,7 @@ class UopKernel {
uint32_t opcode_{0xFFFFFFFF}; uint32_t opcode_{0xFFFFFFFF};
bool use_imm_{false}; bool use_imm_{false};
uint16_t imm_val_{0}; uint16_t imm_val_{0};
private: private:
// Verify that we don't write to the same acc_mem index two cycles in a row // Verify that we don't write to the same acc_mem index two cycles in a row
void VerifyDep(uint32_t dst_index) { void VerifyDep(uint32_t dst_index) {
...@@ -375,7 +376,7 @@ class UopQueue : public BaseQueue { ...@@ -375,7 +376,7 @@ class UopQueue : public BaseQueue {
} }
// Simple eviction policy // Simple eviction policy
uint32_t evict_begin = cache_ptr_; uint32_t evict_begin = cache_ptr_;
for (;cache_ptr_ < cache_.size(); ++cache_ptr_) { for (; cache_ptr_ < cache_.size(); ++cache_ptr_) {
if (cache_[cache_ptr_]->sram_begin_ >= sram_end_) break; if (cache_[cache_ptr_]->sram_begin_ >= sram_end_) break;
cache_[cache_ptr_]->sram_begin_ = 0; cache_[cache_ptr_]->sram_begin_ = 0;
cache_[cache_ptr_]->sram_end_ = 0; cache_[cache_ptr_]->sram_end_ = 0;
...@@ -395,7 +396,7 @@ class UopQueue : public BaseQueue { ...@@ -395,7 +396,7 @@ class UopQueue : public BaseQueue {
void FlushUopLoad(VTAMemInsn* insn) { void FlushUopLoad(VTAMemInsn* insn) {
if (sram_begin_ != sram_end_) { if (sram_begin_ != sram_end_) {
assert((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_)); assert((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_));
insn->memory_type = MEM_ID_UOP; insn->memory_type = VTA_MEM_ID_UOP;
insn->sram_base = sram_begin_; insn->sram_base = sram_begin_;
insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_; insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_;
insn->y_size = 1; insn->y_size = 1;
...@@ -418,7 +419,7 @@ class UopQueue : public BaseQueue { ...@@ -418,7 +419,7 @@ class UopQueue : public BaseQueue {
std::vector<UopKernel*> cache_; std::vector<UopKernel*> cache_;
// Constants // Constants
static constexpr int kElemBytes = sizeof(VTAUop); static constexpr int kElemBytes = sizeof(VTAUop);
static constexpr int kMaxNumUop = UOP_BUFF_DEPTH; static constexpr int kMaxNumUop = VTA_UOP_BUFF_DEPTH;
static constexpr int kMaxElems = kMaxBytes / kElemBytes; static constexpr int kMaxElems = kMaxBytes / kElemBytes;
}; };
...@@ -541,22 +542,22 @@ class InsnQueue : public BaseQueue { ...@@ -541,22 +542,22 @@ class InsnQueue : public BaseQueue {
for (int i = 1; i < insn_count; ++i) { for (int i = 1; i < insn_count; ++i) {
PipelineStage prev = GetPipelineStage(mem_ptr + i - 1); PipelineStage prev = GetPipelineStage(mem_ptr + i - 1);
PipelineStage now = GetPipelineStage(mem_ptr + i); PipelineStage now = GetPipelineStage(mem_ptr + i);
if (prev==kLoadStage && now==kComputeStage) { if (prev == kLoadStage && now == kComputeStage) {
mem_ptr[i - 1].push_prev_dep = false; mem_ptr[i - 1].push_prev_dep = false;
mem_ptr[i - 1].push_next_dep = true; mem_ptr[i - 1].push_next_dep = true;
mem_ptr[i].pop_prev_dep = true; mem_ptr[i].pop_prev_dep = true;
mem_ptr[i].pop_next_dep = false; mem_ptr[i].pop_next_dep = false;
} else if (prev==kComputeStage && now==kLoadStage) { } else if (prev == kComputeStage && now == kLoadStage) {
mem_ptr[i - 1].push_prev_dep = true; mem_ptr[i - 1].push_prev_dep = true;
mem_ptr[i - 1].push_next_dep = false; mem_ptr[i - 1].push_next_dep = false;
mem_ptr[i].pop_prev_dep = false; mem_ptr[i].pop_prev_dep = false;
mem_ptr[i].pop_next_dep = true; mem_ptr[i].pop_next_dep = true;
} else if (prev==kStoreStage && now==kComputeStage) { } else if (prev == kStoreStage && now == kComputeStage) {
mem_ptr[i - 1].push_prev_dep = true; mem_ptr[i - 1].push_prev_dep = true;
mem_ptr[i - 1].push_next_dep = false; mem_ptr[i - 1].push_next_dep = false;
mem_ptr[i].pop_prev_dep = false; mem_ptr[i].pop_prev_dep = false;
mem_ptr[i].pop_next_dep = true; mem_ptr[i].pop_next_dep = true;
} else if (prev==kComputeStage && now==kStoreStage) { } else if (prev == kComputeStage && now == kStoreStage) {
mem_ptr[i - 1].push_prev_dep = false; mem_ptr[i - 1].push_prev_dep = false;
mem_ptr[i - 1].push_next_dep = true; mem_ptr[i - 1].push_next_dep = true;
mem_ptr[i].pop_prev_dep = true; mem_ptr[i].pop_prev_dep = true;
...@@ -573,39 +574,39 @@ class InsnQueue : public BaseQueue { ...@@ -573,39 +574,39 @@ class InsnQueue : public BaseQueue {
// Helper function: Get Opcode string // Helper function: Get Opcode string
const char* getOpcodeString(int opcode, bool use_imm) { const char* getOpcodeString(int opcode, bool use_imm) {
// The string name // The string name
if (opcode==ALU_OPCODE_MIN) { if (opcode == VTA_ALU_OPCODE_MIN) {
if (use_imm) { if (use_imm) {
return "min imm"; return "min imm";
} else { } else {
return "min"; return "min";
} }
} else if (opcode==ALU_OPCODE_MAX) { } else if (opcode == VTA_ALU_OPCODE_MAX) {
if (use_imm) { if (use_imm) {
return "max imm"; return "max imm";
} else { } else {
return "max"; return "max";
} }
} else if (opcode==ALU_OPCODE_ADD) { } else if (opcode == VTA_ALU_OPCODE_ADD) {
if (use_imm) { if (use_imm) {
return "add imm"; return "add imm";
} else { } else {
return "add"; return "add";
} }
} else if (opcode==ALU_OPCODE_SUB) { } else if (opcode == VTA_ALU_OPCODE_SUB) {
if (use_imm) { if (use_imm) {
return "sub imm"; return "sub imm";
} else { } else {
return "sub"; return "sub";
} }
} else if (opcode==ALU_OPCODE_MUL) { } else if (opcode == VTA_ALU_OPCODE_MUL) {
if (use_imm) { if (use_imm) {
return "mul imm"; return "mul imm";
} else { } else {
return "mul"; return "mul";
} }
} else if (opcode==ALU_OPCODE_SHL) { } else if (opcode == VTA_ALU_OPCODE_SHL) {
return "shl"; return "shl";
} else if (opcode==ALU_OPCODE_SHR) { } else if (opcode == VTA_ALU_OPCODE_SHR) {
return "shr"; return "shr";
} }
...@@ -629,12 +630,11 @@ class InsnQueue : public BaseQueue { ...@@ -629,12 +630,11 @@ class InsnQueue : public BaseQueue {
// Fetch instruction and decode opcode // Fetch instruction and decode opcode
c.generic = insn[i]; c.generic = insn[i];
printf("INSTRUCTION %u: ", i); printf("INSTRUCTION %u: ", i);
if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) { if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
if (c.mem.x_size == 0) { if (c.mem.x_size == 0) {
if (c.mem.opcode == OPCODE_STORE) { if (c.mem.opcode == VTA_OPCODE_STORE) {
printf("NOP-STORE-STAGE\n"); printf("NOP-STORE-STAGE\n");
} } else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
printf("NOP-COMPUTE-STAGE\n"); printf("NOP-COMPUTE-STAGE\n");
} else { } else {
printf("NOP-MEMORY-STAGE\n"); printf("NOP-MEMORY-STAGE\n");
...@@ -645,15 +645,15 @@ class InsnQueue : public BaseQueue { ...@@ -645,15 +645,15 @@ class InsnQueue : public BaseQueue {
static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_prev_dep),
static_cast<int>(c.mem.push_next_dep)); static_cast<int>(c.mem.push_next_dep));
// Count status in queues // Count status in queues
if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) { if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
if (c.mem.opcode == OPCODE_STORE) { if (c.mem.opcode == VTA_OPCODE_STORE) {
assert(c.mem.pop_next_dep == false); assert(c.mem.pop_next_dep == false);
assert(c.mem.push_next_dep == false); assert(c.mem.push_next_dep == false);
if (c.mem.pop_prev_dep) g2s_queue--; if (c.mem.pop_prev_dep) g2s_queue--;
if (c.mem.push_prev_dep) s2g_queue++; if (c.mem.push_prev_dep) s2g_queue++;
} else if (c.mem.opcode == OPCODE_LOAD && } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
(c.mem.memory_type == MEM_ID_INP || (c.mem.memory_type == VTA_MEM_ID_INP ||
c.mem.memory_type == MEM_ID_WGT) ) { c.mem.memory_type == VTA_MEM_ID_WGT) ) {
assert(c.mem.pop_prev_dep == false); assert(c.mem.pop_prev_dep == false);
assert(c.mem.push_prev_dep == false); assert(c.mem.push_prev_dep == false);
if (c.mem.pop_next_dep) g2l_queue--; if (c.mem.pop_next_dep) g2l_queue--;
...@@ -664,7 +664,7 @@ class InsnQueue : public BaseQueue { ...@@ -664,7 +664,7 @@ class InsnQueue : public BaseQueue {
if (c.mem.pop_next_dep) s2g_queue--; if (c.mem.pop_next_dep) s2g_queue--;
if (c.mem.push_next_dep) g2s_queue++; if (c.mem.push_next_dep) g2s_queue++;
} }
} else if (c.mem.opcode == OPCODE_GEMM) { } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
// Print instruction field information // Print instruction field information
if (c.gemm.pop_prev_dep) l2g_queue--; if (c.gemm.pop_prev_dep) l2g_queue--;
if (c.gemm.push_prev_dep) g2l_queue++; if (c.gemm.push_prev_dep) g2l_queue++;
...@@ -676,14 +676,14 @@ class InsnQueue : public BaseQueue { ...@@ -676,14 +676,14 @@ class InsnQueue : public BaseQueue {
continue; continue;
} }
// Print instruction field information // Print instruction field information
if (c.mem.opcode==OPCODE_LOAD) { if (c.mem.opcode == VTA_OPCODE_LOAD) {
printf("LOAD "); printf("LOAD ");
if (c.mem.memory_type == MEM_ID_UOP) printf("UOP\n"); if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
if (c.mem.memory_type == MEM_ID_WGT) printf("WGT\n"); if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
if (c.mem.memory_type == MEM_ID_INP) printf("INP\n"); if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
if (c.mem.memory_type == MEM_ID_ACC) printf("ACC\n"); if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
} }
if (c.mem.opcode==OPCODE_STORE) { if (c.mem.opcode == VTA_OPCODE_STORE) {
printf("STORE\n"); printf("STORE\n");
} }
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
...@@ -703,7 +703,7 @@ class InsnQueue : public BaseQueue { ...@@ -703,7 +703,7 @@ class InsnQueue : public BaseQueue {
static_cast<int>(c.mem.x_stride), static_cast<int>(c.mem.x_stride),
static_cast<int>(c.mem.x_pad_0), static_cast<int>(c.mem.x_pad_0),
static_cast<int>(c.mem.x_pad_1)); static_cast<int>(c.mem.x_pad_1));
} else if (c.mem.opcode==OPCODE_GEMM) { } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
// Print instruction field information // Print instruction field information
printf("GEMM\n"); printf("GEMM\n");
...@@ -725,7 +725,7 @@ class InsnQueue : public BaseQueue { ...@@ -725,7 +725,7 @@ class InsnQueue : public BaseQueue {
static_cast<int>(c.gemm.wgt_factor_in), static_cast<int>(c.gemm.wgt_factor_in),
static_cast<int>(c.gemm.src_factor_in), static_cast<int>(c.gemm.src_factor_in),
static_cast<int>(c.gemm.dst_factor_in)); static_cast<int>(c.gemm.dst_factor_in));
} else if (c.mem.opcode == OPCODE_ALU) { } else if (c.mem.opcode == VTA_OPCODE_ALU) {
// Print instruction field information // Print instruction field information
printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm)); printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
...@@ -744,20 +744,20 @@ class InsnQueue : public BaseQueue { ...@@ -744,20 +744,20 @@ class InsnQueue : public BaseQueue {
static_cast<int>(c.alu.iter_in), static_cast<int>(c.alu.iter_in),
static_cast<int>(c.alu.dst_factor_in), static_cast<int>(c.alu.dst_factor_in),
static_cast<int>(c.alu.src_factor_in)); static_cast<int>(c.alu.src_factor_in));
} else if (c.mem.opcode == OPCODE_FINISH) { } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
printf("FINISH\n"); printf("FINISH\n");
} }
// Count status in queues // Count status in queues
if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) { if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
if (c.mem.opcode == OPCODE_STORE) { if (c.mem.opcode == VTA_OPCODE_STORE) {
assert(c.mem.pop_next_dep == false); assert(c.mem.pop_next_dep == false);
assert(c.mem.push_next_dep == false); assert(c.mem.push_next_dep == false);
if (c.mem.pop_prev_dep) g2s_queue--; if (c.mem.pop_prev_dep) g2s_queue--;
if (c.mem.push_prev_dep) s2g_queue++; if (c.mem.push_prev_dep) s2g_queue++;
} else if (c.mem.opcode == OPCODE_LOAD && } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
(c.mem.memory_type == MEM_ID_INP || (c.mem.memory_type == VTA_MEM_ID_INP ||
c.mem.memory_type == MEM_ID_WGT) ) { c.mem.memory_type == VTA_MEM_ID_WGT) ) {
assert(c.mem.pop_prev_dep == false); assert(c.mem.pop_prev_dep == false);
assert(c.mem.push_prev_dep == false); assert(c.mem.push_prev_dep == false);
if (c.mem.pop_next_dep) g2l_queue--; if (c.mem.pop_next_dep) g2l_queue--;
...@@ -768,8 +768,8 @@ class InsnQueue : public BaseQueue { ...@@ -768,8 +768,8 @@ class InsnQueue : public BaseQueue {
if (c.mem.pop_next_dep) s2g_queue--; if (c.mem.pop_next_dep) s2g_queue--;
if (c.mem.push_next_dep) g2s_queue++; if (c.mem.push_next_dep) g2s_queue++;
} }
} else if (c.mem.opcode == OPCODE_GEMM || } else if (c.mem.opcode == VTA_OPCODE_GEMM ||
c.mem.opcode == OPCODE_ALU) { c.mem.opcode == VTA_OPCODE_ALU) {
// Print instruction field information // Print instruction field information
if (c.gemm.pop_prev_dep) l2g_queue--; if (c.gemm.pop_prev_dep) l2g_queue--;
if (c.gemm.push_prev_dep) g2l_queue++; if (c.gemm.push_prev_dep) g2l_queue++;
...@@ -832,23 +832,24 @@ class InsnQueue : public BaseQueue { ...@@ -832,23 +832,24 @@ class InsnQueue : public BaseQueue {
} }
// Get stage of the memory // Get stage of the memory
static PipelineStage GetMemPipelineStage(int memory_type) { static PipelineStage GetMemPipelineStage(int memory_type) {
if (memory_type == MEM_ID_ACC) return kComputeStage; if (memory_type == VTA_MEM_ID_ACC) return kComputeStage;
if (memory_type == MEM_ID_UOP) return kComputeStage; if (memory_type == VTA_MEM_ID_UOP) return kComputeStage;
return kLoadStage; return kLoadStage;
} }
// Get stage of the computation // Get stage of the computation
static PipelineStage GetPipelineStage(VTAMemInsn* insn) { static PipelineStage GetPipelineStage(VTAMemInsn* insn) {
if (insn->opcode == OPCODE_GEMM) return kComputeStage; if (insn->opcode == VTA_OPCODE_GEMM) return kComputeStage;
if (insn->opcode == OPCODE_ALU) return kComputeStage; if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage;
if (insn->opcode == OPCODE_LOAD) { if (insn->opcode == VTA_OPCODE_LOAD) {
if (insn->x_size == 0) return kNoneStage; if (insn->x_size == 0) return kNoneStage;
if (insn->memory_type == MEM_ID_ACC) return kComputeStage; if (insn->memory_type == VTA_MEM_ID_ACC) return kComputeStage;
if (insn->memory_type == MEM_ID_UOP) return kComputeStage; if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage;
return kLoadStage; return kLoadStage;
} }
if (insn->opcode == OPCODE_STORE) { if (insn->opcode == VTA_OPCODE_STORE) {
// FIXME: Right now memory_type is a 2-bit field which means that MEM_ID_OUT will appear as 0 // FIXME: Right now memory_type is a 2-bit field which means that
// For now we'll refrain from checking the memory_type to avoid an assertion error... // VTA_MEM_ID_OUT will appear as 0. For now we'll refrain from
// checking the memory_type to avoid an assertion error...
return kStoreStage; return kStoreStage;
} }
assert(false); assert(false);
...@@ -859,7 +860,7 @@ class InsnQueue : public BaseQueue { ...@@ -859,7 +860,7 @@ class InsnQueue : public BaseQueue {
bool push_prev_dep, bool push_next_dep, bool push_prev_dep, bool push_next_dep,
bool pop_prev_dep, bool pop_next_dep) { bool pop_prev_dep, bool pop_next_dep) {
VTAMemInsn* insn = reinterpret_cast<VTAMemInsn*>(NextInsn()); VTAMemInsn* insn = reinterpret_cast<VTAMemInsn*>(NextInsn());
insn->opcode = (stage==kStoreStage ? OPCODE_STORE : OPCODE_LOAD); insn->opcode = (stage == kStoreStage ? VTA_OPCODE_STORE : VTA_OPCODE_LOAD);
insn->push_prev_dep = push_prev_dep; insn->push_prev_dep = push_prev_dep;
insn->push_next_dep = push_next_dep; insn->push_next_dep = push_next_dep;
insn->pop_prev_dep = pop_prev_dep; insn->pop_prev_dep = pop_prev_dep;
...@@ -873,7 +874,7 @@ class InsnQueue : public BaseQueue { ...@@ -873,7 +874,7 @@ class InsnQueue : public BaseQueue {
insn->y_pad_1 = 0; insn->y_pad_1 = 0;
insn->x_pad_0 = 0; insn->x_pad_0 = 0;
insn->x_pad_1 = 0; insn->x_pad_1 = 0;
insn->memory_type = (stage == kLoadStage ? MEM_ID_INP : MEM_ID_UOP); insn->memory_type = (stage == kLoadStage ? VTA_MEM_ID_INP : VTA_MEM_ID_UOP);
} }
private: private:
...@@ -913,12 +914,12 @@ class CommandQueue { ...@@ -913,12 +914,12 @@ class CommandQueue {
} }
uint32_t GetElemBytes(uint32_t memory_id) { uint32_t GetElemBytes(uint32_t memory_id) {
switch (memory_id){ switch (memory_id) {
case MEM_ID_UOP: return UOP_ELEM_BYTES; case VTA_MEM_ID_UOP: return VTA_UOP_ELEM_BYTES;
case MEM_ID_INP: return INP_ELEM_BYTES; case VTA_MEM_ID_INP: return VTA_INP_ELEM_BYTES;
case MEM_ID_WGT: return WGT_ELEM_BYTES; case VTA_MEM_ID_WGT: return VTA_WGT_ELEM_BYTES;
case MEM_ID_ACC: return ACC_ELEM_BYTES; case VTA_MEM_ID_ACC: return VTA_ACC_ELEM_BYTES;
case MEM_ID_OUT: return INP_ELEM_BYTES; case VTA_MEM_ID_OUT: return VTA_INP_ELEM_BYTES;
default: break; default: break;
} }
printf("Memory id not recognized: %d\n", memory_id); printf("Memory id not recognized: %d\n", memory_id);
...@@ -938,7 +939,7 @@ class CommandQueue { ...@@ -938,7 +939,7 @@ class CommandQueue {
uint32_t dst_sram_index, uint32_t dst_sram_index,
uint32_t dst_memory_type) { uint32_t dst_memory_type) {
VTAMemInsn* insn = insn_queue_.CreateMemInsn(dst_memory_type); VTAMemInsn* insn = insn_queue_.CreateMemInsn(dst_memory_type);
insn->opcode = OPCODE_LOAD; insn->opcode = VTA_OPCODE_LOAD;
insn->memory_type = dst_memory_type; insn->memory_type = dst_memory_type;
insn->sram_base = dst_sram_index; insn->sram_base = dst_sram_index;
DataBuffer* src = DataBuffer::FromHandle(src_dram_addr); DataBuffer* src = DataBuffer::FromHandle(src_dram_addr);
...@@ -961,7 +962,7 @@ class CommandQueue { ...@@ -961,7 +962,7 @@ class CommandQueue {
uint32_t y_size, uint32_t y_size,
uint32_t x_stride) { uint32_t x_stride) {
VTAMemInsn* insn = insn_queue_.CreateStoreInsn(); VTAMemInsn* insn = insn_queue_.CreateStoreInsn();
insn->opcode = OPCODE_STORE; insn->opcode = VTA_OPCODE_STORE;
insn->memory_type = src_memory_type; insn->memory_type = src_memory_type;
insn->sram_base = src_sram_index; insn->sram_base = src_sram_index;
DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr); DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr);
...@@ -1013,7 +1014,7 @@ class CommandQueue { ...@@ -1013,7 +1014,7 @@ class CommandQueue {
insn_queue_.CommitPendingPop(kComputeStage); insn_queue_.CommitPendingPop(kComputeStage);
// NOTE: FINISH cannot contain pop // NOTE: FINISH cannot contain pop
VTAGemInsn* insn = insn_queue_.CreateGemInsn(); VTAGemInsn* insn = insn_queue_.CreateGemInsn();
insn->opcode = OPCODE_FINISH; insn->opcode = VTA_OPCODE_FINISH;
assert(!insn_queue_.PendingPop()); assert(!insn_queue_.PendingPop());
// Check if there are no instruction to execute at all // Check if there are no instruction to execute at all
if (insn_queue_.count() == 0) return; if (insn_queue_.count() == 0) return;
...@@ -1026,11 +1027,11 @@ class CommandQueue { ...@@ -1026,11 +1027,11 @@ class CommandQueue {
} }
// Make sure that the last instruction is a finish instruction // Make sure that the last instruction is a finish instruction
assert(reinterpret_cast<VTAMemInsn*>( assert(reinterpret_cast<VTAMemInsn*>(
insn_queue_.data())[insn_queue_.count()-1].opcode == OPCODE_FINISH); insn_queue_.data())[insn_queue_.count()-1].opcode == VTA_OPCODE_FINISH);
#ifdef PYNQ_TARGET #ifdef VTA_PYNQ_TARGET
// Make sure that we don't exceed contiguous physical memory limits // Make sure that we don't exceed contiguous physical memory limits
assert(insn_queue_.count() < MAX_XFER); assert(insn_queue_.count() < VTA_MAX_XFER);
// NOTE: Register address map is derived from the auto-generated // NOTE: Register address map is derived from the auto-generated
// driver files available under hardware/build/vivado/<design>/export/driver // driver files available under hardware/build/vivado/<design>/export/driver
...@@ -1064,7 +1065,7 @@ class CommandQueue { ...@@ -1064,7 +1065,7 @@ class CommandQueue {
} }
// Report error if timeout // Report error if timeout
assert(t < wait_cycles); assert(t < wait_cycles);
#endif //PYNQ_TARGET #endif // VTA_PYNQ_TARGET
// Reset buffers // Reset buffers
uop_queue_.Reset(); uop_queue_.Reset();
...@@ -1142,12 +1143,12 @@ class CommandQueue { ...@@ -1142,12 +1143,12 @@ class CommandQueue {
uop_queue_.Push(kernel, uop_queue_.Push(kernel,
[this]() { this->AutoSync(); }); [this]() { this->AutoSync(); });
if (uop_queue_.pending()) { if (uop_queue_.pending()) {
VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP); VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
insn->opcode = OPCODE_LOAD; insn->opcode = VTA_OPCODE_LOAD;
uop_queue_.FlushUopLoad(insn); uop_queue_.FlushUopLoad(insn);
} }
VTAGemInsn* insn = insn_queue_.CreateGemInsn(); VTAGemInsn* insn = insn_queue_.CreateGemInsn();
insn->opcode = OPCODE_GEMM; insn->opcode = VTA_OPCODE_GEMM;
insn->uop_bgn = kernel->sram_begin_; insn->uop_bgn = kernel->sram_begin_;
insn->uop_end = kernel->sram_end_; insn->uop_end = kernel->sram_end_;
const std::vector<UopKernel::LoopEntry> &loop = kernel->loop(); const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
...@@ -1180,12 +1181,12 @@ class CommandQueue { ...@@ -1180,12 +1181,12 @@ class CommandQueue {
uop_queue_.Push(kernel, uop_queue_.Push(kernel,
[this]() { this->AutoSync(); }); [this]() { this->AutoSync(); });
if (uop_queue_.pending()) { if (uop_queue_.pending()) {
VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP); VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
insn->opcode = OPCODE_LOAD; insn->opcode = VTA_OPCODE_LOAD;
uop_queue_.FlushUopLoad(insn); uop_queue_.FlushUopLoad(insn);
} }
VTAAluInsn* insn = insn_queue_.CreateAluInsn(); VTAAluInsn* insn = insn_queue_.CreateAluInsn();
insn->opcode = OPCODE_ALU; insn->opcode = VTA_OPCODE_ALU;
insn->uop_bgn = kernel->sram_begin_; insn->uop_bgn = kernel->sram_begin_;
insn->uop_end = kernel->sram_end_; insn->uop_end = kernel->sram_end_;
insn->alu_opcode = kernel->opcode_; insn->alu_opcode = kernel->opcode_;
...@@ -1219,7 +1220,7 @@ class CommandQueue { ...@@ -1219,7 +1220,7 @@ class CommandQueue {
void CheckInsnOverFlow() { void CheckInsnOverFlow() {
// At each API call, we can at most commit: // At each API call, we can at most commit:
// one pending store, one pending load, and one uop // one pending store, one pending load, and one uop
if (insn_queue_.count() >= MAX_XFER) { if (insn_queue_.count() >= VTA_MAX_XFER) {
this->AutoSync(); this->AutoSync();
} }
} }
...@@ -1237,9 +1238,9 @@ class CommandQueue { ...@@ -1237,9 +1238,9 @@ class CommandQueue {
// The kernel we currently recording // The kernel we currently recording
UopKernel* record_kernel_{nullptr}; UopKernel* record_kernel_{nullptr};
// Micro op queue // Micro op queue
UopQueue<MAX_XFER, true, true> uop_queue_; UopQueue<VTA_MAX_XFER, true, true> uop_queue_;
// instruction queue // instruction queue
InsnQueue<MAX_XFER, true, true> insn_queue_; InsnQueue<VTA_MAX_XFER, true, true> insn_queue_;
}; };
} // namespace vta } // namespace vta
......
// simply include the driver for now. /*!
* Copyright (c) 2018 by Contributors
* \file vta_device_api.cc
* \brief VTA device API for TVM
*/
#include <tvm/runtime/registry.h> #include <tvm/runtime/registry.h>
#include <dmlc/thread_local.h> #include <dmlc/thread_local.h>
#include <vta/runtime.h> #include <vta/runtime.h>
#include "../../tvm/src/runtime/workspace_pool.h"
#include "../../nnvm/tvm/src/runtime/workspace_pool.h"
namespace tvm { namespace tvm {
namespace runtime { namespace runtime {
......
...@@ -6,41 +6,43 @@ ...@@ -6,41 +6,43 @@
#include "./test_lib.h" #include "./test_lib.h"
uint32_t globalSeed;
const char* getOpcodeString(int opcode, bool use_imm) { const char* getOpcodeString(int opcode, bool use_imm) {
// Returns string name // Returns string name
if (opcode == ALU_OPCODE_MIN) { if (opcode == VTA_ALU_OPCODE_MIN) {
if (use_imm) { if (use_imm) {
return "min imm"; return "min imm";
} else { } else {
return "min"; return "min";
} }
} else if (opcode == ALU_OPCODE_MAX) { } else if (opcode == VTA_ALU_OPCODE_MAX) {
if (use_imm) { if (use_imm) {
return "max imm"; return "max imm";
} else { } else {
return "max"; return "max";
} }
} else if (opcode == ALU_OPCODE_ADD) { } else if (opcode == VTA_ALU_OPCODE_ADD) {
if (use_imm) { if (use_imm) {
return "add imm"; return "add imm";
} else { } else {
return "add"; return "add";
} }
} else if (opcode == ALU_OPCODE_SUB) { } else if (opcode == VTA_ALU_OPCODE_SUB) {
if (use_imm) { if (use_imm) {
return "sub imm"; return "sub imm";
} else { } else {
return "sub"; return "sub";
} }
} else if (opcode == ALU_OPCODE_MUL) { } else if (opcode == VTA_ALU_OPCODE_MUL) {
if (use_imm) { if (use_imm) {
return "mul imm"; return "mul imm";
} else { } else {
return "mul"; return "mul";
} }
} else if (opcode == ALU_OPCODE_SHL) { } else if (opcode == VTA_ALU_OPCODE_SHL) {
return "shl"; return "shl";
} else if (opcode == ALU_OPCODE_SHR) { } else if (opcode == VTA_ALU_OPCODE_SHR) {
return "shr"; return "shr";
} }
return "unknown op"; return "unknown op";
...@@ -49,20 +51,20 @@ const char* getOpcodeString(int opcode, bool use_imm) { ...@@ -49,20 +51,20 @@ const char* getOpcodeString(int opcode, bool use_imm) {
template <typename T, int T_WIDTH> template <typename T, int T_WIDTH>
void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block) { void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block) {
int buffer_idx = 0; int buffer_idx = 0;
for(int i = 0; i < y_size / y_block; i ++) { for (int i = 0; i < y_size / y_block; i++) {
for(int j = 0; j < x_size / x_block; j ++) { for (int j = 0; j < x_size / x_block; j++) {
for(int k = 0; k < y_block; k ++) { for (int k = 0; k < y_block; k++) {
if (T_WIDTH < 8) { if (T_WIDTH < 8) {
for (int l = 0; l < x_block; l += 8 / T_WIDTH) { for (int l = 0; l < x_block; l += 8 / T_WIDTH) {
dst[buffer_idx] = 0; dst[buffer_idx] = 0;
for (int m = 0; m < 8 / T_WIDTH; m ++) { for (int m = 0; m < 8 / T_WIDTH; m++) {
dst[buffer_idx] |= (src[i * y_block + k][j * x_block + l + m] & dst[buffer_idx] |= (src[i * y_block + k][j * x_block + l + m] &
((1ULL << T_WIDTH) - 1)) << (m * T_WIDTH); ((1ULL << T_WIDTH) - 1)) << (m * T_WIDTH);
} }
buffer_idx ++; buffer_idx++;
} }
} else { } else {
for (int l = 0; l < x_block; l ++) { for (int l = 0; l < x_block; l++) {
dst[buffer_idx++] = src[i * y_block + k][j * x_block + l]; dst[buffer_idx++] = src[i * y_block + k][j * x_block + l];
} }
} }
...@@ -74,20 +76,20 @@ void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_bloc ...@@ -74,20 +76,20 @@ void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_bloc
template <typename T, int T_WIDTH> template <typename T, int T_WIDTH>
void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block) { void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block) {
int buffer_idx = 0; int buffer_idx = 0;
for(int i = 0; i < y_size / y_block; i ++) { for (int i = 0; i < y_size / y_block; i++) {
for(int j = 0; j < x_size / x_block; j ++) { for (int j = 0; j < x_size / x_block; j++) {
for(int k = 0; k < y_block; k ++) { for (int k = 0; k < y_block; k++) {
if (T_WIDTH < 8) { if (T_WIDTH < 8) {
for (int l = 0; l < x_block; l += 8 / T_WIDTH) { for (int l = 0; l < x_block; l += 8 / T_WIDTH) {
for (int m = 0; m < 8 / T_WIDTH; m ++) { for (int m = 0; m < 8 / T_WIDTH; m++) {
dst[i * y_block + k][j * x_block + l + m] = (src[buffer_idx] >> (m * T_WIDTH)) dst[i * y_block + k][j * x_block + l + m] = (src[buffer_idx] >> (m * T_WIDTH))
& ((1 << T_WIDTH) - 1); & ((1 << T_WIDTH) - 1);
} }
buffer_idx ++; buffer_idx++;
} }
} else { } else {
for (int l = 0; l < x_block; l ++) { for (int l = 0; l < x_block; l++) {
dst[i * y_block + k][j * x_block + l] = src[buffer_idx ++]; dst[i * y_block + k][j * x_block + l] = src[buffer_idx++];
} }
} }
} }
...@@ -98,14 +100,15 @@ void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_bl ...@@ -98,14 +100,15 @@ void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_bl
template <typename T, int T_WIDTH> template <typename T, int T_WIDTH>
T ** allocInit2dArray(int rows, int cols) { T ** allocInit2dArray(int rows, int cols) {
// Allocate // Allocate
T **array = (T **) malloc(sizeof(T *) * rows); T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
for (int i = 0; i < rows; i ++) { for (int i = 0; i < rows; i++) {
array[i] = (T *) malloc(sizeof(T) * cols); array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
} }
// Init // Init
for (int i = 0; i < rows; i ++) { for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j ++) { for (int j = 0; j < cols; j++) {
array[i][j] = (T) (rand() % (1LL << (T_WIDTH - 1)) - (1LL << (T_WIDTH - 2))); array[i][j] =
static_cast<T>(rand_r(&globalSeed) % (1LL << (T_WIDTH - 1)) - (1LL << (T_WIDTH - 2)));
} }
} }
return array; return array;
...@@ -113,16 +116,16 @@ T ** allocInit2dArray(int rows, int cols) { ...@@ -113,16 +116,16 @@ T ** allocInit2dArray(int rows, int cols) {
template <typename T> template <typename T>
T ** alloc2dArray(int rows, int cols) { T ** alloc2dArray(int rows, int cols) {
T **array = (T **) malloc(sizeof(T *) * rows); T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
for (int i = 0; i < rows; i ++) { for (int i = 0; i < rows; i++) {
array[i] = (T *) malloc(sizeof(T) * cols); array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
} }
return array; return array;
} }
template <typename T> template <typename T>
void free2dArray(T **array, int rows, int cols) { void free2dArray(T **array, int rows, int cols) {
for (int i = 0; i < rows; i ++) { for (int i = 0; i < rows; i++) {
free(array[i]); free(array[i]);
} }
free(array); free(array);
...@@ -130,11 +133,11 @@ void free2dArray(T **array, int rows, int cols) { ...@@ -130,11 +133,11 @@ void free2dArray(T **array, int rows, int cols) {
template <typename T> template <typename T>
T *** alloc3dArray(int rows, int cols, int depth) { T *** alloc3dArray(int rows, int cols, int depth) {
T ***array = (T ***) malloc(sizeof(T **) * rows); T ***array = static_cast<T ***>(malloc(sizeof(T **) * rows));
for (int i = 0; i < rows; i ++) { for (int i = 0; i < rows; i++) {
array[i] = (T **) malloc(sizeof(T *) * cols); array[i] = static_cast<T **>(malloc(sizeof(T *) * cols));
for (int j = 0; j < cols; j ++) { for (int j = 0; j < cols; j++) {
array[i][j] = (T*) malloc(sizeof(T) * depth); array[i][j] = static_cast<T*>(malloc(sizeof(T) * depth));
} }
} }
return array; return array;
...@@ -142,8 +145,8 @@ T *** alloc3dArray(int rows, int cols, int depth) { ...@@ -142,8 +145,8 @@ T *** alloc3dArray(int rows, int cols, int depth) {
template <typename T> template <typename T>
void free3dArray(T *** array, int rows, int cols, int depth) { void free3dArray(T *** array, int rows, int cols, int depth) {
for (int i = 0; i < rows; i ++) { for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j ++) { for (int j = 0; j < cols; j++) {
free(array[i][j]); free(array[i][j]);
} }
free(array[i]); free(array[i]);
...@@ -153,7 +156,7 @@ void free3dArray(T *** array, int rows, int cols, int depth) { ...@@ -153,7 +156,7 @@ void free3dArray(T *** array, int rows, int cols, int depth) {
void * allocBuffer(size_t num_bytes) { void * allocBuffer(size_t num_bytes) {
#ifdef NO_SIM #ifdef NO_SIM
return VTAMemAlloc(num_bytes, CACHED); return VTAMemAlloc(num_bytes, VTA_CACHED);
#else #else
return malloc(num_bytes); return malloc(num_bytes);
#endif #endif
...@@ -173,7 +176,7 @@ VTAGenericInsn reset2DInsn(int type, int sram_offset, int y_size, int x_size, in ...@@ -173,7 +176,7 @@ VTAGenericInsn reset2DInsn(int type, int sram_offset, int y_size, int x_size, in
union VTAInsn converter; union VTAInsn converter;
// Memory instruction initialization // Memory instruction initialization
VTAMemInsn insn = {}; VTAMemInsn insn = {};
insn.opcode = OPCODE_LOAD; insn.opcode = VTA_OPCODE_LOAD;
insn.pop_prev_dep = pop_prev_dep; insn.pop_prev_dep = pop_prev_dep;
insn.pop_next_dep = pop_next_dep; insn.pop_next_dep = pop_next_dep;
insn.push_prev_dep = push_prev_dep; insn.push_prev_dep = push_prev_dep;
...@@ -250,7 +253,7 @@ VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat, ...@@ -250,7 +253,7 @@ VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
union VTAInsn converter; union VTAInsn converter;
// GEVM instruction initialization // GEVM instruction initialization
VTAGemInsn insn; VTAGemInsn insn;
insn.opcode = OPCODE_GEMM; insn.opcode = VTA_OPCODE_GEMM;
insn.pop_prev_dep = pop_prev_dep; insn.pop_prev_dep = pop_prev_dep;
insn.pop_next_dep = pop_next_dep; insn.pop_next_dep = pop_next_dep;
insn.push_prev_dep = push_prev_dep; insn.push_prev_dep = push_prev_dep;
...@@ -288,7 +291,7 @@ VTAGenericInsn getALUInsn(int opcode, int vector_size, bool use_imm, int imm, bo ...@@ -288,7 +291,7 @@ VTAGenericInsn getALUInsn(int opcode, int vector_size, bool use_imm, int imm, bo
union VTAInsn converter; union VTAInsn converter;
// Memory instruction initialization // Memory instruction initialization
VTAAluInsn insn = {}; VTAAluInsn insn = {};
insn.opcode = OPCODE_ALU; insn.opcode = VTA_OPCODE_ALU;
insn.pop_prev_dep = pop_prev_dep; insn.pop_prev_dep = pop_prev_dep;
insn.pop_next_dep = pop_next_dep; insn.pop_next_dep = pop_next_dep;
insn.push_prev_dep = push_prev_dep; insn.push_prev_dep = push_prev_dep;
...@@ -327,7 +330,7 @@ VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) { ...@@ -327,7 +330,7 @@ VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) {
union VTAInsn converter; union VTAInsn converter;
// GEVM instruction initialization // GEVM instruction initialization
VTAGemInsn insn; VTAGemInsn insn;
insn.opcode = OPCODE_FINISH; insn.opcode = VTA_OPCODE_FINISH;
insn.pop_prev_dep = pop_prev; insn.pop_prev_dep = pop_prev;
insn.pop_next_dep = pop_next; insn.pop_next_dep = pop_next;
insn.push_prev_dep = 0; insn.push_prev_dep = 0;
...@@ -347,21 +350,20 @@ VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) { ...@@ -347,21 +350,20 @@ VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) {
} }
VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) { VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
// Derive the total uop size // Derive the total uop size
int uop_size = (uop_compression) ? 1 : y_size * x_size; int uop_size = (uop_compression) ? 1 : y_size * x_size;
// Allocate buffer // Allocate buffer
#ifdef NO_SIM #ifdef NO_SIM
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED); VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
#else #else
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size); VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
#endif #endif
if (!uop_compression) { if (!uop_compression) {
int uop_idx = 0; int uop_idx = 0;
for (int i = 0; i < y_size; i ++) { for (int i = 0; i < y_size; i++) {
for (int j = 0; j < x_size; j ++) { for (int j = 0; j < x_size; j++) {
uop_buf[uop_idx].reset_out = false; uop_buf[uop_idx].reset_out = false;
uop_buf[uop_idx].dst_idx = i * x_size + j; uop_buf[uop_idx].dst_idx = i * x_size + j;
uop_buf[uop_idx].src_idx = 0; uop_buf[uop_idx].src_idx = 0;
...@@ -381,23 +383,22 @@ VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) { ...@@ -381,23 +383,22 @@ VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression, VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
bool multi_threaded) { bool multi_threaded) {
// Derive the total uop size // Derive the total uop size
int uop_size = (uop_compression) ? batch : batch * in_feat * out_feat; int uop_size = (uop_compression) ? batch : batch * in_feat * out_feat;
if (multi_threaded) uop_size *= 2; if (multi_threaded) uop_size *= 2;
// Allocate buffer // Allocate buffer
#ifdef NO_SIM #ifdef NO_SIM
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED); VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
#else #else
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size); VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
#endif #endif
if (!uop_compression) { if (!uop_compression) {
int uop_idx = 0; int uop_idx = 0;
for (int i = 0; i < batch; i ++) { for (int i = 0; i < batch; i++) {
for (int j = 0; j < in_feat; j ++) { for (int j = 0; j < in_feat; j++) {
for (int k = 0; k < out_feat; k ++) { for (int k = 0; k < out_feat; k++) {
uop_buf[uop_idx].reset_out = false; uop_buf[uop_idx].reset_out = false;
uop_buf[uop_idx].dst_idx = i * out_feat + k; uop_buf[uop_idx].dst_idx = i * out_feat + k;
uop_buf[uop_idx].src_idx = i * in_feat + j; uop_buf[uop_idx].src_idx = i * in_feat + j;
...@@ -407,7 +408,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression, ...@@ -407,7 +408,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
} }
} }
} else { } else {
for (int i = 0; i < batch; i ++) { for (int i = 0; i < batch; i++) {
uop_buf[i].reset_out = false; uop_buf[i].reset_out = false;
uop_buf[i].dst_idx = i * out_feat; uop_buf[i].dst_idx = i * out_feat;
uop_buf[i].src_idx = i * in_feat; uop_buf[i].src_idx = i * in_feat;
...@@ -418,9 +419,9 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression, ...@@ -418,9 +419,9 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
if (multi_threaded) { if (multi_threaded) {
if (!uop_compression) { if (!uop_compression) {
int uop_idx = uop_size / 2; int uop_idx = uop_size / 2;
for (int i = 0; i < batch; i ++) { for (int i = 0; i < batch; i++) {
for (int j = 0; j < in_feat; j ++) { for (int j = 0; j < in_feat; j++) {
for (int k = 0; k < out_feat; k ++) { for (int k = 0; k < out_feat; k++) {
uop_buf[uop_idx].reset_out = false; uop_buf[uop_idx].reset_out = false;
uop_buf[uop_idx].dst_idx = i * out_feat + k; uop_buf[uop_idx].dst_idx = i * out_feat + k;
uop_buf[uop_idx].src_idx = batch * in_feat + i * in_feat + j; uop_buf[uop_idx].src_idx = batch * in_feat + i * in_feat + j;
...@@ -430,7 +431,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression, ...@@ -430,7 +431,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
} }
} }
} else { } else {
for (int i = 0; i < batch; i ++) { for (int i = 0; i < batch; i++) {
uop_buf[batch+i].reset_out = false; uop_buf[batch+i].reset_out = false;
uop_buf[batch+i].dst_idx = i * out_feat; uop_buf[batch+i].dst_idx = i * out_feat;
uop_buf[batch+i].src_idx = batch * in_feat + i * in_feat; uop_buf[batch+i].src_idx = batch * in_feat + i * in_feat;
...@@ -443,19 +444,18 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression, ...@@ -443,19 +444,18 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
} }
VTAUop * getMapALUUops(int vector_size, bool uop_compression) { VTAUop * getMapALUUops(int vector_size, bool uop_compression) {
// Derive the total uop size // Derive the total uop size
int uop_size = (uop_compression) ? 1 : vector_size; int uop_size = (uop_compression) ? 1 : vector_size;
// Allocate buffer // Allocate buffer
#ifdef NO_SIM #ifdef NO_SIM
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED); VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
#else #else
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size); VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
#endif #endif
if (!uop_compression) { if (!uop_compression) {
for (int i = 0; i < vector_size; i ++) { for (int i = 0; i < vector_size; i++) {
uop_buf[i].reset_out = 0; uop_buf[i].reset_out = 0;
uop_buf[i].dst_idx = i; uop_buf[i].dst_idx = i;
uop_buf[i].src_idx = vector_size + i; uop_buf[i].src_idx = vector_size + i;
...@@ -473,65 +473,65 @@ void printParameters() { ...@@ -473,65 +473,65 @@ void printParameters() {
// Some debugging code // Some debugging code
printf("Size of VTAInsn: %d\n", sizeof(VTAGenericInsn)); printf("Size of VTAInsn: %d\n", sizeof(VTAGenericInsn));
printf("Size of VTAUop: %d\n", sizeof(VTAUop)); printf("Size of VTAUop: %d\n", sizeof(VTAUop));
printf("UOP_BUFF_DEPTH: %d\n", UOP_BUFF_DEPTH); printf("VTA_UOP_BUFF_DEPTH: %d\n", VTA_UOP_BUFF_DEPTH);
printf("LOG_UOP_BUFF_DEPTH: %d\n", LOG_UOP_BUFF_DEPTH); printf("VTA_LOG_UOP_BUFF_DEPTH: %d\n", VTA_LOG_UOP_BUFF_DEPTH);
printf("WGT_BUFF_DEPTH: %d\n", WGT_BUFF_DEPTH); printf("VTA_WGT_BUFF_DEPTH: %d\n", VTA_WGT_BUFF_DEPTH);
printf("LOG_WGT_BUFF_DEPTH: %d\n", LOG_WGT_BUFF_DEPTH); printf("VTA_LOG_WGT_BUFF_DEPTH: %d\n", VTA_LOG_WGT_BUFF_DEPTH);
printf("INP_BUFF_DEPTH: %d\n", INP_BUFF_DEPTH); printf("VTA_INP_BUFF_DEPTH: %d\n", VTA_INP_BUFF_DEPTH);
printf("LOG_INP_BUFF_DEPTH: %d\n", LOG_INP_BUFF_DEPTH); printf("VTA_LOG_INP_BUFF_DEPTH: %d\n", VTA_LOG_INP_BUFF_DEPTH);
printf("ACC_BUFF_DEPTH: %d\n", ACC_BUFF_DEPTH); printf("VTA_ACC_BUFF_DEPTH: %d\n", VTA_ACC_BUFF_DEPTH);
printf("LOG_ACC_BUFF_DEPTH: %d\n", LOG_ACC_BUFF_DEPTH); printf("VTA_LOG_ACC_BUFF_DEPTH: %d\n", VTA_LOG_ACC_BUFF_DEPTH);
printf("WGT_WORDS: %d\n", WGT_BUFF_DEPTH*BLOCK_IN*BLOCK_OUT); printf("VTA_WGT_WORDS: %d\n", VTA_WGT_BUFF_DEPTH*VTA_BLOCK_IN*VTA_BLOCK_OUT);
printf("INP_WORDS: %d\n", INP_BUFF_DEPTH*BLOCK_IN); printf("VTA_INP_WORDS: %d\n", VTA_INP_BUFF_DEPTH*VTA_BLOCK_IN);
printf("ACC_WORDS: %d\n", ACC_BUFF_DEPTH*BLOCK_OUT); printf("VTA_ACC_WORDS: %d\n", VTA_ACC_BUFF_DEPTH*VTA_BLOCK_OUT);
printf("INS_ELEM_BYTES: %d\n", INS_ELEM_BYTES); printf("VTA_INS_ELEM_BYTES: %d\n", VTA_INS_ELEM_BYTES);
printf("UOP_ELEM_BYTES: %d\n", UOP_ELEM_BYTES); printf("VTA_UOP_ELEM_BYTES: %d\n", VTA_UOP_ELEM_BYTES);
printf("INP_ELEM_BYTES: %d\n", INP_ELEM_BYTES); printf("VTA_INP_ELEM_BYTES: %d\n", VTA_INP_ELEM_BYTES);
printf("WGT_ELEM_BYTES: %d\n", WGT_ELEM_BYTES); printf("VTA_WGT_ELEM_BYTES: %d\n", VTA_WGT_ELEM_BYTES);
printf("ACC_ELEM_BYTES: %d\n", ACC_ELEM_BYTES); printf("VTA_ACC_ELEM_BYTES: %d\n", VTA_ACC_ELEM_BYTES);
printf("BLOCK_IN: %d\n", BLOCK_IN); printf("VTA_BLOCK_IN: %d\n", VTA_BLOCK_IN);
printf("BLOCK_OUT: %d\n", BLOCK_OUT); printf("VTA_BLOCK_OUT: %d\n", VTA_BLOCK_OUT);
printf("INSN_MEM_0 [%d-%d]\n", INSN_MEM_0_0, INSN_MEM_0_1); printf("VTA_INSN_MEM_0 [%d-%d]\n", VTA_INSN_MEM_0_0, VTA_INSN_MEM_0_1);
printf("INSN_MEM_1 [%d]\n", INSN_MEM_1); printf("VTA_INSN_MEM_1 [%d]\n", VTA_INSN_MEM_1);
printf("INSN_MEM_2 [%d]\n", INSN_MEM_2); printf("VTA_INSN_MEM_2 [%d]\n", VTA_INSN_MEM_2);
printf("INSN_MEM_3 [%d]\n", INSN_MEM_3); printf("VTA_INSN_MEM_3 [%d]\n", VTA_INSN_MEM_3);
printf("INSN_MEM_4 [%d]\n", INSN_MEM_4); printf("VTA_INSN_MEM_4 [%d]\n", VTA_INSN_MEM_4);
printf("INSN_MEM_5 [%d-%d]\n", INSN_MEM_5_0, INSN_MEM_5_1); printf("VTA_INSN_MEM_5 [%d-%d]\n", VTA_INSN_MEM_5_0, VTA_INSN_MEM_5_1);
printf("INSN_MEM_6 [%d-%d]\n", INSN_MEM_6_0, INSN_MEM_6_1); printf("VTA_INSN_MEM_6 [%d-%d]\n", VTA_INSN_MEM_6_0, VTA_INSN_MEM_6_1);
printf("INSN_MEM_7 [%d-%d]\n", INSN_MEM_7_0, INSN_MEM_7_1); printf("VTA_INSN_MEM_7 [%d-%d]\n", VTA_INSN_MEM_7_0, VTA_INSN_MEM_7_1);
printf("INSN_MEM_8 [%d-%d]\n", INSN_MEM_8_0, INSN_MEM_8_1); printf("VTA_INSN_MEM_8 [%d-%d]\n", VTA_INSN_MEM_8_0, VTA_INSN_MEM_8_1);
printf("INSN_MEM_9 [%d-%d]\n", INSN_MEM_9_0, INSN_MEM_9_1); printf("VTA_INSN_MEM_9 [%d-%d]\n", VTA_INSN_MEM_9_0, VTA_INSN_MEM_9_1);
printf("INSN_MEM_A [%d-%d]\n", INSN_MEM_A_0, INSN_MEM_A_1); printf("VTA_INSN_MEM_A [%d-%d]\n", VTA_INSN_MEM_A_0, VTA_INSN_MEM_A_1);
printf("INSN_MEM_B [%d-%d]\n", INSN_MEM_B_0, INSN_MEM_B_1); printf("VTA_INSN_MEM_B [%d-%d]\n", VTA_INSN_MEM_B_0, VTA_INSN_MEM_B_1);
printf("INSN_MEM_C [%d-%d]\n", INSN_MEM_C_0, INSN_MEM_C_1); printf("VTA_INSN_MEM_C [%d-%d]\n", VTA_INSN_MEM_C_0, VTA_INSN_MEM_C_1);
printf("INSN_MEM_D [%d-%d]\n", INSN_MEM_D_0, INSN_MEM_D_1); printf("VTA_INSN_MEM_D [%d-%d]\n", VTA_INSN_MEM_D_0, VTA_INSN_MEM_D_1);
printf("INSN_MEM_E [%d-%d]\n", INSN_MEM_E_0, INSN_MEM_E_1); printf("VTA_INSN_MEM_E [%d-%d]\n", VTA_INSN_MEM_E_0, VTA_INSN_MEM_E_1);
printf("INSN_GEM_0 [%d-%d]\n", INSN_GEM_0_0, INSN_GEM_0_1); printf("VTA_INSN_GEM_0 [%d-%d]\n", VTA_INSN_GEM_0_0, VTA_INSN_GEM_0_1);
printf("INSN_GEM_1 [%d]\n", INSN_GEM_1); printf("VTA_INSN_GEM_1 [%d]\n", VTA_INSN_GEM_1);
printf("INSN_GEM_2 [%d]\n", INSN_GEM_2); printf("VTA_INSN_GEM_2 [%d]\n", VTA_INSN_GEM_2);
printf("INSN_GEM_3 [%d]\n", INSN_GEM_3); printf("VTA_INSN_GEM_3 [%d]\n", VTA_INSN_GEM_3);
printf("INSN_GEM_4 [%d]\n", INSN_GEM_4); printf("VTA_INSN_GEM_4 [%d]\n", VTA_INSN_GEM_4);
printf("INSN_GEM_5 [%d-%d]\n", INSN_GEM_5_0, INSN_GEM_5_1); printf("VTA_INSN_GEM_5 [%d-%d]\n", VTA_INSN_GEM_5_0, VTA_INSN_GEM_5_1);
printf("INSN_GEM_6 [%d-%d]\n", INSN_GEM_6_0, INSN_GEM_6_1); printf("VTA_INSN_GEM_6 [%d-%d]\n", VTA_INSN_GEM_6_0, VTA_INSN_GEM_6_1);
printf("INSN_GEM_7 [%d-%d]\n", INSN_GEM_7_0, INSN_GEM_7_1); printf("VTA_INSN_GEM_7 [%d-%d]\n", VTA_INSN_GEM_7_0, VTA_INSN_GEM_7_1);
printf("INSN_GEM_8 [%d-%d]\n", INSN_GEM_8_0, INSN_GEM_8_1); printf("VTA_INSN_GEM_8 [%d-%d]\n", VTA_INSN_GEM_8_0, VTA_INSN_GEM_8_1);
printf("INSN_GEM_9 [%d-%d]\n", INSN_GEM_9_0, INSN_GEM_9_1); printf("VTA_INSN_GEM_9 [%d-%d]\n", VTA_INSN_GEM_9_0, VTA_INSN_GEM_9_1);
printf("INSN_GEM_A [%d-%d]\n", INSN_GEM_A_0, INSN_GEM_A_1); printf("VTA_INSN_GEM_A [%d-%d]\n", VTA_INSN_GEM_A_0, VTA_INSN_GEM_A_1);
printf("INSN_GEM_B [%d-%d]\n", INSN_GEM_B_0, INSN_GEM_B_1); printf("VTA_INSN_GEM_B [%d-%d]\n", VTA_INSN_GEM_B_0, VTA_INSN_GEM_B_1);
printf("INSN_GEM_C [%d-%d]\n", INSN_GEM_C_0, INSN_GEM_C_1); printf("VTA_INSN_GEM_C [%d-%d]\n", VTA_INSN_GEM_C_0, VTA_INSN_GEM_C_1);
printf("INSN_GEM_D [%d-%d]\n", INSN_GEM_D_0, INSN_GEM_D_1); printf("VTA_INSN_GEM_D [%d-%d]\n", VTA_INSN_GEM_D_0, VTA_INSN_GEM_D_1);
printf("INSN_GEM_E [%d-%d]\n", INSN_GEM_E_0, INSN_GEM_E_1); printf("VTA_INSN_GEM_E [%d-%d]\n", VTA_INSN_GEM_E_0, VTA_INSN_GEM_E_1);
printf("INSN_ALU_D [%d-%d]\n", INSN_ALU_D_0, INSN_ALU_D_1); printf("VTA_INSN_ALU_D [%d-%d]\n", VTA_INSN_ALU_D_0, VTA_INSN_ALU_D_1);
printf("INSN_ALU_E [%d]\n", INSN_ALU_E); printf("VTA_INSN_ALU_E [%d]\n", VTA_INSN_ALU_E);
printf("INSN_ALU_F [%d-%d]\n", INSN_ALU_F_0, INSN_ALU_F_1); printf("VTA_INSN_ALU_F [%d-%d]\n", VTA_INSN_ALU_F_0, VTA_INSN_ALU_F_1);
printf("UOP_GEM_0 [%d]\n", UOP_GEM_0); printf("VTA_UOP_GEM_0 [%d]\n", VTA_UOP_GEM_0);
printf("UOP_GEM_1 [%d-%d]\n", UOP_GEM_1_0, UOP_GEM_1_1); printf("VTA_UOP_GEM_1 [%d-%d]\n", VTA_UOP_GEM_1_0, VTA_UOP_GEM_1_1);
printf("UOP_GEM_2 [%d-%d]\n", UOP_GEM_2_0, UOP_GEM_2_1); printf("VTA_UOP_GEM_2 [%d-%d]\n", VTA_UOP_GEM_2_0, VTA_UOP_GEM_2_1);
printf("UOP_GEM_3 [%d-%d]\n", UOP_GEM_3_0, UOP_GEM_3_1); printf("VTA_UOP_GEM_3 [%d-%d]\n", VTA_UOP_GEM_3_0, VTA_UOP_GEM_3_1);
printf("UOP_ALU_0 [%d]\n", UOP_ALU_0); printf("VTA_UOP_ALU_0 [%d]\n", VTA_UOP_ALU_0);
printf("UOP_ALU_1 [%d-%d]\n", UOP_ALU_1_0, UOP_ALU_1_1); printf("VTA_UOP_ALU_1 [%d-%d]\n", VTA_UOP_ALU_1_0, VTA_UOP_ALU_1_1);
printf("UOP_ALU_2 [%d-%d]\n", UOP_ALU_2_0, UOP_ALU_2_1); printf("VTA_UOP_ALU_2 [%d-%d]\n", VTA_UOP_ALU_2_0, VTA_UOP_ALU_2_1);
printf("UOP_ALU_3 [%d-%d]\n", UOP_ALU_3_0, UOP_ALU_3_1); printf("VTA_UOP_ALU_3 [%d-%d]\n", VTA_UOP_ALU_3_0, VTA_UOP_ALU_3_1);
} }
void printInstruction(int num_insn, VTAGenericInsn *insns) { void printInstruction(int num_insn, VTAGenericInsn *insns) {
...@@ -544,84 +544,111 @@ void printInstruction(int num_insn, VTAGenericInsn *insns) { ...@@ -544,84 +544,111 @@ void printInstruction(int num_insn, VTAGenericInsn *insns) {
union VTAInsn c; union VTAInsn c;
// Iterate over all instructions // Iterate over all instructions
printf("DEBUG - There are %u instructions\n", num_insn); printf("DEBUG - There are %u instructions\n", num_insn);
for (int i = 0; i < num_insn; i ++) { for (int i = 0; i < num_insn; i++) {
// Fetch instruction and decode opcode // Fetch instruction and decode opcode
c.generic = insns[i]; c.generic = insns[i];
printf("DEBUG - INSTRUCTION %u: ", i); printf("DEBUG - INSTRUCTION %u: ", i);
if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) { if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
// Print instruction field information // Print instruction field information
if (c.mem.opcode == OPCODE_LOAD) { if (c.mem.opcode == VTA_OPCODE_LOAD) {
printf("LOAD "); printf("LOAD ");
if (c.mem.memory_type == MEM_ID_UOP) printf("UOP\n"); if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
if (c.mem.memory_type == MEM_ID_WGT) printf("WGT\n"); if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
if (c.mem.memory_type == MEM_ID_INP) printf("INP\n"); if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
if (c.mem.memory_type == MEM_ID_ACC) printf("ACC\n"); if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
} }
if (c.mem.opcode == OPCODE_STORE) { if (c.mem.opcode == VTA_OPCODE_STORE) {
printf("STORE ACC\n"); printf("STORE ACC\n");
} }
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
(int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep, static_cast<int>(c.mem.pop_prev_dep),
(int) c.mem.push_prev_dep, (int) c.mem.push_next_dep); static_cast<int>(c.mem.pop_next_dep),
printf("\tDRAM: 0x%08x, SRAM:0x%04x\n", (int) c.mem.dram_base, (int) c.mem.sram_base); static_cast<int>(c.mem.push_prev_dep),
printf("\ty: size=%d, pad=[%d, %d]\n", (int) c.mem.y_size, (int) c.mem.y_pad_0, static_cast<int>(c.mem.push_next_dep));
(int) c.mem.y_pad_1); printf("\tDRAM: 0x%08x, SRAM:0x%04x\n",
printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n", (int) c.mem.x_size, (int) c.mem.x_stride, static_cast<int>(c.mem.dram_base),
(int) c.mem.x_pad_0, (int) c.mem.x_pad_1); static_cast<int>(c.mem.sram_base));
if (c.mem.opcode == OPCODE_STORE) { printf("\ty: size=%d, pad=[%d, %d]\n",
if (c.mem.pop_prev_dep) g2s_queue --; static_cast<int>(c.mem.y_size),
if (c.mem.push_prev_dep) s2g_queue ++; static_cast<int>(c.mem.y_pad_0),
} else if (c.mem.opcode == OPCODE_LOAD && static_cast<int>(c.mem.y_pad_1));
(c.mem.memory_type == MEM_ID_INP || c.mem.memory_type == MEM_ID_WGT)) { printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n",
if (c.mem.pop_next_dep) g2l_queue --; static_cast<int>(c.mem.x_size),
if (c.mem.push_next_dep) l2g_queue ++; static_cast<int>(c.mem.x_stride),
static_cast<int>(c.mem.x_pad_0),
static_cast<int>(c.mem.x_pad_1));
if (c.mem.opcode == VTA_OPCODE_STORE) {
if (c.mem.pop_prev_dep) g2s_queue--;
if (c.mem.push_prev_dep) s2g_queue++;
} else if (c.mem.opcode == VTA_OPCODE_LOAD &&
(c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) {
if (c.mem.pop_next_dep) g2l_queue--;
if (c.mem.push_next_dep) l2g_queue++;
} else { } else {
if (c.mem.pop_prev_dep) l2g_queue --; if (c.mem.pop_prev_dep) l2g_queue--;
if (c.mem.push_prev_dep) g2l_queue ++; if (c.mem.push_prev_dep) g2l_queue++;
if (c.mem.pop_next_dep) s2g_queue --; if (c.mem.pop_next_dep) s2g_queue--;
if (c.mem.push_next_dep) g2s_queue ++; if (c.mem.push_next_dep) g2s_queue++;
} }
} else if (c.mem.opcode == OPCODE_GEMM) { } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
// Print instruction field information // Print instruction field information
printf("GEVM\n"); printf("GEVM\n");
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
(int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep, static_cast<int>(c.mem.pop_prev_dep),
(int) c.mem.push_prev_dep, (int) c.mem.push_next_dep); static_cast<int>(c.mem.pop_next_dep),
printf("\trange (%d, %d)\n", (int) c.gemm.uop_bgn, (int) c.gemm.uop_end); static_cast<int>(c.mem.push_prev_dep),
printf("\touter loop - iter: %d, acc: %d, inp: %d, wgt: %d\n", (int) c.gemm.iter_out, static_cast<int>(c.mem.push_next_dep));
(int) c.gemm.dst_factor_out, (int) c.gemm.src_factor_out, printf("\trange (%d, %d)\n",
(int) c.gemm.wgt_factor_out); static_cast<int>(c.gemm.uop_bgn),
printf("\tinner loop - iter: %d, acc: %d, inp: %d, wgt: %d\n", (int) c.gemm.iter_in, static_cast<int>(c.gemm.uop_end));
(int) c.gemm.dst_factor_in, (int) c.gemm.src_factor_in, printf("\touter loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
(int) c.gemm.wgt_factor_in); static_cast<int>(c.gemm.iter_out),
if (c.gemm.pop_prev_dep) l2g_queue --; static_cast<int>(c.gemm.dst_factor_out),
if (c.gemm.push_prev_dep) g2l_queue ++; static_cast<int>(c.gemm.src_factor_out),
if (c.gemm.pop_next_dep) s2g_queue --; static_cast<int>(c.gemm.wgt_factor_out));
if (c.gemm.push_next_dep) g2s_queue ++; printf("\tinner loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
} else if (c.mem.opcode == OPCODE_FINISH) { static_cast<int>(c.gemm.iter_in),
static_cast<int>(c.gemm.dst_factor_in),
static_cast<int>(c.gemm.src_factor_in),
static_cast<int>(c.gemm.wgt_factor_in));
if (c.gemm.pop_prev_dep) l2g_queue--;
if (c.gemm.push_prev_dep) g2l_queue++;
if (c.gemm.pop_next_dep) s2g_queue--;
if (c.gemm.push_next_dep) g2s_queue++;
} else if (c.mem.opcode == VTA_OPCODE_FINISH) {
printf("FINISH\n"); printf("FINISH\n");
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
(int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep, static_cast<int>(c.mem.pop_prev_dep),
(int) c.mem.push_prev_dep, (int) c.mem.push_next_dep); static_cast<int>(c.mem.pop_next_dep),
if (c.gemm.pop_prev_dep) l2g_queue --; static_cast<int>(c.mem.push_prev_dep),
if (c.gemm.push_prev_dep) g2l_queue ++; static_cast<int>(c.mem.push_next_dep));
if (c.gemm.pop_next_dep) s2g_queue --; if (c.gemm.pop_prev_dep) l2g_queue--;
if (c.gemm.push_next_dep) g2s_queue ++; if (c.gemm.push_prev_dep) g2l_queue++;
} else if (c.mem.opcode == OPCODE_ALU) { if (c.gemm.pop_next_dep) s2g_queue--;
if (c.gemm.push_next_dep) g2s_queue++;
} else if (c.mem.opcode == VTA_OPCODE_ALU) {
// Print instruction field information // Print instruction field information
printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm)); printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
(int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep, static_cast<int>(c.mem.pop_prev_dep),
(int) c.mem.push_prev_dep, (int) c.mem.push_next_dep); static_cast<int>(c.mem.pop_next_dep),
printf("\trange (%d, %d)\n", (int) c.alu.uop_bgn, (int) c.alu.uop_end); static_cast<int>(c.mem.push_prev_dep),
printf("\touter loop - iter: %d, dst: %d, src: %d\n", (int) c.alu.iter_out, static_cast<int>(c.mem.push_next_dep));
(int) c.alu.dst_factor_out, (int) c.alu.src_factor_out); printf("\trange (%d, %d)\n",
printf("\tinner loop - iter: %d, dst: %d, src: %d\n", (int) c.alu.iter_in, static_cast<int>(c.alu.uop_bgn),
(int) c.alu.dst_factor_in, (int) c.alu.src_factor_in); static_cast<int>(c.alu.uop_end));
if (c.alu.pop_prev_dep) l2g_queue --; printf("\touter loop - iter: %d, dst: %d, src: %d\n",
if (c.alu.push_prev_dep) g2l_queue ++; static_cast<int>(c.alu.iter_out),
if (c.alu.pop_next_dep) s2g_queue --; static_cast<int>(c.alu.dst_factor_out),
if (c.alu.push_next_dep) g2s_queue ++; static_cast<int>(c.alu.src_factor_out));
printf("\tinner loop - iter: %d, dst: %d, src: %d\n",
static_cast<int>(c.alu.iter_in),
static_cast<int>(c.alu.dst_factor_in),
static_cast<int>(c.alu.src_factor_in));
if (c.alu.pop_prev_dep) l2g_queue--;
if (c.alu.push_prev_dep) g2l_queue++;
if (c.alu.pop_next_dep) s2g_queue--;
if (c.alu.push_next_dep) g2s_queue++;
} }
} }
printf("DEBUG - l2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue); printf("DEBUG - l2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
...@@ -632,68 +659,73 @@ void printInstruction(int num_insn, VTAGenericInsn *insns) { ...@@ -632,68 +659,73 @@ void printInstruction(int num_insn, VTAGenericInsn *insns) {
void printMicroOp(int num_uop, VTAUop *uops) { void printMicroOp(int num_uop, VTAUop *uops) {
// Iterate over all micro ops // Iterate over all micro ops
printf("DEBUG - There are %u micro-ops\n", num_uop); printf("DEBUG - There are %u micro-ops\n", num_uop);
for (int i = 0; i < num_uop; i ++) { for (int i = 0; i < num_uop; i++) {
// Read micro-op // Read micro-op
printf("DEBUG - UOP %u: ", i); printf("DEBUG - UOP %u: ", i);
printf("rst_out=%u, acc=%u, inp= %u, wgt=%u\n", uops[i].reset_out, uops[i].dst_idx, printf("rst_out=%u, acc=%u, inp= %u, wgt=%u\n", uops[i].reset_out, uops[i].dst_idx,
uops[i].src_idx, uops[i].wgt_idx); uops[i].src_idx, uops[i].wgt_idx);
} }
} }
int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) { int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) {
// Some assertions
assert(batch % BATCH == 0); assert(batch % VTA_BATCH == 0);
assert(vector_size % BLOCK_OUT == 0); assert(vector_size % VTA_BLOCK_OUT == 0);
assert(!(opcode == ALU_OPCODE_SHL && !use_imm)); assert(!(opcode == VTA_ALU_OPCODE_SHL && !use_imm));
assert(!(opcode == ALU_OPCODE_SHR && !use_imm)); assert(!(opcode == VTA_ALU_OPCODE_SHR && !use_imm));
printf("=====================================================================================\n"); printf("=====================================================================================\n");
printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n", printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n",
getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression); getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression);
// Instruction count // Instruction count
int ins_size = 3 * batch / BATCH + 2; int ins_size = 3 * batch / VTA_BATCH + 2;
// Micro op count // Micro op count
int uop_size = uop_compression ? 1 : vector_size / BLOCK_OUT; int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT;
// Input/output elements in each transfer // Input/output elements in each transfer
int tx_size = vector_size / BLOCK_OUT; int tx_size = vector_size / VTA_BLOCK_OUT;
// Number of input sets to be generated // Number of input sets to be generated
int input_sets = (use_imm) ? 1 : 2; int input_sets = (use_imm) ? 1 : 2;
// Make sure we don't exceed buffer bounds // Make sure we don't exceed buffer bounds
assert(uop_size <= UOP_BUFF_DEPTH); assert(uop_size <= VTA_UOP_BUFF_DEPTH);
assert(tx_size * input_sets <= ACC_BUFF_DEPTH); assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH);
// Immediate values // Immediate values
acc_T *immediate = (acc_T *) malloc(sizeof(acc_T) * batch / BATCH); acc_T *immediate = static_cast<acc_T *>(malloc(sizeof(acc_T) * batch / VTA_BATCH));
for (int b = 0; b < batch / BATCH; b ++) { for (int b = 0; b < batch / VTA_BATCH; b++) {
if (opcode == ALU_OPCODE_MIN) { if (opcode == VTA_ALU_OPCODE_MIN) {
immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); immediate[b] = static_cast<acc_T>(
} else if (opcode == ALU_OPCODE_MAX) { rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); } else if (opcode == VTA_ALU_OPCODE_MAX) {
} else if (opcode == ALU_OPCODE_ADD) { immediate[b] = static_cast<acc_T>(
immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
} else if (opcode == ALU_OPCODE_SUB) { } else if (opcode == VTA_ALU_OPCODE_ADD) {
immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); immediate[b] = static_cast<acc_T>(
} else if (opcode == ALU_OPCODE_MUL) { rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); } else if (opcode == VTA_ALU_OPCODE_SUB) {
} else if (opcode == ALU_OPCODE_SHL) { immediate[b] = static_cast<acc_T>(
immediate[b] = (acc_T) (rand() % (INP_WIDTH + 1)); rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
} else if (opcode == ALU_OPCODE_SHR) { } else if (opcode == VTA_ALU_OPCODE_MUL) {
immediate[b] = (acc_T) (rand() % (INP_WIDTH + 1)); immediate[b] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
} else if (opcode == VTA_ALU_OPCODE_SHL) {
immediate[b] = static_cast<acc_T>(rand_r(&globalSeed) % (VTA_INP_WIDTH + 1));
} else if (opcode == VTA_ALU_OPCODE_SHR) {
immediate[b] = static_cast<acc_T>(rand_r(&globalSeed) % (VTA_INP_WIDTH + 1));
} }
} }
// Initialize instructions // Initialize instructions
VTAGenericInsn *insn_buf = (VTAGenericInsn *) allocBuffer(sizeof(VTAGenericInsn) * ins_size); VTAGenericInsn *insn_buf =
static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
int insn_idx = 0; int insn_idx = 0;
insn_buf[insn_idx ++] = get1DLoadStoreInsn(OPCODE_LOAD, MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0); insn_buf[insn_idx++] =
for (int b = 0; b < batch; b += BATCH) { get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0);
insn_buf[insn_idx ++] = get2DLoadStoreInsn( for (int b = 0; b < batch; b += VTA_BATCH) {
OPCODE_LOAD, // opcode insn_buf[insn_idx++] = get2DLoadStoreInsn(
MEM_ID_ACC, // vector size VTA_OPCODE_LOAD, // opcode
VTA_MEM_ID_ACC, // vector size
0, // sram offset 0, // sram offset
b / BATCH * tx_size * input_sets, // dram offset b / VTA_BATCH * tx_size * input_sets, // dram offset
1, // y size 1, // y size
tx_size * input_sets, // x size tx_size * input_sets, // x size
tx_size * input_sets, // x stride tx_size * input_sets, // x stride
...@@ -703,21 +735,21 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp ...@@ -703,21 +735,21 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
b > 0, // pop next dep b > 0, // pop next dep
0, // push prev dep 0, // push prev dep
0); // push next dep 0); // push next dep
insn_buf[insn_idx ++] = getALUInsn( insn_buf[insn_idx++] = getALUInsn(
opcode, // opcode opcode, // opcode
tx_size, // vector size tx_size, // vector size
use_imm, // use imm use_imm, // use imm
immediate[b / BATCH], // imm immediate[b / VTA_BATCH], // imm
uop_compression, // uop compression uop_compression, // uop compression
0, // pop prev dep 0, // pop prev dep
0, // pop next dep 0, // pop next dep
0, // push prev dep 0, // push prev dep
1); // push next dep 1); // push next dep
insn_buf[insn_idx ++] = get2DLoadStoreInsn( insn_buf[insn_idx++] = get2DLoadStoreInsn(
OPCODE_STORE, // opcode VTA_OPCODE_STORE, // opcode
MEM_ID_OUT, // vector size VTA_MEM_ID_OUT, // vector size
0, // sram offset 0, // sram offset
b / BATCH * tx_size, // dram offset b / VTA_BATCH * tx_size, // dram offset
1, // y size 1, // y size
tx_size, // x size tx_size, // x size
tx_size, // x stride tx_size, // x stride
...@@ -729,77 +761,91 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp ...@@ -729,77 +761,91 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
0); // push next dep 0); // push next dep
} }
// Finish // Finish
insn_buf[insn_idx ++] = getFinishInsn(0, 1); insn_buf[insn_idx++] = getFinishInsn(0, 1);
// Prepare the uop buffer // Prepare the uop buffer
VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression); VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression);
#if DEBUG==1 #if VTA_DEBUG == 1
printInstruction(ins_size, insn_buf); printInstruction(ins_size, insn_buf);
printMicroOp(uop_size, uop_buf); printMicroOp(uop_size, uop_buf);
#endif #endif
// Initialize the input/output data // Initialize the input/output data
acc_T **inputs = alloc2dArray<acc_T>(batch, vector_size * input_sets); acc_T **inputs = alloc2dArray<acc_T>(batch, vector_size * input_sets);
for (int i = 0; i < batch; i ++) { for (int i = 0; i < batch; i++) {
for (int j = 0; j < vector_size * input_sets; j ++) { for (int j = 0; j < vector_size * input_sets; j++) {
if (opcode == ALU_OPCODE_MIN) { if (opcode == VTA_ALU_OPCODE_MIN) {
inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); inputs[i][j] = static_cast<acc_T>(
} else if (opcode == ALU_OPCODE_MAX) { rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); } else if (opcode == VTA_ALU_OPCODE_MAX) {
} else if (opcode == ALU_OPCODE_ADD) { inputs[i][j] = static_cast<acc_T>(
inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
} else if (opcode == ALU_OPCODE_SUB) { } else if (opcode == VTA_ALU_OPCODE_ADD) {
inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); inputs[i][j] = static_cast<acc_T>(
} else if (opcode == ALU_OPCODE_MUL) { rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1))); } else if (opcode == VTA_ALU_OPCODE_SUB) {
} else if (opcode == ALU_OPCODE_SHL) { inputs[i][j] = static_cast<acc_T>(
inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
} else if (opcode == ALU_OPCODE_SHR) { } else if (opcode == VTA_ALU_OPCODE_MUL) {
inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2))); inputs[i][j] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
} else if (opcode == VTA_ALU_OPCODE_SHL) {
inputs[i][j] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
} else if (opcode == VTA_ALU_OPCODE_SHR) {
inputs[i][j] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
} }
} }
} }
// Compute reference output // Compute reference output
out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size); out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
for (int i = 0; i < batch; i ++) { for (int i = 0; i < batch; i++) {
for (int j = 0; j < vector_size; j ++) { for (int j = 0; j < vector_size; j++) {
acc_T tmp = 0; acc_T tmp = 0;
if (opcode == ALU_OPCODE_MIN) { if (opcode == VTA_ALU_OPCODE_MIN) {
if (!use_imm) { if (!use_imm) {
tmp = inputs[i][j] < inputs[i][j + vector_size] ? inputs[i][j] : inputs[i][j + vector_size]; tmp = inputs[i][j] < inputs[i][j + vector_size] ?
inputs[i][j] :
inputs[i][j + vector_size];
} else { } else {
tmp = inputs[i][j] < immediate[i / BATCH] ? inputs[i][j] : immediate[i / BATCH]; tmp = inputs[i][j] < immediate[i / VTA_BATCH] ?
inputs[i][j] :
immediate[i / VTA_BATCH];
} }
} else if (opcode == ALU_OPCODE_MAX) { } else if (opcode == VTA_ALU_OPCODE_MAX) {
if (!use_imm) { if (!use_imm) {
tmp = inputs[i][j] > inputs[i][j + vector_size] ? inputs[i][j] : inputs[i][j + vector_size]; tmp = inputs[i][j] > inputs[i][j + vector_size] ?
inputs[i][j] :
inputs[i][j + vector_size];
} else { } else {
tmp = inputs[i][j] > immediate[i / BATCH] ? inputs[i][j] : immediate[i / BATCH]; tmp = inputs[i][j] > immediate[i / VTA_BATCH] ?
inputs[i][j] :
immediate[i / VTA_BATCH];
} }
} else if (opcode == ALU_OPCODE_ADD) { } else if (opcode == VTA_ALU_OPCODE_ADD) {
if (!use_imm) { if (!use_imm) {
tmp = inputs[i][j] + inputs[i][j + vector_size]; tmp = inputs[i][j] + inputs[i][j + vector_size];
} else { } else {
tmp = inputs[i][j] + immediate[i / BATCH]; tmp = inputs[i][j] + immediate[i / VTA_BATCH];
} }
} else if (opcode == ALU_OPCODE_SUB) { } else if (opcode == VTA_ALU_OPCODE_SUB) {
if (!use_imm) { if (!use_imm) {
tmp = inputs[i][j] - inputs[i][j + vector_size]; tmp = inputs[i][j] - inputs[i][j + vector_size];
} else { } else {
tmp = inputs[i][j] - immediate[i / BATCH]; tmp = inputs[i][j] - immediate[i / VTA_BATCH];
} }
} else if (opcode == ALU_OPCODE_MUL) { } else if (opcode == VTA_ALU_OPCODE_MUL) {
if (!use_imm) { if (!use_imm) {
tmp = inputs[i][j] * inputs[i][j + vector_size]; tmp = inputs[i][j] * inputs[i][j + vector_size];
} else { } else {
tmp = inputs[i][j] * immediate[i / BATCH]; tmp = inputs[i][j] * immediate[i / VTA_BATCH];
} }
} else if (opcode == ALU_OPCODE_SHL) { } else if (opcode == VTA_ALU_OPCODE_SHL) {
tmp = inputs[i][j] << immediate[i / BATCH]; tmp = inputs[i][j] << immediate[i / VTA_BATCH];
} else if (opcode == ALU_OPCODE_SHR) { } else if (opcode == VTA_ALU_OPCODE_SHR) {
tmp = inputs[i][j] >> immediate[i / BATCH]; tmp = inputs[i][j] >> immediate[i / VTA_BATCH];
} }
// Set // Set
outputs_ref[i][j] = (out_T) tmp; outputs_ref[i][j] = (out_T) tmp;
...@@ -807,44 +853,51 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp ...@@ -807,44 +853,51 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
} }
// Pack input buffer // Pack input buffer
acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * batch * tx_size * input_sets); acc_T *bias_buf =
packBuffer<acc_T, ACC_WIDTH>(bias_buf, inputs, batch, vector_size * input_sets, BATCH, BLOCK_OUT); static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
packBuffer<acc_T, VTA_ACC_WIDTH>(
bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT);
// Prepare output buffer // Prepare output buffer
out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets); out_T *output_buf =
static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * batch * tx_size * input_sets));
#ifdef NO_SIM #ifdef NO_SIM
// Invoke the VTA // Invoke the VTA
uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf); uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf);
// Report on timining // Report on timining
printf("INFO - Synchronization time: %.3lfms\n", (double) t_fpga / 1E6); printf("INFO - Synchronization time: %.3fms\n", static_cast<float>(t_fpga) / 1E6);
printf("INFO - Throughput: %.3lfGOps/s\n", (double) vector_size * batch / t_fpga); printf("INFO - Throughput: %.3fGOps/s\n", static_cast<float>(vector_size * batch) / t_fpga);
#else #else
// Invoke the VTA // Invoke the VTA
vta( vta(ins_size,
ins_size,
(volatile insn_T *) insn_buf, (volatile insn_T *) insn_buf,
(volatile uop_T *) uop_buf, (volatile uop_T *) uop_buf,
(volatile inp_vec_T *) NULL, (volatile inp_vec_T *) NULL,
(volatile wgt_vec_T *) NULL, (volatile wgt_vec_T *) NULL,
(volatile acc_vec_T *) bias_buf, (volatile acc_vec_T *) bias_buf,
(volatile inp_vec_T *) output_buf (volatile inp_vec_T *) output_buf);
);
#endif #endif
// Unpack output buffer // Unpack output buffer
out_T **outputs = alloc2dArray<out_T>(batch, vector_size); out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT); unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
output_buf,
batch,
vector_size,
VTA_BATCH,
VTA_BLOCK_OUT);
// Correctness checks // Correctness checks
int err = 0; int err = 0;
for (int i = 0; i < batch; i ++) { for (int i = 0; i < batch; i++) {
for (int j = 0; j < vector_size; j ++) { for (int j = 0; j < vector_size; j++) {
if (outputs_ref[i][j] != outputs[i][j]) { if (outputs_ref[i][j] != outputs[i][j]) {
err++; err++;
#if DEBUG==1 #if VTA_DEBUG == 1
printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, (int) outputs_ref[i][j], printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
(int) outputs[i][j]); static_cast<int>(outputs_ref[i][j]),
static_cast<int>(outputs[i][j]));
#endif #endif
} }
} }
...@@ -867,21 +920,19 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp ...@@ -867,21 +920,19 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
printf("INFO - ALU test failed, got %d errors!\n", err); printf("INFO - ALU test failed, got %d errors!\n", err);
return -1; return -1;
} }
} }
int blocked_gemm_test(int batch, int channels, int block, bool uop_compression, int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
int virtual_threads) { int virtual_threads) {
// Some assertions
assert(block % BLOCK_IN == 0); assert(block % VTA_BLOCK_IN == 0);
assert(block % BLOCK_OUT == 0); assert(block % VTA_BLOCK_OUT == 0);
assert(block % BATCH == 0); assert(block % VTA_BATCH == 0);
assert(channels % block == 0); assert(channels % block == 0);
assert(batch % block == 0); assert(batch % block == 0);
printf("=====================================================================================\n"); printf("=====================================================================================\n");
printf("INFO - Blocked GEMM test: batch=%d, channels=%d, block=%d, uop_compression=%d, \ printf("INFO - Blocked GEMM test: batch=%d, channels=%d, block=%d, uop_comp=%d, vt=%d\n",
virtual_threads=%d\n",
batch, channels, block, uop_compression, virtual_threads); batch, channels, block, uop_compression, virtual_threads);
// Input/output channels // Input/output channels
...@@ -889,40 +940,50 @@ virtual_threads=%d\n", ...@@ -889,40 +940,50 @@ virtual_threads=%d\n",
int out_feat = channels; int out_feat = channels;
// Derive number of elements that need to be loaded/stored // Derive number of elements that need to be loaded/stored
int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2; int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2;
int uop_size = uop_compression ? block / BATCH * virtual_threads : int uop_size = uop_compression ?
block / BATCH * block / BLOCK_IN * block / BLOCK_OUT * virtual_threads; block / VTA_BATCH * virtual_threads :
int inp_size = batch / BATCH * in_feat / BLOCK_IN; block / VTA_BATCH * block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT * virtual_threads;
int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT; int inp_size = batch / VTA_BATCH * in_feat / VTA_BLOCK_IN;
int out_size = batch / BATCH * out_feat / BLOCK_OUT; int wgt_size = in_feat / VTA_BLOCK_IN * out_feat / VTA_BLOCK_OUT;
int out_size = batch / VTA_BATCH * out_feat / VTA_BLOCK_OUT;
// Blocked buffer sizes (in terms of elements) // Blocked buffer sizes (in terms of elements)
int inp_block_size = block / BATCH * block / BLOCK_IN; int inp_block_size = block / VTA_BATCH * block / VTA_BLOCK_IN;
int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT; int wgt_block_size = block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT;
int out_block_size = block / BATCH * block / BLOCK_OUT; int out_block_size = block / VTA_BATCH * block / VTA_BLOCK_OUT;
// Make sure we don't exceed buffer bounds // Make sure we don't exceed buffer bounds
assert(uop_size <= UOP_BUFF_DEPTH); assert(uop_size <= VTA_UOP_BUFF_DEPTH);
assert(inp_block_size <= INP_BUFF_DEPTH); assert(inp_block_size <= VTA_INP_BUFF_DEPTH);
assert(wgt_block_size <= WGT_BUFF_DEPTH); assert(wgt_block_size <= VTA_WGT_BUFF_DEPTH);
assert(out_block_size <= ACC_BUFF_DEPTH); assert(out_block_size <= VTA_ACC_BUFF_DEPTH);
// Initialize instruction buffer // Initialize instruction buffer
VTAGenericInsn *insn_buf = (VTAGenericInsn *) allocBuffer(sizeof(VTAGenericInsn) * ins_size); VTAGenericInsn *insn_buf =
static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
int insn_idx = 0; int insn_idx = 0;
// Load uops // Load uops
insn_buf[insn_idx ++] = get1DLoadStoreInsn(OPCODE_LOAD, MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0); insn_buf[insn_idx++] = get1DLoadStoreInsn(VTA_OPCODE_LOAD,
VTA_MEM_ID_UOP,
0,
0,
uop_size,
0,
0,
0,
0);
// Iterate over batch blocks // Iterate over batch blocks
for (int i = 0; i < batch; i += block) { for (int i = 0; i < batch; i += block) {
// Iterate over output channel blocks // Iterate over output channel blocks
for (int j = 0; j < out_feat; j += block) { for (int j = 0; j < out_feat; j += block) {
// Load bias block (pop next if not first, push prev) // Load bias block (pop next if not first, push prev)
insn_buf[insn_idx ++] = get2DLoadStoreInsn( insn_buf[insn_idx++] = get2DLoadStoreInsn(
OPCODE_LOAD, // opcode VTA_OPCODE_LOAD, // opcode
MEM_ID_ACC, // type VTA_MEM_ID_ACC, // type
0, // sram offset 0, // sram offset
(i / BATCH * out_feat + j) / BLOCK_OUT, // dram offset (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT, // dram offset
block / BATCH, // y size block / VTA_BATCH, // y size
block / BLOCK_OUT, // x size block / VTA_BLOCK_OUT, // x size
out_feat / BLOCK_OUT, // x stride out_feat / VTA_BLOCK_OUT, // x stride
0, // y pad 0, // y pad
0, // x pad 0, // x pad
0, // pop prev dep 0, // pop prev dep
...@@ -942,18 +1003,17 @@ virtual_threads=%d\n", ...@@ -942,18 +1003,17 @@ virtual_threads=%d\n",
( (
(k + l != in_feat - block) || (k + l != in_feat - block) ||
(j != out_feat - block) || (j != out_feat - block) ||
(i != batch - block) (i != batch - block));
);
bool push_next = (k + l == in_feat - block); bool push_next = (k + l == in_feat - block);
// Load weight block (pop next) // Load weight block (pop next)
insn_buf[insn_idx ++] = get2DLoadStoreInsn( insn_buf[insn_idx++] = get2DLoadStoreInsn(
OPCODE_LOAD, // opcode VTA_OPCODE_LOAD, // opcode
MEM_ID_WGT, // type VTA_MEM_ID_WGT, // type
l / BLOCK_IN * block / BLOCK_OUT, // sram offset l / VTA_BLOCK_IN * block / VTA_BLOCK_OUT, // sram offset
(j / BLOCK_OUT * in_feat + k + l) / BLOCK_IN, // dram offset (j / VTA_BLOCK_OUT * in_feat + k + l) / VTA_BLOCK_IN, // dram offset
block / BLOCK_OUT, // y size block / VTA_BLOCK_OUT, // y size
block / BLOCK_IN, // x size block / VTA_BLOCK_IN, // x size
in_feat / BLOCK_IN, // x stride in_feat / VTA_BLOCK_IN, // x stride
0, // y pad 0, // y pad
0, // x pad 0, // x pad
0, // pop prev dep 0, // pop prev dep
...@@ -961,14 +1021,14 @@ virtual_threads=%d\n", ...@@ -961,14 +1021,14 @@ virtual_threads=%d\n",
0, // push prev dep 0, // push prev dep
0); // push next dep 0); // push next dep
// Load input block (push next) // Load input block (push next)
insn_buf[insn_idx ++] = get2DLoadStoreInsn( insn_buf[insn_idx++] = get2DLoadStoreInsn(
OPCODE_LOAD, // opcode VTA_OPCODE_LOAD, // opcode
MEM_ID_INP, // type VTA_MEM_ID_INP, // type
l / BLOCK_IN * block / BATCH, // sram offset l / VTA_BLOCK_IN * block / VTA_BATCH, // sram offset
(i / BATCH * in_feat + k + l) / BLOCK_IN, // dram offset (i / VTA_BATCH * in_feat + k + l) / VTA_BLOCK_IN, // dram offset
block / BATCH, // y size block / VTA_BATCH, // y size
block / BLOCK_IN, // x size block / VTA_BLOCK_IN, // x size
in_feat / BLOCK_IN, // x stride in_feat / VTA_BLOCK_IN, // x stride
0, // y pad 0, // y pad
0, // x pad 0, // x pad
0, // pop prev dep 0, // pop prev dep
...@@ -976,11 +1036,11 @@ virtual_threads=%d\n", ...@@ -976,11 +1036,11 @@ virtual_threads=%d\n",
0, // push prev dep 0, // push prev dep
1); // push next dep 1); // push next dep
// Perform GEMM (pop prev, push prev if not last, push next if last) // Perform GEMM (pop prev, push prev if not last, push next if last)
insn_buf[insn_idx ++] = getGEMMInsn( insn_buf[insn_idx++] = getGEMMInsn(
l / block * uop_size / virtual_threads, // uop offset l / block * uop_size / virtual_threads, // uop offset
block / BATCH, // batch block / VTA_BATCH, // batch
block / BLOCK_IN, // in_feat block / VTA_BLOCK_IN, // in_feat
block / BLOCK_OUT, // out_feat block / VTA_BLOCK_OUT, // out_feat
uop_compression, // uop_compression uop_compression, // uop_compression
1, // pop_prev_dep 1, // pop_prev_dep
0, // pop_next_dep 0, // pop_next_dep
...@@ -989,14 +1049,14 @@ virtual_threads=%d\n", ...@@ -989,14 +1049,14 @@ virtual_threads=%d\n",
} }
} }
// Store output block (pop prev, push prev if not last) // Store output block (pop prev, push prev if not last)
insn_buf[insn_idx ++] = get2DLoadStoreInsn( insn_buf[insn_idx++] = get2DLoadStoreInsn(
OPCODE_STORE, // opcode VTA_OPCODE_STORE, // opcode
MEM_ID_OUT, // type VTA_MEM_ID_OUT, // type
0, // sram offset 0, // sram offset
(i / BATCH * out_feat + j) / BLOCK_OUT, // dram offset (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT, // dram offset
block / BATCH, // y size block / VTA_BATCH, // y size
block / BLOCK_OUT, // x size block / VTA_BLOCK_OUT, // x size
out_feat / BLOCK_OUT, // x stride out_feat / VTA_BLOCK_OUT, // x stride
0, // y pad 0, // y pad
0, // x pad 0, // x pad
1, // pop prev dep 1, // pop prev dep
...@@ -1006,30 +1066,34 @@ virtual_threads=%d\n", ...@@ -1006,30 +1066,34 @@ virtual_threads=%d\n",
} }
} }
// Finish // Finish
insn_buf[insn_idx ++] = getFinishInsn(0, 1); insn_buf[insn_idx++] = getFinishInsn(0, 1);
// Prepare the uop buffer // Prepare the uop buffer
VTAUop * uop_buf = getGEMMUops(block / BATCH, block / BLOCK_IN, block / BLOCK_OUT, uop_compression, VTAUop * uop_buf = getGEMMUops(
block / VTA_BATCH,
block / VTA_BLOCK_IN,
block / VTA_BLOCK_OUT,
uop_compression,
virtual_threads > 1); virtual_threads > 1);
#if DEBUG==1 #if VTA_DEBUG == 1
printInstruction(ins_size, insn_buf); printInstruction(ins_size, insn_buf);
printMicroOp(uop_size, uop_buf); printMicroOp(uop_size, uop_buf);
#endif #endif
// Initialize inputs // Initialize inputs
inp_T **inputs = allocInit2dArray<inp_T, INP_WIDTH>(batch, in_feat); inp_T **inputs = allocInit2dArray<inp_T, VTA_INP_WIDTH>(batch, in_feat);
// Initialize weights // Initialize weights
wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat); wgt_T **weights = allocInit2dArray<wgt_T, VTA_WGT_WIDTH>(out_feat, in_feat);
// Initialize biases // Initialize biases
acc_T **biases = allocInit2dArray<acc_T, ACC_WIDTH>(batch, out_feat); acc_T **biases = allocInit2dArray<acc_T, VTA_ACC_WIDTH>(batch, out_feat);
// Reference GEMM implementation // Reference GEMM implementation
out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat); out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
for (int i = 0; i < batch; i ++) { for (int i = 0; i < batch; i++) {
for (int j = 0; j < out_feat; j ++) { for (int j = 0; j < out_feat; j++) {
acc_T sum = biases[i][j]; acc_T sum = biases[i][j];
for (int k = 0; k < in_feat; k ++) { for (int k = 0; k < in_feat; k++) {
sum += (acc_T) (inputs[i][k] * weights[j][k]); sum += (acc_T) (inputs[i][k] * weights[j][k]);
} }
// Set // Set
...@@ -1038,49 +1102,75 @@ virtual_threads=%d\n", ...@@ -1038,49 +1102,75 @@ virtual_threads=%d\n",
} }
// Prepare the input buffer // Prepare the input buffer
inp_T *input_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * inp_size); inp_T *input_buf = static_cast<inp_T *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
packBuffer<inp_T, INP_WIDTH>(input_buf, inputs, batch, in_feat, BATCH, BLOCK_IN); packBuffer<inp_T, VTA_INP_WIDTH>(input_buf,
inputs,
batch,
in_feat,
VTA_BATCH,
VTA_BLOCK_IN);
// Prepare the weight buffer // Prepare the weight buffer
wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size); wgt_T *weight_buf = static_cast<wgt_T *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN); packBuffer<wgt_T, VTA_WGT_WIDTH>(weight_buf,
weights,
out_feat,
in_feat,
VTA_BLOCK_OUT,
VTA_BLOCK_IN);
// Prepare the bias buffer // Prepare the bias buffer
acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * out_size); acc_T *bias_buf = static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
packBuffer<acc_T, ACC_WIDTH>(bias_buf, biases, batch, out_feat, BATCH, BLOCK_OUT); packBuffer<acc_T, VTA_ACC_WIDTH>(bias_buf,
biases,
batch,
out_feat,
VTA_BATCH,
VTA_BLOCK_OUT);
// Prepare the output buffer // Prepare the output buffer
out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * out_size); out_T *output_buf = static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * out_size));
#ifdef NO_SIM #ifdef NO_SIM
// Invoke the VTA // Invoke the VTA
uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, input_buf, weight_buf, bias_buf, output_buf); uint64_t t_fpga = vta(ins_size,
insn_buf,
uop_buf,
input_buf,
weight_buf,
bias_buf,
output_buf);
// Report on timining // Report on timining
printf("INFO - Synchronization time: %.3lfms\n", (double) t_fpga / 1E6); printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
printf("INFO - Throughput: %.3lfGOPs/s\n", (double) batch * in_feat * out_feat * 2 / t_fpga); printf("INFO - Throughput: %.3lfGOPs/s\n",
static_cast<float>(batch) * in_feat * out_feat * 2 / t_fpga);
#else #else
// Invoke the VTA // Invoke the VTA
vta( vta(ins_size,
ins_size,
(volatile insn_T *) insn_buf, (volatile insn_T *) insn_buf,
(volatile uop_T *) uop_buf, (volatile uop_T *) uop_buf,
(volatile inp_vec_T *) input_buf, (volatile inp_vec_T *) input_buf,
(volatile wgt_vec_T *) weight_buf, (volatile wgt_vec_T *) weight_buf,
(volatile acc_vec_T *) bias_buf, (volatile acc_vec_T *) bias_buf,
(volatile inp_vec_T *) output_buf (volatile inp_vec_T *) output_buf);
);
#endif #endif
// Unpack output data // Unpack output data
out_T **outputs = alloc2dArray<out_T>(batch, out_feat); out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT); unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
output_buf,
batch,
out_feat,
VTA_BATCH,
VTA_BLOCK_OUT);
// Correctness checks // Correctness checks
int err = 0; int err = 0;
for (int i = 0; i < batch; i ++) { for (int i = 0; i < batch; i++) {
for (int j = 0; j < out_feat; j ++) { for (int j = 0; j < out_feat; j++) {
if (outputs_ref[i][j] != outputs[i][j]) { if (outputs_ref[i][j] != outputs[i][j]) {
err++; err++;
#if DEBUG==1 #if VTA_DEBUG == 1
printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, (int) outputs_ref[i][j], printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
(int) outputs[i][j]); static_cast<int>(outputs_ref[i][j]),
static_cast<int>(outputs[i][j]));
#endif #endif
} }
} }
...@@ -1092,12 +1182,12 @@ virtual_threads=%d\n", ...@@ -1092,12 +1182,12 @@ virtual_threads=%d\n",
free2dArray<acc_T>(biases, batch, out_feat); free2dArray<acc_T>(biases, batch, out_feat);
free2dArray<out_T>(outputs_ref, batch, out_feat); free2dArray<out_T>(outputs_ref, batch, out_feat);
free2dArray<out_T>(outputs, batch, out_feat); free2dArray<out_T>(outputs, batch, out_feat);
freeBuffer((void *) insn_buf); freeBuffer(insn_buf);
freeBuffer((void *) uop_buf); freeBuffer(uop_buf);
freeBuffer((void *) input_buf); freeBuffer(input_buf);
freeBuffer((void *) weight_buf); freeBuffer(weight_buf);
freeBuffer((void *) bias_buf); freeBuffer(bias_buf);
freeBuffer((void *) output_buf); freeBuffer(output_buf);
if (err == 0) { if (err == 0) {
printf("INFO - Blocked GEMM test successful!\n"); printf("INFO - Blocked GEMM test successful!\n");
...@@ -1106,5 +1196,4 @@ virtual_threads=%d\n", ...@@ -1106,5 +1196,4 @@ virtual_threads=%d\n",
printf("INFO - Blocked GEMM test failed, got %d errors!\n", err); printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
return -1; return -1;
} }
} }
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
* \brief Test library for the VTA design simulation and driver tests. * \brief Test library for the VTA design simulation and driver tests.
*/ */
#ifndef VTA_TESTLIB_H_ #ifndef TESTS_HARDWARE_COMMON_TEST_LIB_H_
#define VTA_TESTLIB_H_ #define TESTS_HARDWARE_COMMON_TEST_LIB_H_
#include <assert.h> #include <assert.h>
#include <stdint.h> #include <stdint.h>
...@@ -17,9 +17,9 @@ ...@@ -17,9 +17,9 @@
#include <vta/driver.h> #include <vta/driver.h>
#ifdef PYNQ_TARGET #ifdef VTA_PYNQ_TARGET
#include "../../../src/pynq/pynq_driver.h" #include "../../../src/pynq/pynq_driver.h"
#endif //PYNQ_TARGET #endif // VTA_PYNQ_TARGET
typedef uint64_t axi_T; typedef uint64_t axi_T;
typedef uint32_t uop_T; typedef uint32_t uop_T;
...@@ -28,7 +28,7 @@ typedef int8_t inp_T; ...@@ -28,7 +28,7 @@ typedef int8_t inp_T;
typedef int8_t out_T; typedef int8_t out_T;
typedef int32_t acc_T; typedef int32_t acc_T;
uint64_t vta ( uint64_t vta(
uint32_t insn_count, uint32_t insn_count,
VTAGenericInsn *insns, VTAGenericInsn *insns,
VTAUop *uops, VTAUop *uops,
...@@ -37,11 +37,11 @@ uint64_t vta ( ...@@ -37,11 +37,11 @@ uint64_t vta (
acc_T *biases, acc_T *biases,
inp_T *outputs); inp_T *outputs);
#else //NO_SIM #else // NO_SIM
#include "../../../hardware/vivado/src/vta.h" #include "../../../hardware/vivado/src/vta.h"
#endif //NO_SIM #endif // NO_SIM
/*! /*!
* \brief Returns opcode string. * \brief Returns opcode string.
...@@ -300,4 +300,4 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp ...@@ -300,4 +300,4 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
int blocked_gemm_test(int batch, int channels, int block, bool uop_compression, int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
int virtual_threads); int virtual_threads);
#endif // VTA_TESTLIB_H_ #endif // TESTS_HARDWARE_COMMON_TEST_LIB_H_
\ No newline at end of file
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "../common/test_lib.h" #include "../common/test_lib.h"
// VTA invocation (present the same abstraction as in the simulation tests) // VTA invocation (present the same abstraction as in the simulation tests)
uint64_t vta ( uint64_t vta(
uint32_t insn_count, uint32_t insn_count,
VTAGenericInsn *insns, VTAGenericInsn *insns,
VTAUop *uops, VTAUop *uops,
...@@ -22,24 +22,23 @@ uint64_t vta ( ...@@ -22,24 +22,23 @@ uint64_t vta (
wgt_T *weights, wgt_T *weights,
acc_T *biases, acc_T *biases,
inp_T *outputs) { inp_T *outputs) {
// Performance counter variables // Performance counter variables
uint64_t t_fpga; uint64_t t_fpga;
struct timespec start, stop; struct timespec start, stop;
// Derive bitstream file // Derive bitstream file
char bitstream[64]; char bitstream[128];
char str_batch_size[4]; char str_batch_size[4];
char str_block_out_size[4]; char str_block_out_size[4];
char str_block_in_size[4]; char str_block_in_size[4];
char str_block_bit_width[4]; char str_block_bit_width[4];
sprintf(str_batch_size, "%d", BATCH); snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
sprintf(str_block_out_size, "%d", BLOCK_OUT); snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
sprintf(str_block_in_size, "%d", BLOCK_IN); snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
sprintf(str_block_bit_width, "%d", WGT_WIDTH); snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
strcpy(bitstream, "vta.bit"); snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
#if DEBUG==1 #if VTA_DEBUG == 1
printf("INFO - Programming FPGA: %s!\n", bitstream); printf("INFO - Programming FPGA: %s!\n", bitstream);
#endif #endif
...@@ -59,7 +58,7 @@ uint64_t vta ( ...@@ -59,7 +58,7 @@ uint64_t vta (
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0; uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0; uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
#if DEBUG==1 #if VTA_DEBUG == 1
printf("INFO - Starting FPGA!\n"); printf("INFO - Starting FPGA!\n");
#endif #endif
...@@ -92,14 +91,13 @@ uint64_t vta ( ...@@ -92,14 +91,13 @@ uint64_t vta (
if (flag & VTA_DONE) break; if (flag & VTA_DONE) break;
} }
if (t==10000000) { if (t == 10000000) {
printf("\tWARNING: VTA TIMEOUT!!!!\n"); printf("\tWARNING: VTA TIMEOUT!!!!\n");
} #if VTA_DEBUG == 1
#if DEBUG==1 } else {
else {
printf("INFO - FPGA Finished!\n"); printf("INFO - FPGA Finished!\n");
}
#endif #endif
}
clock_gettime(CLOCK_REALTIME, &stop); clock_gettime(CLOCK_REALTIME, &stop);
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec); t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
...@@ -111,43 +109,40 @@ uint64_t vta ( ...@@ -111,43 +109,40 @@ uint64_t vta (
VTAUnmapRegister(vta_store_handle, VTA_RANGE); VTAUnmapRegister(vta_store_handle, VTA_RANGE);
return t_fpga; return t_fpga;
}; }
int main(void)
{
#if DEBUG==1 int main(void) {
#if VTA_DEBUG == 1
printParameters(); printParameters();
#endif #endif
int status = 0; int status = 0;
// Run ALU test (vector-scalar operators) // Run ALU test (vector-scalar operators)
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false);
// Run ALU test (vector-vector operators) // Run ALU test (vector-vector operators)
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true); status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false); status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false);
// Run blocked GEMM test // Run blocked GEMM test
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
if (status==0) { if (status == 0) {
printf("\nINFO - Unit tests successful!\n"); printf("\nINFO - Unit tests successful!\n");
} else { } else {
printf("\nINTO - Unit tests failed!\n"); printf("\nINTO - Unit tests failed!\n");
} }
return status; return status;
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment