Commit dae77cdb by Thierry Moreau Committed by Tianqi Chen

[HARDWARE, TEST] Fixed hardware generation flow (#34)

parent 9f0e8ffe
#!/bin/bash
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/tvm/python:/home/xilinx/vta/python
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/vta/nnvm/tvm/python:/home/xilinx/vta/python
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/
python -m vta.exec.rpc_server
......@@ -13,8 +13,10 @@ VIVADO_HLS = vivado_hls
VIVADO = vivado
HSI = hsi
# HLS Mode
MODE = all
# HLS mode
MODE = skip_sim
# Debug flag
DEBUG = false
# SLURM
SLURM = false
# Prevent generation of DSP
......@@ -22,15 +24,26 @@ NO_DSP = false
# Prevent generation of ALU
NO_ALU = false
# Include top-level config file
ifndef config
ifneq ("$(wildcard ../../config.mk)", "")
config = ../../config.mk
else
config = ../../make/config.mk
endif
endif
include $(config)
# Process VTA JSON config
VTA_CONFIG = python $(CURDIR)/../../make/vta_config.py
CFLAGS := $(shell ${VTA_CONFIG} --cflags)
VTA_TARGET := $(shell ${VTA_CONFIG} --target)
#---------------------
# VTA Parameters
#--------------------
VTA_INP_WIDTH := $(shell ${VTA_CONFIG} --get-inpwidth)
VTA_WGT_WIDTH := $(shell ${VTA_CONFIG} --get-wgtwidth)
VTA_ACC_WIDTH := $(shell ${VTA_CONFIG} --get-accwidth)
VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth)
VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch)
VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin)
VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout)
VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize)
VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize)
VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize)
#---------------------
# Compilation parameters
......@@ -50,8 +63,8 @@ TARGET_PER = \
$(shell echo "$$(( (1000 + $(VTA_HW_COMP_CLOCK_FREQ) - 1) / $(VTA_HW_COMP_CLOCK_FREQ) - $(VTA_HW_COMP_TIMING_COMP)))" )
# Derive config name
CONF = \
$(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_LOG_UOP_BUFF_SIZE)_$(VTA_LOG_INP_BUFF_SIZE)_$(VTA_LOG_WGT_BUFF_SIZE)_$(VTA_LOG_ACC_BUFF_SIZE)_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
CONF_ROOT = $(shell ${VTA_CONFIG} --cfg-str)
CONF = $(CONF_ROOT)_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
......@@ -60,26 +73,34 @@ ifeq ($(SLURM), true)
HW_BUILD_PATH = /scratch/vivado/$(CONF)
endif
.PHONY: all ip bit driver clean clean_all
# IP file path
IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip
# Bitstream file path
BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
all: bit
.PHONY: all ip bit bsp clean clean_all
ip:
all: bsp
ip: $(IP_PATH)
bit: $(BIT_PATH)
$(IP_PATH): $(SRC_DIR)/*
mkdir -p $(IP_BUILD_PATH)
cd $(IP_BUILD_PATH) && \
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
$(VTA_LOG_INP_WIDTH) $(VTA_LOG_WGT_WIDTH) $(VTA_LOG_ACC_WIDTH) $(VTA_LOG_OUT_WIDTH) \
$(VTA_LOG_BATCH) $(VTA_LOG_BLOCK_OUT) $(VTA_LOG_BLOCK_IN) \
$(VTA_LOG_UOP_BUFF_SIZE) $(VTA_LOG_INP_BUFF_SIZE) $(VTA_LOG_WGT_BUFF_SIZE) \
$(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE) \
$(MODE) $(NO_DSP) $(NO_ALU)
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
$(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(TARGET_PER) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
ifeq ($(SLURM), true)
mkdir -p $(BUILD_DIR)/hls
mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/.
endif
bit: ip
$(BIT_PATH): $(IP_PATH)
mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
......@@ -92,12 +113,12 @@ ifeq ($(SLURM), true)
mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/.
endif
driver: bit
bsp: $(BIT_PATH)
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
cd $(HW_BUILD_PATH)/bsp && make
clean:
rm -rf *.out *.log *.sb figures
clean_all: clean
cleanall: clean
rm -rf $(BUILD_DIR)
......@@ -9,65 +9,69 @@
# Arg 2: path to sim sources
# Arg 3: path to test sources
# Arg 4: path to include sources
# Arg 5: target clock period
# Arg 6: input type width (log)
# Arg 7: weight type width (log)
# Arg 8: accum type width (log)
# Arg 9: output type width (log)
# Arg 10: batch size (log)
# Arg 11: in block size (log)
# Arg 12: out block size (log)
# Arg 13: uop buffer size in B (log)
# Arg 14: inp buffer size in B (log)
# Arg 15: wgt buffer size in B (log)
# Arg 16: acc buffer size in B (log)
# Arg 17: out buffer size in B (log)
# Arg 18: mode
# Arg 19: no_dsp
# Arg 20: no_alu
# Arg 5: mode
# Arg 6: debug
# Arg 7: no_dsp
# Arg 8: no_alu
# Arg 9: target clock period
# Arg 10: input type width (log)
# Arg 11: weight type width (log)
# Arg 12: accum type width (log)
# Arg 13: output type width (log)
# Arg 14: batch size (log)
# Arg 15: in block size (log)
# Arg 16: out block size (log)
# Arg 17: uop buffer size in B (log)
# Arg 18: inp buffer size in B (log)
# Arg 19: wgt buffer size in B (log)
# Arg 20: acc buffer size in B (log)
# Arg 21: out buffer size in B (log)
if { [llength $argv] eq 22 } {
if { [llength $argv] eq 23 } {
set src_dir [lindex $argv 2]
set sim_dir [lindex $argv 3]
set test_dir [lindex $argv 4]
set include_dir [lindex $argv 5]
set target_period [lindex $argv 6]
set inp_width [lindex $argv 7]
set wgt_width [lindex $argv 8]
set acc_width [lindex $argv 9]
set out_width [lindex $argv 10]
set batch [lindex $argv 11]
set block_in [lindex $argv 12]
set block_out [lindex $argv 13]
set uop_buff_size [lindex $argv 14]
set inp_buff_size [lindex $argv 15]
set wgt_buff_size [lindex $argv 16]
set acc_buff_size [lindex $argv 17]
set out_buff_size [lindex $argv 18]
set mode [lindex $argv 19]
set no_dsp [lindex $argv 20]
set no_alu [lindex $argv 21]
set mode [lindex $argv 6]
set debug [lindex $argv 7]
set no_dsp [lindex $argv 8]
set no_alu [lindex $argv 9]
set target_period [lindex $argv 10]
set inp_width [lindex $argv 11]
set wgt_width [lindex $argv 12]
set acc_width [lindex $argv 13]
set out_width [lindex $argv 14]
set batch [lindex $argv 15]
set block_in [lindex $argv 16]
set block_out [lindex $argv 17]
set uop_buff_size [lindex $argv 18]
set inp_buff_size [lindex $argv 19]
set wgt_buff_size [lindex $argv 20]
set acc_buff_size [lindex $argv 21]
set out_buff_size [lindex $argv 22]
} else {
set src_dir "../src"
set sim_dir "../sim"
set test_dir "../../src/test"
set include_dir "../../include"
set mode "all"
set debug "false"
set no_dsp "true"
set no_alu "false"
set target_period 10
set inp_width 3
set wgt_width 3
set acc_width 5
set out_width 3
set batch 1
set block_out 4
set block_in 4
set block_out 4
set uop_buff_size 15
set inp_buff_size 15
set wgt_buff_size 15
set acc_buff_size 17
set out_buff_size 15
set mode "all"
set no_dsp "true"
set no_alu "false"
exit
}
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
......@@ -124,12 +128,15 @@ proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
# C define flags to pass to compiler
set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
-DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
-DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
if {$debug=="true"} {
append cflags " -DVTA_DEBUG=1"
}
if {$no_dsp=="true"} {
append cflags " -DNO_DSP"
}
......
......@@ -26,15 +26,15 @@ if { [llength $argv] eq 12 } {
set ip_path [lindex $argv 0]
set num_threads [lindex $argv 1]
set clock_freq [lindex $argv 2]
set inp_width [lindex $argv 3]
set wgt_width [lindex $argv 4]
set out_width [lindex $argv 5]
set batch [lindex $argv 6]
set out_block [lindex $argv 7]
set in_block [lindex $argv 8]
set inp_mem_size [lindex $argv 9]
set wgt_mem_size [lindex $argv 10]
set out_mem_size [lindex $argv 11]
set inp_width [expr 1 << [lindex $argv 3]]
set wgt_width [expr 1 << [lindex $argv 4]]
set out_width [expr 1 << [lindex $argv 5]]
set batch [expr 1 << [lindex $argv 6]]
set out_block [expr 1 << [lindex $argv 7]]
set in_block [expr 1 << [lindex $argv 8]]
set inp_mem_size [expr 1 << [lindex $argv 9]]
set wgt_mem_size [expr 1 << [lindex $argv 10]]
set out_mem_size [expr 1 << [lindex $argv 11]]
if {$clock_freq eq 100} {
set clock_id 0
puts "Setting clock frequency to 100MHz"
......
......@@ -53,5 +53,8 @@ int main(void) {
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
// Simple GEMM unit test
status |= gemm_test(64, 64, 64, true);
return status;
}
......@@ -7,8 +7,8 @@
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_UOP_BUFF_SIZE" : 14,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}
......@@ -7,8 +7,8 @@
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_UOP_BUFF_SIZE" : 14,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}
......@@ -28,6 +28,32 @@ def main():
help="print all the config json")
parser.add_argument("--target", action="store_true",
help="print the target")
parser.add_argument("--cfg-str", action="store_true",
help="print the configuration string")
parser.add_argument("--get-inpwidth", action="store_true",
help="returns log of input bitwidth")
parser.add_argument("--get-wgtwidth", action="store_true",
help="returns log of weight bitwidth")
parser.add_argument("--get-accwidth", action="store_true",
help="returns log of accum bitwidth")
parser.add_argument("--get-outwidth", action="store_true",
help="returns log of output bitwidth")
parser.add_argument("--get-batch", action="store_true",
help="returns log of tensor batch dimension")
parser.add_argument("--get-blockin", action="store_true",
help="returns log of tensor block in dimension")
parser.add_argument("--get-blockout", action="store_true",
help="returns log of tensor block out dimension")
parser.add_argument("--get-uopbuffsize", action="store_true",
help="returns log of micro-op buffer size in B")
parser.add_argument("--get-inpbuffsize", action="store_true",
help="returns log of input buffer size in B")
parser.add_argument("--get-wgtbuffsize", action="store_true",
help="returns log of weight buffer size in B")
parser.add_argument("--get-accbuffsize", action="store_true",
help="returns log of accum buffer size in B")
parser.add_argument("--get-outbuffsize", action="store_true",
help="returns log of output buffer size in B")
args = parser.parse_args()
if len(sys.argv) == 1:
......@@ -46,13 +72,17 @@ def main():
raise RuntimeError("Cannot find config in %s" % str(path_list))
cfg = json.load(open(ok_path_list[0]))
cfg["LOG_OUT_WIDTH"] = cfg["LOG_INP_WIDTH"]
cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] + cfg["LOG_ACC_WIDTH"] - cfg["LOG_OUT_WIDTH"]
pkg = get_pkg_config(cfg)
if args.target:
print(pkg.target)
if args.cflags:
print(" ".join(pkg.cflags))
cflags_str = " ".join(pkg.cflags)
if cfg["TARGET"] == "pynq":
cflags_str += " -DVTA_TARGET_PYNQ"
print(cflags_str)
if args.ldflags:
print(" ".join(pkg.ldflags))
......@@ -60,6 +90,54 @@ def main():
if args.cfg_json:
print(pkg.cfg_json)
if args.cfg_str:
cfg_str = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}".format(
(1 << cfg["LOG_BATCH"]),
(1 << cfg["LOG_BLOCK_IN"]),
(1 << cfg["LOG_BLOCK_OUT"]),
(1 << cfg["LOG_INP_WIDTH"]),
(1 << cfg["LOG_WGT_WIDTH"]),
cfg["LOG_UOP_BUFF_SIZE"],
cfg["LOG_INP_BUFF_SIZE"],
cfg["LOG_WGT_BUFF_SIZE"],
cfg["LOG_ACC_BUFF_SIZE"])
print cfg_str
if args.get_inpwidth:
print(cfg["LOG_INP_WIDTH"])
if args.get_wgtwidth:
print(cfg["LOG_WGT_WIDTH"])
if args.get_accwidth:
print(cfg["LOG_ACC_WIDTH"])
if args.get_outwidth:
print(cfg["LOG_OUT_WIDTH"])
if args.get_batch:
print(cfg["LOG_BATCH"])
if args.get_blockin:
print(cfg["LOG_BLOCK_IN"])
if args.get_blockout:
print(cfg["LOG_BLOCK_OUT"])
if args.get_uopbuffsize:
print(cfg["LOG_UOP_BUFF_SIZE"])
if args.get_inpbuffsize:
print(cfg["LOG_INP_BUFF_SIZE"])
if args.get_wgtbuffsize:
print(cfg["LOG_WGT_BUFF_SIZE"])
if args.get_outbuffsize:
print(cfg["LOG_OUT_BUFF_SIZE"])
if args.get_accbuffsize:
print(cfg["LOG_ACC_BUFF_SIZE"])
if __name__ == "__main__":
main()
......@@ -130,11 +130,15 @@ class Environment(object):
self.BLOCK_IN *
self.WGT_WIDTH)
self.ACC_ELEM_BITS = (self.BATCH *
self.BLOCK_IN *
self.BLOCK_OUT *
self.ACC_WIDTH)
self.OUT_ELEM_BITS = (self.BATCH *
self.BLOCK_OUT *
self.OUT_WIDTH)
self.INP_ELEM_BYTES = self.INP_ELEM_BITS // 8
self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8
self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8
self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8
# dtypes
self.acc_dtype = "int%d" % self.ACC_WIDTH
self.inp_dtype = "int%d" % self.INP_WIDTH
......
......@@ -339,7 +339,7 @@ def inject_dma_intrin(stmt_in):
base = 0
for i in range(1, ndim + 1):
if not util.equal_const_int(buf.strides[ndim - i] - x_size, 0):
raise RuntimeError("scope %s need need to have block=%d" % (scope, elem_block))
raise RuntimeError("scope %s needs to have block=%d" % (scope, elem_block))
x_size = x_size * buf.shape[ndim - i]
if util.equal_const_int(x_size - elem_block, 0):
base = i + 1
......@@ -469,10 +469,10 @@ def inject_dma_intrin(stmt_in):
if pad_before or pad_after:
raise RuntimeError("Do not support copy into DRAM with pad")
if src.scope == env.acc_scope:
elem_width = env.INP_WIDTH # output compression to inp type
elem_bytes = env.INP_ELEM_BYTES # output compression to inp type
elem_width = env.OUT_WIDTH
elem_bytes = env.OUT_ELEM_BYTES
mem_type = env.dev.MEM_ID_OUT
data_type = "int%d" % env.INP_WIDTH
data_type = "int%d" % env.OUT_WIDTH
task_qid = env.dev.QID_STORE_OUT
else:
raise RuntimeError("Do not support copy %s->dram" % (src.scope))
......
/*!
* Copyright (c) 2018 by Contributors
* \file vta_test_lib.cpp
* \file test_lib.cpp
* \brief Test library for the VTA design simulation and driver tests.
*/
......@@ -17,9 +17,9 @@
#include <vta/driver.h>
#ifdef VTA_PYNQ_TARGET
#ifdef VTA_TARGET_PYNQ
#include "../../../src/pynq/pynq_driver.h"
#endif // VTA_PYNQ_TARGET
#endif // VTA_TARGET_PYNQ
typedef uint64_t axi_T;
typedef uint32_t uop_T;
......@@ -300,4 +300,14 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
int virtual_threads);
/*!
* \brief VTA GEMM unit test.
* \param batch Batch size.
* \param in_channels Input channels.
* \param out_channels Output channels.
* \param uop_compression Apply micro-op compression.
* \return Number of errors from the test run.
*/
int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression);
#endif // TESTS_HARDWARE_COMMON_TEST_LIB_H_
CC ?= g++
CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
LDFLAGS = -L/usr/lib -L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/
LIBS = -l:libsds_lib.so -l:libdma.so
LIBS = -l:libsds_lib.so -l:libdma.so -lstdc++
INCLUDE_DIR = ../../../include
DRIVER_DIR = ../../../src/pynq
TESTLIB_DIR = ../common
......@@ -10,19 +10,14 @@ SOURCES = pynq_driver.cc test_lib.cc
OBJECTS = pynq_driver.o test_lib.o metal_test.o
EXECUTABLE = vta
# Include top-level config file
ifndef config
ifneq ("$(wildcard ../../../config.mk)", "")
config = ../../../config.mk
else
config = ../../../make/config.mk
endif
endif
include $(config)
# Include VTA config
VTA_CONFIG = python ../../../make/vta_config.py
CFLAGS += `${VTA_CONFIG} --cflags`
LDFLAGS += `${VTA_CONFIG} --ldflags`
VTA_TARGET := $(shell ${VTA_CONFIG} --target)
# Define flags
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DDEBUG=0
CFLAGS += $(ADD_CFLAGS)
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DVTA_DEBUG=0
# All Target
all: $(EXECUTABLE)
......
/*!
* Copyright (c) 2018 by Contributors
* \file driver_test.cpp
* \file metal_test.cpp
* \brief Bare-metal test to test driver and VTA design.
*/
......@@ -13,104 +13,6 @@
#include "../../../src/pynq/pynq_driver.h"
#include "../common/test_lib.h"
// VTA invocation (present the same abstraction as in the simulation tests)
uint64_t vta(
uint32_t insn_count,
VTAGenericInsn *insns,
VTAUop *uops,
inp_T *inputs,
wgt_T *weights,
acc_T *biases,
inp_T *outputs) {
// Performance counter variables
uint64_t t_fpga;
struct timespec start, stop;
// Derive bitstream file
char bitstream[128];
char str_batch_size[4];
char str_block_out_size[4];
char str_block_in_size[4];
char str_block_bit_width[4];
snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
#if VTA_DEBUG == 1
printf("INFO - Programming FPGA: %s!\n", bitstream);
#endif
// Program VTA
VTAProgram(bitstream);
// Get VTA handles
VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
// Physical address pointers
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
#if VTA_DEBUG == 1
printf("INFO - Starting FPGA!\n");
#endif
clock_gettime(CLOCK_REALTIME, &start);
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V
if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
// LOAD @ 0x10 : Data signal of inputs_V
if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
// LOAD @ 0x18 : Data signal of weight_V
if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
// COMPUTE @ 0x20 : Data signal of uops_V
if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
// COMPUTE @ 0x28 : Data signal of biases_V
if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
// STORE @ 0x10 : Data signal of outputs_V
if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
// VTA start
VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
int flag = 0, t = 0;
for (t = 0; t < 10000000; ++t) {
flag = VTAReadMappedReg(vta_compute_handle, 0x18);
if (flag & VTA_DONE) break;
}
if (t == 10000000) {
printf("\tWARNING: VTA TIMEOUT!!!!\n");
#if VTA_DEBUG == 1
} else {
printf("INFO - FPGA Finished!\n");
#endif
}
clock_gettime(CLOCK_REALTIME, &stop);
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
// Unmap VTA register
VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
VTAUnmapRegister(vta_load_handle, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
VTAUnmapRegister(vta_store_handle, VTA_RANGE);
return t_fpga;
}
int main(void) {
#if VTA_DEBUG == 1
printParameters();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment