Commit 47001850 by Thierry Moreau Committed by Tianqi Chen

hardware compilation flow, and driver tests

parent b8d8e5b6
# vta
Open Hardware/Software Stack for Vertical Deep Learning System Optimization Open Hardware/Software Stack for Vertical Deep Learning System Optimization
==============================================
[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
VTA is an open hardware/software co-design stack for deep learning systems systems.
It provides a customizable hardware accelerator template for deep learning inference workloads,
combined with a fully functional compiler stack built with TVM.
License
-------
© Contributors, 2018. Licensed under an [Apache-2.0](https://github.com/tmoreau89/vta/blob/master/LICENSE) license.
This source diff could not be displayed because it is too large. You can view the blob instead.
# Directories
ROOTDIR = $(CURDIR)
BUILD_DIR = $(ROOTDIR)/build
SCRIPT_DIR = $(ROOTDIR)/scripts
SRC_DIR = $(ROOTDIR)/../../src/hardware/hls
SIM_DIR = $(ROOTDIR)/sim
TEST_DIR = $(ROOTDIR)/../../src/test
INCLUDE_DIR = $(ROOTDIR)/../../include
# Executables
VIVADO_HLS = vivado_hls
VIVADO = vivado
HSI = hsi
# Build parameters:
# Number of threads during compilation
NUM_THREADS = 8
# Target Frequency
CLOCK_FREQ = 100
# Log of input width in bits
LOG_INP_WIDTH = 3
# Log of weight width in bits
LOG_WGT_WIDTH = 3
# Log of accum width in bits
LOG_ACC_WIDTH = 5
# Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_IN_BLOCK = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_OUT_BLOCK = 4
# Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17
# Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
# Derived parameter
# Input width in bits
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
# Weight width in bits
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
# Output width in bits
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
# Tensor batch size
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
# Tensor outer block size
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" )
# Tensor inner block size
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" )
# Uop buffer size in Bytes
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
# Inp buffer size in Bytes
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
# Wgt buffer size in Bytes
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
# Acc buffer size in Bytes
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
# Out buffer size in Bytes
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
# Derive clock target period
TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
# Derive config name
CONF = \
$(BATCH)x$(IN_BLOCK)x$(OUT_BLOCK)_$(INP_WIDTH)bx$(WGT_WIDTH)b_$(CLOCK_FREQ)MHz_$(TARGET_PER)ns
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
.PHONY: all ip bit driver clean
all: driver
ip:
mkdir -p $(IP_BUILD_PATH)
cd $(IP_BUILD_PATH) && \
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
$(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \
$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
bit: ip
mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
-tclargs $(IP_BUILD_PATH) $(NUM_THREADS) $(CLOCK_FREQ) \
$(INP_WIDTH) $(WGT_WIDTH) $(OUT_WIDTH) \
$(BATCH) $(IN_BLOCK) $(OUT_BLOCK) \
$(INP_BUFF_SIZE) $(WGT_BUFF_SIZE) $(OUT_BUFF_SIZE)
driver: bit
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
cd $(HW_BUILD_PATH)/bsp && make
clean:
rm -rf build
\ No newline at end of file
#
# Copyright (c) 2018 by Contributors
# file: hls.tcl
# brief: HLS generation script.
#
# Command line arguments:
# Arg 1: path to design sources
# Arg 2: path to sim sources
# Arg 3: path to test sources
# Arg 4: path to include sources
# Arg 5: target clock period
# Arg 6: input type width (log)
# Arg 7: weight type width (log)
# Arg 8: accum type width (log)
# Arg 9: output type width (log)
# Arg 10: batch size (log)
# Arg 11: in block size (log)
# Arg 12: out block size (log)
# Arg 13: uop buffer size in B (log)
# Arg 14: inp buffer size in B (log)
# Arg 15: wgt buffer size in B (log)
# Arg 16: acc buffer size in B (log)
# Arg 17: out buffer size in B (log)
if { [llength $argv] eq 19 } {
set src_dir [lindex $argv 2]
set sim_dir [lindex $argv 3]
set test_dir [lindex $argv 4]
set include_dir [lindex $argv 5]
set target_period [lindex $argv 6]
set inp_width [lindex $argv 7]
set wgt_width [lindex $argv 8]
set acc_width [lindex $argv 9]
set out_width [lindex $argv 10]
set batch [lindex $argv 11]
set block_in [lindex $argv 12]
set block_out [lindex $argv 13]
set uop_buff_size [lindex $argv 14]
set inp_buff_size [lindex $argv 15]
set wgt_buff_size [lindex $argv 16]
set acc_buff_size [lindex $argv 17]
set out_buff_size [lindex $argv 18]
} else {
set src_dir "../src/"
set sim_dir "../sim/"
set test_dir "../../src/test/"
set include_dir "../../include"
set target_period 10
set inp_width 3
set wgt_width 3
set acc_width 5
set out_width 3
set batch 1
set block_out 4
set block_in 4
set uop_buff_size 15
set inp_buff_size 15
set wgt_buff_size 15
set acc_buff_size 17
set out_buff_size 15
}
# C define flags to pass to compiler
set cflags "-I $include_dir -I $include_dir/hardware/hls \
-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
-DLOG_UOP_BUFF_SIZE=$uop_buff_size -DLOG_INP_BUFF_SIZE=$inp_buff_size \
-DLOG_WGT_BUFF_SIZE=$wgt_buff_size -DLOG_ACC_BUFF_SIZE=$acc_buff_size \
-DLOG_OUT_BUFF_SIZE=$out_buff_size"
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
# This is necessary because of a Vivado restriction that doesn't allow for
# buses wider than 1024 bits.
proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
# Set device number
set_part {xc7z020clg484-1}
# Set the clock frequency
create_clock -period $per -name default
# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/1024)
set inp_partition_factor [expr {(1 << ($inp_width + $block_in + $batch)) / 1024}]
if {$inp_partition_factor == 0} {
set_directive_array_reshape -type complete -dim 2 "load" inp_mem
set_directive_array_reshape -type complete -dim 2 "compute" inp_mem
} else {
# Set input reshaping factor below to (1024/INP_VECTOR_WIDTH)
set inp_reshape_factor [expr {1024 / (1 << ($inp_width + $block_in))}]
set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "load" inp_mem
set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "compute" inp_mem
set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
}
# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/1024)
set wgt_partition_factor [expr {(1 << ($wgt_width + $block_in + $block_out)) / 1024}]
if {$wgt_partition_factor == 0} {
set_directive_array_reshape -type complete -dim 2 "load" wgt_mem
set_directive_array_reshape -type complete -dim 2 "compute" wgt_mem
} else {
# Set weight reshaping factor below to (1024/WGT_VECTOR_WIDTH)
set wgt_reshape_factor [expr {1024 / (1 << ($wgt_width + $block_in))}]
set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "load" wgt_mem
set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "compute" wgt_mem
set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
}
# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/1024)
set out_partition_factor [expr {(1 << ($out_width + $block_out + $batch)) / 1024}]
if {$out_partition_factor == 0} {
set_directive_array_reshape -type complete -dim 2 "compute" out_mem
set_directive_array_reshape -type complete -dim 2 "store" out_mem
} else {
# Set output reshaping factor below to (1024/OUT_VECTOR_WIDTH)
set out_reshape_factor [expr {1024 / (1 << ($out_width + $block_out))}]
set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "compute" out_mem
set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "store" out_mem
set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "compute" out_mem
set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
}
}
# HLS behavioral sim
open_project vta_sim
set_top vta
add_files $src_dir/vta.cc -cflags $cflags
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csim_design -clean
close_project
# Generate fetch stage
open_project vta_fetch
set_top fetch
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
# Generate load stage
open_project vta_load
set_top load
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
# Generate compute stage
open_project vta_compute
set_top compute
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
# Generate store stage
open_project vta_store
set_top store
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
exit
#
# Copyright (c) 2018 by Contributors
# file: hsi.tcl
# brief: Driver generation script for ARMv7 driver libraries.
#
open_hw_design export/vta.hdf
create_sw_design swdesign -proc ps7_cortexa9_0 -os standalone
generate_bsp -dir bsp
exit
/*!
* Copyright (c) 2018 by Contributors
* \file vta_test.cpp
* \brief Simulation tests for the VTA design.
*/
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include "vta.h"
#include "vta_test_lib.h"
int main(void)
{
#if DEBUG==1
printParameters();
#endif
// Buffer indexing
assert(LOG_ACC_BUFF_DEPTH>=LOG_INP_BUFF_DEPTH);
// Micro op bound
assert(UOP_GEM_3_1<UOP_WIDTH);
assert(UOP_ALU_3_1<UOP_WIDTH);
// Instruction alignment checks
assert(INSN_MEM_7_1<INSN_MEM_8_0);
assert(INSN_GEM_8_1<INSN_GEM_9_0);
// Instruction bounds
assert(INSN_MEM_E_1<INS_WIDTH);
assert(INSN_GEM_E_1<INS_WIDTH);
assert(INSN_ALU_F_1<INS_WIDTH);
int status = 0;
// Run ALU test (vector-scalar operators)
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
// Run ALU test (vector-vector operators)
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
// Run blocked GEMM test
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
return status;
}
\ No newline at end of file
/*!
* Copyright (c) 2018 by Contributors
* \file vta.h
* \brief Type definitions and prototype for VTA HLS design.
*/
#ifndef VTA_MAIN_H_
#define VTA_MAIN_H_
#include <assert.h>
#include <ap_axi_sdata.h>
#include <ap_int.h>
#include <hls_stream.h>
#include "vta_typedefs.h"
#include "vta_params.h"
/*!
* \brief Fetch module.
* Reads in \a insn_count instructions via DMA and pushes them to the
* appropriate load, gemm or store queue.
* \param insns Instruction data base address in DRAM. AXI-4 master port.
* \param insn_count Total instruction count. AXI-lite memory mapped register.
* \param load_queue Load instruction queue. AXI-stream FIFO.
* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
* \param store_queue Store instruction queue. AXI-stream FIFO.
*/
void fetch (
uint32_t insn_count,
volatile insn_T *insns,
hls::stream<insn_T> &load_queue,
hls::stream<insn_T> &gemm_queue,
hls::stream<insn_T> &store_queue);
/*!
* \brief Load module.
* Reads in load instructions from the load queue, and performs appropriate
* DMA load operation to the \a wgt_mem and \a inp_mem SRAM buffers from DRAM.
* Updates dependence queues accordingly.
* \param inputs Input data base address in DRAM. AXI-4 master port.
* \param weights Weight data base address in DRAM. AXI-4 master port.
* \param load_queue Load instruction queue. AXI-stream FIFO.
* \param g2l_dep_queue Dependence queue from GEMM to load stage.
* AXI-stream FIFO.
* \param l2g_dep_queue Dependence queue from load to GEMM stage.
* AXI-stream FIFO.
* \param inp_mem Local input SRAM buffer. Write only single port BRAM.
* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
*/
void load (
volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights,
hls::stream<insn_T> &load_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &l2g_dep_queue,
inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]
);
/*!
* \brief Compute module.
* Reads in GEMM instructions from the gemm queue, and performs appropriate
* GEMM/ALU instructions. Reads in data from the \a wgt_mem and \a inp_mem,
* and writes computation results into the \a out_mem. Updates dependence
* queues accordingly.
* \param done Signal that indicates that VLA is done. AXI-lite memory mapped
* register.
* \param uops Micro-op data base address in DRAM. AXI-4 master port.
* \param biases Bias data base address in DRAM. AXI-4 master port.
* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
* \param l2g_dep_queue Dependence queue from load to gemm stage.
* AXI-stream FIFO.
* \param s2g_dep_queue Dependence queue from store to gemm stage.
* AXI-stream FIFO.
* \param g2l_dep_queue Dependence queue from gemm to load stage.
* AXI-stream FIFO.
* \param g2s_dep_queue Dependence queue from gemm to store stage.
* AXI-stream FIFO.
* \param inp_mem Local input SRAM buffer. Read only single port BRAM.
* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
* \param out_mem Local output SRAM buffer. Write only single port BRAM.
*/
void compute (
volatile uint32_t &done,
volatile uop_T *uops,
volatile acc_vec_T *biases,
hls::stream<insn_T> &gemm_queue,
hls::stream<bool> &l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue,
out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT],
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
);
/*!
* \brief Store module.
* Reads in store instructions from the store queue, and performs appropriate
* store instructions from the output buffer in SRAM to DRAM. Updates dependence
* queues accordingly.
* \param outputs Output data base address in DRAM. AXI-4 master port.
* \param store_queue Store instruction queue. AXI-stream FIFO.
* \param g2s_dep_queue Dependence queue from gemm to store stage.
* AXI-stream FIFO.
* \param s2g_dep_queue Dependence queue from store to gemm stage.
* AXI-stream FIFO.
* \param out_mem Local output SRAM buffer. Read only single port BRAM.
*/
void store (
volatile out_vec_T *outputs,
hls::stream<insn_T> &store_queue,
hls::stream<bool> &g2s_dep_queue,
hls::stream<bool> &s2g_dep_queue,
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
);
/*!
* \brief VTA wrapper for simulation purpose only.
* Orchestrates dataflow execution of the fetch, load, GEMM and store stages.
* \param insn_count Total instruction count. AXI-lite memory mapped register.
* \param insns Instruction data base address in DRAM. AXI-4 master port.
* \param uops Micro-op data base address in DRAM. AXI-4 master port.
* \param inputs Input data base address in DRAM. AXI-4 master port.
* \param weights Weight data base address in DRAM. AXI-4 master port.
* \param biases Bias data base address in DRAM. AXI-4 master port.
* \param outputs Output data base address in DRAM. AXI-4 master port.
*/
void vta (
uint32_t insn_count,
volatile insn_T *insns,
volatile uop_T *uops,
volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights,
volatile acc_vec_T *biases,
volatile out_vec_T *outputs);
#endif // VTA_MAIN_H_
\ No newline at end of file
/*!
* Copyright (c) 2018 by Contributors
* \file vta_typedefs.h
* \brief Type definitions for VTA HLS design.
*/
#ifndef VTA_TYPEDEFS_H_
#define VTA_TYPEDEFS_H_
#include <assert.h>
#include <ap_axi_sdata.h>
#include <ap_int.h>
#include <hls_stream.h>
#include "vta_params.h"
/* \typedef uop_T Micro-op datatype*/
typedef ap_uint<UOP_WIDTH> uop_T;
/* \typedef inp_T Input datatype*/
typedef ap_int<INP_WIDTH> inp_T;
/* \typedef wgt_T Weight datatype*/
typedef ap_int<WGT_WIDTH> wgt_T;
/* \typedef out_T Output datatype*/
typedef ap_int<OUT_WIDTH> out_T;
/* \typedef acc_T Accumulator datatype*/
typedef ap_int<ACC_WIDTH> acc_T;
/* \typedef mul_T Multiplier output datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
/* \typedef inp_vec_T Input vector datatype*/
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
/* \typedef wgt_vec_T Weight vector datatype*/
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
/* \typedef acc_vec_T Accumulator vector datatype*/
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
/* \typedef out_vec_T Output vector datatype*/
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
/* \typedef inp_idx_T Input SRAM index datatype*/
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
/* \typedef wgt_idx_T Weight SRAM index datatype*/
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
/* \typedef acc_idx_T Accumulator SRAM index datatype*/
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
/* \typedef opcode_T Opcode datatype*/
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
/* \typedef insn_T Instruction datatype*/
typedef ap_uint<INS_WIDTH> insn_T;
/* \typedef loop_T Loop bound datatype*/
typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
/* \typedef memop_id_T Memory operation ID datatype*/
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
/* \typedef memop_sram_T Memory operation SRAM index datatype*/
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
/* \typedef memop_dram_T Memory operation DRAM index datatype*/
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
/* \typedef memop_size_T Memory operation range datatype*/
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
/* \typedef memop_stride_T Memory operation stride datatype*/
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
/* \typedef memop_pad_T Memory operation pad width datatype*/
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
#endif // VTA_TYPEDEFS_H_
/*!
* Copyright (c) 2018 by Contributors
* \file vta_pynq_driver.h
* \brief VTA driver for Pynq board.
*/
#ifndef VTA_PYNQ_DRIVER_H_
#define VTA_PYNQ_DRIVER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <assert.h>
#include <fcntl.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#ifdef __arm__
#include "libxlnk_cma.h"
#else
void* cma_alloc(size_t size, int cached);
void cma_free(void* buf);
uint32_t cma_get_phy_addr(void* buf);
void xlnkFlushCache(void* buf, int size);
void xlnkInvalidateCache(void* buf, int size);
#endif
/*! \brief VTA command handle */
typedef void * VTAHandle;
/*! \brief DMA command handle */
typedef struct {
/*! \brief Register map to the AXI DMA control registers*/
void *dma_register_map;
/*! \brief Transmit data descriptor*/
void *mm2s_descriptor_register_map;
/*! \brief Receive data descriptor*/
void *s2mm_descriptor_register_map;
/*! \brief Transmit data descriptor physical address*/
uint32_t mm2s_descriptor_phy;
/*! \brief Receive data descriptor physical address*/
uint32_t s2mm_descriptor_phy;
/*! \brief Descriptor size */
uint32_t descriptor_size;
/*! \brief Transaction count for tx channel */
uint32_t mm2s_count;
/*! \brief Transaction count for rx channel */
uint32_t s2mm_count;
/*! \brief Multi-channel mode enable */
int multichannel_en;
} DMAHandle;
/*! \brief partial bitstream status file path */
#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
/*! \brief bitstream destination file path */
#define BS_XDEVCFG "/dev/xdevcfg"
/*! \brief Path to /dev/mem */
#define DEV_MEM_PATH "/dev/mem"
/*! \brief MMIO driver constant */
#define MMIO_WORD_LENGTH 4
/*! \brief MMIO driver constant */
#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
/*! \brief VTA configuration register address range */
#define VTA_RANGE 0x100
/*! \brief VTA configuration register start value */
#define VTA_START 0x1
/*! \brief VTA configuration register auto-restart value */
#define VTA_AUTORESTART 0x81
/*! \brief VTA configuration register done value */
#define VTA_DONE 0x1
/*! \brief VTA fetch stage configuration register address
* from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_FETCH_ADDR 0x43C00000
/*! \brief VTA compute stage configuration register address
* from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_COMPUTE_ADDR 0x43C10000
/*! \brief VTA compute stage configuration register address
* from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_LOAD_ADDR 0x43C20000
/*! \brief VTA store stage configuration register address
* from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_STORE_ADDR 0x43C30000
/*! \brief Memory management constants with libxlnk_cma */
#define CACHED 1
/*! \brief Memory management constants with libxlnk_cma */
#define NOT_CACHED 0
/*! \brief log2 of SDS buffer size limit */
#define LOG_MAX_XFER 22
/*! \brief SDS buffer size limit */
#define MAX_XFER (1<<LOG_MAX_XFER)
/*!
* \brief Returns a memory map to FPGA configuration registers.
* \param addr The base physical address of the configuration registers.
* \param length The size of the memory mapped region in bytes.
* \return A pointer to the memory mapped region.
*/
void *MapRegister(unsigned addr, size_t length);
/*!
* \brief Deletes the configuration register memory map.
* \param vta The memory mapped region.
* \param length The size of the memory mapped region in bytes.
*/
void UnmapRegister(void *vta, size_t length);
/*!
* \brief Writes to a memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to write to.
* \param val The value to be written to the memory mapped register.
*/
void WriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
/*!
* \brief Reads from the memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to read from.
* \return The value read from the memory mapped register.
*/
unsigned ReadMappedReg(VTAHandle vta_base, unsigned offset);
/*!
* \brief Programming the bit stream on the FPGA.
* \param bitstream The path to the bit stream file.
*/
void ProgramVTA(const char* bitstream);
#ifdef __cplusplus
}
#endif
#endif // VTA_PYNQ_DRIVER_H_
\ No newline at end of file
/*!
* Copyright (c) 2018 by Contributors
* \file vta_pynq_driver.c
* \brief VTA driver for Pynq board.
*/
#ifdef __cplusplus
extern "C" {
#endif
#include "vta_pynq_driver.h"
#ifdef __cplusplus
}
#endif
void *MapRegister(uint32_t addr, size_t length) {
// Align the base address with the pages
uint32_t virt_base = addr & ~(getpagesize() - 1);
// Calculate base address offset w.r.t the base address
uint32_t virt_offset = addr - virt_base;
// Open file and mmap
uint32_t mmap_file = open(DEV_MEM_PATH, O_RDWR|O_SYNC);
return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base);
}
void UnmapRegister(void *vta, size_t length) {
// Unmap memory
int status = munmap(vta, length);
assert(status==0);
}
void WriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
*((volatile uint32_t *) (((char *) base_addr) + offset)) = val;
}
uint32_t ReadMappedReg(void* base_addr, uint32_t offset) {
return *((volatile uint32_t *) (((char *) base_addr) + offset));
}
void ProgramVTA(const char* bitstream) {
int elem;
FILE *src, *dst, *partial;
partial = fopen(BS_IS_PARTIAL, "w");
if (partial == NULL) {
printf("Cannot open partial config file %s\n", BS_IS_PARTIAL);
fclose(partial);
exit(1);
}
fputc('0', partial);
fclose(partial);
src = fopen(bitstream, "rb");
if (src == NULL) {
printf("Cannot open bitstream %s\n", bitstream);
exit(1);
}
dst = fopen(BS_XDEVCFG, "wb");
if (dst == NULL) {
printf("Cannot open device file %s\n", BS_XDEVCFG);
fclose(dst);
exit(1);
}
elem = fgetc(src);
while (elem != EOF) {
fputc(elem, dst);
elem = fgetc(src);
}
fclose(src);
fclose(dst);
}
\ No newline at end of file
CC ?= g++
CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
LIBS = -l:libsds_lib.so -l:libdma.so
SRC_DIR = ../../src
INCLUDE_DIR = ../../include
DRIVER_DIR = $(SRC_DIR)/driver/pynq
TESTLIB_DIR = $(SRC_DIR)/test
VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
SOURCES = vta_pynq_driver.c vta_test_lib.cc
OBJECTS = vta_pynq_driver.o vta_test_lib.o driver_test.o
EXECUTABLE = vta
# VTA Parameters
# Log of input width in bits
LOG_INP_WIDTH = 3
# Log of weight width in bits
LOG_WGT_WIDTH = 3
# Log of accum width in bits
LOG_ACC_WIDTH = 5
# Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_IN_BLOCK = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_OUT_BLOCK = 4
# Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17
# Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
# Define flags
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM \
-DDEBUG=0 -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_IN_BLOCK) -DLOG_BLOCK_OUT=$(LOG_OUT_BLOCK) \
-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
# All Target
all: $(EXECUTABLE)
%.o: %.cc $(SOURCES)
$(CC) -c -o $@ $< $(CFLAGS)
$(EXECUTABLE): $(OBJECTS)
$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
clean:
rm -rf *.o $(EXECUTABLE)
/*!
* Copyright (c) 2018 by Contributors
* \file driver_test.cpp
* \brief Bare-metal test to test driver and VTA design.
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "vta_test_lib.h"
#include "vta_pynq_driver.h"
// VTA invocation (present the same abstraction as in the simulation tests)
uint64_t vta (
uint32_t insn_count,
VTAGenericInsn *insns,
VTAUop *uops,
inp_T *inputs,
wgt_T *weights,
acc_T *biases,
inp_T *outputs) {
// Performance counter variables
uint64_t t_fpga;
struct timespec start, stop;
// Derive bitstream file
char bitstream[64];
char str_batch_size[4];
char str_block_out_size[4];
char str_block_in_size[4];
char str_block_bit_width[4];
sprintf(str_batch_size, "%d", BATCH);
sprintf(str_block_out_size, "%d", BLOCK_OUT);
sprintf(str_block_in_size, "%d", BLOCK_IN);
sprintf(str_block_bit_width, "%d", WGT_WIDTH);
strcpy(bitstream, "vta.bit");
#if DEBUG==1
printf("INFO - Programming FPGA: %s!\n", bitstream);
#endif
// Program VTA
ProgramVTA(bitstream);
// Get VTA handles
VTAHandle vta_fetch_handle = MapRegister(VTA_FETCH_ADDR, VTA_RANGE);
VTAHandle vta_load_handle = MapRegister(VTA_LOAD_ADDR, VTA_RANGE);
VTAHandle vta_compute_handle = MapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
VTAHandle vta_store_handle = MapRegister(VTA_STORE_ADDR, VTA_RANGE);
// Physical address pointers
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
#if DEBUG==1
printf("INFO - Starting FPGA!\n");
#endif
clock_gettime(CLOCK_REALTIME, &start);
// FETCH @ 0x10 : Data signal of insn_count_V
WriteMappedReg(vta_fetch_handle, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V
if (insns) WriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
// LOAD @ 0x10 : Data signal of weight_V
if (weights) WriteMappedReg(vta_load_handle, 0x10, weight_phy);
// LOAD @ 0x18 : Data signal of inputs_V
if (inputs) WriteMappedReg(vta_load_handle, 0x18, input_phy);
// COMPUTE @ 0x20 : Data signal of uops_V
if (uops) WriteMappedReg(vta_compute_handle, 0x20, uop_phy);
// COMPUTE @ 0x28 : Data signal of biases_V
if (biases) WriteMappedReg(vta_compute_handle, 0x28, bias_phy);
// STORE @ 0x10 : Data signal of outputs_V
if (outputs) WriteMappedReg(vta_store_handle, 0x10, output_phy);
// VTA start
WriteMappedReg(vta_fetch_handle, 0x0, 0x1);
WriteMappedReg(vta_load_handle, 0x0, 0x81);
WriteMappedReg(vta_compute_handle, 0x0, 0x81);
WriteMappedReg(vta_store_handle, 0x0, 0x81);
int flag = 0, t = 0;
for (t = 0; t < 10000000; ++t) {
flag = ReadMappedReg(vta_compute_handle, 0x18);
if (flag & VTA_DONE) break;
}
if (t==10000000) {
printf("\tWARNING: VTA TIMEOUT!!!!\n");
}
#if DEBUG==1
else {
printf("INFO - FPGA Finished!\n");
}
#endif
clock_gettime(CLOCK_REALTIME, &stop);
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
// Unmap VTA register
UnmapRegister(vta_fetch_handle, VTA_RANGE);
UnmapRegister(vta_load_handle, VTA_RANGE);
UnmapRegister(vta_compute_handle, VTA_RANGE);
UnmapRegister(vta_store_handle, VTA_RANGE);
return t_fpga;
};
int main(void)
{
#if DEBUG==1
printParameters();
#endif
int status = 0;
// Run ALU test (vector-scalar operators)
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
// Run ALU test (vector-vector operators)
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
// Run blocked GEMM test
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
if (status==0) {
printf("\nINFO - Unit tests successful!\n");
} else {
printf("\nINTO - Unit tests failed!\n");
}
return status;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment