hardware compilation flow, and driver tests

47001850 · Thierry Moreau · Tianqi Chen · b8d8e5b6 · 47001850 · 47001850
Commit 47001850 authored Mar 16, 2018 by Thierry Moreau Committed by Tianqi Chen Jul 11, 2018
19 changed files
--- a/vta/README.md
+++ b/vta/README.md
-# vta
 Open Hardware/Software Stack for Vertical Deep Learning System Optimization
+==============================================
+[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
+VTA is an open hardware/software co-design stack for deep learning systems systems.
+It provides a customizable hardware accelerator template for deep learning inference workloads,
+combined with a fully functional compiler stack built with TVM.
+License
+-------
+© Contributors, 2018. Licensed under an [Apache-2.0](https://github.com/tmoreau89/vta/blob/master/LICENSE) license.
--- a/vta/docs/.gitignore
+++ b/vta/docs/.gitignore
+doxygen
--- a/vta/docs/Doxyfile
+++ b/vta/docs/Doxyfile
--- a/vta/hardware/vivado/.gitignore
+++ b/vta/hardware/vivado/.gitignore
+build
--- a/vta/hardware/vivado/Makefile
+++ b/vta/hardware/vivado/Makefile
+# Directories
+ROOTDIR = $(CURDIR)
+BUILD_DIR = $(ROOTDIR)/build
+SCRIPT_DIR = $(ROOTDIR)/scripts
+SRC_DIR = $(ROOTDIR)/../../src/hardware/hls
+SIM_DIR = $(ROOTDIR)/sim
+TEST_DIR = $(ROOTDIR)/../../src/test
+INCLUDE_DIR = $(ROOTDIR)/../../include
+# Executables
+VIVADO_HLS = vivado_hls
+VIVADO = vivado
+HSI = hsi
+# Build parameters:
+#  Number of threads during compilation
+NUM_THREADS = 8
+#  Target Frequency
+CLOCK_FREQ = 100
+#  Log of input width in bits
+LOG_INP_WIDTH = 3
+#  Log of weight width in bits
+LOG_WGT_WIDTH = 3
+#  Log of accum width in bits
+LOG_ACC_WIDTH = 5
+#  Log of output width in bits
+LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
+#  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
+LOG_BATCH = 0
+#  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
+LOG_IN_BLOCK = 4
+#  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
+LOG_OUT_BLOCK = 4
+#  Log of uop buffer size in Bytes
+LOG_UOP_BUFF_SIZE = 15
+#  Log of inp buffer size in Bytes
+LOG_INP_BUFF_SIZE = 15
+#  Log of wgt buffer size in Bytes
+LOG_WGT_BUFF_SIZE = 15
+#  Log of acc buffer size in Bytes
+LOG_ACC_BUFF_SIZE = 17
+#  Log of out buffer size in Bytes
+LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
+# Derived parameter
+#  Input width in bits
+INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
+#  Weight width in bits
+WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
+#  Output width in bits
+OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
+#  Tensor batch size
+BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
+#  Tensor outer block size
+IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" )
+#  Tensor inner block size
+OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" )
+#  Uop buffer size in Bytes
+UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
+#  Inp buffer size in Bytes
+INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
+#  Wgt buffer size in Bytes
+WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
+#  Acc buffer size in Bytes
+ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
+#  Out buffer size in Bytes
+OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
+# Derive clock target period
+TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
+# Derive config name
+CONF = \
+	$(BATCH)x$(IN_BLOCK)x$(OUT_BLOCK)_$(INP_WIDTH)bx$(WGT_WIDTH)b_$(CLOCK_FREQ)MHz_$(TARGET_PER)ns
+IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
+HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
+.PHONY: all ip bit driver clean
+all: driver
+ip: 
+	mkdir -p $(IP_BUILD_PATH)
+	cd $(IP_BUILD_PATH) && \
+		$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
+			-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
+			$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
+			$(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \
+			$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
+			$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
+bit: ip
+	mkdir -p $(HW_BUILD_PATH)
+	cd $(HW_BUILD_PATH) && \
+		$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
+		-tclargs $(IP_BUILD_PATH) $(NUM_THREADS) $(CLOCK_FREQ) \
+		$(INP_WIDTH) $(WGT_WIDTH) $(OUT_WIDTH) \
+		$(BATCH) $(IN_BLOCK) $(OUT_BLOCK) \
+		$(INP_BUFF_SIZE) $(WGT_BUFF_SIZE) $(OUT_BUFF_SIZE)
+driver: bit
+	cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
+	cd $(HW_BUILD_PATH)/bsp && make
+clean:
+	rm -rf build
\ No newline at end of file
--- a/vta/hardware/vivado/scripts/hls.tcl
+++ b/vta/hardware/vivado/scripts/hls.tcl
+#
+#  Copyright (c) 2018 by Contributors
+#  file: hls.tcl
+#  brief: HLS generation script.
+#
+# Command line arguments:
+# Arg 1: path to design sources
+# Arg 2: path to sim sources
+# Arg 3: path to test sources
+# Arg 4: path to include sources
+# Arg 5: target clock period
+# Arg 6: input type width (log)
+# Arg 7: weight type width (log)
+# Arg 8: accum type width (log)
+# Arg 9: output type width (log)
+# Arg 10: batch size (log)
+# Arg 11: in block size (log)
+# Arg 12: out block size (log)
+# Arg 13: uop buffer size in B (log)
+# Arg 14: inp buffer size in B (log)
+# Arg 15: wgt buffer size in B (log)
+# Arg 16: acc buffer size in B (log)
+# Arg 17: out buffer size in B (log)
+if { [llength $argv] eq 19 } {
+	set src_dir [lindex $argv 2]
+	set sim_dir [lindex $argv 3]
+	set test_dir [lindex $argv 4]
+	set include_dir [lindex $argv 5]
+	set target_period [lindex $argv 6]
+	set inp_width [lindex $argv 7]
+	set wgt_width [lindex $argv 8]
+	set acc_width [lindex $argv 9]
+	set out_width [lindex $argv 10]
+	set batch [lindex $argv 11]
+	set block_in [lindex $argv 12]
+	set block_out [lindex $argv 13]
+	set uop_buff_size [lindex $argv 14]
+	set inp_buff_size [lindex $argv 15]
+	set wgt_buff_size [lindex $argv 16]
+	set acc_buff_size [lindex $argv 17]
+	set out_buff_size [lindex $argv 18]
+} else {
+	set src_dir "../src/"
+	set sim_dir "../sim/"
+	set test_dir "../../src/test/"
+	set include_dir "../../include"
+	set target_period 10
+	set inp_width 3
+	set wgt_width 3
+	set acc_width 5
+	set out_width 3
+	set batch 1
+	set block_out 4
+	set block_in 4
+	set uop_buff_size 15
+	set inp_buff_size 15
+	set wgt_buff_size 15
+	set acc_buff_size 17
+	set out_buff_size 15
+}
+# C define flags to pass to compiler
+set cflags "-I $include_dir -I $include_dir/hardware/hls \
+	-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
+	-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
+	-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
+	-DLOG_UOP_BUFF_SIZE=$uop_buff_size -DLOG_INP_BUFF_SIZE=$inp_buff_size \
+	-DLOG_WGT_BUFF_SIZE=$wgt_buff_size -DLOG_ACC_BUFF_SIZE=$acc_buff_size \
+	-DLOG_OUT_BUFF_SIZE=$out_buff_size"
+# Initializes the HLS design and sets HLS pragmas for memory partitioning.
+# This is necessary because of a Vivado restriction that doesn't allow for
+# buses wider than 1024 bits.
+proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
+	# Set device number
+	set_part {xc7z020clg484-1}
+	# Set the clock frequency
+	create_clock -period $per -name default
+	# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/1024)
+	set inp_partition_factor [expr {(1 << ($inp_width + $block_in + $batch)) / 1024}]
+	if {$inp_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "load" inp_mem
+		set_directive_array_reshape -type complete -dim 2 "compute" inp_mem
+	} else {
+		# Set input reshaping factor below to (1024/INP_VECTOR_WIDTH)
+		set inp_reshape_factor [expr {1024 / (1 << ($inp_width + $block_in))}]
+		set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "load" inp_mem
+		set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "compute" inp_mem
+		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
+		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
+	}
+	# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/1024)
+	set wgt_partition_factor [expr {(1 << ($wgt_width + $block_in + $block_out)) / 1024}]
+	if {$wgt_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "load" wgt_mem
+		set_directive_array_reshape -type complete -dim 2 "compute" wgt_mem
+	} else {
+		# Set weight reshaping factor below to (1024/WGT_VECTOR_WIDTH)
+		set wgt_reshape_factor [expr {1024 / (1 << ($wgt_width + $block_in))}]
+		set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "load" wgt_mem
+		set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "compute" wgt_mem
+		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
+		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
+	}
+	# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/1024)
+	set out_partition_factor [expr {(1 << ($out_width + $block_out + $batch)) / 1024}]
+	if {$out_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "compute" out_mem
+		set_directive_array_reshape -type complete -dim 2 "store" out_mem
+	} else {
+		# Set output reshaping factor below to (1024/OUT_VECTOR_WIDTH)
+		set out_reshape_factor [expr {1024 / (1 << ($out_width + $block_out))}]
+		set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "compute" out_mem
+		set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "store" out_mem
+		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "compute" out_mem
+		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
+	}
+}
+# HLS behavioral sim
+open_project vta_sim
+set_top vta
+add_files $src_dir/vta.cc -cflags $cflags
+add_files -tb $sim_dir/vta_test.cc -cflags $cflags
+add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csim_design -clean
+close_project
+# Generate fetch stage
+open_project vta_fetch
+set_top fetch
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+# Generate load stage
+open_project vta_load
+set_top load
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+# Generate compute stage
+open_project vta_compute
+set_top compute
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+# Generate store stage
+open_project vta_store
+set_top store
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+exit
--- a/vta/hardware/vivado/scripts/hsi.tcl
+++ b/vta/hardware/vivado/scripts/hsi.tcl
+#
+#  Copyright (c) 2018 by Contributors
+#  file: hsi.tcl
+#  brief: Driver generation script for ARMv7 driver libraries.
+#
+open_hw_design export/vta.hdf
+create_sw_design swdesign -proc ps7_cortexa9_0 -os standalone
+generate_bsp -dir bsp
+exit
--- a/vta/hardware/vivado/scripts/vivado.tcl
+++ b/vta/hardware/vivado/scripts/vivado.tcl
--- a/vta/hardware/vivado/sim/vta_test.cc
+++ b/vta/hardware/vivado/sim/vta_test.cc
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_test.cpp
+ * \brief Simulation tests for the VTA design.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include "vta.h"
+#include "vta_test_lib.h"
+int main(void)
+{
+#if DEBUG==1
+    printParameters();
+#endif
+    // Buffer indexing
+    assert(LOG_ACC_BUFF_DEPTH>=LOG_INP_BUFF_DEPTH);
+    // Micro op bound
+    assert(UOP_GEM_3_1<UOP_WIDTH);
+    assert(UOP_ALU_3_1<UOP_WIDTH);
+    // Instruction alignment checks
+    assert(INSN_MEM_7_1<INSN_MEM_8_0);
+    assert(INSN_GEM_8_1<INSN_GEM_9_0);
+    // Instruction bounds
+    assert(INSN_MEM_E_1<INS_WIDTH);
+    assert(INSN_GEM_E_1<INS_WIDTH);
+    assert(INSN_ALU_F_1<INS_WIDTH);
+    int status = 0;
+    // Run ALU test (vector-scalar operators)
+    status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
+    // Run ALU test (vector-vector operators)
+    status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
+    // Run blocked GEMM test
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
+    return status;
+}
\ No newline at end of file
--- a/vta/include/hardware/hls/vta.h
+++ b/vta/include/hardware/hls/vta.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta.h
+ * \brief Type definitions and prototype for VTA HLS design.
+ */
+#ifndef VTA_MAIN_H_
+#define VTA_MAIN_H_
+#include <assert.h>
+#include <ap_axi_sdata.h>
+#include <ap_int.h>
+#include <hls_stream.h>
+#include "vta_typedefs.h"
+#include "vta_params.h"
+/*!
+* \brief Fetch module.
+*   Reads in \a insn_count instructions via DMA and pushes them to the
+*   appropriate load, gemm or store queue.
+* \param insns Instruction data base address in DRAM. AXI-4 master port.
+* \param insn_count Total instruction count. AXI-lite memory mapped register.
+* \param load_queue Load instruction queue. AXI-stream FIFO.
+* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
+* \param store_queue Store instruction queue. AXI-stream FIFO.
+*/
+void fetch (
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<insn_T> &store_queue);
+/*!
+* \brief Load module.
+*   Reads in load instructions from the load queue, and performs appropriate
+*   DMA load operation to the \a wgt_mem and \a inp_mem SRAM buffers from DRAM.
+*   Updates dependence queues accordingly.
+* \param inputs Input data base address in DRAM. AXI-4 master port.
+* \param weights Weight data base address in DRAM. AXI-4 master port.
+* \param load_queue Load instruction queue. AXI-stream FIFO.
+* \param g2l_dep_queue Dependence queue from GEMM to load stage.
+*   AXI-stream FIFO.
+* \param l2g_dep_queue Dependence queue from load to GEMM stage.
+*   AXI-stream FIFO.
+* \param inp_mem Local input SRAM buffer. Write only single port BRAM.
+* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
+*/
+void load (
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
+  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]
+  );
+/*!
+* \brief Compute module.
+*   Reads in GEMM instructions from the gemm queue, and performs appropriate
+*   GEMM/ALU instructions. Reads in data from the \a wgt_mem and \a inp_mem,
+*   and writes computation results into the \a out_mem. Updates dependence
+*   queues accordingly.
+* \param done Signal that indicates that VLA is done.  AXI-lite memory mapped
+*   register.
+* \param uops Micro-op data base address in DRAM. AXI-4 master port.
+* \param biases Bias data base address in DRAM. AXI-4 master port.
+* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
+* \param l2g_dep_queue Dependence queue from load to gemm stage.
+*   AXI-stream FIFO.
+* \param s2g_dep_queue Dependence queue from store to gemm stage.
+*   AXI-stream FIFO.
+* \param g2l_dep_queue Dependence queue from gemm to load stage.
+*   AXI-stream FIFO.
+* \param g2s_dep_queue Dependence queue from gemm to store stage.
+*   AXI-stream FIFO.
+* \param inp_mem Local input SRAM buffer. Read only single port BRAM.
+* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
+* \param out_mem Local output SRAM buffer. Write only single port BRAM.
+*/
+void compute (
+  volatile uint32_t &done,
+  volatile uop_T *uops,
+  volatile acc_vec_T *biases,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
+  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT],
+  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
+  );
+/*!
+* \brief Store module.
+*   Reads in store instructions from the store queue, and performs appropriate
+*   store instructions from the output buffer in SRAM to DRAM. Updates dependence
+*   queues accordingly.
+* \param outputs Output data base address in DRAM. AXI-4 master port.
+* \param store_queue Store instruction queue. AXI-stream FIFO.
+* \param g2s_dep_queue Dependence queue from gemm to store stage.
+*   AXI-stream FIFO.
+* \param s2g_dep_queue Dependence queue from store to gemm stage.
+*   AXI-stream FIFO.
+* \param out_mem Local output SRAM buffer. Read only single port BRAM.
+*/
+void store (
+  volatile out_vec_T *outputs,
+  hls::stream<insn_T> &store_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
+  );
+/*!
+* \brief VTA wrapper for simulation purpose only.
+*   Orchestrates dataflow execution of the fetch, load, GEMM and store stages.
+* \param insn_count Total instruction count. AXI-lite memory mapped register.
+* \param insns Instruction data base address in DRAM. AXI-4 master port.
+* \param uops Micro-op data base address in DRAM. AXI-4 master port.
+* \param inputs Input data base address in DRAM. AXI-4 master port.
+* \param weights Weight data base address in DRAM. AXI-4 master port.
+* \param biases Bias data base address in DRAM. AXI-4 master port.
+* \param outputs Output data base address in DRAM. AXI-4 master port.
+*/
+void vta (
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  volatile uop_T *uops,
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  volatile acc_vec_T *biases,
+  volatile out_vec_T *outputs);
+#endif  // VTA_MAIN_H_
\ No newline at end of file
--- a/vta/include/hardware/hls/vta_typedefs.h
+++ b/vta/include/hardware/hls/vta_typedefs.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_typedefs.h
+ * \brief Type definitions for VTA HLS design.
+ */
+#ifndef VTA_TYPEDEFS_H_
+#define VTA_TYPEDEFS_H_
+#include <assert.h>
+#include <ap_axi_sdata.h>
+#include <ap_int.h>
+#include <hls_stream.h>
+#include "vta_params.h"
+/* \typedef uop_T Micro-op datatype*/
+typedef ap_uint<UOP_WIDTH> uop_T;
+/* \typedef inp_T Input datatype*/
+typedef ap_int<INP_WIDTH> inp_T;
+/* \typedef wgt_T Weight datatype*/
+typedef ap_int<WGT_WIDTH> wgt_T;
+/* \typedef out_T Output datatype*/
+typedef ap_int<OUT_WIDTH> out_T;
+/* \typedef acc_T Accumulator datatype*/
+typedef ap_int<ACC_WIDTH> acc_T;
+/* \typedef mul_T Multiplier output datatype*/
+typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
+/* \typedef sum_T GEMM accumulator datatype*/
+typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
+/* \typedef inp_vec_T Input vector datatype*/
+typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
+/* \typedef wgt_vec_T Weight vector datatype*/
+typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
+/* \typedef acc_vec_T Accumulator vector datatype*/
+typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
+/* \typedef out_vec_T Output vector datatype*/
+typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
+/* \typedef uop_idx_T Micro-op SRAM index datatype*/
+typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
+/* \typedef inp_idx_T Input SRAM index datatype*/
+typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
+/* \typedef wgt_idx_T Weight SRAM index datatype*/
+typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
+/* \typedef acc_idx_T Accumulator SRAM index datatype*/
+typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
+/* \typedef opcode_T Opcode datatype*/
+typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
+/* \typedef insn_T Instruction datatype*/
+typedef ap_uint<INS_WIDTH> insn_T;
+/* \typedef loop_T Loop bound datatype*/
+typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
+/* \typedef memop_id_T Memory operation ID datatype*/
+typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
+/* \typedef memop_sram_T Memory operation SRAM index datatype*/
+typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
+/* \typedef memop_dram_T Memory operation DRAM index datatype*/
+typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
+/* \typedef memop_size_T Memory operation range datatype*/
+typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
+/* \typedef memop_stride_T Memory operation stride datatype*/
+typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
+/* \typedef memop_pad_T Memory operation pad width datatype*/
+typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
+/* \typedef aluop_opcode_T ALU operation opcode datatype*/
+typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
+/* \typedef aluop_opcode_T ALU operation immediate datatype*/
+typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
+/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
+typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
+#endif // VTA_TYPEDEFS_H_
--- a/vta/include/vta_params.h
+++ b/vta/include/vta_params.h
--- a/vta/include/vta_pynq_driver.h
+++ b/vta/include/vta_pynq_driver.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_pynq_driver.h
+ * \brief VTA driver for Pynq board.
+ */
+#ifndef VTA_PYNQ_DRIVER_H_
+#define VTA_PYNQ_DRIVER_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <assert.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+#ifdef __arm__
+#include "libxlnk_cma.h"
+#else
+void* cma_alloc(size_t size, int cached);
+void cma_free(void* buf);
+uint32_t cma_get_phy_addr(void* buf);
+void xlnkFlushCache(void* buf, int size);
+void xlnkInvalidateCache(void* buf, int size);
+#endif
+/*! \brief VTA command handle */
+typedef void * VTAHandle;
+/*! \brief DMA command handle */
+typedef struct {
+  /*! \brief Register map to the AXI DMA control registers*/
+  void *dma_register_map;
+  /*! \brief Transmit data descriptor*/
+  void *mm2s_descriptor_register_map;
+  /*! \brief Receive data descriptor*/
+  void *s2mm_descriptor_register_map;
+  /*! \brief Transmit data descriptor physical address*/
+  uint32_t mm2s_descriptor_phy;
+  /*! \brief Receive data descriptor physical address*/
+  uint32_t s2mm_descriptor_phy;
+  /*! \brief Descriptor size */
+  uint32_t descriptor_size;
+  /*! \brief Transaction count for tx channel */
+  uint32_t mm2s_count;
+  /*! \brief Transaction count for rx channel */
+  uint32_t s2mm_count;
+  /*! \brief Multi-channel mode enable */
+  int multichannel_en;
+} DMAHandle;
+/*! \brief partial bitstream status file path */
+#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
+/*! \brief bitstream destination file path */
+#define BS_XDEVCFG "/dev/xdevcfg"
+/*! \brief Path to /dev/mem */
+#define DEV_MEM_PATH "/dev/mem"
+/*! \brief MMIO driver constant */
+#define MMIO_WORD_LENGTH 4
+/*! \brief MMIO driver constant */
+#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
+/*! \brief VTA configuration register address range */
+#define VTA_RANGE 0x100
+/*! \brief VTA configuration register start value */
+#define VTA_START 0x1
+/*! \brief VTA configuration register auto-restart value */
+#define VTA_AUTORESTART 0x81
+/*! \brief VTA configuration register done value */
+#define VTA_DONE 0x1
+/*! \brief VTA fetch stage configuration register address
+*   from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_FETCH_ADDR    0x43C00000
+/*! \brief VTA compute stage configuration register address
+*   from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_COMPUTE_ADDR  0x43C10000
+/*! \brief VTA compute stage configuration register address
+*   from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_LOAD_ADDR     0x43C20000
+/*! \brief VTA store stage configuration register address
+*   from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_STORE_ADDR    0x43C30000
+/*! \brief Memory management constants with libxlnk_cma */
+#define CACHED 1
+/*! \brief Memory management constants with libxlnk_cma */
+#define NOT_CACHED 0
+/*! \brief log2 of SDS buffer size limit */
+#define LOG_MAX_XFER 22
+/*! \brief SDS buffer size limit */
+#define MAX_XFER (1<<LOG_MAX_XFER)
+/*!
+ * \brief Returns a memory map to FPGA configuration registers.
+ * \param addr The base physical address of the configuration registers.
+ * \param length The size of the memory mapped region in bytes.
+ * \return A pointer to the memory mapped region.
+ */
+void *MapRegister(unsigned addr, size_t length);
+/*!
+ * \brief Deletes the configuration register memory map.
+ * \param vta The memory mapped region.
+ * \param length The size of the memory mapped region in bytes.
+ */
+void UnmapRegister(void *vta, size_t length);
+/*!
+ * \brief Writes to a memory mapped configuration register.
+ * \param vta_base The handle to the memory mapped configuration registers.
+ * \param offset The offset of the register to write to.
+ * \param val The value to be written to the memory mapped register.
+ */
+void WriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
+/*!
+ * \brief Reads from the memory mapped configuration register.
+ * \param vta_base The handle to the memory mapped configuration registers.
+ * \param offset The offset of the register to read from.
+ * \return The value read from the memory mapped register.
+ */
+unsigned ReadMappedReg(VTAHandle vta_base, unsigned offset);
+/*!
+ * \brief Programming the bit stream on the FPGA.
+ * \param bitstream The path to the bit stream file.
+ */
+void ProgramVTA(const char* bitstream);
+#ifdef __cplusplus
+}
+#endif
+#endif  // VTA_PYNQ_DRIVER_H_
\ No newline at end of file
--- a/vta/include/vta_test_lib.h
+++ b/vta/include/vta_test_lib.h
--- a/vta/src/driver/pynq/vta_pynq_driver.c
+++ b/vta/src/driver/pynq/vta_pynq_driver.c
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_pynq_driver.c
+ * \brief VTA driver for Pynq board.
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "vta_pynq_driver.h"
+#ifdef __cplusplus
+}
+#endif
+void *MapRegister(uint32_t addr, size_t length) {
+  // Align the base address with the pages
+  uint32_t virt_base = addr & ~(getpagesize() - 1);
+  // Calculate base address offset w.r.t the base address
+  uint32_t virt_offset = addr - virt_base;
+  // Open file and mmap
+  uint32_t mmap_file = open(DEV_MEM_PATH, O_RDWR|O_SYNC);
+  return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base);
+}
+void UnmapRegister(void *vta, size_t length) {
+  // Unmap memory
+  int status = munmap(vta, length);
+  assert(status==0);
+}
+void WriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
+  *((volatile uint32_t *) (((char *) base_addr) + offset)) = val;
+}
+uint32_t ReadMappedReg(void* base_addr, uint32_t offset) {
+  return *((volatile uint32_t *) (((char *) base_addr) + offset));
+}
+void ProgramVTA(const char* bitstream) {
+    int elem;
+    FILE *src, *dst, *partial;
+    partial = fopen(BS_IS_PARTIAL, "w");
+    if (partial == NULL) {
+        printf("Cannot open partial config file %s\n", BS_IS_PARTIAL);
+        fclose(partial);
+        exit(1);
+    }
+    fputc('0', partial);
+    fclose(partial);
+    src = fopen(bitstream, "rb");
+    if (src == NULL) {
+        printf("Cannot open bitstream %s\n", bitstream);
+        exit(1);
+    }
+    dst = fopen(BS_XDEVCFG, "wb");
+    if (dst == NULL) {
+        printf("Cannot open device file %s\n", BS_XDEVCFG);
+        fclose(dst);
+        exit(1);
+    }
+    elem = fgetc(src);
+    while (elem != EOF) {
+        fputc(elem, dst);
+        elem = fgetc(src);
+    }
+    fclose(src);
+    fclose(dst);
+}
\ No newline at end of file
--- a/vta/src/hardware/hls/vta.cc
+++ b/vta/src/hardware/hls/vta.cc
--- a/vta/src/test/vta_test_lib.cc
+++ b/vta/src/test/vta_test_lib.cc
--- a/vta/tests/driver/Makefile
+++ b/vta/tests/driver/Makefile
+CC ?= g++
+CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
+LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
+LIBS = -l:libsds_lib.so -l:libdma.so
+SRC_DIR = ../../src
+INCLUDE_DIR = ../../include
+DRIVER_DIR = $(SRC_DIR)/driver/pynq
+TESTLIB_DIR = $(SRC_DIR)/test
+VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
+SOURCES = vta_pynq_driver.c vta_test_lib.cc
+OBJECTS = vta_pynq_driver.o vta_test_lib.o driver_test.o
+EXECUTABLE = vta
+# VTA Parameters
+#  Log of input width in bits
+LOG_INP_WIDTH = 3
+#  Log of weight width in bits
+LOG_WGT_WIDTH = 3
+#  Log of accum width in bits
+LOG_ACC_WIDTH = 5
+#  Log of output width in bits
+LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
+#  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
+LOG_BATCH = 0
+#  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
+LOG_IN_BLOCK = 4
+#  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
+LOG_OUT_BLOCK = 4
+#  Log of uop buffer size in Bytes
+LOG_UOP_BUFF_SIZE = 15
+#  Log of inp buffer size in Bytes
+LOG_INP_BUFF_SIZE = 15
+#  Log of wgt buffer size in Bytes
+LOG_WGT_BUFF_SIZE = 15
+#  Log of acc buffer size in Bytes
+LOG_ACC_BUFF_SIZE = 17
+#  Log of out buffer size in Bytes
+LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
+# Define flags
+CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM \
+	-DDEBUG=0 -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
+	-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
+	-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_IN_BLOCK) -DLOG_BLOCK_OUT=$(LOG_OUT_BLOCK) \
+	-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
+	-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
+	-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
+# All Target
+all: $(EXECUTABLE)
+%.o: %.cc $(SOURCES)
+	$(CC) -c -o $@ $< $(CFLAGS)
+$(EXECUTABLE): $(OBJECTS)
+	$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
+clean:
+	rm -rf *.o $(EXECUTABLE)
--- a/vta/tests/driver/driver_test.cc
+++ b/vta/tests/driver/driver_test.cc
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file driver_test.cpp
+ * \brief Bare-metal test to test driver and VTA design.
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "vta_test_lib.h"
+#include "vta_pynq_driver.h"
+// VTA invocation (present the same abstraction as in the simulation tests)
+uint64_t vta (
+    uint32_t insn_count,
+    VTAGenericInsn *insns,
+    VTAUop *uops,
+    inp_T *inputs,
+    wgt_T *weights,
+    acc_T *biases,
+    inp_T *outputs) {
+    // Performance counter variables
+    uint64_t t_fpga;
+    struct timespec start, stop;
+    // Derive bitstream file
+    char bitstream[64];
+    char str_batch_size[4];
+    char str_block_out_size[4];
+    char str_block_in_size[4];
+    char str_block_bit_width[4];
+    sprintf(str_batch_size, "%d", BATCH);
+    sprintf(str_block_out_size, "%d", BLOCK_OUT);
+    sprintf(str_block_in_size, "%d", BLOCK_IN);
+    sprintf(str_block_bit_width, "%d", WGT_WIDTH);
+    strcpy(bitstream, "vta.bit");
+#if DEBUG==1
+    printf("INFO - Programming FPGA: %s!\n", bitstream);
+#endif
+    // Program VTA
+    ProgramVTA(bitstream);
+    // Get VTA handles
+    VTAHandle vta_fetch_handle = MapRegister(VTA_FETCH_ADDR, VTA_RANGE);
+    VTAHandle vta_load_handle = MapRegister(VTA_LOAD_ADDR, VTA_RANGE);
+    VTAHandle vta_compute_handle = MapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
+    VTAHandle vta_store_handle = MapRegister(VTA_STORE_ADDR, VTA_RANGE);
+    // Physical address pointers
+    uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
+    uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
+    uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
+    uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
+    uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
+    uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
+#if DEBUG==1
+    printf("INFO - Starting FPGA!\n");
+#endif
+    clock_gettime(CLOCK_REALTIME, &start);
+    // FETCH @ 0x10 : Data signal of insn_count_V
+    WriteMappedReg(vta_fetch_handle, 0x10, insn_count);
+    // FETCH @ 0x18 : Data signal of insns_V
+    if (insns) WriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
+    // LOAD @ 0x10 : Data signal of weight_V
+    if (weights) WriteMappedReg(vta_load_handle, 0x10, weight_phy);
+    // LOAD @ 0x18 : Data signal of inputs_V
+    if (inputs) WriteMappedReg(vta_load_handle, 0x18, input_phy);
+    // COMPUTE @ 0x20 : Data signal of uops_V
+    if (uops) WriteMappedReg(vta_compute_handle, 0x20, uop_phy);
+    // COMPUTE @ 0x28 : Data signal of biases_V
+    if (biases) WriteMappedReg(vta_compute_handle, 0x28, bias_phy);
+    // STORE @ 0x10 : Data signal of outputs_V
+    if (outputs) WriteMappedReg(vta_store_handle, 0x10, output_phy);
+    // VTA start
+    WriteMappedReg(vta_fetch_handle, 0x0, 0x1);
+    WriteMappedReg(vta_load_handle, 0x0, 0x81);
+    WriteMappedReg(vta_compute_handle, 0x0, 0x81);
+    WriteMappedReg(vta_store_handle, 0x0, 0x81);
+    int flag = 0, t = 0;
+    for (t = 0; t < 10000000; ++t) {
+      flag = ReadMappedReg(vta_compute_handle, 0x18);
+      if (flag & VTA_DONE) break;
+    }
+    if (t==10000000) {
+        printf("\tWARNING: VTA TIMEOUT!!!!\n");
+    }
+#if DEBUG==1
+    else {
+        printf("INFO - FPGA Finished!\n");
+    }
+#endif
+    clock_gettime(CLOCK_REALTIME, &stop);
+    t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
+    // Unmap VTA register
+    UnmapRegister(vta_fetch_handle, VTA_RANGE);
+    UnmapRegister(vta_load_handle, VTA_RANGE);
+    UnmapRegister(vta_compute_handle, VTA_RANGE);
+    UnmapRegister(vta_store_handle, VTA_RANGE);
+    return t_fpga;
+};
+int main(void)
+{
+#if DEBUG==1
+    printParameters();
+#endif
+    int status = 0;
+    // Run ALU test (vector-scalar operators)
+    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
+    // Run ALU test (vector-vector operators)
+    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
+    // Run blocked GEMM test
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
+    if (status==0) {
+        printf("\nINFO - Unit tests successful!\n");
+    } else {
+        printf("\nINTO - Unit tests failed!\n");
+    }
+    return status;
+}