[HARDWARE, TEST] Fixed hardware generation flow (#34)

dae77cdb · Thierry Moreau · Tianqi Chen · 9f0e8ffe · dae77cdb · dae77cdb
Commit dae77cdb authored May 02, 2018 by Thierry Moreau Committed by Tianqi Chen Jul 11, 2018
14 changed files
--- a/vta/apps/pynq_rpc/start_rpc_server.sh
+++ b/vta/apps/pynq_rpc/start_rpc_server.sh
 #!/bin/bash
-export PYTHONPATH=${PYTHONPATH}:/home/xilinx/tvm/python:/home/xilinx/vta/python
+export PYTHONPATH=${PYTHONPATH}:/home/xilinx/vta/nnvm/tvm/python:/home/xilinx/vta/python
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/
 python -m vta.exec.rpc_server
--- a/vta/hardware/xilinx/Makefile
+++ b/vta/hardware/xilinx/Makefile
@@ -13,8 +13,10 @@ VIVADO_HLS = vivado_hls
 VIVADO = vivado
 HSI = hsi

-# HLS Mode
-MODE = all
+# HLS mode
+MODE = skip_sim
+# Debug flag
+DEBUG = false
 # SLURM
 SLURM = false
 # Prevent generation of DSP
@@ -22,15 +24,26 @@ NO_DSP = false
 # Prevent generation of ALU
 NO_ALU = false

-# Include top-level config file
-ifndef config
-ifneq ("$(wildcard ../../config.mk)", "")
-	config = ../../config.mk
-else
-	config = ../../make/config.mk
-endif
-endif
-include $(config)
+# Process VTA JSON config
+VTA_CONFIG = python $(CURDIR)/../../make/vta_config.py
+CFLAGS := $(shell ${VTA_CONFIG} --cflags)
+VTA_TARGET := $(shell ${VTA_CONFIG} --target)
+
+#---------------------
+# VTA Parameters
+#--------------------
+VTA_INP_WIDTH := $(shell ${VTA_CONFIG} --get-inpwidth)
+VTA_WGT_WIDTH := $(shell ${VTA_CONFIG} --get-wgtwidth)
+VTA_ACC_WIDTH := $(shell ${VTA_CONFIG} --get-accwidth)
+VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth)
+VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch)
+VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin)
+VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout)
+VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize)
+VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
+VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
+VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize)
+VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize)

 #---------------------
 # Compilation parameters
@@ -50,8 +63,8 @@ TARGET_PER = \
 $(shell echo "$$(( (1000 + $(VTA_HW_COMP_CLOCK_FREQ) - 1) / $(VTA_HW_COMP_CLOCK_FREQ) - $(VTA_HW_COMP_TIMING_COMP)))" )

 # Derive config name
-CONF = \
-$(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_LOG_UOP_BUFF_SIZE)_$(VTA_LOG_INP_BUFF_SIZE)_$(VTA_LOG_WGT_BUFF_SIZE)_$(VTA_LOG_ACC_BUFF_SIZE)_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
+CONF_ROOT = $(shell ${VTA_CONFIG} --cfg-str)
+CONF = $(CONF_ROOT)_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
 IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
 HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)

@@ -60,26 +73,34 @@ ifeq ($(SLURM), true)
 	HW_BUILD_PATH = /scratch/vivado/$(CONF)
 endif

-.PHONY: all ip bit driver clean clean_all
+# IP file path
+IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip
+
+# Bitstream file path
+BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit

-all: bit
+.PHONY: all ip bit bsp clean clean_all

-ip: 
+all: bsp
+ip: $(IP_PATH)
+bit: $(BIT_PATH)
+
+$(IP_PATH): $(SRC_DIR)/*
 	mkdir -p $(IP_BUILD_PATH)
 	cd $(IP_BUILD_PATH) && \
 		$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-			-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
-			$(VTA_LOG_INP_WIDTH) $(VTA_LOG_WGT_WIDTH) $(VTA_LOG_ACC_WIDTH) $(VTA_LOG_OUT_WIDTH) \
-			$(VTA_LOG_BATCH) $(VTA_LOG_BLOCK_OUT) $(VTA_LOG_BLOCK_IN) \
-			$(VTA_LOG_UOP_BUFF_SIZE) $(VTA_LOG_INP_BUFF_SIZE) $(VTA_LOG_WGT_BUFF_SIZE) \
-			$(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE) \
-			$(MODE) $(NO_DSP) $(NO_ALU)
+		-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
+		$(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(TARGET_PER) \
+		$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
+		$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
+		$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
+		$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
 ifeq ($(SLURM), true)
 	mkdir -p $(BUILD_DIR)/hls
 	mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/.
 endif

-bit: ip
+$(BIT_PATH): $(IP_PATH)
 	mkdir -p $(HW_BUILD_PATH)
 	cd $(HW_BUILD_PATH) && \
 		$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
@@ -92,12 +113,12 @@ ifeq ($(SLURM), true)
 	mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/.
 endif

-driver: bit
+bsp: $(BIT_PATH)
 	cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
 	cd $(HW_BUILD_PATH)/bsp && make

 clean:
 	rm -rf *.out *.log *.sb figures

-clean_all: clean
+cleanall: clean
 	rm -rf $(BUILD_DIR)
--- a/vta/hardware/xilinx/scripts/hls.tcl
+++ b/vta/hardware/xilinx/scripts/hls.tcl
@@ -9,65 +9,69 @@
 # Arg 2: path to sim sources
 # Arg 3: path to test sources
 # Arg 4: path to include sources
-# Arg 5: target clock period
-# Arg 6: input type width (log)
-# Arg 7: weight type width (log)
-# Arg 8: accum type width (log)
-# Arg 9: output type width (log)
-# Arg 10: batch size (log)
-# Arg 11: in block size (log)
-# Arg 12: out block size (log)
-# Arg 13: uop buffer size in B (log)
-# Arg 14: inp buffer size in B (log)
-# Arg 15: wgt buffer size in B (log)
-# Arg 16: acc buffer size in B (log)
-# Arg 17: out buffer size in B (log)
-# Arg 18: mode
-# Arg 19: no_dsp
-# Arg 20: no_alu
+# Arg 5: mode
+# Arg 6: debug
+# Arg 7: no_dsp
+# Arg 8: no_alu
+# Arg 9: target clock period
+# Arg 10: input type width (log)
+# Arg 11: weight type width (log)
+# Arg 12: accum type width (log)
+# Arg 13: output type width (log)
+# Arg 14: batch size (log)
+# Arg 15: in block size (log)
+# Arg 16: out block size (log)
+# Arg 17: uop buffer size in B (log)
+# Arg 18: inp buffer size in B (log)
+# Arg 19: wgt buffer size in B (log)
+# Arg 20: acc buffer size in B (log)
+# Arg 21: out buffer size in B (log)

-if { [llength $argv] eq 22 } {
+if { [llength $argv] eq 23 } {
 	set src_dir [lindex $argv 2]
 	set sim_dir [lindex $argv 3]
 	set test_dir [lindex $argv 4]
 	set include_dir [lindex $argv 5]
-	set target_period [lindex $argv 6]
-	set inp_width [lindex $argv 7]
-	set wgt_width [lindex $argv 8]
-	set acc_width [lindex $argv 9]
-	set out_width [lindex $argv 10]
-	set batch [lindex $argv 11]
-	set block_in [lindex $argv 12]
-	set block_out [lindex $argv 13]
-	set uop_buff_size [lindex $argv 14]
-	set inp_buff_size [lindex $argv 15]
-	set wgt_buff_size [lindex $argv 16]
-	set acc_buff_size [lindex $argv 17]
-	set out_buff_size [lindex $argv 18]
-	set mode [lindex $argv 19]
-	set no_dsp [lindex $argv 20]
-	set no_alu [lindex $argv 21]
+	set mode [lindex $argv 6]
+	set debug [lindex $argv 7]
+	set no_dsp [lindex $argv 8]
+	set no_alu [lindex $argv 9]
+	set target_period [lindex $argv 10]
+	set inp_width [lindex $argv 11]
+	set wgt_width [lindex $argv 12]
+	set acc_width [lindex $argv 13]
+	set out_width [lindex $argv 14]
+	set batch [lindex $argv 15]
+	set block_in [lindex $argv 16]
+	set block_out [lindex $argv 17]
+	set uop_buff_size [lindex $argv 18]
+	set inp_buff_size [lindex $argv 19]
+	set wgt_buff_size [lindex $argv 20]
+	set acc_buff_size [lindex $argv 21]
+	set out_buff_size [lindex $argv 22]
 } else {
 	set src_dir "../src"
 	set sim_dir "../sim"
 	set test_dir "../../src/test"
 	set include_dir "../../include"
+	set mode "all"
+	set debug "false"
+	set no_dsp "true"
+	set no_alu "false"
 	set target_period 10
 	set inp_width 3
 	set wgt_width 3
 	set acc_width 5
 	set out_width 3
 	set batch 1
-	set block_out 4
 	set block_in 4
+	set block_out 4
 	set uop_buff_size 15
 	set inp_buff_size 15
 	set wgt_buff_size 15
 	set acc_buff_size 17
 	set out_buff_size 15
-	set mode "all"
-	set no_dsp "true"
-	set no_alu "false"
+	exit
 }

 # Initializes the HLS design and sets HLS pragmas for memory partitioning.
@@ -124,12 +128,15 @@ proc init_design {per inp_width wgt_width out_width batch block_in block_out} {

 # C define flags to pass to compiler
 set cflags "-I $include_dir -I $src_dir -I $test_dir \
-	-DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
+	-DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
 	-DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
 	-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
 	-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
 	-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
 	-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
+if {$debug=="true"} {
+	append cflags " -DVTA_DEBUG=1"
+}
 if {$no_dsp=="true"} {
 	append cflags " -DNO_DSP"
 }

--- a/vta/hardware/xilinx/scripts/vivado.tcl
+++ b/vta/hardware/xilinx/scripts/vivado.tcl
@@ -26,15 +26,15 @@ if { [llength $argv] eq 12 } {
  set ip_path [lindex $argv 0]
  set num_threads [lindex $argv 1]
  set clock_freq [lindex $argv 2]
-  set inp_width [lindex $argv 3]
-  set wgt_width [lindex $argv 4]
-  set out_width [lindex $argv 5]
-  set batch [lindex $argv 6]
-  set out_block [lindex $argv 7]
-  set in_block [lindex $argv 8]
-  set inp_mem_size [lindex $argv 9]
-  set wgt_mem_size [lindex $argv 10]
-  set out_mem_size [lindex $argv 11]
+  set inp_width [expr 1 << [lindex $argv 3]]
+  set wgt_width [expr 1 << [lindex $argv 4]]
+  set out_width [expr 1 << [lindex $argv 5]]
+  set batch [expr 1 << [lindex $argv 6]]
+  set out_block [expr 1 << [lindex $argv 7]]
+  set in_block [expr 1 << [lindex $argv 8]]
+  set inp_mem_size [expr 1 << [lindex $argv 9]]
+  set wgt_mem_size [expr 1 << [lindex $argv 10]]
+  set out_mem_size [expr 1 << [lindex $argv 11]]
  if {$clock_freq eq 100} {
    set clock_id 0
    puts "Setting clock frequency to 100MHz"

--- a/vta/hardware/xilinx/sim/vta_test.cc
+++ b/vta/hardware/xilinx/sim/vta_test.cc
@@ -53,5 +53,8 @@ int main(void) {
    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);

+    // Simple GEMM unit test
+    status |= gemm_test(64, 64, 64, true);
+
    return status;
 }
--- a/vta/make/config.json
+++ b/vta/make/config.json
@@ -7,8 +7,8 @@
  "LOG_BATCH" : 0,
  "LOG_BLOCK_IN" : 4,
  "LOG_BLOCK_OUT" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
+  "LOG_UOP_BUFF_SIZE" : 14,
  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 15,
+  "LOG_WGT_BUFF_SIZE" : 18,
  "LOG_ACC_BUFF_SIZE" : 17
 }
--- a/vta/make/sim_sample.json
+++ b/vta/make/sim_sample.json
@@ -7,8 +7,8 @@
  "LOG_BATCH" : 0,
  "LOG_BLOCK_IN" : 4,
  "LOG_BLOCK_OUT" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
+  "LOG_UOP_BUFF_SIZE" : 14,
  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 15,
+  "LOG_WGT_BUFF_SIZE" : 18,
  "LOG_ACC_BUFF_SIZE" : 17
 }
--- a/vta/make/vta_config.py
+++ b/vta/make/vta_config.py
@@ -28,6 +28,32 @@ def main():
                        help="print all the config json")
    parser.add_argument("--target", action="store_true",
                        help="print the target")
+    parser.add_argument("--cfg-str", action="store_true",
+                        help="print the configuration string")
+    parser.add_argument("--get-inpwidth", action="store_true",
+                        help="returns log of input bitwidth")
+    parser.add_argument("--get-wgtwidth", action="store_true",
+                        help="returns log of weight bitwidth")
+    parser.add_argument("--get-accwidth", action="store_true",
+                        help="returns log of accum bitwidth")
+    parser.add_argument("--get-outwidth", action="store_true",
+                        help="returns log of output bitwidth")
+    parser.add_argument("--get-batch", action="store_true",
+                        help="returns log of tensor batch dimension")
+    parser.add_argument("--get-blockin", action="store_true",
+                        help="returns log of tensor block in dimension")
+    parser.add_argument("--get-blockout", action="store_true",
+                        help="returns log of tensor block out dimension")
+    parser.add_argument("--get-uopbuffsize", action="store_true",
+                        help="returns log of micro-op buffer size in B")
+    parser.add_argument("--get-inpbuffsize", action="store_true",
+                        help="returns log of input buffer size in B")
+    parser.add_argument("--get-wgtbuffsize", action="store_true",
+                        help="returns log of weight buffer size in B")
+    parser.add_argument("--get-accbuffsize", action="store_true",
+                        help="returns log of accum buffer size in B")
+    parser.add_argument("--get-outbuffsize", action="store_true",
+                        help="returns log of output buffer size in B")
    args = parser.parse_args()

    if len(sys.argv) == 1:
@@ -46,13 +72,17 @@ def main():
        raise RuntimeError("Cannot find config in %s" % str(path_list))
    cfg = json.load(open(ok_path_list[0]))
    cfg["LOG_OUT_WIDTH"] = cfg["LOG_INP_WIDTH"]
+    cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] + cfg["LOG_ACC_WIDTH"] - cfg["LOG_OUT_WIDTH"]
    pkg = get_pkg_config(cfg)

    if args.target:
        print(pkg.target)

    if args.cflags:
-        print(" ".join(pkg.cflags))
+        cflags_str = " ".join(pkg.cflags)
+        if cfg["TARGET"] == "pynq":
+            cflags_str += " -DVTA_TARGET_PYNQ"
+        print(cflags_str)

    if args.ldflags:
        print(" ".join(pkg.ldflags))
@@ -60,6 +90,54 @@ def main():
    if args.cfg_json:
        print(pkg.cfg_json)

+    if args.cfg_str:
+        cfg_str = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}".format(
+            (1 << cfg["LOG_BATCH"]),
+            (1 << cfg["LOG_BLOCK_IN"]),
+            (1 << cfg["LOG_BLOCK_OUT"]),
+            (1 << cfg["LOG_INP_WIDTH"]),
+            (1 << cfg["LOG_WGT_WIDTH"]),
+            cfg["LOG_UOP_BUFF_SIZE"],
+            cfg["LOG_INP_BUFF_SIZE"],
+            cfg["LOG_WGT_BUFF_SIZE"],
+            cfg["LOG_ACC_BUFF_SIZE"])
+        print cfg_str
+
+    if args.get_inpwidth:
+        print(cfg["LOG_INP_WIDTH"])
+
+    if args.get_wgtwidth:
+        print(cfg["LOG_WGT_WIDTH"])
+
+    if args.get_accwidth:
+        print(cfg["LOG_ACC_WIDTH"])
+
+    if args.get_outwidth:
+        print(cfg["LOG_OUT_WIDTH"])
+
+    if args.get_batch:
+        print(cfg["LOG_BATCH"])
+
+    if args.get_blockin:
+        print(cfg["LOG_BLOCK_IN"])
+
+    if args.get_blockout:
+        print(cfg["LOG_BLOCK_OUT"])
+
+    if args.get_uopbuffsize:
+        print(cfg["LOG_UOP_BUFF_SIZE"])
+
+    if args.get_inpbuffsize:
+        print(cfg["LOG_INP_BUFF_SIZE"])
+
+    if args.get_wgtbuffsize:
+        print(cfg["LOG_WGT_BUFF_SIZE"])
+
+    if args.get_outbuffsize:
+        print(cfg["LOG_OUT_BUFF_SIZE"])
+
+    if args.get_accbuffsize:
+        print(cfg["LOG_ACC_BUFF_SIZE"])

 if __name__ == "__main__":
    main()
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -130,11 +130,15 @@ class Environment(object):
                              self.BLOCK_IN *
                              self.WGT_WIDTH)
        self.ACC_ELEM_BITS = (self.BATCH *
-                              self.BLOCK_IN *
+                              self.BLOCK_OUT *
                              self.ACC_WIDTH)
+        self.OUT_ELEM_BITS = (self.BATCH *
+                              self.BLOCK_OUT *
+                              self.OUT_WIDTH)
        self.INP_ELEM_BYTES = self.INP_ELEM_BITS // 8
        self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8
        self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8
+        self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8
        # dtypes
        self.acc_dtype = "int%d" % self.ACC_WIDTH
        self.inp_dtype = "int%d" % self.INP_WIDTH

--- a/vta/python/vta/ir_pass.py
+++ b/vta/python/vta/ir_pass.py
@@ -339,7 +339,7 @@ def inject_dma_intrin(stmt_in):
        base = 0
        for i in range(1, ndim + 1):
            if not util.equal_const_int(buf.strides[ndim - i] - x_size, 0):
-                raise RuntimeError("scope %s need need to have block=%d" % (scope, elem_block))
+                raise RuntimeError("scope %s needs to have block=%d" % (scope, elem_block))
            x_size = x_size * buf.shape[ndim - i]
            if util.equal_const_int(x_size - elem_block, 0):
                base = i + 1
@@ -469,10 +469,10 @@ def inject_dma_intrin(stmt_in):
            if pad_before or pad_after:
                raise RuntimeError("Do not support copy into DRAM with pad")
            if src.scope == env.acc_scope:
-                elem_width = env.INP_WIDTH # output compression to inp type
-                elem_bytes = env.INP_ELEM_BYTES # output compression to inp type
+                elem_width = env.OUT_WIDTH
+                elem_bytes = env.OUT_ELEM_BYTES
                mem_type = env.dev.MEM_ID_OUT
-                data_type = "int%d" % env.INP_WIDTH
+                data_type = "int%d" % env.OUT_WIDTH
                task_qid = env.dev.QID_STORE_OUT
            else:
                raise RuntimeError("Do not support copy %s->dram" % (src.scope))

--- a/vta/tests/hardware/common/test_lib.cc
+++ b/vta/tests/hardware/common/test_lib.cc
 /*!
 *  Copyright (c) 2018 by Contributors
- * \file vta_test_lib.cpp
+ * \file test_lib.cpp
 * \brief Test library for the VTA design simulation and driver tests.
 */

 #include "./test_lib.h"

+#ifdef NO_SIM
+#ifdef VTA_TARGET_PYNQ
+
+uint64_t vta(
+  uint32_t insn_count,
+  VTAGenericInsn *insns,
+  VTAUop *uops,
+  inp_T *inputs,
+  wgt_T *weights,
+  acc_T *biases,
+  inp_T *outputs) {
+  // Performance counter variables
+  uint64_t t_fpga;
+  struct timespec start, stop;
+
+  // Derive bitstream file
+  char bitstream[128];
+  char str_batch_size[4];
+  char str_block_out_size[4];
+  char str_block_in_size[4];
+  char str_block_bit_width[4];
+  snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
+  snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
+  snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
+  snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
+  snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
+
+#if VTA_DEBUG == 1
+  printf("INFO - Programming FPGA: %s!\n", bitstream);
+#endif
+
+  // Program VTA
+  VTAProgram(bitstream);
+  // Get VTA handles
+  void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
+  void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
+  void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
+  void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
+
+  // Physical address pointers
+  uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
+  uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
+  uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
+  uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
+  uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
+  uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
+
+#if VTA_DEBUG == 1
+  printf("INFO - Starting FPGA!\n");
+#endif
+
+  clock_gettime(CLOCK_REALTIME, &start);
+
+  // FETCH @ 0x10 : Data signal of insn_count_V
+  VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
+  // FETCH @ 0x18 : Data signal of insns_V
+  if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
+  // LOAD @ 0x10 : Data signal of inputs_V
+  if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
+  // LOAD @ 0x18 : Data signal of weight_V
+  if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
+  // COMPUTE @ 0x20 : Data signal of uops_V
+  if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
+  // COMPUTE @ 0x28 : Data signal of biases_V
+  if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
+  // STORE @ 0x10 : Data signal of outputs_V
+  if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
+
+  // VTA start
+  VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
+  VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
+  VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
+  VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
+
+  int flag = 0, t = 0;
+  for (t = 0; t < 10000000; ++t) {
+    flag = VTAReadMappedReg(vta_compute_handle, 0x18);
+    if (flag & VTA_DONE) break;
+  }
+
+  if (t == 10000000) {
+    printf("\tWARNING: VTA TIMEOUT!!!!\n");
+#if VTA_DEBUG == 1
+  } else {
+    printf("INFO - FPGA Finished!\n");
+#endif
+  }
+
+  clock_gettime(CLOCK_REALTIME, &stop);
+  t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
+
+  // Unmap VTA register
+  VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
+  VTAUnmapRegister(vta_load_handle, VTA_RANGE);
+  VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
+  VTAUnmapRegister(vta_store_handle, VTA_RANGE);
+
+  return t_fpga;
+}
+
+#endif  // VTA_TARGET_PYNQ
+#endif  // NO_SIM
+
 uint32_t globalSeed;

 const char* getOpcodeString(int opcode, bool use_imm) {
@@ -1122,3 +1225,232 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
    return -1;
  }
 }
+
+
+int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression) {
+  // Some assertions
+  assert(batch % VTA_BATCH == 0);
+  assert(in_channels % VTA_BLOCK_IN == 0);
+  assert(out_channels % VTA_BLOCK_OUT == 0);
+
+  printf("=====================================================================================\n");
+  printf("INFO - Blocked GEMM test: batch=%d, in_channels=%d, out_channels=%d, uop_comp=%d\n",
+         batch, in_channels, out_channels, uop_compression);
+
+  // Derive number of elements that need to be loaded/stored
+  int ins_size = 7;
+  int uop_size = uop_compression ?
+      batch / VTA_BATCH :
+      batch / VTA_BATCH * in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
+  int inp_size = batch / VTA_BATCH * in_channels / VTA_BLOCK_IN;
+  int wgt_size = in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
+  int out_size = batch / VTA_BATCH * out_channels / VTA_BLOCK_OUT;
+  // Make sure we don't exceed buffer bounds
+  assert(uop_size <= VTA_UOP_BUFF_DEPTH);
+  assert(inp_size <= VTA_INP_BUFF_DEPTH);
+  assert(wgt_size <= VTA_WGT_BUFF_DEPTH);
+  assert(out_size <= VTA_ACC_BUFF_DEPTH);
+
+  // Initialize instruction buffer
+  VTAGenericInsn *insn_buf =
+      static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
+  int insn_idx = 0;
+
+  // Load uops
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(
+      VTA_OPCODE_LOAD,
+      VTA_MEM_ID_UOP,
+      0,
+      0,
+      uop_size,
+      0,
+      0,
+      0,
+      0);
+  // Load bias
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(
+      VTA_OPCODE_LOAD,                                    // opcode
+      VTA_MEM_ID_ACC,                                     // type
+      0,                                                  // sram offset
+      0,                                                  // dram offset
+      out_size,                                           // size
+      0,                                                  // pop prev dep
+      0,                                                  // pop next dep
+      1,                                                  // push prev dep
+      0);                                                 // push next dep
+  // Load weight block (pop next)
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(
+      VTA_OPCODE_LOAD,                                    // opcode
+      VTA_MEM_ID_WGT,                                     // type
+      0,                                                  // sram offset
+      0,                                                  // dram offset
+      wgt_size,                                           // size
+      0,                                                  // pop prev dep
+      1,                                                  // pop next dep
+      0,                                                  // push prev dep
+      0);                                                 // push next dep
+  // Load input block (push next)
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(
+      VTA_OPCODE_LOAD,                                    // opcode
+      VTA_MEM_ID_INP,                                     // type
+      0,                                                  // sram offset
+      0,                                                  // dram offset
+      inp_size,                                           // size
+      0,                                                  // pop prev dep
+      0,                                                  // pop next dep
+      0,                                                  // push prev dep
+      1);                                                 // push next dep
+  // Perform GEMM (pop prev, push prev if not last, push next if last)
+  insn_buf[insn_idx++] = getGEMMInsn(
+      0,                                                  // uop offset
+      batch / VTA_BATCH,                                  // batch
+      in_channels / VTA_BLOCK_IN,                         // in_channels
+      out_channels / VTA_BLOCK_OUT,                       // out_channels
+      uop_compression,                                    // uop_compression
+      1,                                                  // pop_prev_dep
+      0,                                                  // pop_next_dep
+      0,                                                  // push prev dep
+      1);                                                 // push_next_dep
+  // Store output block (pop prev, push prev if not last)
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(
+      VTA_OPCODE_STORE,                                   // opcode
+      VTA_MEM_ID_OUT,                                     // type
+      0,                                                  // sram offset
+      0,                                                  // dram offset
+      out_size,                                           // size
+      1,                                                  // pop prev dep
+      0,                                                  // pop next dep
+      1,                                                  // push prev dep
+      0);                                                 // push next dep
+  // Finish
+  insn_buf[insn_idx++] = getFinishInsn(0, 1);
+
+  // Prepare the uop buffer
+  VTAUop * uop_buf = getGEMMUops(
+      batch / VTA_BATCH,
+      in_channels / VTA_BLOCK_IN,
+      out_channels / VTA_BLOCK_OUT,
+      uop_compression,
+      0);
+
+#if VTA_DEBUG == 1
+  printInstruction(ins_size, insn_buf);
+  printMicroOp(uop_size, uop_buf);
+#endif
+
+  // Initialize inputs
+  inp_T **inputs = allocInit2dArray<inp_T, VTA_INP_WIDTH>(batch, in_channels);
+  // Initialize weights
+  wgt_T **weights = allocInit2dArray<wgt_T, VTA_WGT_WIDTH>(out_channels, in_channels);
+  // Initialize biases
+  acc_T **biases = allocInit2dArray<acc_T, VTA_ACC_WIDTH>(batch, out_channels);
+
+  // Reference GEMM implementation
+  out_T **outputs_ref = alloc2dArray<out_T>(batch, out_channels);
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < out_channels; j++) {
+      acc_T sum = biases[i][j];
+      for (int k = 0; k < in_channels; k++) {
+        sum += (acc_T) (inputs[i][k] * weights[j][k]);
+      }
+      // Set
+      outputs_ref[i][j] = (out_T) sum;
+    }
+  }
+
+  // Prepare the input buffer
+  inp_T *input_buf = static_cast<inp_T *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
+  packBuffer<inp_T, VTA_INP_WIDTH>(input_buf,
+                                   inputs,
+                                   batch,
+                                   in_channels,
+                                   VTA_BATCH,
+                                   VTA_BLOCK_IN);
+  // Prepare the weight buffer
+  wgt_T *weight_buf = static_cast<wgt_T *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
+  packBuffer<wgt_T, VTA_WGT_WIDTH>(weight_buf,
+                                   weights,
+                                   out_channels,
+                                   in_channels,
+                                   VTA_BLOCK_OUT,
+                                   VTA_BLOCK_IN);
+  // Prepare the bias buffer
+  acc_T *bias_buf = static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
+  packBuffer<acc_T, VTA_ACC_WIDTH>(bias_buf,
+                                   biases,
+                                   batch,
+                                   out_channels,
+                                   VTA_BATCH,
+                                   VTA_BLOCK_OUT);
+  // Prepare the output buffer
+  out_T *output_buf = static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * out_size));
+
+#ifdef NO_SIM
+  // Invoke the VTA
+  uint64_t t_fpga = vta(ins_size,
+                        insn_buf,
+                        uop_buf,
+                        input_buf,
+                        weight_buf,
+                        bias_buf,
+                        output_buf);
+  // Report on timining
+  printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
+  printf("INFO - Throughput: %.3lfGOPs/s\n",
+         static_cast<float>(batch) * in_channels * out_channels * 2 / t_fpga);
+#else
+  // Invoke the VTA
+  vta(ins_size,
+      (volatile insn_T *) insn_buf,
+      (volatile uop_T *) uop_buf,
+      (volatile inp_vec_T *) input_buf,
+      (volatile wgt_vec_T *) weight_buf,
+      (volatile acc_vec_T *) bias_buf,
+      (volatile out_vec_T *) output_buf);
+#endif
+
+  // Unpack output data
+  out_T **outputs = alloc2dArray<out_T>(batch, out_channels);
+  unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
+                                     output_buf,
+                                     batch,
+                                     out_channels,
+                                     VTA_BATCH,
+                                     VTA_BLOCK_OUT);
+
+  // Correctness checks
+  int err = 0;
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < out_channels; j++) {
+      if (outputs_ref[i][j] != outputs[i][j]) {
+        err++;
+#if VTA_DEBUG == 1
+        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
+               static_cast<int>(outputs_ref[i][j]),
+               static_cast<int>(outputs[i][j]));
+#endif
+      }
+    }
+  }
+
+  // Free all allocated arrays
+  free2dArray<inp_T>(inputs, batch, in_channels);
+  free2dArray<wgt_T>(weights, out_channels, in_channels);
+  free2dArray<acc_T>(biases, batch, out_channels);
+  free2dArray<out_T>(outputs_ref, batch, out_channels);
+  free2dArray<out_T>(outputs, batch, out_channels);
+  freeBuffer(insn_buf);
+  freeBuffer(uop_buf);
+  freeBuffer(input_buf);
+  freeBuffer(weight_buf);
+  freeBuffer(bias_buf);
+  freeBuffer(output_buf);
+
+  if (err == 0) {
+    printf("INFO - Blocked GEMM test successful!\n");
+    return 0;
+  } else {
+    printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
+    return -1;
+  }
+}
\ No newline at end of file
--- a/vta/tests/hardware/common/test_lib.h
+++ b/vta/tests/hardware/common/test_lib.h
 /*!
 *  Copyright (c) 2018 by Contributors
- * \file vta_test_lib.cpp
+ * \file test_lib.cpp
 * \brief Test library for the VTA design simulation and driver tests.
 */

@@ -17,9 +17,9 @@

 #include <vta/driver.h>

-#ifdef VTA_PYNQ_TARGET
+#ifdef VTA_TARGET_PYNQ
 #include "../../../src/pynq/pynq_driver.h"
-#endif  // VTA_PYNQ_TARGET
+#endif  // VTA_TARGET_PYNQ

 typedef uint64_t axi_T;
 typedef uint32_t uop_T;
@@ -300,4 +300,14 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
 int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
  int virtual_threads);

+/*!
+* \brief VTA GEMM unit test.
+* \param batch Batch size.
+* \param in_channels Input channels.
+* \param out_channels Output channels.
+* \param uop_compression Apply micro-op compression.
+* \return Number of errors from the test run.
+*/
+int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression);
+
 #endif  //  TESTS_HARDWARE_COMMON_TEST_LIB_H_
--- a/vta/tests/hardware/pynq/Makefile
+++ b/vta/tests/hardware/pynq/Makefile
 CC ?= g++
 CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
 LDFLAGS = -L/usr/lib -L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/
-LIBS = -l:libsds_lib.so -l:libdma.so
+LIBS = -l:libsds_lib.so -l:libdma.so -lstdc++
 INCLUDE_DIR = ../../../include
 DRIVER_DIR = ../../../src/pynq
 TESTLIB_DIR = ../common
@@ -10,19 +10,14 @@ SOURCES = pynq_driver.cc test_lib.cc
 OBJECTS = pynq_driver.o test_lib.o metal_test.o
 EXECUTABLE = vta

-# Include top-level config file
-ifndef config
-ifneq ("$(wildcard ../../../config.mk)", "")
-	config = ../../../config.mk
-else
-	config = ../../../make/config.mk
-endif
-endif
-include $(config)
+# Include VTA config
+VTA_CONFIG = python ../../../make/vta_config.py
+CFLAGS += `${VTA_CONFIG} --cflags`
+LDFLAGS += `${VTA_CONFIG} --ldflags`
+VTA_TARGET := $(shell ${VTA_CONFIG} --target)

 # Define flags
-CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DDEBUG=0
-CFLAGS += $(ADD_CFLAGS)
+CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DVTA_DEBUG=0

 # All Target
 all: $(EXECUTABLE)

--- a/vta/tests/hardware/pynq/metal_test.cc
+++ b/vta/tests/hardware/pynq/metal_test.cc
 /*!
 *  Copyright (c) 2018 by Contributors
- * \file driver_test.cpp
+ * \file metal_test.cpp
 * \brief Bare-metal test to test driver and VTA design.
 */

@@ -13,104 +13,6 @@
 #include "../../../src/pynq/pynq_driver.h"
 #include "../common/test_lib.h"

-// VTA invocation (present the same abstraction as in the simulation tests)
-uint64_t vta(
-  uint32_t insn_count,
-  VTAGenericInsn *insns,
-  VTAUop *uops,
-  inp_T *inputs,
-  wgt_T *weights,
-  acc_T *biases,
-  inp_T *outputs) {
-  // Performance counter variables
-  uint64_t t_fpga;
-  struct timespec start, stop;
-
-  // Derive bitstream file
-  char bitstream[128];
-  char str_batch_size[4];
-  char str_block_out_size[4];
-  char str_block_in_size[4];
-  char str_block_bit_width[4];
-  snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
-  snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
-  snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
-  snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
-  snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
-
-#if VTA_DEBUG == 1
-  printf("INFO - Programming FPGA: %s!\n", bitstream);
-#endif
-
-  // Program VTA
-  VTAProgram(bitstream);
-  // Get VTA handles
-  VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
-  VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
-  VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
-  VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
-
-  // Physical address pointers
-  uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
-  uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
-  uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
-  uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
-  uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
-  uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
-
-#if VTA_DEBUG == 1
-  printf("INFO - Starting FPGA!\n");
-#endif
-
-  clock_gettime(CLOCK_REALTIME, &start);
-
-  // FETCH @ 0x10 : Data signal of insn_count_V
-  VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
-  // FETCH @ 0x18 : Data signal of insns_V
-  if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
-  // LOAD @ 0x10 : Data signal of inputs_V
-  if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
-  // LOAD @ 0x18 : Data signal of weight_V
-  if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
-  // COMPUTE @ 0x20 : Data signal of uops_V
-  if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
-  // COMPUTE @ 0x28 : Data signal of biases_V
-  if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
-  // STORE @ 0x10 : Data signal of outputs_V
-  if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
-
-  // VTA start
-  VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
-  VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
-  VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
-  VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
-
-  int flag = 0, t = 0;
-  for (t = 0; t < 10000000; ++t) {
-    flag = VTAReadMappedReg(vta_compute_handle, 0x18);
-    if (flag & VTA_DONE) break;
-  }
-
-  if (t == 10000000) {
-    printf("\tWARNING: VTA TIMEOUT!!!!\n");
-#if VTA_DEBUG == 1
-  } else {
-    printf("INFO - FPGA Finished!\n");
-#endif
-  }
-
-  clock_gettime(CLOCK_REALTIME, &stop);
-  t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
-
-  // Unmap VTA register
-  VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
-  VTAUnmapRegister(vta_load_handle, VTA_RANGE);
-  VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
-  VTAUnmapRegister(vta_store_handle, VTA_RANGE);
-
-  return t_fpga;
-}
-
 int main(void) {
 #if VTA_DEBUG == 1
  printParameters();