[REFACTOR] Code base refactoring (#5)

28a10b69 · Thierry Moreau · Tianqi Chen · 0979e9aa · 28a10b69 · 28a10b69
Commit 28a10b69 authored Mar 18, 2018 by Thierry Moreau Committed by Tianqi Chen Jul 11, 2018
21 changed files
--- a/vta/Makefile
+++ b/vta/Makefile
@@ -54,9 +54,13 @@ endif

 all: lib/libvta.$(SHARED_LIBRARY_SUFFIX)

-SRC = $(wildcard src/*.cc src/*.cc)
-ALL_OBJ = $(patsubst %.cc, build/%.o, $(SRC))
-ALL_DEP = $(ALL_OBJ)
+VTA_LIB_SRC = $(wildcard src/*.cc src/tvm/*.cc)
+ifeq ($(TARGET), PYNQ_TARGET)
+	VTA_LIB_SRC += $(wildcard src/pynq/*.cc)
+	LDFLAGS += -L/usr/lib -lsds_lib
+	LDFLAGS += -L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/ -l:libdma.so
+endif
+VTA_LIB_OBJ = $(patsubst %.cc, build/%.o, $(VTA_LIB_SRC))

 test: $(TEST)

@@ -65,7 +69,7 @@ build/src/%.o: src/%.cc
 	$(CXX) $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
 	$(CXX) -c $(CFLAGS) -c $< -o $@

-lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(ALL_DEP)
+lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ)
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS)


--- a/vta/apps/pynq_rpc/start_rpc_server.sh
+++ b/vta/apps/pynq_rpc/start_rpc_server.sh
+#!/bin/bash
+export PYTHONPATH=${PYTHONPATH}:/home/xilinx/tvm/python
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/
+python -m  tvm.exec.rpc_server --load-library /home/xilinx/vta/lib/libvta.so
--- a/vta/hardware/vivado/Makefile
+++ b/vta/hardware/vivado/Makefile
@@ -2,9 +2,9 @@
 ROOTDIR = $(CURDIR)
 BUILD_DIR = $(ROOTDIR)/build
 SCRIPT_DIR = $(ROOTDIR)/scripts
-SRC_DIR = $(ROOTDIR)/../../src/hardware/hls
+SRC_DIR = $(ROOTDIR)/src
 SIM_DIR = $(ROOTDIR)/sim
-TEST_DIR = $(ROOTDIR)/../../src/test
+TEST_DIR = $(ROOTDIR)/../../tests/hardware/common
 INCLUDE_DIR = $(ROOTDIR)/../../include

 # Executables
@@ -12,59 +12,28 @@ VIVADO_HLS = vivado_hls
 VIVADO = vivado
 HSI = hsi

-# Build parameters:
+# Include top-level config file
+ifndef config
+ifneq ("$(wildcard ../../config.mk)", "")
+	config = ../../config.mk
+else
+	config = ../../make/config.mk
+endif
+endif
+include $(config)
+
+#---------------------
+# Compilation parameters
+#--------------------
+
 #  Number of threads during compilation
 NUM_THREADS = 8
+
 #  Target Frequency
 CLOCK_FREQ = 100
-#  Log of input width in bits
-LOG_INP_WIDTH = 3
-#  Log of weight width in bits
-LOG_WGT_WIDTH = 3
-#  Log of accum width in bits
-LOG_ACC_WIDTH = 5
-#  Log of output width in bits
-LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
-#  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
-LOG_BATCH = 0
-#  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
-LOG_IN_BLOCK = 4
-#  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
-LOG_OUT_BLOCK = 4
-#  Log of uop buffer size in Bytes
-LOG_UOP_BUFF_SIZE = 15
-#  Log of inp buffer size in Bytes
-LOG_INP_BUFF_SIZE = 15
-#  Log of wgt buffer size in Bytes
-LOG_WGT_BUFF_SIZE = 15
-#  Log of acc buffer size in Bytes
-LOG_ACC_BUFF_SIZE = 17
-#  Log of out buffer size in Bytes
-LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )

-# Derived parameter
-#  Input width in bits
-INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
-#  Weight width in bits
-WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
-#  Output width in bits
-OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
-#  Tensor batch size
-BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
-#  Tensor outer block size
-IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" )
-#  Tensor inner block size
-OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" )
-#  Uop buffer size in Bytes
-UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
-#  Inp buffer size in Bytes
-INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
-#  Wgt buffer size in Bytes
-WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
-#  Acc buffer size in Bytes
-ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
-#  Out buffer size in Bytes
-OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
+#  Timing closure compensation (0 for none, 3 for highest)
+TIMING_CLOSURE_COMP = 0

 # Derive clock target period
 TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
@@ -85,7 +54,7 @@ ip:
 		$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
 			-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
 			$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
-			$(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \
+			$(LOG_BATCH) $(LOG_BLOCK_OUT) $(LOG_BLOCK_IN) \
 			$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
 			$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)


--- a/vta/hardware/vivado/scripts/hls.tcl
+++ b/vta/hardware/vivado/scripts/hls.tcl
@@ -62,7 +62,7 @@ if { [llength $argv] eq 19 } {
 }

 # C define flags to pass to compiler
-set cflags "-I $include_dir -I $include_dir/hardware/hls \
+set cflags "-I $include_dir -I $src_dir -I $test_dir \
 	-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
 	-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
 	-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
@@ -127,7 +127,7 @@ open_project vta_sim
 set_top vta
 add_files $src_dir/vta.cc -cflags $cflags
 add_files -tb $sim_dir/vta_test.cc -cflags $cflags
-add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags
+add_files -tb $test_dir/test_lib.cc -cflags $cflags
 open_solution "solution0"
 init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
 csim_design -clean

--- a/vta/hardware/vivado/sim/vta_test.cc
+++ b/vta/hardware/vivado/sim/vta_test.cc
@@ -8,8 +8,8 @@
 #include <stdlib.h>
 #include <iostream>

-#include "vta.h"
-#include "vta_test_lib.h"
+#include "../src/vta.h"
+#include "../../../tests/hardware/common/test_lib.h"

 int main(void)
 {

--- a/vta/src/hardware/hls/vta.cc
+++ b/vta/src/hardware/hls/vta.cc
@@ -8,7 +8,7 @@
 #include <stdlib.h>
 #include <string.h>

-#include "vta.h"
+#include "./vta.h"

 void fetch (
  uint32_t insn_count,

--- a/vta/include/hardware/hls/vta.h
+++ b/vta/include/hardware/hls/vta.h
@@ -11,8 +11,88 @@
 #include <ap_int.h>
 #include <hls_stream.h>

-#include "vta_typedefs.h"
-#include "vta_params.h"
+#include <vta/hw_spec.h>
+
+/* \typedef uop_T Micro-op datatype*/
+typedef ap_uint<UOP_WIDTH> uop_T;
+
+/* \typedef inp_T Input datatype*/
+typedef ap_int<INP_WIDTH> inp_T;
+
+/* \typedef wgt_T Weight datatype*/
+typedef ap_int<WGT_WIDTH> wgt_T;
+
+/* \typedef out_T Output datatype*/
+typedef ap_int<OUT_WIDTH> out_T;
+
+/* \typedef acc_T Accumulator datatype*/
+typedef ap_int<ACC_WIDTH> acc_T;
+
+/* \typedef mul_T Multiplier output datatype*/
+typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
+
+/* \typedef sum_T GEMM accumulator datatype*/
+typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
+
+/* \typedef inp_vec_T Input vector datatype*/
+typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
+
+/* \typedef wgt_vec_T Weight vector datatype*/
+typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
+
+/* \typedef acc_vec_T Accumulator vector datatype*/
+typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
+
+/* \typedef out_vec_T Output vector datatype*/
+typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
+
+/* \typedef uop_idx_T Micro-op SRAM index datatype*/
+typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
+
+/* \typedef inp_idx_T Input SRAM index datatype*/
+typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
+
+/* \typedef wgt_idx_T Weight SRAM index datatype*/
+typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
+
+/* \typedef acc_idx_T Accumulator SRAM index datatype*/
+typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
+
+/* \typedef opcode_T Opcode datatype*/
+typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
+
+/* \typedef insn_T Instruction datatype*/
+typedef ap_uint<INS_WIDTH> insn_T;
+
+/* \typedef loop_T Loop bound datatype*/
+typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
+
+/* \typedef memop_id_T Memory operation ID datatype*/
+typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
+
+/* \typedef memop_sram_T Memory operation SRAM index datatype*/
+typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
+
+/* \typedef memop_dram_T Memory operation DRAM index datatype*/
+typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
+
+/* \typedef memop_size_T Memory operation range datatype*/
+typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
+
+/* \typedef memop_stride_T Memory operation stride datatype*/
+typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
+
+/* \typedef memop_pad_T Memory operation pad width datatype*/
+typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
+
+/* \typedef aluop_opcode_T ALU operation opcode datatype*/
+typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
+
+/* \typedef aluop_opcode_T ALU operation immediate datatype*/
+typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
+
+/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
+typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;

 /*!
 * \brief Fetch module.

--- a/vta/include/hardware/hls/vta_typedefs.h
+++ b/vta/include/hardware/hls/vta_typedefs.h
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file vta_typedefs.h
- * \brief Type definitions for VTA HLS design.
- */
-#ifndef VTA_TYPEDEFS_H_
-#define VTA_TYPEDEFS_H_
-
-#include <assert.h>
-#include <ap_axi_sdata.h>
-#include <ap_int.h>
-#include <hls_stream.h>
-
-#include "vta_params.h"
-
-/* \typedef uop_T Micro-op datatype*/
-typedef ap_uint<UOP_WIDTH> uop_T;
-
-/* \typedef inp_T Input datatype*/
-typedef ap_int<INP_WIDTH> inp_T;
-
-/* \typedef wgt_T Weight datatype*/
-typedef ap_int<WGT_WIDTH> wgt_T;
-
-/* \typedef out_T Output datatype*/
-typedef ap_int<OUT_WIDTH> out_T;
-
-/* \typedef acc_T Accumulator datatype*/
-typedef ap_int<ACC_WIDTH> acc_T;
-
-/* \typedef mul_T Multiplier output datatype*/
-typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
-
-/* \typedef sum_T GEMM accumulator datatype*/
-typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
-
-/* \typedef inp_vec_T Input vector datatype*/
-typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
-
-/* \typedef wgt_vec_T Weight vector datatype*/
-typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
-
-/* \typedef acc_vec_T Accumulator vector datatype*/
-typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
-
-/* \typedef out_vec_T Output vector datatype*/
-typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
-
-/* \typedef uop_idx_T Micro-op SRAM index datatype*/
-typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
-
-/* \typedef inp_idx_T Input SRAM index datatype*/
-typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
-
-/* \typedef wgt_idx_T Weight SRAM index datatype*/
-typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
-
-/* \typedef acc_idx_T Accumulator SRAM index datatype*/
-typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
-
-/* \typedef opcode_T Opcode datatype*/
-typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
-
-/* \typedef insn_T Instruction datatype*/
-typedef ap_uint<INS_WIDTH> insn_T;
-
-/* \typedef loop_T Loop bound datatype*/
-typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
-
-/* \typedef memop_id_T Memory operation ID datatype*/
-typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
-
-/* \typedef memop_sram_T Memory operation SRAM index datatype*/
-typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
-
-/* \typedef memop_dram_T Memory operation DRAM index datatype*/
-typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
-
-/* \typedef memop_size_T Memory operation range datatype*/
-typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
-
-/* \typedef memop_stride_T Memory operation stride datatype*/
-typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
-
-/* \typedef memop_pad_T Memory operation pad width datatype*/
-typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
-
-/* \typedef aluop_opcode_T ALU operation opcode datatype*/
-typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
-
-/* \typedef aluop_opcode_T ALU operation immediate datatype*/
-typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
-
-/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
-typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
-
-#endif // VTA_TYPEDEFS_H_
--- a/vta/include/vta/driver.h
+++ b/vta/include/vta/driver.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_driver.h
+ * \brief General driver interface.
+ */
+
+#ifndef VTA_DRIVER_H_
+#define VTA_DRIVER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+
+/*! \brief Memory management constants with libxlnk_cma */
+#define CACHED 1
+/*! \brief Memory management constants with libxlnk_cma */
+#define NOT_CACHED 0
+
+/*! \brief VTA command handle */
+typedef void * VTAHandle;
+
+/*!
+ * \brief Allocates physically contiguous region in memory (limited by MAX_XFER).
+ * \param size Size of the region in Bytes.
+ * \param cached Region can be set to not cached (write-back) if set to 0.
+ * \return A pointer to the allocated region.
+ */
+void* VTAMemAlloc(size_t size, int cached);
+
+/*!
+ * \brief Frees a physically contiguous region in memory.
+ * \param buf Buffer to free.
+ */
+void VTAMemFree(void* buf);
+
+/*!
+ * \brief Returns a physical address to the region of memory allocated with VTAMemAlloc.
+ * \param buf Pointer to memory region allocated with VTAMemAlloc.
+ * \return The physical address of the memory region.
+ */
+uint32_t VTAGetMemPhysAddr(void* buf);
+
+/*!
+ * \brief Flushes the region of memory out of the CPU cache to DRAM.
+ * \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
+ * \param size Size of the region to flush in Bytes.
+ */
+void VTAFlushCache(void* buf, int size);
+
+/*!
+ * \brief Invalidates the region of memory that is cached.
+ * \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated.
+ * \param size Size of the region to invalidate in Bytes.
+ */
+void VTAInvalidateCache(void* buf, int size);
+
+/*!
+ * \brief Returns a memory map to FPGA configuration registers.
+ * \param addr The base physical address of the configuration registers.
+ * \param length The size of the memory mapped region in bytes.
+ * \return A pointer to the memory mapped region.
+ */
+void *VTAMapRegister(unsigned addr, size_t length);
+
+/*!
+ * \brief Deletes the configuration register memory map.
+ * \param vta The memory mapped region.
+ * \param length The size of the memory mapped region in bytes.
+ */
+void VTAUnmapRegister(void *vta, size_t length);
+
+/*!
+ * \brief Writes to a memory mapped configuration register.
+ * \param vta_base The handle to the memory mapped configuration registers.
+ * \param offset The offset of the register to write to.
+ * \param val The value to be written to the memory mapped register.
+ */
+void VTAWriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
+
+/*!
+ * \brief Reads from the memory mapped configuration register.
+ * \param vta_base The handle to the memory mapped configuration registers.
+ * \param offset The offset of the register to read from.
+ * \return The value read from the memory mapped register.
+ */
+unsigned VTAReadMappedReg(VTAHandle vta_base, unsigned offset);
+
+/*!
+ * \brief Programming the bit stream on the FPGA.
+ * \param bitstream The path to the bit stream file.
+ */
+void VTAProgram(const char* bitstream);
+
+#ifdef __cplusplus
+}
+#endif
+#endif // VTA_DRIVER_H_
--- a/vta/include/vta_params.h
+++ b/vta/include/vta_params.h
@@ -3,8 +3,13 @@
 * \file vta_defines.h
 * \brief Preprocessor definitions for VTA HLS design and runtime.
 */
-#ifndef VTA_DEFINES_H_
-#define VTA_DEFINES_H_
+
+#ifndef VTA_HW_SPEC_H_
+#define VTA_HW_SPEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif

 #include <stdint.h>

@@ -556,4 +561,7 @@ typedef struct {
  uint32_t wgt_idx    : LOG_WGT_BUFF_DEPTH;
 } VTAUop;

-#endif // VTA_DEFINES_H_
+#ifdef __cplusplus
+}
+#endif
+#endif // VTA_HW_SPEC_H_
--- a/vta/include/vta/runtime.h
+++ b/vta/include/vta/runtime.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file runtime.h
+ * \brief VTA runtime library.
+ */
+
+#ifndef VTA_RUNTIME_H_
+#define VTA_RUNTIME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./driver.h"
+
+#define VTA_MEMCPY_H2D 1
+#define VTA_MEMCPY_D2H 2
+#define VTA_MEMCPY_D2D 3
+
+#define VTA_DEBUG_DUMP_INSN (1 << 1)
+#define VTA_DEBUG_DUMP_UOP (1 << 2)
+#define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3)
+#define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4)
+#define VTA_DEBUG_FORCE_SERIAL (1 << 5)
+
+/*! \brief VTA command handle */
+typedef void * VTACommandHandle;
+
+/*! \brief Shutdown hook of VTA to cleanup resources */
+void VTARuntimeShutdown();
+
+/*!
+ * \brief Get thread local command handle.
+ * \return A thread local command handle.
+ */
+VTACommandHandle VTATLSCommandHandle();
+
+/*!
+ * \brief Allocate data buffer.
+ * \param cmd The VTA command handle.
+ * \param size Buffer size.
+ * \return A pointer to the allocated buffer.
+ */
+void* VTABufferAlloc(VTACommandHandle cmd, size_t size);
+
+/*!
+ * \brief Free data buffer.
+ * \param cmd The VTA command handle.
+ * \param buffer The data buffer to be freed.
+ */
+void VTABufferFree(VTACommandHandle cmd, void* buffer);
+
+/*!
+ * \brief Get the buffer access pointer on CPU.
+ * \param cmd The VTA command handle.
+ * \param buffer The data buffer.
+ * \return The pointer that can be accessed by the CPU.
+ */
+void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
+
+/*!
+ * \brief Copy data buffer from one location to another.
+ * \param cmd The VTA command handle.
+ * \param from The source buffer base address.
+ * \param from_offset The offset of the source buffer.
+ * \param to The target buffer base address.
+ * \param to_offset The offset of the target buffer.
+ * \param size Size of copy.
+ * \param kind_mask The memory copy kind.
+ */
+void VTABufferCopy(VTACommandHandle cmd,
+                   const void* from,
+                   size_t from_offset,
+                   void* to,
+                   size_t to_offset,
+                   size_t size,
+                   int kind_mask);
+
+/*!
+ * \brief Set debug mode on the command handle.
+ * \param cmd The VTA command handle.
+ * \param debug_flag The debug flag.
+ */
+void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
+
+/*!
+ * \brief Perform a write barrier to make a memory region visible to the CPU.
+ * \param cmd The VTA command handle.
+ * \param buffer The head buffer pointer.
+ * \param elem_bits The size in bits of each element.
+ * \param start The start of the region (in elements).
+ * \param extent The end of the region (in elements).
+ */
+void VTAWriteBarrier(VTACommandHandle cmd,
+                     void* buffer, uint32_t elem_bits,
+                     uint32_t start, uint32_t extent);
+
+/*!
+ * \brief Perform a read barrier to a memory region visible to VTA.
+ * \param cmd The VTA command handle.
+ * \param buffer The head buffer pointer.
+ * \param elem_bits The unit bits of each elements.
+ * \param start The start of the region (in elements).
+ * \param extent The end of the region (in elements).
+ */
+void VTAReadBarrier(VTACommandHandle cmd,
+                    void* buffer, uint32_t elem_bits,
+                    uint32_t start, uint32_t extent);
+
+/*!
+ * \brief Perform a 2D data load from DRAM.
+ *  Sizes are measured in units of vector elements.
+ * \param cmd The VTA command handle.
+ * \param src_dram_addr Source DRAM address.
+ * \param src_elem_offset The source DRAM offset in number of unit elements.
+ * \param x_size The lowest dimension (x axis) size in number of unit elements.
+ * \param y_size The number of rows (y axis).
+ * \param x_stride The x axis stride.
+ * \param x_pad_before The start padding on x axis.
+ * \param y_pad_before The start padding on y axis.
+ * \param x_pad_after The end padding on x axis.
+ * \param y_pad_after The end padding of y axis.
+ * \param dst_sram_index Destination SRAM index.
+ * \param dst_memory_type Destination memory type.
+ */
+void VTALoadBuffer2D(VTACommandHandle cmd,
+                     void* src_dram_addr,
+                     uint32_t src_elem_offset,
+                     uint32_t x_size,
+                     uint32_t y_size,
+                     uint32_t x_stride,
+                     uint32_t x_pad_before,
+                     uint32_t y_pad_before,
+                     uint32_t x_pad_after,
+                     uint32_t y_pad_after,
+                     uint32_t dst_sram_index,
+                     uint32_t dst_memory_type);
+
+/*!
+ * \brief Perform a 2D data store into DRAM
+ *  Sizes are measured in units of vector elements.
+ * \param cmd The VTA command handle.
+ * \param src_sram_index Source SRAM index.
+ * \param src_memory_type Source memory type.
+ * \param dst_dram_addr Destination DRAM address.
+ * \param x_size The lowest dimension (x axis) size in number of unit elements.
+ * \param y_size The number of rows.
+ * \param x_stride The x axis stride.
+ */
+void VTAStoreBuffer2D(VTACommandHandle cmd,
+                      uint32_t src_sram_index,
+                      uint32_t src_memory_type,
+                      void* dst_dram_addr,
+                      uint32_t dst_elem_offset,
+                      uint32_t x_size,
+                      uint32_t y_size,
+                      uint32_t x_stride);
+
+/*!
+ * \brief Push uop into kernel buffer.
+ * In GEMM mode, do a blocked GEMM with 2d access pattern.
+ * In ALU mode, do a vectorized ALU operation with 2d access pattern.
+ *
+ *  \code
+ *
+ *   DType accum[INP_BUFF_DEPTH][l][n];
+ *   DType weight[WGT_BUFF_DEPTH][n][m];
+ *   DType input[INP_BUFF_DEPTH][l][m];
+ *   if reset_out == 1
+ *    accum[dst_index] = 0
+ *   elif mode == 0
+ *    accum[dst_index] += GEMM(input[src_index], weight[wgt_index]);
+ *   else
+ *    if (use_imm)
+ *      accum[dst_index] = opcode(accum[dst_index], imm_val);
+ *    else
+ *      accum[dst_index] = opcode(accum[dst_index], accum[src_index]);
+ *
+ *  \endcode
+ *
+ * \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
+ * \param reset_out Resets the accum to 0.
+ * \param dst_index The accum memory index.
+ * \param src_index The input memory (gemm) / accum memory (alu) index.
+ * \param wgt_index The weight memory index.
+ * \param opcode The ALU opcode.
+ * \param use_imm Use immediate in ALU mode if set to true.
+ * \param imm_val Immediate value in ALU mode.
+ */
+void VTAUopPush(uint32_t mode,
+                uint32_t reset_out,
+                uint32_t dst_index,
+                uint32_t src_index,
+                uint32_t wgt_index,
+                uint32_t opcode,
+                uint32_t use_imm,
+                uint32_t imm_val);
+
+/*!
+ * \brief Mark start of a micro op loop.
+ * \param extent The extent of the loop.
+ * \param dst_factor The accum factor.
+ * \param src_factor The input factor.
+ * \param wgt_factor The weight factor.
+ */
+void VTAUopLoopBegin(uint32_t extent,
+                     uint32_t dst_factor,
+                     uint32_t src_factor,
+                     uint32_t wgt_factor);
+
+/*!
+ * \brief Mark end of a micro op loop.
+ */
+void VTAUopLoopEnd();
+
+/*!
+ * \brief Push GEMM uop kernel into the command handle.
+ * \param uop_handle The uop cache handle.
+ * \param finit The initalization function to initialize uop.
+ * \param signature The closure arguments of the finit.
+ * \param nbytes Number of bytes to in the closure arguments.
+ * \return 0 if success.
+ */
+int VTAPushGEMMOp(void** uop_handle,
+                  int (*finit)(void*),
+                  void* signature,
+                  int nbytes);
+
+/*!
+ * \brief Push ALU uop kernel into the command handle.
+ * \param uop_handle The uop cache handle.
+ * \param finit The initalization function to initialize uop.
+ * \param signature The closure arguments of the finit.
+ * \param nbytes Number of bytes to in the closure arguments.
+ * \return 0 if success.
+ */
+int VTAPushALUOp(void** uop_handle,
+                 int (*finit)(void*),
+                 void* signature,
+                 int nbytes);
+
+/*!
+ * \brief Push dependence token.
+ * \param cmd The VTA command handle.
+ * \param from_qid The source queue.
+ * \param to_qid The destination queue.
+ * \return 0 if success.
+ */
+int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
+
+/*!
+ * \brief Pop dependence signal.
+ * \param cmd The VTA command handle.
+ * \param from_qid The source queue.
+ * \param to_qid The destination queue.
+ * \return 0 if success.
+ */
+int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
+
+/*!
+ * \brief Synchronize the command handle.
+ *  Commit all the instructions to VTA and wait until
+ *  the accelerator finishes its job.
+ *  Perform all of the out-of-order DRAM stores.
+ * \param cmd The VTA command handle.
+ * \param wait_cycles The limit of poll cycles.
+ *
+ */
+void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // VTA_RUNTIME_H_
--- a/vta/make/config.mk
+++ b/vta/make/config.mk
@@ -25,3 +25,72 @@ ADD_LDFLAGS=

 # the additional compile flags you want to add
 ADD_CFLAGS=
+
+# the hardware target
+TARGET=PYNQ_TARGET
+
+#---------------------
+# VTA hardware parameters
+#--------------------
+
+#  Log of input/activation width in bits (default 3 -> 8 bits)
+LOG_INP_WIDTH = 3
+#  Log of kernel weight width in bits (default 3 -> 8 bits)
+LOG_WGT_WIDTH = 3
+#  Log of accum width in bits (default 5 -> 32 bits)
+LOG_ACC_WIDTH = 5
+#  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
+LOG_BATCH = 0
+#  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
+LOG_BLOCK_IN = 4
+#  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
+LOG_BLOCK_OUT = 4
+#  Log of uop buffer size in Bytes
+LOG_UOP_BUFF_SIZE = 15
+#  Log of inp buffer size in Bytes
+LOG_INP_BUFF_SIZE = 15
+#  Log of wgt buffer size in Bytes
+LOG_WGT_BUFF_SIZE = 15
+#  Log of acc buffer size in Bytes
+LOG_ACC_BUFF_SIZE = 17
+
+#---------------------
+# Derived VTA hardware parameters
+#--------------------
+
+#  Input width in bits
+INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
+#  Weight width in bits
+WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
+#  Log of output width in bits
+LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
+#  Output width in bits
+OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
+#  Tensor batch size
+BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
+#  Tensor outer block size
+IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_IN) ))" )
+#  Tensor inner block size
+OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_OUT) ))" )
+#  Uop buffer size in Bytes
+UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
+#  Inp buffer size in Bytes
+INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
+#  Wgt buffer size in Bytes
+WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
+#  Acc buffer size in Bytes
+ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
+#  Log of out buffer size in Bytes
+LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
+#  Out buffer size in Bytes
+OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
+
+# Update ADD_CFLAGS
+ADD_CFLAGS += \
+	-D$(TARGET) \
+	-DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
+	-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
+	-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_BLOCK_IN) -DLOG_BLOCK_OUT=$(LOG_BLOCK_OUT) \
+	-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
+	-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
+	-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
\ No newline at end of file
--- a/vta/src/driver/pynq/vta_pynq_driver.c
+++ b/vta/src/driver/pynq/vta_pynq_driver.c
@@ -4,15 +4,31 @@
 * \brief VTA driver for Pynq board.
 */

-#ifdef __cplusplus
-extern "C" {
-#endif
-#include "vta_pynq_driver.h"
-#ifdef __cplusplus
+#include <vta/driver.h>
+#include "./pynq_driver.h"
+
+
+void* VTAMemAlloc(size_t size, int cached) {
+  return cma_alloc(size, cached);
+}
+
+void VTAMemFree(void* buf) {
+  cma_free(buf);
+}
+
+uint32_t VTAGetMemPhysAddr(void* buf) {
+  return cma_get_phy_addr(buf);
+}
+
+void VTAFlushCache(void* buf, int size) {
+  xlnkFlushCache(buf, size);
+}
+
+void VTAInvalidateCache(void* buf, int size) {
+  xlnkInvalidateCache(buf, size);
 }
-#endif

-void *MapRegister(uint32_t addr, size_t length) {
+void *VTAMapRegister(uint32_t addr, size_t length) {

  // Align the base address with the pages
  uint32_t virt_base = addr & ~(getpagesize() - 1);
@@ -24,21 +40,21 @@ void *MapRegister(uint32_t addr, size_t length) {
  return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base);
 }

-void UnmapRegister(void *vta, size_t length) {
+void VTAUnmapRegister(void *vta, size_t length) {
  // Unmap memory
  int status = munmap(vta, length);
  assert(status==0);
 }

-void WriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
+void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
  *((volatile uint32_t *) (((char *) base_addr) + offset)) = val;
 }

-uint32_t ReadMappedReg(void* base_addr, uint32_t offset) {
+uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
  return *((volatile uint32_t *) (((char *) base_addr) + offset));
 }

-void ProgramVTA(const char* bitstream) {
+void VTAProgram(const char* bitstream) {

    int elem;
    FILE *src, *dst, *partial;

--- a/vta/include/vta_pynq_driver.h
+++ b/vta/include/vta_pynq_driver.h
@@ -23,7 +23,7 @@ extern "C" {
 #include <unistd.h>

 #ifdef __arm__
-#include "libxlnk_cma.h"
+#include <libxlnk_cma.h>
 #else
 void* cma_alloc(size_t size, int cached);
 void cma_free(void* buf);
@@ -32,31 +32,6 @@ void xlnkFlushCache(void* buf, int size);
 void xlnkInvalidateCache(void* buf, int size);
 #endif

-/*! \brief VTA command handle */
-typedef void * VTAHandle;
-
-/*! \brief DMA command handle */
-typedef struct {
-  /*! \brief Register map to the AXI DMA control registers*/
-  void *dma_register_map;
-  /*! \brief Transmit data descriptor*/
-  void *mm2s_descriptor_register_map;
-  /*! \brief Receive data descriptor*/
-  void *s2mm_descriptor_register_map;
-  /*! \brief Transmit data descriptor physical address*/
-  uint32_t mm2s_descriptor_phy;
-  /*! \brief Receive data descriptor physical address*/
-  uint32_t s2mm_descriptor_phy;
-  /*! \brief Descriptor size */
-  uint32_t descriptor_size;
-  /*! \brief Transaction count for tx channel */
-  uint32_t mm2s_count;
-  /*! \brief Transaction count for rx channel */
-  uint32_t s2mm_count;
-  /*! \brief Multi-channel mode enable */
-  int multichannel_en;
-} DMAHandle;
-
 /*! \brief partial bitstream status file path */
 #define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
 /*! \brief bitstream destination file path */
@@ -99,52 +74,8 @@ typedef struct {
 */
 #define VTA_STORE_ADDR    0x43C30000

-/*! \brief Memory management constants with libxlnk_cma */
-#define CACHED 1
-/*! \brief Memory management constants with libxlnk_cma */
-#define NOT_CACHED 0
-
-/*! \brief log2 of SDS buffer size limit */
-#define LOG_MAX_XFER 22
-/*! \brief SDS buffer size limit */
-#define MAX_XFER (1<<LOG_MAX_XFER)
-
-/*!
- * \brief Returns a memory map to FPGA configuration registers.
- * \param addr The base physical address of the configuration registers.
- * \param length The size of the memory mapped region in bytes.
- * \return A pointer to the memory mapped region.
- */
-void *MapRegister(unsigned addr, size_t length);
-
-/*!
- * \brief Deletes the configuration register memory map.
- * \param vta The memory mapped region.
- * \param length The size of the memory mapped region in bytes.
- */
-void UnmapRegister(void *vta, size_t length);
-
-/*!
- * \brief Writes to a memory mapped configuration register.
- * \param vta_base The handle to the memory mapped configuration registers.
- * \param offset The offset of the register to write to.
- * \param val The value to be written to the memory mapped register.
- */
-void WriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
-
-/*!
- * \brief Reads from the memory mapped configuration register.
- * \param vta_base The handle to the memory mapped configuration registers.
- * \param offset The offset of the register to read from.
- * \return The value read from the memory mapped register.
- */
-unsigned ReadMappedReg(VTAHandle vta_base, unsigned offset);
-
-/*!
- * \brief Programming the bit stream on the FPGA.
- * \param bitstream The path to the bit stream file.
- */
-void ProgramVTA(const char* bitstream);
+/*! \brief Buffer size limit */
+#define MAX_XFER (1<<22)

 #ifdef __cplusplus
 }

--- a/vta/src/runtime.cc
+++ b/vta/src/runtime.cc
--- a/vta/src/tvm/vta_device_api.cc
+++ b/vta/src/tvm/vta_device_api.cc
+// simply include the driver for now.
+#include <tvm/runtime/registry.h>
+#include <dmlc/thread_local.h>
+#include <vta/runtime.h>
+#include "../../tvm/src/runtime/workspace_pool.h"
+
+namespace tvm {
+namespace runtime {
+
+std::string VTARPCGetPath(const std::string& name) {
+  static const PackedFunc* f =
+      runtime::Registry::Get("tvm.contrib.rpc.server.workpath");
+  CHECK(f != nullptr) << "require tvm.contrib.rpc.server.workpath";
+  return (*f)(name);
+}
+
+// Global functions that can be called
+TVM_REGISTER_GLOBAL("tvm.contrib.vta.init")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    std::string path = VTARPCGetPath(args[0]);
+    VTAProgram(path.c_str());
+    LOG(INFO) << "VTA initialization end with bistream " << path;
+  });
+
+TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.shutdown")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    VTARuntimeShutdown();
+  });
+
+class VTADeviceAPI final : public DeviceAPI {
+ public:
+  void SetDevice(TVMContext ctx) final {}
+
+  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
+    if (kind == kExist) {
+      *rv = 1;
+    }
+  }
+
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t size, size_t alignment,
+                       TVMType type_hint) final {
+    return VTABufferAlloc(VTATLSCommandHandle(), size);
+  }
+
+  void FreeDataSpace(TVMContext ctx, void* ptr) final {
+    VTABufferFree(VTATLSCommandHandle(), ptr);
+  }
+
+  void CopyDataFromTo(const void* from,
+                      size_t from_offset,
+                      void* to,
+                      size_t to_offset,
+                      size_t size,
+                      TVMContext ctx_from,
+                      TVMContext ctx_to,
+                      TVMStreamHandle stream) final {
+    int kind_mask = 0;
+    if (ctx_from.device_type != kDLCPU) {
+      kind_mask |= 2;
+    }
+    if (ctx_to.device_type != kDLCPU) {
+      kind_mask |= 1;
+    }
+    VTABufferCopy(VTATLSCommandHandle(),
+                  from, from_offset,
+                  to, to_offset,
+                  size, kind_mask);
+  }
+
+  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
+  }
+
+  void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
+
+  void FreeWorkspace(TVMContext ctx, void* data) final;
+
+  static const std::shared_ptr<VTADeviceAPI>& Global() {
+    static std::shared_ptr<VTADeviceAPI> inst =
+        std::make_shared<VTADeviceAPI>();
+    return inst;
+  }
+};
+
+struct VTAWorkspacePool : public WorkspacePool {
+  VTAWorkspacePool() :
+      WorkspacePool(static_cast<DLDeviceType>(kExtDev),
+                    VTADeviceAPI::Global()) {}
+};
+
+void* VTADeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) {
+  return dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()
+      ->AllocWorkspace(ctx, size);
+}
+
+void VTADeviceAPI::FreeWorkspace(TVMContext ctx, void* data) {
+  dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->FreeWorkspace(ctx, data);
+}
+
+TVM_REGISTER_GLOBAL("device_api.ext_dev")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    DeviceAPI* ptr = VTADeviceAPI::Global().get();
+    *rv = static_cast<void*>(ptr);
+  });
+}  // namespace runtime
+}  // namespace tvm
--- a/vta/tests/driver/Makefile
+++ b/vta/tests/driver/Makefile
-CC ?= g++
-CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
-LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
-LIBS = -l:libsds_lib.so -l:libdma.so
-SRC_DIR = ../../src
-INCLUDE_DIR = ../../include
-DRIVER_DIR = $(SRC_DIR)/driver/pynq
-TESTLIB_DIR = $(SRC_DIR)/test
-VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
-SOURCES = vta_pynq_driver.c vta_test_lib.cc
-OBJECTS = vta_pynq_driver.o vta_test_lib.o driver_test.o
-EXECUTABLE = vta
-
-# VTA Parameters
-#  Log of input width in bits
-LOG_INP_WIDTH = 3
-#  Log of weight width in bits
-LOG_WGT_WIDTH = 3
-#  Log of accum width in bits
-LOG_ACC_WIDTH = 5
-#  Log of output width in bits
-LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
-#  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
-LOG_BATCH = 0
-#  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
-LOG_IN_BLOCK = 4
-#  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
-LOG_OUT_BLOCK = 4
-#  Log of uop buffer size in Bytes
-LOG_UOP_BUFF_SIZE = 15
-#  Log of inp buffer size in Bytes
-LOG_INP_BUFF_SIZE = 15
-#  Log of wgt buffer size in Bytes
-LOG_WGT_BUFF_SIZE = 15
-#  Log of acc buffer size in Bytes
-LOG_ACC_BUFF_SIZE = 17
-#  Log of out buffer size in Bytes
-LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
-
-# Define flags
-CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM \
-	-DDEBUG=0 -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
-	-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
-	-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_IN_BLOCK) -DLOG_BLOCK_OUT=$(LOG_OUT_BLOCK) \
-	-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
-	-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
-	-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
-
-# All Target
-all: $(EXECUTABLE)
-
-%.o: %.cc $(SOURCES)
-	$(CC) -c -o $@ $< $(CFLAGS)
-
-$(EXECUTABLE): $(OBJECTS)
-	$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
-
-clean:
-	rm -rf *.o $(EXECUTABLE)
--- a/vta/src/test/vta_test_lib.cc
+++ b/vta/src/test/vta_test_lib.cc
@@ -4,7 +4,7 @@
 * \brief Test library for the VTA design simulation and driver tests.
 */

-#include "vta_test_lib.h"
+#include "./test_lib.h"

 const char* getOpcodeString(int opcode, bool use_imm) {
  // Returns string name
@@ -153,7 +153,7 @@ void free3dArray(T *** array, int rows, int cols, int depth) {

 void * allocBuffer(size_t num_bytes) {
 #ifdef NO_SIM
-  return cma_alloc(num_bytes, CACHED);
+  return VTAMemAlloc(num_bytes, CACHED);
 #else
  return malloc(num_bytes);
 #endif
@@ -161,7 +161,7 @@ void * allocBuffer(size_t num_bytes) {

 void freeBuffer(void * buffer) {
 #ifdef NO_SIM
-  return cma_free(buffer);
+  return VTAMemFree(buffer);
 #else
  return free(buffer);
 #endif
@@ -353,7 +353,7 @@ VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {

  // Allocate buffer
 #ifdef NO_SIM
-  VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
+  VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
 #else
  VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
 #endif
@@ -388,7 +388,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,

  // Allocate buffer
 #ifdef NO_SIM
-  VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
+  VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
 #else
  VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
 #endif
@@ -449,7 +449,7 @@ VTAUop * getMapALUUops(int vector_size, bool uop_compression) {

  // Allocate buffer
 #ifdef NO_SIM
-  VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
+  VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
 #else
  VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
 #endif
@@ -762,7 +762,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
  }

  // Compute reference output
-  inp_T **outputs_ref = alloc2dArray<inp_T>(batch, vector_size);
+  out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
  for (int i = 0; i < batch; i ++) {
    for (int j = 0; j < vector_size; j ++) {
      acc_T tmp = 0;
@@ -802,7 +802,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
        tmp = inputs[i][j] >> immediate[i / BATCH];
      }
      // Set
-      outputs_ref[i][j] = (inp_T) tmp;
+      outputs_ref[i][j] = (out_T) tmp;
    }
  }

@@ -811,7 +811,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
  packBuffer<acc_T, ACC_WIDTH>(bias_buf, inputs, batch, vector_size * input_sets, BATCH, BLOCK_OUT);

  // Prepare output buffer
-  inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets);
+  out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets);

 #ifdef NO_SIM
  // Invoke the VTA
@@ -833,8 +833,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
 #endif

  // Unpack output buffer
-  inp_T **outputs = alloc2dArray<inp_T>(batch, vector_size);
-  unpackBuffer<inp_T, INP_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT);
+  out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
+  unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT);

  // Correctness checks
  int err = 0;
@@ -853,8 +853,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
  // Free all allocated arrays
  free(immediate);
  free2dArray<acc_T>(inputs, batch, vector_size * input_sets);
-  free2dArray<inp_T>(outputs_ref, batch, vector_size);
-  free2dArray<inp_T>(outputs, batch, vector_size);
+  free2dArray<out_T>(outputs_ref, batch, vector_size);
+  free2dArray<out_T>(outputs, batch, vector_size);
  freeBuffer(insn_buf);
  freeBuffer(uop_buf);
  freeBuffer(bias_buf);
@@ -891,17 +891,17 @@ virtual_threads=%d\n",
  int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2;
  int uop_size = uop_compression ? block / BATCH * virtual_threads :
    block / BATCH * block / BLOCK_IN * block / BLOCK_OUT * virtual_threads;
-  int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT;
  int inp_size = batch / BATCH * in_feat / BLOCK_IN;
+  int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT;
  int out_size = batch / BATCH * out_feat / BLOCK_OUT;
  // Blocked buffer sizes (in terms of elements)
-  int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT;
  int inp_block_size = block / BATCH * block / BLOCK_IN;
+  int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT;
  int out_block_size = block / BATCH * block / BLOCK_OUT;
  // Make sure we don't exceed buffer bounds
  assert(uop_size <= UOP_BUFF_DEPTH);
-  assert(wgt_block_size <= WGT_BUFF_DEPTH);
  assert(inp_block_size <= INP_BUFF_DEPTH);
+  assert(wgt_block_size <= WGT_BUFF_DEPTH);
  assert(out_block_size <= ACC_BUFF_DEPTH);

  // Initialize instruction buffer
@@ -1017,15 +1017,15 @@ virtual_threads=%d\n",
  printMicroOp(uop_size, uop_buf);
 #endif

-  // Initialize weights
-  wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat);
  // Initialize inputs
  inp_T **inputs = allocInit2dArray<inp_T, INP_WIDTH>(batch, in_feat);
+  // Initialize weights
+  wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat);
  // Initialize biases
  acc_T **biases = allocInit2dArray<acc_T, ACC_WIDTH>(batch, out_feat);

  // Reference GEMM implementation
-  inp_T **outputs_ref = alloc2dArray<inp_T>(batch, out_feat);
+  out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
  for (int i = 0; i < batch; i ++) {
    for (int j = 0; j < out_feat; j ++) {
      acc_T sum = biases[i][j];
@@ -1033,21 +1033,21 @@ virtual_threads=%d\n",
        sum += (acc_T) (inputs[i][k] * weights[j][k]);
      }
      // Set
-      outputs_ref[i][j] = (inp_T) sum;
+      outputs_ref[i][j] = (out_T) sum;
    }
  }

-  // Prepare the weight buffer
-  wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size);
-  packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN);
  // Prepare the input buffer
  inp_T *input_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * inp_size);
  packBuffer<inp_T, INP_WIDTH>(input_buf, inputs, batch, in_feat, BATCH, BLOCK_IN);
+  // Prepare the weight buffer
+  wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size);
+  packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN);
  // Prepare the bias buffer
  acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * out_size);
  packBuffer<acc_T, ACC_WIDTH>(bias_buf, biases, batch, out_feat, BATCH, BLOCK_OUT);
  // Prepare the output buffer
-  inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * out_size);
+  out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * out_size);

 #ifdef NO_SIM
  // Invoke the VTA
@@ -1069,8 +1069,8 @@ virtual_threads=%d\n",
 #endif

  // Unpack output data
-  inp_T **outputs = alloc2dArray<inp_T>(batch, out_feat);
-  unpackBuffer<inp_T, INP_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT);
+  out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
+  unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT);

  // Correctness checks
  int err = 0;
@@ -1087,15 +1087,15 @@ virtual_threads=%d\n",
  }

  // Free all allocated arrays
-  free2dArray<wgt_T>(weights, out_feat, in_feat);
  free2dArray<inp_T>(inputs, batch, in_feat);
+  free2dArray<wgt_T>(weights, out_feat, in_feat);
  free2dArray<acc_T>(biases, batch, out_feat);
-  free2dArray<inp_T>(outputs_ref, batch, out_feat);
-  free2dArray<inp_T>(outputs, batch, out_feat);
+  free2dArray<out_T>(outputs_ref, batch, out_feat);
+  free2dArray<out_T>(outputs, batch, out_feat);
  freeBuffer((void *) insn_buf);
  freeBuffer((void *) uop_buf);
-  freeBuffer((void *) weight_buf);
  freeBuffer((void *) input_buf);
+  freeBuffer((void *) weight_buf);
  freeBuffer((void *) bias_buf);
  freeBuffer((void *) output_buf);


--- a/vta/include/vta_test_lib.h
+++ b/vta/include/vta_test_lib.h
@@ -7,21 +7,25 @@
 #ifndef VTA_TESTLIB_H_
 #define VTA_TESTLIB_H_

-#include "vta_params.h"
-
 #include <assert.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <vta/hw_spec.h>

 #ifdef NO_SIM

-#include "vta_pynq_driver.h"
+#include <vta/driver.h>
+
+#ifdef PYNQ_TARGET
+#include "../../../src/pynq/pynq_driver.h"
+#endif //PYNQ_TARGET

 typedef uint64_t axi_T;
 typedef uint32_t uop_T;
 typedef int8_t wgt_T;
 typedef int8_t inp_T;
+typedef int8_t out_T;
 typedef int32_t acc_T;

 uint64_t vta (
@@ -35,8 +39,7 @@ uint64_t vta (

 #else //NO_SIM

-#include "vta.h"
-#include "vta_typedefs.h"
+#include "../../../hardware/vivado/src/vta.h"

 #endif //NO_SIM


--- a/vta/tests/hardware/pynq/Makefile
+++ b/vta/tests/hardware/pynq/Makefile
+CC ?= g++
+CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
+LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
+LIBS = -l:libsds_lib.so -l:libdma.so
+INCLUDE_DIR = ../../../include
+DRIVER_DIR = ../../../src/pynq
+TESTLIB_DIR = ../common
+VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
+SOURCES = pynq_driver.cc test_lib.cc
+OBJECTS = pynq_driver.o test_lib.o metal_test.o
+EXECUTABLE = vta
+
+# Include top-level config file
+ifndef config
+ifneq ("$(wildcard ../../../config.mk)", "")
+	config = ../../../config.mk
+else
+	config = ../../../make/config.mk
+endif
+endif
+include $(config)
+
+# Define flags
+CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DDEBUG=0
+CFLAGS += $(ADD_CFLAGS)
+
+# All Target
+all: $(EXECUTABLE)
+
+%.o: %.cc $(SOURCES)
+	$(CC) -c -o $@ $< $(CFLAGS)
+
+$(EXECUTABLE): $(OBJECTS)
+	$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
+
+clean:
+	rm -rf *.o $(EXECUTABLE)
--- a/vta/tests/driver/driver_test.cc
+++ b/vta/tests/driver/driver_test.cc
@@ -9,8 +9,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
-#include "vta_test_lib.h"
-#include "vta_pynq_driver.h"
+#include <vta/driver.h>
+#include "../../../src/pynq/pynq_driver.h"
+#include "../common/test_lib.h"

 // VTA invocation (present the same abstraction as in the simulation tests)
 uint64_t vta (
@@ -43,18 +44,18 @@ uint64_t vta (
 #endif

    // Program VTA
-    ProgramVTA(bitstream);
+    VTAProgram(bitstream);
    // Get VTA handles
-    VTAHandle vta_fetch_handle = MapRegister(VTA_FETCH_ADDR, VTA_RANGE);
-    VTAHandle vta_load_handle = MapRegister(VTA_LOAD_ADDR, VTA_RANGE);
-    VTAHandle vta_compute_handle = MapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
-    VTAHandle vta_store_handle = MapRegister(VTA_STORE_ADDR, VTA_RANGE);
+    VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
+    VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
+    VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
+    VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);

    // Physical address pointers
    uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
    uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
-    uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
    uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
+    uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
    uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
    uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;

@@ -65,29 +66,29 @@ uint64_t vta (
    clock_gettime(CLOCK_REALTIME, &start);

    // FETCH @ 0x10 : Data signal of insn_count_V
-    WriteMappedReg(vta_fetch_handle, 0x10, insn_count);
+    VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
    // FETCH @ 0x18 : Data signal of insns_V
-    if (insns) WriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
-    // LOAD @ 0x10 : Data signal of weight_V
-    if (weights) WriteMappedReg(vta_load_handle, 0x10, weight_phy);
-    // LOAD @ 0x18 : Data signal of inputs_V
-    if (inputs) WriteMappedReg(vta_load_handle, 0x18, input_phy);
+    if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
+    // LOAD @ 0x10 : Data signal of inputs_V
+    if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
+    // LOAD @ 0x18 : Data signal of weight_V
+    if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
    // COMPUTE @ 0x20 : Data signal of uops_V
-    if (uops) WriteMappedReg(vta_compute_handle, 0x20, uop_phy);
+    if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
    // COMPUTE @ 0x28 : Data signal of biases_V
-    if (biases) WriteMappedReg(vta_compute_handle, 0x28, bias_phy);
+    if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
    // STORE @ 0x10 : Data signal of outputs_V
-    if (outputs) WriteMappedReg(vta_store_handle, 0x10, output_phy);
+    if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);

    // VTA start
-    WriteMappedReg(vta_fetch_handle, 0x0, 0x1);
-    WriteMappedReg(vta_load_handle, 0x0, 0x81);
-    WriteMappedReg(vta_compute_handle, 0x0, 0x81);
-    WriteMappedReg(vta_store_handle, 0x0, 0x81);
+    VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
+    VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
+    VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
+    VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);

    int flag = 0, t = 0;
    for (t = 0; t < 10000000; ++t) {
-      flag = ReadMappedReg(vta_compute_handle, 0x18);
+      flag = VTAReadMappedReg(vta_compute_handle, 0x18);
      if (flag & VTA_DONE) break;
    }

@@ -104,10 +105,10 @@ uint64_t vta (
    t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);

    // Unmap VTA register
-    UnmapRegister(vta_fetch_handle, VTA_RANGE);
-    UnmapRegister(vta_load_handle, VTA_RANGE);
-    UnmapRegister(vta_compute_handle, VTA_RANGE);
-    UnmapRegister(vta_store_handle, VTA_RANGE);
+    VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
+    VTAUnmapRegister(vta_load_handle, VTA_RANGE);
+    VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
+    VTAUnmapRegister(vta_store_handle, VTA_RANGE);

    return t_fpga;
 };