Commit 28a10b69 by Thierry Moreau Committed by Tianqi Chen

[REFACTOR] Code base refactoring (#5)

parent 0979e9aa
......@@ -54,9 +54,13 @@ endif
all: lib/libvta.$(SHARED_LIBRARY_SUFFIX)
SRC = $(wildcard src/*.cc src/*.cc)
ALL_OBJ = $(patsubst %.cc, build/%.o, $(SRC))
ALL_DEP = $(ALL_OBJ)
VTA_LIB_SRC = $(wildcard src/*.cc src/tvm/*.cc)
ifeq ($(TARGET), PYNQ_TARGET)
VTA_LIB_SRC += $(wildcard src/pynq/*.cc)
LDFLAGS += -L/usr/lib -lsds_lib
LDFLAGS += -L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/ -l:libdma.so
endif
VTA_LIB_OBJ = $(patsubst %.cc, build/%.o, $(VTA_LIB_SRC))
test: $(TEST)
......@@ -65,7 +69,7 @@ build/src/%.o: src/%.cc
$(CXX) $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
$(CXX) -c $(CFLAGS) -c $< -o $@
lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(ALL_DEP)
lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ)
@mkdir -p $(@D)
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS)
......
#!/bin/bash
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/tvm/python
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/
python -m tvm.exec.rpc_server --load-library /home/xilinx/vta/lib/libvta.so
......@@ -2,9 +2,9 @@
ROOTDIR = $(CURDIR)
BUILD_DIR = $(ROOTDIR)/build
SCRIPT_DIR = $(ROOTDIR)/scripts
SRC_DIR = $(ROOTDIR)/../../src/hardware/hls
SRC_DIR = $(ROOTDIR)/src
SIM_DIR = $(ROOTDIR)/sim
TEST_DIR = $(ROOTDIR)/../../src/test
TEST_DIR = $(ROOTDIR)/../../tests/hardware/common
INCLUDE_DIR = $(ROOTDIR)/../../include
# Executables
......@@ -12,59 +12,28 @@ VIVADO_HLS = vivado_hls
VIVADO = vivado
HSI = hsi
# Build parameters:
# Include top-level config file
ifndef config
ifneq ("$(wildcard ../../config.mk)", "")
config = ../../config.mk
else
config = ../../make/config.mk
endif
endif
include $(config)
#---------------------
# Compilation parameters
#--------------------
# Number of threads during compilation
NUM_THREADS = 8
# Target Frequency
CLOCK_FREQ = 100
# Log of input width in bits
LOG_INP_WIDTH = 3
# Log of weight width in bits
LOG_WGT_WIDTH = 3
# Log of accum width in bits
LOG_ACC_WIDTH = 5
# Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_IN_BLOCK = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_OUT_BLOCK = 4
# Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17
# Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
# Derived parameter
# Input width in bits
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
# Weight width in bits
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
# Output width in bits
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
# Tensor batch size
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
# Tensor outer block size
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" )
# Tensor inner block size
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" )
# Uop buffer size in Bytes
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
# Inp buffer size in Bytes
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
# Wgt buffer size in Bytes
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
# Acc buffer size in Bytes
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
# Out buffer size in Bytes
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
# Timing closure compensation (0 for none, 3 for highest)
TIMING_CLOSURE_COMP = 0
# Derive clock target period
TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
......@@ -85,7 +54,7 @@ ip:
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
$(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \
$(LOG_BATCH) $(LOG_BLOCK_OUT) $(LOG_BLOCK_IN) \
$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
......
......@@ -62,7 +62,7 @@ if { [llength $argv] eq 19 } {
}
# C define flags to pass to compiler
set cflags "-I $include_dir -I $include_dir/hardware/hls \
set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
......@@ -127,7 +127,7 @@ open_project vta_sim
set_top vta
add_files $src_dir/vta.cc -cflags $cflags
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags
add_files -tb $test_dir/test_lib.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csim_design -clean
......
......@@ -8,8 +8,8 @@
#include <stdlib.h>
#include <iostream>
#include "vta.h"
#include "vta_test_lib.h"
#include "../src/vta.h"
#include "../../../tests/hardware/common/test_lib.h"
int main(void)
{
......
......@@ -8,7 +8,7 @@
#include <stdlib.h>
#include <string.h>
#include "vta.h"
#include "./vta.h"
void fetch (
uint32_t insn_count,
......
......@@ -11,8 +11,88 @@
#include <ap_int.h>
#include <hls_stream.h>
#include "vta_typedefs.h"
#include "vta_params.h"
#include <vta/hw_spec.h>
/* \typedef uop_T Micro-op datatype*/
typedef ap_uint<UOP_WIDTH> uop_T;
/* \typedef inp_T Input datatype*/
typedef ap_int<INP_WIDTH> inp_T;
/* \typedef wgt_T Weight datatype*/
typedef ap_int<WGT_WIDTH> wgt_T;
/* \typedef out_T Output datatype*/
typedef ap_int<OUT_WIDTH> out_T;
/* \typedef acc_T Accumulator datatype*/
typedef ap_int<ACC_WIDTH> acc_T;
/* \typedef mul_T Multiplier output datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
/* \typedef inp_vec_T Input vector datatype*/
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
/* \typedef wgt_vec_T Weight vector datatype*/
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
/* \typedef acc_vec_T Accumulator vector datatype*/
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
/* \typedef out_vec_T Output vector datatype*/
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
/* \typedef inp_idx_T Input SRAM index datatype*/
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
/* \typedef wgt_idx_T Weight SRAM index datatype*/
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
/* \typedef acc_idx_T Accumulator SRAM index datatype*/
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
/* \typedef opcode_T Opcode datatype*/
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
/* \typedef insn_T Instruction datatype*/
typedef ap_uint<INS_WIDTH> insn_T;
/* \typedef loop_T Loop bound datatype*/
typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
/* \typedef memop_id_T Memory operation ID datatype*/
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
/* \typedef memop_sram_T Memory operation SRAM index datatype*/
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
/* \typedef memop_dram_T Memory operation DRAM index datatype*/
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
/* \typedef memop_size_T Memory operation range datatype*/
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
/* \typedef memop_stride_T Memory operation stride datatype*/
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
/* \typedef memop_pad_T Memory operation pad width datatype*/
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
/*!
* \brief Fetch module.
......
/*!
* Copyright (c) 2018 by Contributors
* \file vta_typedefs.h
* \brief Type definitions for VTA HLS design.
*/
#ifndef VTA_TYPEDEFS_H_
#define VTA_TYPEDEFS_H_
#include <assert.h>
#include <ap_axi_sdata.h>
#include <ap_int.h>
#include <hls_stream.h>
#include "vta_params.h"
/* \typedef uop_T Micro-op datatype*/
typedef ap_uint<UOP_WIDTH> uop_T;
/* \typedef inp_T Input datatype*/
typedef ap_int<INP_WIDTH> inp_T;
/* \typedef wgt_T Weight datatype*/
typedef ap_int<WGT_WIDTH> wgt_T;
/* \typedef out_T Output datatype*/
typedef ap_int<OUT_WIDTH> out_T;
/* \typedef acc_T Accumulator datatype*/
typedef ap_int<ACC_WIDTH> acc_T;
/* \typedef mul_T Multiplier output datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
/* \typedef inp_vec_T Input vector datatype*/
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
/* \typedef wgt_vec_T Weight vector datatype*/
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
/* \typedef acc_vec_T Accumulator vector datatype*/
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
/* \typedef out_vec_T Output vector datatype*/
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
/* \typedef inp_idx_T Input SRAM index datatype*/
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
/* \typedef wgt_idx_T Weight SRAM index datatype*/
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
/* \typedef acc_idx_T Accumulator SRAM index datatype*/
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
/* \typedef opcode_T Opcode datatype*/
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
/* \typedef insn_T Instruction datatype*/
typedef ap_uint<INS_WIDTH> insn_T;
/* \typedef loop_T Loop bound datatype*/
typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
/* \typedef memop_id_T Memory operation ID datatype*/
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
/* \typedef memop_sram_T Memory operation SRAM index datatype*/
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
/* \typedef memop_dram_T Memory operation DRAM index datatype*/
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
/* \typedef memop_size_T Memory operation range datatype*/
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
/* \typedef memop_stride_T Memory operation stride datatype*/
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
/* \typedef memop_pad_T Memory operation pad width datatype*/
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
#endif // VTA_TYPEDEFS_H_
/*!
* Copyright (c) 2018 by Contributors
* \file vta_driver.h
* \brief General driver interface.
*/
#ifndef VTA_DRIVER_H_
#define VTA_DRIVER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdlib.h>
#include <stdint.h>
/*! \brief Memory management constants with libxlnk_cma */
#define CACHED 1
/*! \brief Memory management constants with libxlnk_cma */
#define NOT_CACHED 0
/*! \brief VTA command handle */
typedef void * VTAHandle;
/*!
* \brief Allocates physically contiguous region in memory (limited by MAX_XFER).
* \param size Size of the region in Bytes.
* \param cached Region can be set to not cached (write-back) if set to 0.
* \return A pointer to the allocated region.
*/
void* VTAMemAlloc(size_t size, int cached);
/*!
* \brief Frees a physically contiguous region in memory.
* \param buf Buffer to free.
*/
void VTAMemFree(void* buf);
/*!
* \brief Returns a physical address to the region of memory allocated with VTAMemAlloc.
* \param buf Pointer to memory region allocated with VTAMemAlloc.
* \return The physical address of the memory region.
*/
uint32_t VTAGetMemPhysAddr(void* buf);
/*!
* \brief Flushes the region of memory out of the CPU cache to DRAM.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
* \param size Size of the region to flush in Bytes.
*/
void VTAFlushCache(void* buf, int size);
/*!
* \brief Invalidates the region of memory that is cached.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated.
* \param size Size of the region to invalidate in Bytes.
*/
void VTAInvalidateCache(void* buf, int size);
/*!
* \brief Returns a memory map to FPGA configuration registers.
* \param addr The base physical address of the configuration registers.
* \param length The size of the memory mapped region in bytes.
* \return A pointer to the memory mapped region.
*/
void *VTAMapRegister(unsigned addr, size_t length);
/*!
* \brief Deletes the configuration register memory map.
* \param vta The memory mapped region.
* \param length The size of the memory mapped region in bytes.
*/
void VTAUnmapRegister(void *vta, size_t length);
/*!
* \brief Writes to a memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to write to.
* \param val The value to be written to the memory mapped register.
*/
void VTAWriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
/*!
* \brief Reads from the memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to read from.
* \return The value read from the memory mapped register.
*/
unsigned VTAReadMappedReg(VTAHandle vta_base, unsigned offset);
/*!
* \brief Programming the bit stream on the FPGA.
* \param bitstream The path to the bit stream file.
*/
void VTAProgram(const char* bitstream);
#ifdef __cplusplus
}
#endif
#endif // VTA_DRIVER_H_
......@@ -3,8 +3,13 @@
* \file vta_defines.h
* \brief Preprocessor definitions for VTA HLS design and runtime.
*/
#ifndef VTA_DEFINES_H_
#define VTA_DEFINES_H_
#ifndef VTA_HW_SPEC_H_
#define VTA_HW_SPEC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
......@@ -556,4 +561,7 @@ typedef struct {
uint32_t wgt_idx : LOG_WGT_BUFF_DEPTH;
} VTAUop;
#endif // VTA_DEFINES_H_
#ifdef __cplusplus
}
#endif
#endif // VTA_HW_SPEC_H_
/*!
* Copyright (c) 2018 by Contributors
* \file runtime.h
* \brief VTA runtime library.
*/
#ifndef VTA_RUNTIME_H_
#define VTA_RUNTIME_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./driver.h"
#define VTA_MEMCPY_H2D 1
#define VTA_MEMCPY_D2H 2
#define VTA_MEMCPY_D2D 3
#define VTA_DEBUG_DUMP_INSN (1 << 1)
#define VTA_DEBUG_DUMP_UOP (1 << 2)
#define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3)
#define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4)
#define VTA_DEBUG_FORCE_SERIAL (1 << 5)
/*! \brief VTA command handle */
typedef void * VTACommandHandle;
/*! \brief Shutdown hook of VTA to cleanup resources */
void VTARuntimeShutdown();
/*!
* \brief Get thread local command handle.
* \return A thread local command handle.
*/
VTACommandHandle VTATLSCommandHandle();
/*!
* \brief Allocate data buffer.
* \param cmd The VTA command handle.
* \param size Buffer size.
* \return A pointer to the allocated buffer.
*/
void* VTABufferAlloc(VTACommandHandle cmd, size_t size);
/*!
* \brief Free data buffer.
* \param cmd The VTA command handle.
* \param buffer The data buffer to be freed.
*/
void VTABufferFree(VTACommandHandle cmd, void* buffer);
/*!
* \brief Get the buffer access pointer on CPU.
* \param cmd The VTA command handle.
* \param buffer The data buffer.
* \return The pointer that can be accessed by the CPU.
*/
void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
/*!
* \brief Copy data buffer from one location to another.
* \param cmd The VTA command handle.
* \param from The source buffer base address.
* \param from_offset The offset of the source buffer.
* \param to The target buffer base address.
* \param to_offset The offset of the target buffer.
* \param size Size of copy.
* \param kind_mask The memory copy kind.
*/
void VTABufferCopy(VTACommandHandle cmd,
const void* from,
size_t from_offset,
void* to,
size_t to_offset,
size_t size,
int kind_mask);
/*!
* \brief Set debug mode on the command handle.
* \param cmd The VTA command handle.
* \param debug_flag The debug flag.
*/
void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
/*!
* \brief Perform a write barrier to make a memory region visible to the CPU.
* \param cmd The VTA command handle.
* \param buffer The head buffer pointer.
* \param elem_bits The size in bits of each element.
* \param start The start of the region (in elements).
* \param extent The end of the region (in elements).
*/
void VTAWriteBarrier(VTACommandHandle cmd,
void* buffer, uint32_t elem_bits,
uint32_t start, uint32_t extent);
/*!
* \brief Perform a read barrier to a memory region visible to VTA.
* \param cmd The VTA command handle.
* \param buffer The head buffer pointer.
* \param elem_bits The unit bits of each elements.
* \param start The start of the region (in elements).
* \param extent The end of the region (in elements).
*/
void VTAReadBarrier(VTACommandHandle cmd,
void* buffer, uint32_t elem_bits,
uint32_t start, uint32_t extent);
/*!
* \brief Perform a 2D data load from DRAM.
* Sizes are measured in units of vector elements.
* \param cmd The VTA command handle.
* \param src_dram_addr Source DRAM address.
* \param src_elem_offset The source DRAM offset in number of unit elements.
* \param x_size The lowest dimension (x axis) size in number of unit elements.
* \param y_size The number of rows (y axis).
* \param x_stride The x axis stride.
* \param x_pad_before The start padding on x axis.
* \param y_pad_before The start padding on y axis.
* \param x_pad_after The end padding on x axis.
* \param y_pad_after The end padding of y axis.
* \param dst_sram_index Destination SRAM index.
* \param dst_memory_type Destination memory type.
*/
void VTALoadBuffer2D(VTACommandHandle cmd,
void* src_dram_addr,
uint32_t src_elem_offset,
uint32_t x_size,
uint32_t y_size,
uint32_t x_stride,
uint32_t x_pad_before,
uint32_t y_pad_before,
uint32_t x_pad_after,
uint32_t y_pad_after,
uint32_t dst_sram_index,
uint32_t dst_memory_type);
/*!
* \brief Perform a 2D data store into DRAM
* Sizes are measured in units of vector elements.
* \param cmd The VTA command handle.
* \param src_sram_index Source SRAM index.
* \param src_memory_type Source memory type.
* \param dst_dram_addr Destination DRAM address.
* \param x_size The lowest dimension (x axis) size in number of unit elements.
* \param y_size The number of rows.
* \param x_stride The x axis stride.
*/
void VTAStoreBuffer2D(VTACommandHandle cmd,
uint32_t src_sram_index,
uint32_t src_memory_type,
void* dst_dram_addr,
uint32_t dst_elem_offset,
uint32_t x_size,
uint32_t y_size,
uint32_t x_stride);
/*!
* \brief Push uop into kernel buffer.
* In GEMM mode, do a blocked GEMM with 2d access pattern.
* In ALU mode, do a vectorized ALU operation with 2d access pattern.
*
* \code
*
* DType accum[INP_BUFF_DEPTH][l][n];
* DType weight[WGT_BUFF_DEPTH][n][m];
* DType input[INP_BUFF_DEPTH][l][m];
* if reset_out == 1
* accum[dst_index] = 0
* elif mode == 0
* accum[dst_index] += GEMM(input[src_index], weight[wgt_index]);
* else
* if (use_imm)
* accum[dst_index] = opcode(accum[dst_index], imm_val);
* else
* accum[dst_index] = opcode(accum[dst_index], accum[src_index]);
*
* \endcode
*
* \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
* \param reset_out Resets the accum to 0.
* \param dst_index The accum memory index.
* \param src_index The input memory (gemm) / accum memory (alu) index.
* \param wgt_index The weight memory index.
* \param opcode The ALU opcode.
* \param use_imm Use immediate in ALU mode if set to true.
* \param imm_val Immediate value in ALU mode.
*/
void VTAUopPush(uint32_t mode,
uint32_t reset_out,
uint32_t dst_index,
uint32_t src_index,
uint32_t wgt_index,
uint32_t opcode,
uint32_t use_imm,
uint32_t imm_val);
/*!
* \brief Mark start of a micro op loop.
* \param extent The extent of the loop.
* \param dst_factor The accum factor.
* \param src_factor The input factor.
* \param wgt_factor The weight factor.
*/
void VTAUopLoopBegin(uint32_t extent,
uint32_t dst_factor,
uint32_t src_factor,
uint32_t wgt_factor);
/*!
* \brief Mark end of a micro op loop.
*/
void VTAUopLoopEnd();
/*!
* \brief Push GEMM uop kernel into the command handle.
* \param uop_handle The uop cache handle.
* \param finit The initalization function to initialize uop.
* \param signature The closure arguments of the finit.
* \param nbytes Number of bytes to in the closure arguments.
* \return 0 if success.
*/
int VTAPushGEMMOp(void** uop_handle,
int (*finit)(void*),
void* signature,
int nbytes);
/*!
* \brief Push ALU uop kernel into the command handle.
* \param uop_handle The uop cache handle.
* \param finit The initalization function to initialize uop.
* \param signature The closure arguments of the finit.
* \param nbytes Number of bytes to in the closure arguments.
* \return 0 if success.
*/
int VTAPushALUOp(void** uop_handle,
int (*finit)(void*),
void* signature,
int nbytes);
/*!
* \brief Push dependence token.
* \param cmd The VTA command handle.
* \param from_qid The source queue.
* \param to_qid The destination queue.
* \return 0 if success.
*/
int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
/*!
* \brief Pop dependence signal.
* \param cmd The VTA command handle.
* \param from_qid The source queue.
* \param to_qid The destination queue.
* \return 0 if success.
*/
int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
/*!
* \brief Synchronize the command handle.
* Commit all the instructions to VTA and wait until
* the accelerator finishes its job.
* Perform all of the out-of-order DRAM stores.
* \param cmd The VTA command handle.
* \param wait_cycles The limit of poll cycles.
*
*/
void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
#ifdef __cplusplus
}
#endif
#endif // VTA_RUNTIME_H_
......@@ -25,3 +25,72 @@ ADD_LDFLAGS=
# the additional compile flags you want to add
ADD_CFLAGS=
# the hardware target
TARGET=PYNQ_TARGET
#---------------------
# VTA hardware parameters
#--------------------
# Log of input/activation width in bits (default 3 -> 8 bits)
LOG_INP_WIDTH = 3
# Log of kernel weight width in bits (default 3 -> 8 bits)
LOG_WGT_WIDTH = 3
# Log of accum width in bits (default 5 -> 32 bits)
LOG_ACC_WIDTH = 5
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_BLOCK_IN = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_BLOCK_OUT = 4
# Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17
#---------------------
# Derived VTA hardware parameters
#--------------------
# Input width in bits
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
# Weight width in bits
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
# Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
# Output width in bits
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
# Tensor batch size
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
# Tensor outer block size
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_IN) ))" )
# Tensor inner block size
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_OUT) ))" )
# Uop buffer size in Bytes
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
# Inp buffer size in Bytes
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
# Wgt buffer size in Bytes
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
# Acc buffer size in Bytes
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
# Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
# Out buffer size in Bytes
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
# Update ADD_CFLAGS
ADD_CFLAGS += \
-D$(TARGET) \
-DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_BLOCK_IN) -DLOG_BLOCK_OUT=$(LOG_BLOCK_OUT) \
-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
\ No newline at end of file
......@@ -4,15 +4,31 @@
* \brief VTA driver for Pynq board.
*/
#ifdef __cplusplus
extern "C" {
#endif
#include "vta_pynq_driver.h"
#ifdef __cplusplus
#include <vta/driver.h>
#include "./pynq_driver.h"
void* VTAMemAlloc(size_t size, int cached) {
return cma_alloc(size, cached);
}
void VTAMemFree(void* buf) {
cma_free(buf);
}
uint32_t VTAGetMemPhysAddr(void* buf) {
return cma_get_phy_addr(buf);
}
void VTAFlushCache(void* buf, int size) {
xlnkFlushCache(buf, size);
}
void VTAInvalidateCache(void* buf, int size) {
xlnkInvalidateCache(buf, size);
}
#endif
void *MapRegister(uint32_t addr, size_t length) {
void *VTAMapRegister(uint32_t addr, size_t length) {
// Align the base address with the pages
uint32_t virt_base = addr & ~(getpagesize() - 1);
......@@ -24,21 +40,21 @@ void *MapRegister(uint32_t addr, size_t length) {
return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base);
}
void UnmapRegister(void *vta, size_t length) {
void VTAUnmapRegister(void *vta, size_t length) {
// Unmap memory
int status = munmap(vta, length);
assert(status==0);
}
void WriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
*((volatile uint32_t *) (((char *) base_addr) + offset)) = val;
}
uint32_t ReadMappedReg(void* base_addr, uint32_t offset) {
uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
return *((volatile uint32_t *) (((char *) base_addr) + offset));
}
void ProgramVTA(const char* bitstream) {
void VTAProgram(const char* bitstream) {
int elem;
FILE *src, *dst, *partial;
......
......@@ -23,7 +23,7 @@ extern "C" {
#include <unistd.h>
#ifdef __arm__
#include "libxlnk_cma.h"
#include <libxlnk_cma.h>
#else
void* cma_alloc(size_t size, int cached);
void cma_free(void* buf);
......@@ -32,31 +32,6 @@ void xlnkFlushCache(void* buf, int size);
void xlnkInvalidateCache(void* buf, int size);
#endif
/*! \brief VTA command handle */
typedef void * VTAHandle;
/*! \brief DMA command handle */
typedef struct {
/*! \brief Register map to the AXI DMA control registers*/
void *dma_register_map;
/*! \brief Transmit data descriptor*/
void *mm2s_descriptor_register_map;
/*! \brief Receive data descriptor*/
void *s2mm_descriptor_register_map;
/*! \brief Transmit data descriptor physical address*/
uint32_t mm2s_descriptor_phy;
/*! \brief Receive data descriptor physical address*/
uint32_t s2mm_descriptor_phy;
/*! \brief Descriptor size */
uint32_t descriptor_size;
/*! \brief Transaction count for tx channel */
uint32_t mm2s_count;
/*! \brief Transaction count for rx channel */
uint32_t s2mm_count;
/*! \brief Multi-channel mode enable */
int multichannel_en;
} DMAHandle;
/*! \brief partial bitstream status file path */
#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
/*! \brief bitstream destination file path */
......@@ -99,52 +74,8 @@ typedef struct {
*/
#define VTA_STORE_ADDR 0x43C30000
/*! \brief Memory management constants with libxlnk_cma */
#define CACHED 1
/*! \brief Memory management constants with libxlnk_cma */
#define NOT_CACHED 0
/*! \brief log2 of SDS buffer size limit */
#define LOG_MAX_XFER 22
/*! \brief SDS buffer size limit */
#define MAX_XFER (1<<LOG_MAX_XFER)
/*!
* \brief Returns a memory map to FPGA configuration registers.
* \param addr The base physical address of the configuration registers.
* \param length The size of the memory mapped region in bytes.
* \return A pointer to the memory mapped region.
*/
void *MapRegister(unsigned addr, size_t length);
/*!
* \brief Deletes the configuration register memory map.
* \param vta The memory mapped region.
* \param length The size of the memory mapped region in bytes.
*/
void UnmapRegister(void *vta, size_t length);
/*!
* \brief Writes to a memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to write to.
* \param val The value to be written to the memory mapped register.
*/
void WriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
/*!
* \brief Reads from the memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to read from.
* \return The value read from the memory mapped register.
*/
unsigned ReadMappedReg(VTAHandle vta_base, unsigned offset);
/*!
* \brief Programming the bit stream on the FPGA.
* \param bitstream The path to the bit stream file.
*/
void ProgramVTA(const char* bitstream);
/*! \brief Buffer size limit */
#define MAX_XFER (1<<22)
#ifdef __cplusplus
}
......
// simply include the driver for now.
#include <tvm/runtime/registry.h>
#include <dmlc/thread_local.h>
#include <vta/runtime.h>
#include "../../tvm/src/runtime/workspace_pool.h"
namespace tvm {
namespace runtime {
std::string VTARPCGetPath(const std::string& name) {
static const PackedFunc* f =
runtime::Registry::Get("tvm.contrib.rpc.server.workpath");
CHECK(f != nullptr) << "require tvm.contrib.rpc.server.workpath";
return (*f)(name);
}
// Global functions that can be called
TVM_REGISTER_GLOBAL("tvm.contrib.vta.init")
.set_body([](TVMArgs args, TVMRetValue* rv) {
std::string path = VTARPCGetPath(args[0]);
VTAProgram(path.c_str());
LOG(INFO) << "VTA initialization end with bistream " << path;
});
TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.shutdown")
.set_body([](TVMArgs args, TVMRetValue* rv) {
VTARuntimeShutdown();
});
class VTADeviceAPI final : public DeviceAPI {
public:
void SetDevice(TVMContext ctx) final {}
void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
if (kind == kExist) {
*rv = 1;
}
}
void* AllocDataSpace(TVMContext ctx,
size_t size, size_t alignment,
TVMType type_hint) final {
return VTABufferAlloc(VTATLSCommandHandle(), size);
}
void FreeDataSpace(TVMContext ctx, void* ptr) final {
VTABufferFree(VTATLSCommandHandle(), ptr);
}
void CopyDataFromTo(const void* from,
size_t from_offset,
void* to,
size_t to_offset,
size_t size,
TVMContext ctx_from,
TVMContext ctx_to,
TVMStreamHandle stream) final {
int kind_mask = 0;
if (ctx_from.device_type != kDLCPU) {
kind_mask |= 2;
}
if (ctx_to.device_type != kDLCPU) {
kind_mask |= 1;
}
VTABufferCopy(VTATLSCommandHandle(),
from, from_offset,
to, to_offset,
size, kind_mask);
}
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
}
void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
void FreeWorkspace(TVMContext ctx, void* data) final;
static const std::shared_ptr<VTADeviceAPI>& Global() {
static std::shared_ptr<VTADeviceAPI> inst =
std::make_shared<VTADeviceAPI>();
return inst;
}
};
struct VTAWorkspacePool : public WorkspacePool {
VTAWorkspacePool() :
WorkspacePool(static_cast<DLDeviceType>(kExtDev),
VTADeviceAPI::Global()) {}
};
void* VTADeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) {
return dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()
->AllocWorkspace(ctx, size);
}
void VTADeviceAPI::FreeWorkspace(TVMContext ctx, void* data) {
dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->FreeWorkspace(ctx, data);
}
TVM_REGISTER_GLOBAL("device_api.ext_dev")
.set_body([](TVMArgs args, TVMRetValue* rv) {
DeviceAPI* ptr = VTADeviceAPI::Global().get();
*rv = static_cast<void*>(ptr);
});
} // namespace runtime
} // namespace tvm
CC ?= g++
CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
LIBS = -l:libsds_lib.so -l:libdma.so
SRC_DIR = ../../src
INCLUDE_DIR = ../../include
DRIVER_DIR = $(SRC_DIR)/driver/pynq
TESTLIB_DIR = $(SRC_DIR)/test
VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
SOURCES = vta_pynq_driver.c vta_test_lib.cc
OBJECTS = vta_pynq_driver.o vta_test_lib.o driver_test.o
EXECUTABLE = vta
# VTA Parameters
# Log of input width in bits
LOG_INP_WIDTH = 3
# Log of weight width in bits
LOG_WGT_WIDTH = 3
# Log of accum width in bits
LOG_ACC_WIDTH = 5
# Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_IN_BLOCK = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_OUT_BLOCK = 4
# Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17
# Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
# Define flags
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM \
-DDEBUG=0 -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_IN_BLOCK) -DLOG_BLOCK_OUT=$(LOG_OUT_BLOCK) \
-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
# All Target
all: $(EXECUTABLE)
%.o: %.cc $(SOURCES)
$(CC) -c -o $@ $< $(CFLAGS)
$(EXECUTABLE): $(OBJECTS)
$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
clean:
rm -rf *.o $(EXECUTABLE)
......@@ -4,7 +4,7 @@
* \brief Test library for the VTA design simulation and driver tests.
*/
#include "vta_test_lib.h"
#include "./test_lib.h"
const char* getOpcodeString(int opcode, bool use_imm) {
// Returns string name
......@@ -153,7 +153,7 @@ void free3dArray(T *** array, int rows, int cols, int depth) {
void * allocBuffer(size_t num_bytes) {
#ifdef NO_SIM
return cma_alloc(num_bytes, CACHED);
return VTAMemAlloc(num_bytes, CACHED);
#else
return malloc(num_bytes);
#endif
......@@ -161,7 +161,7 @@ void * allocBuffer(size_t num_bytes) {
void freeBuffer(void * buffer) {
#ifdef NO_SIM
return cma_free(buffer);
return VTAMemFree(buffer);
#else
return free(buffer);
#endif
......@@ -353,7 +353,7 @@ VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
// Allocate buffer
#ifdef NO_SIM
VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
#else
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
#endif
......@@ -388,7 +388,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
// Allocate buffer
#ifdef NO_SIM
VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
#else
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
#endif
......@@ -449,7 +449,7 @@ VTAUop * getMapALUUops(int vector_size, bool uop_compression) {
// Allocate buffer
#ifdef NO_SIM
VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
#else
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
#endif
......@@ -762,7 +762,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
}
// Compute reference output
inp_T **outputs_ref = alloc2dArray<inp_T>(batch, vector_size);
out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
for (int i = 0; i < batch; i ++) {
for (int j = 0; j < vector_size; j ++) {
acc_T tmp = 0;
......@@ -802,7 +802,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
tmp = inputs[i][j] >> immediate[i / BATCH];
}
// Set
outputs_ref[i][j] = (inp_T) tmp;
outputs_ref[i][j] = (out_T) tmp;
}
}
......@@ -811,7 +811,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
packBuffer<acc_T, ACC_WIDTH>(bias_buf, inputs, batch, vector_size * input_sets, BATCH, BLOCK_OUT);
// Prepare output buffer
inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets);
out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets);
#ifdef NO_SIM
// Invoke the VTA
......@@ -833,8 +833,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
#endif
// Unpack output buffer
inp_T **outputs = alloc2dArray<inp_T>(batch, vector_size);
unpackBuffer<inp_T, INP_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT);
out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT);
// Correctness checks
int err = 0;
......@@ -853,8 +853,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
// Free all allocated arrays
free(immediate);
free2dArray<acc_T>(inputs, batch, vector_size * input_sets);
free2dArray<inp_T>(outputs_ref, batch, vector_size);
free2dArray<inp_T>(outputs, batch, vector_size);
free2dArray<out_T>(outputs_ref, batch, vector_size);
free2dArray<out_T>(outputs, batch, vector_size);
freeBuffer(insn_buf);
freeBuffer(uop_buf);
freeBuffer(bias_buf);
......@@ -891,17 +891,17 @@ virtual_threads=%d\n",
int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2;
int uop_size = uop_compression ? block / BATCH * virtual_threads :
block / BATCH * block / BLOCK_IN * block / BLOCK_OUT * virtual_threads;
int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT;
int inp_size = batch / BATCH * in_feat / BLOCK_IN;
int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT;
int out_size = batch / BATCH * out_feat / BLOCK_OUT;
// Blocked buffer sizes (in terms of elements)
int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT;
int inp_block_size = block / BATCH * block / BLOCK_IN;
int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT;
int out_block_size = block / BATCH * block / BLOCK_OUT;
// Make sure we don't exceed buffer bounds
assert(uop_size <= UOP_BUFF_DEPTH);
assert(wgt_block_size <= WGT_BUFF_DEPTH);
assert(inp_block_size <= INP_BUFF_DEPTH);
assert(wgt_block_size <= WGT_BUFF_DEPTH);
assert(out_block_size <= ACC_BUFF_DEPTH);
// Initialize instruction buffer
......@@ -1017,15 +1017,15 @@ virtual_threads=%d\n",
printMicroOp(uop_size, uop_buf);
#endif
// Initialize weights
wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat);
// Initialize inputs
inp_T **inputs = allocInit2dArray<inp_T, INP_WIDTH>(batch, in_feat);
// Initialize weights
wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat);
// Initialize biases
acc_T **biases = allocInit2dArray<acc_T, ACC_WIDTH>(batch, out_feat);
// Reference GEMM implementation
inp_T **outputs_ref = alloc2dArray<inp_T>(batch, out_feat);
out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
for (int i = 0; i < batch; i ++) {
for (int j = 0; j < out_feat; j ++) {
acc_T sum = biases[i][j];
......@@ -1033,21 +1033,21 @@ virtual_threads=%d\n",
sum += (acc_T) (inputs[i][k] * weights[j][k]);
}
// Set
outputs_ref[i][j] = (inp_T) sum;
outputs_ref[i][j] = (out_T) sum;
}
}
// Prepare the weight buffer
wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size);
packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN);
// Prepare the input buffer
inp_T *input_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * inp_size);
packBuffer<inp_T, INP_WIDTH>(input_buf, inputs, batch, in_feat, BATCH, BLOCK_IN);
// Prepare the weight buffer
wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size);
packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN);
// Prepare the bias buffer
acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * out_size);
packBuffer<acc_T, ACC_WIDTH>(bias_buf, biases, batch, out_feat, BATCH, BLOCK_OUT);
// Prepare the output buffer
inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * out_size);
out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * out_size);
#ifdef NO_SIM
// Invoke the VTA
......@@ -1069,8 +1069,8 @@ virtual_threads=%d\n",
#endif
// Unpack output data
inp_T **outputs = alloc2dArray<inp_T>(batch, out_feat);
unpackBuffer<inp_T, INP_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT);
out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT);
// Correctness checks
int err = 0;
......@@ -1087,15 +1087,15 @@ virtual_threads=%d\n",
}
// Free all allocated arrays
free2dArray<wgt_T>(weights, out_feat, in_feat);
free2dArray<inp_T>(inputs, batch, in_feat);
free2dArray<wgt_T>(weights, out_feat, in_feat);
free2dArray<acc_T>(biases, batch, out_feat);
free2dArray<inp_T>(outputs_ref, batch, out_feat);
free2dArray<inp_T>(outputs, batch, out_feat);
free2dArray<out_T>(outputs_ref, batch, out_feat);
free2dArray<out_T>(outputs, batch, out_feat);
freeBuffer((void *) insn_buf);
freeBuffer((void *) uop_buf);
freeBuffer((void *) weight_buf);
freeBuffer((void *) input_buf);
freeBuffer((void *) weight_buf);
freeBuffer((void *) bias_buf);
freeBuffer((void *) output_buf);
......
......@@ -7,21 +7,25 @@
#ifndef VTA_TESTLIB_H_
#define VTA_TESTLIB_H_
#include "vta_params.h"
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <vta/hw_spec.h>
#ifdef NO_SIM
#include "vta_pynq_driver.h"
#include <vta/driver.h>
#ifdef PYNQ_TARGET
#include "../../../src/pynq/pynq_driver.h"
#endif //PYNQ_TARGET
typedef uint64_t axi_T;
typedef uint32_t uop_T;
typedef int8_t wgt_T;
typedef int8_t inp_T;
typedef int8_t out_T;
typedef int32_t acc_T;
uint64_t vta (
......@@ -35,8 +39,7 @@ uint64_t vta (
#else //NO_SIM
#include "vta.h"
#include "vta_typedefs.h"
#include "../../../hardware/vivado/src/vta.h"
#endif //NO_SIM
......
CC ?= g++
CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
LIBS = -l:libsds_lib.so -l:libdma.so
INCLUDE_DIR = ../../../include
DRIVER_DIR = ../../../src/pynq
TESTLIB_DIR = ../common
VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
SOURCES = pynq_driver.cc test_lib.cc
OBJECTS = pynq_driver.o test_lib.o metal_test.o
EXECUTABLE = vta
# Include top-level config file
ifndef config
ifneq ("$(wildcard ../../../config.mk)", "")
config = ../../../config.mk
else
config = ../../../make/config.mk
endif
endif
include $(config)
# Define flags
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DDEBUG=0
CFLAGS += $(ADD_CFLAGS)
# All Target
all: $(EXECUTABLE)
%.o: %.cc $(SOURCES)
$(CC) -c -o $@ $< $(CFLAGS)
$(EXECUTABLE): $(OBJECTS)
$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
clean:
rm -rf *.o $(EXECUTABLE)
......@@ -9,8 +9,9 @@
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "vta_test_lib.h"
#include "vta_pynq_driver.h"
#include <vta/driver.h>
#include "../../../src/pynq/pynq_driver.h"
#include "../common/test_lib.h"
// VTA invocation (present the same abstraction as in the simulation tests)
uint64_t vta (
......@@ -43,18 +44,18 @@ uint64_t vta (
#endif
// Program VTA
ProgramVTA(bitstream);
VTAProgram(bitstream);
// Get VTA handles
VTAHandle vta_fetch_handle = MapRegister(VTA_FETCH_ADDR, VTA_RANGE);
VTAHandle vta_load_handle = MapRegister(VTA_LOAD_ADDR, VTA_RANGE);
VTAHandle vta_compute_handle = MapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
VTAHandle vta_store_handle = MapRegister(VTA_STORE_ADDR, VTA_RANGE);
VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
// Physical address pointers
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
......@@ -65,29 +66,29 @@ uint64_t vta (
clock_gettime(CLOCK_REALTIME, &start);
// FETCH @ 0x10 : Data signal of insn_count_V
WriteMappedReg(vta_fetch_handle, 0x10, insn_count);
VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V
if (insns) WriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
// LOAD @ 0x10 : Data signal of weight_V
if (weights) WriteMappedReg(vta_load_handle, 0x10, weight_phy);
// LOAD @ 0x18 : Data signal of inputs_V
if (inputs) WriteMappedReg(vta_load_handle, 0x18, input_phy);
if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
// LOAD @ 0x10 : Data signal of inputs_V
if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
// LOAD @ 0x18 : Data signal of weight_V
if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
// COMPUTE @ 0x20 : Data signal of uops_V
if (uops) WriteMappedReg(vta_compute_handle, 0x20, uop_phy);
if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
// COMPUTE @ 0x28 : Data signal of biases_V
if (biases) WriteMappedReg(vta_compute_handle, 0x28, bias_phy);
if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
// STORE @ 0x10 : Data signal of outputs_V
if (outputs) WriteMappedReg(vta_store_handle, 0x10, output_phy);
if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
// VTA start
WriteMappedReg(vta_fetch_handle, 0x0, 0x1);
WriteMappedReg(vta_load_handle, 0x0, 0x81);
WriteMappedReg(vta_compute_handle, 0x0, 0x81);
WriteMappedReg(vta_store_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
int flag = 0, t = 0;
for (t = 0; t < 10000000; ++t) {
flag = ReadMappedReg(vta_compute_handle, 0x18);
flag = VTAReadMappedReg(vta_compute_handle, 0x18);
if (flag & VTA_DONE) break;
}
......@@ -104,10 +105,10 @@ uint64_t vta (
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
// Unmap VTA register
UnmapRegister(vta_fetch_handle, VTA_RANGE);
UnmapRegister(vta_load_handle, VTA_RANGE);
UnmapRegister(vta_compute_handle, VTA_RANGE);
UnmapRegister(vta_store_handle, VTA_RANGE);
VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
VTAUnmapRegister(vta_load_handle, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
VTAUnmapRegister(vta_store_handle, VTA_RANGE);
return t_fpga;
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment