Commit 28a10b69 by Thierry Moreau Committed by Tianqi Chen

[REFACTOR] Code base refactoring (#5)

parent 0979e9aa
......@@ -54,9 +54,13 @@ endif
all: lib/libvta.$(SHARED_LIBRARY_SUFFIX)
SRC = $(wildcard src/*.cc src/*.cc)
ALL_OBJ = $(patsubst %.cc, build/%.o, $(SRC))
ALL_DEP = $(ALL_OBJ)
VTA_LIB_SRC = $(wildcard src/*.cc src/tvm/*.cc)
ifeq ($(TARGET), PYNQ_TARGET)
VTA_LIB_SRC += $(wildcard src/pynq/*.cc)
LDFLAGS += -L/usr/lib -lsds_lib
LDFLAGS += -L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/ -l:libdma.so
endif
VTA_LIB_OBJ = $(patsubst %.cc, build/%.o, $(VTA_LIB_SRC))
test: $(TEST)
......@@ -65,7 +69,7 @@ build/src/%.o: src/%.cc
$(CXX) $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
$(CXX) -c $(CFLAGS) -c $< -o $@
lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(ALL_DEP)
lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ)
@mkdir -p $(@D)
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS)
......
#!/bin/bash
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/tvm/python
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/
python -m tvm.exec.rpc_server --load-library /home/xilinx/vta/lib/libvta.so
......@@ -2,9 +2,9 @@
ROOTDIR = $(CURDIR)
BUILD_DIR = $(ROOTDIR)/build
SCRIPT_DIR = $(ROOTDIR)/scripts
SRC_DIR = $(ROOTDIR)/../../src/hardware/hls
SRC_DIR = $(ROOTDIR)/src
SIM_DIR = $(ROOTDIR)/sim
TEST_DIR = $(ROOTDIR)/../../src/test
TEST_DIR = $(ROOTDIR)/../../tests/hardware/common
INCLUDE_DIR = $(ROOTDIR)/../../include
# Executables
......@@ -12,59 +12,28 @@ VIVADO_HLS = vivado_hls
VIVADO = vivado
HSI = hsi
# Build parameters:
# Include top-level config file
ifndef config
ifneq ("$(wildcard ../../config.mk)", "")
config = ../../config.mk
else
config = ../../make/config.mk
endif
endif
include $(config)
#---------------------
# Compilation parameters
#--------------------
# Number of threads during compilation
NUM_THREADS = 8
# Target Frequency
CLOCK_FREQ = 100
# Log of input width in bits
LOG_INP_WIDTH = 3
# Log of weight width in bits
LOG_WGT_WIDTH = 3
# Log of accum width in bits
LOG_ACC_WIDTH = 5
# Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_IN_BLOCK = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_OUT_BLOCK = 4
# Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17
# Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
# Derived parameter
# Input width in bits
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
# Weight width in bits
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
# Output width in bits
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
# Tensor batch size
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
# Tensor outer block size
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" )
# Tensor inner block size
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" )
# Uop buffer size in Bytes
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
# Inp buffer size in Bytes
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
# Wgt buffer size in Bytes
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
# Acc buffer size in Bytes
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
# Out buffer size in Bytes
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
# Timing closure compensation (0 for none, 3 for highest)
TIMING_CLOSURE_COMP = 0
# Derive clock target period
TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
......@@ -85,7 +54,7 @@ ip:
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
$(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \
$(LOG_BATCH) $(LOG_BLOCK_OUT) $(LOG_BLOCK_IN) \
$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
......
......@@ -62,7 +62,7 @@ if { [llength $argv] eq 19 } {
}
# C define flags to pass to compiler
set cflags "-I $include_dir -I $include_dir/hardware/hls \
set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
......@@ -127,7 +127,7 @@ open_project vta_sim
set_top vta
add_files $src_dir/vta.cc -cflags $cflags
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags
add_files -tb $test_dir/test_lib.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csim_design -clean
......
......@@ -8,8 +8,8 @@
#include <stdlib.h>
#include <iostream>
#include "vta.h"
#include "vta_test_lib.h"
#include "../src/vta.h"
#include "../../../tests/hardware/common/test_lib.h"
int main(void)
{
......
......@@ -8,7 +8,7 @@
#include <stdlib.h>
#include <string.h>
#include "vta.h"
#include "./vta.h"
void fetch (
uint32_t insn_count,
......
......@@ -11,8 +11,88 @@
#include <ap_int.h>
#include <hls_stream.h>
#include "vta_typedefs.h"
#include "vta_params.h"
#include <vta/hw_spec.h>
/* \typedef uop_T Micro-op datatype*/
typedef ap_uint<UOP_WIDTH> uop_T;
/* \typedef inp_T Input datatype*/
typedef ap_int<INP_WIDTH> inp_T;
/* \typedef wgt_T Weight datatype*/
typedef ap_int<WGT_WIDTH> wgt_T;
/* \typedef out_T Output datatype*/
typedef ap_int<OUT_WIDTH> out_T;
/* \typedef acc_T Accumulator datatype*/
typedef ap_int<ACC_WIDTH> acc_T;
/* \typedef mul_T Multiplier output datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
/* \typedef inp_vec_T Input vector datatype*/
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
/* \typedef wgt_vec_T Weight vector datatype*/
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
/* \typedef acc_vec_T Accumulator vector datatype*/
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
/* \typedef out_vec_T Output vector datatype*/
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
/* \typedef inp_idx_T Input SRAM index datatype*/
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
/* \typedef wgt_idx_T Weight SRAM index datatype*/
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
/* \typedef acc_idx_T Accumulator SRAM index datatype*/
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
/* \typedef opcode_T Opcode datatype*/
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
/* \typedef insn_T Instruction datatype*/
typedef ap_uint<INS_WIDTH> insn_T;
/* \typedef loop_T Loop bound datatype*/
typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
/* \typedef memop_id_T Memory operation ID datatype*/
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
/* \typedef memop_sram_T Memory operation SRAM index datatype*/
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
/* \typedef memop_dram_T Memory operation DRAM index datatype*/
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
/* \typedef memop_size_T Memory operation range datatype*/
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
/* \typedef memop_stride_T Memory operation stride datatype*/
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
/* \typedef memop_pad_T Memory operation pad width datatype*/
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
/*!
* \brief Fetch module.
......
/*!
* Copyright (c) 2018 by Contributors
* \file vta_typedefs.h
* \brief Type definitions for VTA HLS design.
*/
#ifndef VTA_TYPEDEFS_H_
#define VTA_TYPEDEFS_H_
#include <assert.h>
#include <ap_axi_sdata.h>
#include <ap_int.h>
#include <hls_stream.h>
#include "vta_params.h"
/* \typedef uop_T Micro-op datatype*/
typedef ap_uint<UOP_WIDTH> uop_T;
/* \typedef inp_T Input datatype*/
typedef ap_int<INP_WIDTH> inp_T;
/* \typedef wgt_T Weight datatype*/
typedef ap_int<WGT_WIDTH> wgt_T;
/* \typedef out_T Output datatype*/
typedef ap_int<OUT_WIDTH> out_T;
/* \typedef acc_T Accumulator datatype*/
typedef ap_int<ACC_WIDTH> acc_T;
/* \typedef mul_T Multiplier output datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
/* \typedef inp_vec_T Input vector datatype*/
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
/* \typedef wgt_vec_T Weight vector datatype*/
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
/* \typedef acc_vec_T Accumulator vector datatype*/
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
/* \typedef out_vec_T Output vector datatype*/
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
/* \typedef inp_idx_T Input SRAM index datatype*/
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
/* \typedef wgt_idx_T Weight SRAM index datatype*/
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
/* \typedef acc_idx_T Accumulator SRAM index datatype*/
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
/* \typedef opcode_T Opcode datatype*/
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
/* \typedef insn_T Instruction datatype*/
typedef ap_uint<INS_WIDTH> insn_T;
/* \typedef loop_T Loop bound datatype*/
typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
/* \typedef memop_id_T Memory operation ID datatype*/
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
/* \typedef memop_sram_T Memory operation SRAM index datatype*/
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
/* \typedef memop_dram_T Memory operation DRAM index datatype*/
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
/* \typedef memop_size_T Memory operation range datatype*/
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
/* \typedef memop_stride_T Memory operation stride datatype*/
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
/* \typedef memop_pad_T Memory operation pad width datatype*/
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
#endif // VTA_TYPEDEFS_H_
/*!
* Copyright (c) 2018 by Contributors
* \file vta_driver.h
* \brief General driver interface.
*/
#ifndef VTA_DRIVER_H_
#define VTA_DRIVER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdlib.h>
#include <stdint.h>
/*! \brief Memory management constants with libxlnk_cma */
#define CACHED 1
/*! \brief Memory management constants with libxlnk_cma */
#define NOT_CACHED 0
/*! \brief VTA command handle */
typedef void * VTAHandle;
/*!
* \brief Allocates physically contiguous region in memory (limited by MAX_XFER).
* \param size Size of the region in Bytes.
* \param cached Region can be set to not cached (write-back) if set to 0.
* \return A pointer to the allocated region.
*/
void* VTAMemAlloc(size_t size, int cached);
/*!
* \brief Frees a physically contiguous region in memory.
* \param buf Buffer to free.
*/
void VTAMemFree(void* buf);
/*!
* \brief Returns a physical address to the region of memory allocated with VTAMemAlloc.
* \param buf Pointer to memory region allocated with VTAMemAlloc.
* \return The physical address of the memory region.
*/
uint32_t VTAGetMemPhysAddr(void* buf);
/*!
* \brief Flushes the region of memory out of the CPU cache to DRAM.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
* \param size Size of the region to flush in Bytes.
*/
void VTAFlushCache(void* buf, int size);
/*!
* \brief Invalidates the region of memory that is cached.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated.
* \param size Size of the region to invalidate in Bytes.
*/
void VTAInvalidateCache(void* buf, int size);
/*!
* \brief Returns a memory map to FPGA configuration registers.
* \param addr The base physical address of the configuration registers.
* \param length The size of the memory mapped region in bytes.
* \return A pointer to the memory mapped region.
*/
void *VTAMapRegister(unsigned addr, size_t length);
/*!
* \brief Deletes the configuration register memory map.
* \param vta The memory mapped region.
* \param length The size of the memory mapped region in bytes.
*/
void VTAUnmapRegister(void *vta, size_t length);
/*!
* \brief Writes to a memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to write to.
* \param val The value to be written to the memory mapped register.
*/
void VTAWriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
/*!
* \brief Reads from the memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to read from.
* \return The value read from the memory mapped register.
*/
unsigned VTAReadMappedReg(VTAHandle vta_base, unsigned offset);
/*!
* \brief Programming the bit stream on the FPGA.
* \param bitstream The path to the bit stream file.
*/
void VTAProgram(const char* bitstream);
#ifdef __cplusplus
}
#endif
#endif // VTA_DRIVER_H_
......@@ -3,8 +3,13 @@
* \file vta_defines.h
* \brief Preprocessor definitions for VTA HLS design and runtime.
*/
#ifndef VTA_DEFINES_H_
#define VTA_DEFINES_H_
#ifndef VTA_HW_SPEC_H_
#define VTA_HW_SPEC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
......@@ -556,4 +561,7 @@ typedef struct {
uint32_t wgt_idx : LOG_WGT_BUFF_DEPTH;
} VTAUop;
#endif // VTA_DEFINES_H_
#ifdef __cplusplus
}
#endif
#endif // VTA_HW_SPEC_H_
/*!
* Copyright (c) 2018 by Contributors
* \file runtime.h
* \brief VTA runtime library.
*/
#ifndef VTA_RUNTIME_H_
#define VTA_RUNTIME_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./driver.h"
#define VTA_MEMCPY_H2D 1
#define VTA_MEMCPY_D2H 2
#define VTA_MEMCPY_D2D 3
#define VTA_DEBUG_DUMP_INSN (1 << 1)
#define VTA_DEBUG_DUMP_UOP (1 << 2)
#define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3)
#define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4)
#define VTA_DEBUG_FORCE_SERIAL (1 << 5)
/*! \brief VTA command handle */
typedef void * VTACommandHandle;
/*! \brief Shutdown hook of VTA to cleanup resources */
void VTARuntimeShutdown();
/*!
* \brief Get thread local command handle.
* \return A thread local command handle.
*/
VTACommandHandle VTATLSCommandHandle();
/*!
* \brief Allocate data buffer.
* \param cmd The VTA command handle.
* \param size Buffer size.
* \return A pointer to the allocated buffer.
*/
void* VTABufferAlloc(VTACommandHandle cmd, size_t size);
/*!
* \brief Free data buffer.
* \param cmd The VTA command handle.
* \param buffer The data buffer to be freed.
*/
void VTABufferFree(VTACommandHandle cmd, void* buffer);
/*!
* \brief Get the buffer access pointer on CPU.
* \param cmd The VTA command handle.
* \param buffer The data buffer.
* \return The pointer that can be accessed by the CPU.
*/
void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
/*!
* \brief Copy data buffer from one location to another.
* \param cmd The VTA command handle.
* \param from The source buffer base address.
* \param from_offset The offset of the source buffer.
* \param to The target buffer base address.
* \param to_offset The offset of the target buffer.
* \param size Size of copy.
* \param kind_mask The memory copy kind.
*/
void VTABufferCopy(VTACommandHandle cmd,
const void* from,
size_t from_offset,
void* to,
size_t to_offset,
size_t size,
int kind_mask);
/*!
* \brief Set debug mode on the command handle.
* \param cmd The VTA command handle.
* \param debug_flag The debug flag.
*/
void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
/*!
* \brief Perform a write barrier to make a memory region visible to the CPU.
* \param cmd The VTA command handle.
* \param buffer The head buffer pointer.
* \param elem_bits The size in bits of each element.
* \param start The start of the region (in elements).
* \param extent The end of the region (in elements).
*/
void VTAWriteBarrier(VTACommandHandle cmd,
void* buffer, uint32_t elem_bits,
uint32_t start, uint32_t extent);
/*!
* \brief Perform a read barrier to a memory region visible to VTA.
* \param cmd The VTA command handle.
* \param buffer The head buffer pointer.
* \param elem_bits The unit bits of each elements.
* \param start The start of the region (in elements).
* \param extent The end of the region (in elements).
*/
void VTAReadBarrier(VTACommandHandle cmd,
void* buffer, uint32_t elem_bits,
uint32_t start, uint32_t extent);
/*!
* \brief Perform a 2D data load from DRAM.
* Sizes are measured in units of vector elements.
* \param cmd The VTA command handle.
* \param src_dram_addr Source DRAM address.
* \param src_elem_offset The source DRAM offset in number of unit elements.
* \param x_size The lowest dimension (x axis) size in number of unit elements.
* \param y_size The number of rows (y axis).
* \param x_stride The x axis stride.
* \param x_pad_before The start padding on x axis.
* \param y_pad_before The start padding on y axis.
* \param x_pad_after The end padding on x axis.
* \param y_pad_after The end padding of y axis.
* \param dst_sram_index Destination SRAM index.
* \param dst_memory_type Destination memory type.
*/
void VTALoadBuffer2D(VTACommandHandle cmd,
void* src_dram_addr,
uint32_t src_elem_offset,
uint32_t x_size,
uint32_t y_size,
uint32_t x_stride,
uint32_t x_pad_before,
uint32_t y_pad_before,
uint32_t x_pad_after,
uint32_t y_pad_after,
uint32_t dst_sram_index,
uint32_t dst_memory_type);
/*!
* \brief Perform a 2D data store into DRAM
* Sizes are measured in units of vector elements.
* \param cmd The VTA command handle.
* \param src_sram_index Source SRAM index.
* \param src_memory_type Source memory type.
* \param dst_dram_addr Destination DRAM address.
* \param x_size The lowest dimension (x axis) size in number of unit elements.
* \param y_size The number of rows.
* \param x_stride The x axis stride.
*/
void VTAStoreBuffer2D(VTACommandHandle cmd,
uint32_t src_sram_index,
uint32_t src_memory_type,
void* dst_dram_addr,
uint32_t dst_elem_offset,
uint32_t x_size,
uint32_t y_size,
uint32_t x_stride);
/*!
* \brief Push uop into kernel buffer.
* In GEMM mode, do a blocked GEMM with 2d access pattern.
* In ALU mode, do a vectorized ALU operation with 2d access pattern.
*
* \code
*
* DType accum[INP_BUFF_DEPTH][l][n];
* DType weight[WGT_BUFF_DEPTH][n][m];
* DType input[INP_BUFF_DEPTH][l][m];
* if reset_out == 1
* accum[dst_index] = 0
* elif mode == 0
* accum[dst_index] += GEMM(input[src_index], weight[wgt_index]);
* else
* if (use_imm)
* accum[dst_index] = opcode(accum[dst_index], imm_val);
* else
* accum[dst_index] = opcode(accum[dst_index], accum[src_index]);
*
* \endcode
*
* \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
* \param reset_out Resets the accum to 0.
* \param dst_index The accum memory index.
* \param src_index The input memory (gemm) / accum memory (alu) index.
* \param wgt_index The weight memory index.
* \param opcode The ALU opcode.
* \param use_imm Use immediate in ALU mode if set to true.
* \param imm_val Immediate value in ALU mode.
*/
void VTAUopPush(uint32_t mode,
uint32_t reset_out,
uint32_t dst_index,
uint32_t src_index,
uint32_t wgt_index,
uint32_t opcode,
uint32_t use_imm,
uint32_t imm_val);
/*!
* \brief Mark start of a micro op loop.
* \param extent The extent of the loop.
* \param dst_factor The accum factor.
* \param src_factor The input factor.
* \param wgt_factor The weight factor.
*/
void VTAUopLoopBegin(uint32_t extent,
uint32_t dst_factor,
uint32_t src_factor,
uint32_t wgt_factor);
/*!
* \brief Mark end of a micro op loop.
*/
void VTAUopLoopEnd();
/*!
* \brief Push GEMM uop kernel into the command handle.
* \param uop_handle The uop cache handle.
* \param finit The initalization function to initialize uop.
* \param signature The closure arguments of the finit.
* \param nbytes Number of bytes to in the closure arguments.
* \return 0 if success.
*/
int VTAPushGEMMOp(void** uop_handle,
int (*finit)(void*),
void* signature,
int nbytes);
/*!
* \brief Push ALU uop kernel into the command handle.
* \param uop_handle The uop cache handle.
* \param finit The initalization function to initialize uop.
* \param signature The closure arguments of the finit.
* \param nbytes Number of bytes to in the closure arguments.
* \return 0 if success.
*/
int VTAPushALUOp(void** uop_handle,
int (*finit)(void*),
void* signature,
int nbytes);
/*!
* \brief Push dependence token.
* \param cmd The VTA command handle.
* \param from_qid The source queue.
* \param to_qid The destination queue.
* \return 0 if success.
*/
int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
/*!
* \brief Pop dependence signal.
* \param cmd The VTA command handle.
* \param from_qid The source queue.
* \param to_qid The destination queue.
* \return 0 if success.
*/
int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
/*!
* \brief Synchronize the command handle.
* Commit all the instructions to VTA and wait until
* the accelerator finishes its job.
* Perform all of the out-of-order DRAM stores.
* \param cmd The VTA command handle.
* \param wait_cycles The limit of poll cycles.
*
*/
void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
#ifdef __cplusplus
}
#endif
#endif // VTA_RUNTIME_H_
......@@ -25,3 +25,72 @@ ADD_LDFLAGS=
# the additional compile flags you want to add
ADD_CFLAGS=
# the hardware target
TARGET=PYNQ_TARGET
#---------------------
# VTA hardware parameters
#--------------------
# Log of input/activation width in bits (default 3 -> 8 bits)
LOG_INP_WIDTH = 3
# Log of kernel weight width in bits (default 3 -> 8 bits)
LOG_WGT_WIDTH = 3
# Log of accum width in bits (default 5 -> 32 bits)
LOG_ACC_WIDTH = 5
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_BLOCK_IN = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_BLOCK_OUT = 4
# Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17
#---------------------
# Derived VTA hardware parameters
#--------------------
# Input width in bits
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
# Weight width in bits
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
# Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
# Output width in bits
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
# Tensor batch size
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
# Tensor outer block size
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_IN) ))" )
# Tensor inner block size
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_OUT) ))" )
# Uop buffer size in Bytes
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
# Inp buffer size in Bytes
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
# Wgt buffer size in Bytes
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
# Acc buffer size in Bytes
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
# Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
# Out buffer size in Bytes
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
# Update ADD_CFLAGS
ADD_CFLAGS += \
-D$(TARGET) \
-DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_BLOCK_IN) -DLOG_BLOCK_OUT=$(LOG_BLOCK_OUT) \
-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
\ No newline at end of file
......@@ -4,15 +4,31 @@
* \brief VTA driver for Pynq board.
*/
#ifdef __cplusplus
extern "C" {
#endif
#include "vta_pynq_driver.h"
#ifdef __cplusplus
#include <vta/driver.h>
#include "./pynq_driver.h"
void* VTAMemAlloc(size_t size, int cached) {
return cma_alloc(size, cached);
}
void VTAMemFree(void* buf) {
cma_free(buf);
}
uint32_t VTAGetMemPhysAddr(void* buf) {
return cma_get_phy_addr(buf);
}
void VTAFlushCache(void* buf, int size) {
xlnkFlushCache(buf, size);
}
void VTAInvalidateCache(void* buf, int size) {
xlnkInvalidateCache(buf, size);
}
#endif
void *MapRegister(uint32_t addr, size_t length) {
void *VTAMapRegister(uint32_t addr, size_t length) {
// Align the base address with the pages
uint32_t virt_base = addr & ~(getpagesize() - 1);
......@@ -24,21 +40,21 @@ void *MapRegister(uint32_t addr, size_t length) {
return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base);
}
void UnmapRegister(void *vta, size_t length) {
void VTAUnmapRegister(void *vta, size_t length) {
// Unmap memory
int status = munmap(vta, length);
assert(status==0);
}
void WriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
*((volatile uint32_t *) (((char *) base_addr) + offset)) = val;
}
uint32_t ReadMappedReg(void* base_addr, uint32_t offset) {
uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
return *((volatile uint32_t *) (((char *) base_addr) + offset));
}
void ProgramVTA(const char* bitstream) {
void VTAProgram(const char* bitstream) {
int elem;
FILE *src, *dst, *partial;
......
......@@ -23,7 +23,7 @@ extern "C" {
#include <unistd.h>
#ifdef __arm__
#include "libxlnk_cma.h"
#include <libxlnk_cma.h>
#else
void* cma_alloc(size_t size, int cached);
void cma_free(void* buf);
......@@ -32,31 +32,6 @@ void xlnkFlushCache(void* buf, int size);
void xlnkInvalidateCache(void* buf, int size);
#endif
/*! \brief VTA command handle */
typedef void * VTAHandle;
/*! \brief DMA command handle */
typedef struct {
/*! \brief Register map to the AXI DMA control registers*/
void *dma_register_map;
/*! \brief Transmit data descriptor*/
void *mm2s_descriptor_register_map;
/*! \brief Receive data descriptor*/
void *s2mm_descriptor_register_map;
/*! \brief Transmit data descriptor physical address*/
uint32_t mm2s_descriptor_phy;
/*! \brief Receive data descriptor physical address*/
uint32_t s2mm_descriptor_phy;
/*! \brief Descriptor size */
uint32_t descriptor_size;
/*! \brief Transaction count for tx channel */
uint32_t mm2s_count;
/*! \brief Transaction count for rx channel */
uint32_t s2mm_count;
/*! \brief Multi-channel mode enable */
int multichannel_en;
} DMAHandle;
/*! \brief partial bitstream status file path */
#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
/*! \brief bitstream destination file path */
......@@ -99,52 +74,8 @@ typedef struct {
*/
#define VTA_STORE_ADDR 0x43C30000
/*! \brief Memory management constants with libxlnk_cma */
#define CACHED 1
/*! \brief Memory management constants with libxlnk_cma */
#define NOT_CACHED 0
/*! \brief log2 of SDS buffer size limit */
#define LOG_MAX_XFER 22
/*! \brief SDS buffer size limit */
#define MAX_XFER (1<<LOG_MAX_XFER)
/*!
* \brief Returns a memory map to FPGA configuration registers.
* \param addr The base physical address of the configuration registers.
* \param length The size of the memory mapped region in bytes.
* \return A pointer to the memory mapped region.
*/
void *MapRegister(unsigned addr, size_t length);
/*!
* \brief Deletes the configuration register memory map.
* \param vta The memory mapped region.
* \param length The size of the memory mapped region in bytes.
*/
void UnmapRegister(void *vta, size_t length);
/*!
* \brief Writes to a memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to write to.
* \param val The value to be written to the memory mapped register.
*/
void WriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
/*!
* \brief Reads from the memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to read from.
* \return The value read from the memory mapped register.
*/
unsigned ReadMappedReg(VTAHandle vta_base, unsigned offset);
/*!
* \brief Programming the bit stream on the FPGA.
* \param bitstream The path to the bit stream file.
*/
void ProgramVTA(const char* bitstream);
/*! \brief Buffer size limit */
#define MAX_XFER (1<<22)
#ifdef __cplusplus
}
......
/*!
* Copyright (c) 2018 by Contributors
* \file vta_runtime.cc
* \brief VTA runtime for PYNQ in C++11
*/
#include <cassert>
#include <cstring>
#include <vector>
#include <thread>
#include <memory>
#include <atomic>
#include <vta/driver.h>
#include <vta/hw_spec.h>
#include <vta/runtime.h>
#ifdef PYNQ_TARGET
#include "./pynq/pynq_driver.h"
#endif //PYNQ_TARGET
namespace vta {
/*! \brief Enable coherent access between VTA and CPU. */
static const bool kBufferCoherent = true;
/*!
* \brief Data buffer represents data on CMA.
*/
struct DataBuffer {
/*! \return Virtual address of the data. */
void* virt_addr() const {
return data_;
}
/*! \return Physical address of the data. */
uint32_t phy_addr() const {
return phy_addr_;
}
/*!
* \brief Invalidate the cache of given location in data buffer.
* \param offset The offset to the data.
* \param size The size of the data.
*/
void InvalidateCache(size_t offset, size_t size) {
if (!kBufferCoherent) {
VTAInvalidateCache(reinterpret_cast<void*>(phy_addr_ + offset), size);
}
}
/*!
* \brief Invalidate the cache of certain location in data buffer.
* \param offset The offset to the data.
* \param size The size of the data.
*/
void FlushCache(size_t offset, size_t size) {
if (!kBufferCoherent) {
VTAFlushCache(reinterpret_cast<void*>(phy_addr_ + offset), size);
}
}
/*!
* \brief Allocate a buffer of a given size.
* \param size The size of the buffer.
*/
static DataBuffer* Alloc(size_t size) {
void* data = VTAMemAlloc(size, 1);
assert(data != nullptr);
DataBuffer* buffer = new DataBuffer();
buffer->data_ = data;
buffer->phy_addr_ = VTAGetMemPhysAddr(data);
return buffer;
}
/*!
* \brief Free the data buffer.
* \param buffer The buffer to be freed.
*/
static void Free(DataBuffer* buffer) {
VTAMemFree(buffer->data_);
delete buffer;
}
/*!
* \brief Create data buffer header from buffer ptr.
* \param buffer The buffer pointer.
* \return The corresponding data buffer header.
*/
static DataBuffer* FromHandle(const void* buffer) {
return const_cast<DataBuffer*>(
reinterpret_cast<const DataBuffer*>(buffer));
}
private:
/*! \brief The internal data. */
void* data_;
/*! \brief The physical address of the buffer, excluding header. */
uint32_t phy_addr_;
};
/*!
* \brief Micro op kernel.
* Contains functions to construct the kernel with prefix Push.
*/
class UopKernel {
public:
/*! \brief Loop information. */
struct LoopEntry {
uint32_t extent;
uint32_t dst_factor;
uint32_t src_factor;
uint32_t wgt_factor;
};
/*!
* \brief Construct UopKernel with signature.
* \param signature The pointer to signature.
* \param nbytes Number of bytes.
*/
UopKernel(const char* signature, int nbytes)
: signature_(signature, signature + nbytes) {
}
/*!
* \brief Verify if the signature is correct.
* \param signature Signature ptr.
* \param nbytes Number of bytes.
*/
bool MatchSignature(void* signature, int nbytes) const {
if (static_cast<size_t>(nbytes) != signature_.size()) return false;
return memcmp(signature, signature_.data(), nbytes) == 0;
}
/*! \return Whether the kernel is cached in SRAM. */
bool cached() const {
return sram_begin_ != sram_end_;
}
/*! \return The length of the micro op sequence. */
size_t size() const {
return seq_.size();
}
/*! \return The micro-op data. */
const VTAUop* data() const {
return seq_.data();
}
/*! \return The loop structure. */
const std::vector<LoopEntry>& loop() const {
return loop_;
}
/*!
* \brief Declare loop start.
* \param extent The loop extent.
* \param dst_factor Loop factor of accum index.
* \param src_factor Loop factor of input index
* \param wgt_factor Loop factor of weight index.
*/
void PushLoopBegin(uint32_t extent,
uint32_t dst_factor,
uint32_t src_factor,
uint32_t wgt_factor) {
LoopEntry le;
le.extent = extent;
le.dst_factor = dst_factor;
le.src_factor = src_factor;
le.wgt_factor = wgt_factor;
assert(seq_.size() == 0);
assert(loop_.size() < 2);
loop_.push_back(le);
++loop_ptr_;
}
/*!
* \brief Declare loop end.
*/
void PushLoopEnd() {
--loop_ptr_;
}
/*!
* \brief Push micro op into kernel.
* \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
* \param reset_out Resets the accum to 0.
* \param dst_index The accum memory index.
* \param src_index The input memory (gemm) / accum memory (alu) index.
* \param wgt_index The weight memory index.
* \param opcode The ALU opcode.
* \param use_imm Use immediate in ALU mode if set to true.
* \param imm_val Immediate value in ALU mode.
*/
void Push(uint32_t mode,
uint32_t reset_out,
uint32_t dst_index,
uint32_t src_index,
uint32_t wgt_index,
uint32_t opcode,
uint32_t use_imm,
uint32_t imm_val) {
// The loop nest structure
VerifyDep(dst_index);
VTAUop op;
op.reset_out = reset_out;
op.dst_idx = dst_index;
op.src_idx = src_index;
op.wgt_idx = wgt_index;
seq_.push_back(op);
// Ensure that mode is consistent if set
if (mode_==0xFFFFFFFF) {
mode_ = mode;
} else {
assert(mode_==mode);
}
// Check kernel op and imm/imm_val in ALU mode
if (mode==1) {
if (opcode_==0xFFFFFFFF) {
opcode_=opcode;
use_imm_=use_imm;
imm_val_=imm_val;
} else {
assert(opcode_==opcode);
assert(use_imm_==use_imm);
assert(imm_val_==imm_val);
}
}
}
/*! \brief Dump kernel micro ops to stdout. */
void Dump() {
uint32_t size = seq_.size();
printf("There are %u uops\n", size);
for (uint32_t i = 0; i < size; ++i) {
printf("[%04u]\t acc=%u, inp=%u, wgt=%u, reset_out=%u\n",
i,
seq_[i].dst_idx,
seq_[i].src_idx,
seq_[i].wgt_idx,
seq_[i].reset_out);
}
printf("\n");
}
public:
// The kernel's mode, opcode, immediate setting and value
uint32_t mode_{0xFFFFFFFF}; // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU
uint32_t opcode_{0xFFFFFFFF};
bool use_imm_{false};
uint16_t imm_val_{0};
private:
// Verify that we don't write to the same acc_mem index two cycles in a row
void VerifyDep(uint32_t dst_index) {
size_t step = std::min(static_cast<size_t>(2U), seq_.size());
for (size_t i = seq_.size() - step; i < seq_.size(); ++i) {
assert(seq_[i].dst_idx != dst_index);
}
}
// The uop buffer
template<int, bool, bool>
friend class UopQueue;
friend class CommandQueue;
// SRAM location if begin != end.
uint32_t sram_begin_{0};
uint32_t sram_end_{0};
// The signature used for verification
std::vector<char> signature_;
// Internal sequence
std::vector<VTAUop> seq_;
// The loop nest structure specific to ALU instructions
std::vector<LoopEntry> loop_;
// The loop pointer
size_t loop_ptr_{0};
};
/*!
* \brief Base class of all queues to send and recv serial data.
* \param kElemBytes Element unit bytes.
* \param kMaxBytes Maximum number of bytes.
* \param kCoherent Whether we have coherent access to the buffer.
* \param kAlwaysCache Wether we should use cached memory.
*/
class BaseQueue {
public:
~BaseQueue() {
if (dram_buffer_ != nullptr) {
VTAMemFree(dram_buffer_);
}
}
/*! \return Content of DRAM buffer. */
char* dram_buffer() const {
return dram_buffer_;
}
/*! \return Physical address of DRAM. */
uint32_t dram_phy_addr() const {
return dram_phy_addr_;
}
/*! \return Whether there is pending information. */
bool pending() const {
return sram_begin_ != sram_end_;
}
/*! \brief Initialize the space of the buffer. */
void InitSpace(uint32_t elem_bytes, uint32_t max_bytes, bool coherent, bool always_cache) {
coherent_ = coherent;
always_cache_ = always_cache;
elem_bytes_ = elem_bytes;
dram_buffer_ = static_cast<char*>(VTAMemAlloc(
max_bytes, coherent || always_cache_));
assert(dram_buffer_ != nullptr);
dram_phy_addr_ = VTAGetMemPhysAddr(dram_buffer_);
}
/*!
* \brief Reset the pointer of the buffer.
* Set SRAM pointer to be the current end.
*/
void Reset() {
dram_begin_ = dram_end_ = 0;
sram_begin_ = sram_end_;
}
void AutoReadBarrier() {
ReadBarrier(elem_bytes_ * 8, 0, dram_end_);
}
/*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
void ReadBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) {
if (!coherent_ && always_cache_ && dram_extent != 0) {
dram_begin = dram_begin * elem_bits / 8;
dram_extent = dram_extent * elem_bits / 8;
VTAFlushCache(reinterpret_cast<void*>(dram_phy_addr_ + dram_begin),
dram_extent);
}
}
/*! \brief Read barrier to make sure that data written by VTA is visible to CPU. */
void WriteBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) {
if (!coherent_ && always_cache_ && dram_extent != 0) {
dram_begin = dram_begin * elem_bits / 8;
dram_extent = dram_extent * elem_bits / 8;
VTAInvalidateCache(reinterpret_cast<void*>(dram_phy_addr_ + dram_begin),
dram_extent);
}
}
protected:
// Cache coherence access
bool coherent_{false};
// Make the buffer cacheable
bool always_cache_{false};
// Element bytes
uint32_t elem_bytes_{0};
// Begin location of current SRAM read in FIFO mode
uint32_t sram_begin_{0};
// End location of current SRAM write in FIFO mode
uint32_t sram_end_{0};
// The current pending offset in DRAM in FIFO mode
uint32_t dram_begin_{0};
// The current pending offset in DRAM in FIFO mode
uint32_t dram_end_{0};
// The buffer in DRAM
char* dram_buffer_{nullptr};
// Physics address of the buffer
uint32_t dram_phy_addr_;
};
/*!
* \brief Micro op buffer that manages the micro op cache.
*/
template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
class UopQueue : public BaseQueue {
public:
void InitSpace() {
BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
}
// Push data to the queue
template<typename FAutoSync>
void Push(UopKernel* kernel, FAutoSync fautosync) {
if (kernel->cached()) return;
size_t num_op = kernel->size();
if (dram_end_ + num_op > kMaxElems) {
fautosync();
assert(dram_end_ <= kMaxElems);
}
assert(num_op <= kMaxNumUop);
uint32_t uop_begin = 0;
if (sram_end_ + num_op > kMaxElems) {
// Need to evict
cache_ptr_ = 0;
sram_end_ = num_op;
} else {
uop_begin = sram_end_;
sram_end_ += num_op;
}
// Simple eviction policy
uint32_t evict_begin = cache_ptr_;
for (;cache_ptr_ < cache_.size(); ++cache_ptr_) {
if (cache_[cache_ptr_]->sram_begin_ >= sram_end_) break;
cache_[cache_ptr_]->sram_begin_ = 0;
cache_[cache_ptr_]->sram_end_ = 0;
}
memcpy(dram_buffer_ + dram_end_ * kElemBytes,
kernel->data(),
num_op * kElemBytes);
dram_end_ += num_op;
kernel->sram_begin_ = uop_begin;
kernel->sram_end_ = sram_end_;
assert(uop_begin != sram_end_);
cache_.insert(cache_.begin() + cache_ptr_, kernel);
cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_ptr_);
cache_ptr_ = evict_begin + 1;
}
// Flush as weight load
void FlushUopLoad(VTAMemInsn* insn) {
if (sram_begin_ != sram_end_) {
assert((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_));
insn->memory_type = MEM_ID_UOP;
insn->sram_base = sram_begin_;
insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_;
insn->y_size = 1;
insn->x_size = (dram_end_ - dram_begin_);
insn->x_stride = (dram_end_ - dram_begin_);
insn->y_pad_0 = 0;
insn->y_pad_1 = 0;
insn->x_pad_0 = 0;
insn->x_pad_1 = 0;
// Reset indices
sram_begin_ = sram_end_;
dram_begin_ = dram_end_;
}
}
private:
// Cache pointer
uint32_t cache_ptr_{0};
// Cached ring, sorted by sram_begin
std::vector<UopKernel*> cache_;
// Constants
static constexpr int kElemBytes = sizeof(VTAUop);
static constexpr int kMaxNumUop = UOP_BUFF_DEPTH;
static constexpr int kMaxElems = kMaxBytes / kElemBytes;
};
// Internal kernel structure
class UopKernelMap {
public:
// Simple hash map
UopKernel** Get(void* signature,
int nbytes) {
uint32_t key = 0;
assert(nbytes == 0 || nbytes == sizeof(int));
if (nbytes == sizeof(int)) {
memcpy(&key, signature, sizeof(int));
key = key + 1;
}
assert(key < 100);
if (kmap_.size() <= key) {
kmap_.resize(key + 1, nullptr);
}
return &(kmap_[key]);
}
private:
std::vector<UopKernel*> kmap_;
};
enum PipelineStage : int {
kNoneStage = 0,
kLoadStage = 1,
kComputeStage = 2,
kStoreStage = 3
};
// Instruction Queue
template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
class InsnQueue : public BaseQueue {
public:
/*! \brief Initialize the space. */
void InitSpace() {
BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
// Initialize the stage
std::fill(pending_pop_prev_, pending_pop_prev_ + 4, 0);
std::fill(pending_pop_next_, pending_pop_next_ + 4, 0);
}
/*! \return The data pointer. */
VTAGenericInsn* data() {
return reinterpret_cast<VTAGenericInsn*>(dram_buffer_);
}
/*! \return Number of instructions. */
uint32_t count() {
return dram_end_;
}
// Insert dependency push of load
void DepPop(int from, int to) {
// NOTE: This instruction executes on queue[to]
if (from < to) {
if (pending_pop_prev_[to]) {
this->CommitPendingPop(to);
}
pending_pop_prev_[to] = 1;
} else {
if (pending_pop_next_[to]) {
this->CommitPendingPop(to);
}
pending_pop_next_[to] = 1;
}
// Impossible condition
assert(from != kLoadStage || to != kStoreStage);
assert(to != kLoadStage || to != kComputeStage);
}
// Insert dependency push of load
void DepPush(int from, int to) {
// NOTE: this instruction executes on queue[from]
this->CommitPendingPop(from);
if (dram_end_ != 0) {
VTAMemInsn* mptr =
reinterpret_cast<VTAMemInsn*>(dram_buffer_) + dram_end_ - 1;
if (GetPipelineStage(mptr) == from) {
if (from < to && !mptr->push_next_dep) {
// push(LD->C) or push(C->ST)
mptr->push_next_dep = true; return;
} else if (from > to && !mptr->push_prev_dep) {
// push(C->LD) or push(ST->C)
mptr->push_prev_dep = true; return;
}
}
}
if (from < to) {
// Push next dep
PushNoop(from, false, true, false, false);
} else {
// Push prev dep
PushNoop(from, true, false, false, false);
}
}
// Create a new instruction for a GEMM stage
VTAGemInsn* CreateGemInsn() {
return reinterpret_cast<VTAGemInsn*>(
Create(kComputeStage));
}
// Create a new instruction for a ALU stage
VTAAluInsn* CreateAluInsn() {
return reinterpret_cast<VTAAluInsn*>(
Create(kComputeStage));
}
// Create a new instruction for a memory stage
VTAMemInsn* CreateMemInsn(int memory_type) {
return reinterpret_cast<VTAMemInsn*>(
Create(GetMemPipelineStage(memory_type)));
}
// create a new instruction for a store stage
VTAMemInsn* CreateStoreInsn() {
return reinterpret_cast<VTAMemInsn*>(
Create(kStoreStage));
}
// Rewrite instruction stream to force serial execution
void RewriteForceSerial() {
int insn_count = count();
VTAMemInsn* mem_ptr = reinterpret_cast<VTAMemInsn*>(data());
for (int i = 1; i < insn_count; ++i) {
PipelineStage prev = GetPipelineStage(mem_ptr + i - 1);
PipelineStage now = GetPipelineStage(mem_ptr + i);
if (prev==kLoadStage && now==kComputeStage) {
mem_ptr[i - 1].push_prev_dep = false;
mem_ptr[i - 1].push_next_dep = true;
mem_ptr[i].pop_prev_dep = true;
mem_ptr[i].pop_next_dep = false;
} else if (prev==kComputeStage && now==kLoadStage) {
mem_ptr[i - 1].push_prev_dep = true;
mem_ptr[i - 1].push_next_dep = false;
mem_ptr[i].pop_prev_dep = false;
mem_ptr[i].pop_next_dep = true;
} else if (prev==kStoreStage && now==kComputeStage) {
mem_ptr[i - 1].push_prev_dep = true;
mem_ptr[i - 1].push_next_dep = false;
mem_ptr[i].pop_prev_dep = false;
mem_ptr[i].pop_next_dep = true;
} else if (prev==kComputeStage && now==kStoreStage) {
mem_ptr[i - 1].push_prev_dep = false;
mem_ptr[i - 1].push_next_dep = true;
mem_ptr[i].pop_prev_dep = true;
mem_ptr[i].pop_next_dep = false;
} else {
mem_ptr[i - 1].push_prev_dep = false;
mem_ptr[i - 1].push_next_dep = false;
mem_ptr[i].pop_prev_dep = false;
mem_ptr[i].pop_next_dep = false;
}
}
}
// Helper function: Get Opcode string
const char* getOpcodeString(int opcode, bool use_imm) {
// The string name
if (opcode==ALU_OPCODE_MIN) {
if (use_imm) {
return "min imm";
} else {
return "min";
}
} else if (opcode==ALU_OPCODE_MAX) {
if (use_imm) {
return "max imm";
} else {
return "max";
}
} else if (opcode==ALU_OPCODE_ADD) {
if (use_imm) {
return "add imm";
} else {
return "add";
}
} else if (opcode==ALU_OPCODE_SUB) {
if (use_imm) {
return "sub imm";
} else {
return "sub";
}
} else if (opcode==ALU_OPCODE_MUL) {
if (use_imm) {
return "mul imm";
} else {
return "mul";
}
} else if (opcode==ALU_OPCODE_SHL) {
return "shl";
} else if (opcode==ALU_OPCODE_SHR) {
return "shr";
}
return "unknown op";
}
// Dump instructions in the queue
void DumpInsn() {
// Keep tabs on dependence queues
int l2g_queue = 0;
int g2l_queue = 0;
int s2g_queue = 0;
int g2s_queue = 0;
// Converter
union VTAInsn c;
// Iterate over all instructions
int insn_count = count();
const VTAGenericInsn* insn = data();
printf("There are %u instructions\n", insn_count);
for (int i = 0; i < insn_count; ++i) {
// Fetch instruction and decode opcode
c.generic = insn[i];
printf("INSTRUCTION %u: ", i);
if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) {
if (c.mem.x_size == 0) {
if (c.mem.opcode == OPCODE_STORE) {
printf("NOP-STORE-STAGE\n");
}
else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
printf("NOP-COMPUTE-STAGE\n");
} else {
printf("NOP-MEMORY-STAGE\n");
}
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
static_cast<int>(c.mem.pop_prev_dep),
static_cast<int>(c.mem.pop_next_dep),
static_cast<int>(c.mem.push_prev_dep),
static_cast<int>(c.mem.push_next_dep));
// Count status in queues
if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) {
if (c.mem.opcode == OPCODE_STORE) {
assert(c.mem.pop_next_dep == false);
assert(c.mem.push_next_dep == false);
if (c.mem.pop_prev_dep) g2s_queue--;
if (c.mem.push_prev_dep) s2g_queue++;
} else if (c.mem.opcode == OPCODE_LOAD &&
(c.mem.memory_type == MEM_ID_INP ||
c.mem.memory_type == MEM_ID_WGT) ) {
assert(c.mem.pop_prev_dep == false);
assert(c.mem.push_prev_dep == false);
if (c.mem.pop_next_dep) g2l_queue--;
if (c.mem.push_next_dep) l2g_queue++;
} else {
if (c.mem.pop_prev_dep) l2g_queue--;
if (c.mem.push_prev_dep) g2l_queue++;
if (c.mem.pop_next_dep) s2g_queue--;
if (c.mem.push_next_dep) g2s_queue++;
}
} else if (c.mem.opcode == OPCODE_GEMM) {
// Print instruction field information
if (c.gemm.pop_prev_dep) l2g_queue--;
if (c.gemm.push_prev_dep) g2l_queue++;
if (c.gemm.pop_next_dep) s2g_queue--;
if (c.gemm.push_next_dep) g2s_queue++;
}
printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
continue;
}
// Print instruction field information
if (c.mem.opcode==OPCODE_LOAD) {
printf("LOAD ");
if (c.mem.memory_type == MEM_ID_UOP) printf("UOP\n");
if (c.mem.memory_type == MEM_ID_WGT) printf("WGT\n");
if (c.mem.memory_type == MEM_ID_INP) printf("INP\n");
if (c.mem.memory_type == MEM_ID_ACC) printf("ACC\n");
}
if (c.mem.opcode==OPCODE_STORE) {
printf("STORE\n");
}
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
static_cast<int>(c.mem.pop_prev_dep),
static_cast<int>(c.mem.pop_next_dep),
static_cast<int>(c.mem.push_prev_dep),
static_cast<int>(c.mem.push_next_dep));
printf("\tDRAM: 0x%08x, SRAM:0x%04x\n",
static_cast<int>(c.mem.dram_base),
static_cast<int>(c.mem.sram_base));
printf("\ty: size=%d, pad=[%d, %d]\n",
static_cast<int>(c.mem.y_size),
static_cast<int>(c.mem.y_pad_0),
static_cast<int>(c.mem.y_pad_1));
printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n",
static_cast<int>(c.mem.x_size),
static_cast<int>(c.mem.x_stride),
static_cast<int>(c.mem.x_pad_0),
static_cast<int>(c.mem.x_pad_1));
} else if (c.mem.opcode==OPCODE_GEMM) {
// Print instruction field information
printf("GEMM\n");
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
static_cast<int>(c.mem.pop_prev_dep),
static_cast<int>(c.mem.pop_next_dep),
static_cast<int>(c.mem.push_prev_dep),
static_cast<int>(c.mem.push_next_dep));
printf("\trange (%d, %d)\n",
static_cast<int>(c.gemm.uop_bgn),
static_cast<int>(c.gemm.uop_end));
printf("\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
static_cast<int>(c.gemm.iter_out),
static_cast<int>(c.gemm.wgt_factor_out),
static_cast<int>(c.gemm.src_factor_out),
static_cast<int>(c.gemm.dst_factor_out));
printf("\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
static_cast<int>(c.gemm.iter_in),
static_cast<int>(c.gemm.wgt_factor_in),
static_cast<int>(c.gemm.src_factor_in),
static_cast<int>(c.gemm.dst_factor_in));
} else if (c.mem.opcode == OPCODE_ALU) {
// Print instruction field information
printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
static_cast<int>(c.mem.pop_prev_dep),
static_cast<int>(c.mem.pop_next_dep),
static_cast<int>(c.mem.push_prev_dep),
static_cast<int>(c.mem.push_next_dep));
printf("\trange (%d, %d)\n",
static_cast<int>(c.alu.uop_bgn),
static_cast<int>(c.alu.uop_end));
printf("\touter loop - iter: %d, dst: %d, src: %d\n",
static_cast<int>(c.alu.iter_out),
static_cast<int>(c.alu.dst_factor_out),
static_cast<int>(c.alu.src_factor_out));
printf("\tinner loop - iter: %d, dst: %d, src: %d\n",
static_cast<int>(c.alu.iter_in),
static_cast<int>(c.alu.dst_factor_in),
static_cast<int>(c.alu.src_factor_in));
} else if (c.mem.opcode == OPCODE_FINISH) {
printf("FINISH\n");
}
// Count status in queues
if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) {
if (c.mem.opcode == OPCODE_STORE) {
assert(c.mem.pop_next_dep == false);
assert(c.mem.push_next_dep == false);
if (c.mem.pop_prev_dep) g2s_queue--;
if (c.mem.push_prev_dep) s2g_queue++;
} else if (c.mem.opcode == OPCODE_LOAD &&
(c.mem.memory_type == MEM_ID_INP ||
c.mem.memory_type == MEM_ID_WGT) ) {
assert(c.mem.pop_prev_dep == false);
assert(c.mem.push_prev_dep == false);
if (c.mem.pop_next_dep) g2l_queue--;
if (c.mem.push_next_dep) l2g_queue++;
} else {
if (c.mem.pop_prev_dep) l2g_queue--;
if (c.mem.push_prev_dep) g2l_queue++;
if (c.mem.pop_next_dep) s2g_queue--;
if (c.mem.push_next_dep) g2s_queue++;
}
} else if (c.mem.opcode == OPCODE_GEMM ||
c.mem.opcode == OPCODE_ALU) {
// Print instruction field information
if (c.gemm.pop_prev_dep) l2g_queue--;
if (c.gemm.push_prev_dep) g2l_queue++;
if (c.gemm.pop_next_dep) s2g_queue--;
if (c.gemm.push_next_dep) g2s_queue++;
}
printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
}
}
// Commit all pending pop of corresponding stage
void CommitPendingPop(int stage) {
// Handle the LD<->compute queue
// NOTE: pop executes on target(stage)
assert(stage > 0 && stage < 4);
if (pending_pop_prev_[stage] ||
pending_pop_next_[stage]) {
PushNoop(stage, false, false,
pending_pop_prev_[stage],
pending_pop_next_[stage]);
pending_pop_prev_[stage] = 0;
pending_pop_next_[stage] = 0;
}
}
void CommitPending() {
for (int i = kLoadStage; i <= kStoreStage; ++i) {
CommitPendingPop(i);
}
}
bool PendingPop() {
for (int i = kLoadStage; i <= kStoreStage; ++i) {
if (pending_pop_prev_[i]) return true;
if (pending_pop_next_[i]) return true;
}
return false;
}
protected:
/*! \return Add new instruction to the buffer. */
VTAGenericInsn* NextInsn() {
VTAGenericInsn* insn = data() + dram_end_;
++dram_end_;
assert(dram_end_ < kMaxElems);
return insn;
}
// Create a new instruction for a given stage
VTAGenericInsn* Create(PipelineStage stage) {
VTAGenericInsn* gptr = NextInsn();
VTAMemInsn* mptr = reinterpret_cast<VTAMemInsn*>(gptr);
mptr->pop_prev_dep = pending_pop_prev_[stage];
mptr->pop_next_dep = pending_pop_next_[stage];
mptr->push_prev_dep = false;
mptr->push_next_dep = false;
pending_pop_prev_[stage] = 0;
pending_pop_next_[stage] = 0;
return gptr;
}
// Get stage of the memory
static PipelineStage GetMemPipelineStage(int memory_type) {
if (memory_type == MEM_ID_ACC) return kComputeStage;
if (memory_type == MEM_ID_UOP) return kComputeStage;
return kLoadStage;
}
// Get stage of the computation
static PipelineStage GetPipelineStage(VTAMemInsn* insn) {
if (insn->opcode == OPCODE_GEMM) return kComputeStage;
if (insn->opcode == OPCODE_ALU) return kComputeStage;
if (insn->opcode == OPCODE_LOAD) {
if (insn->x_size == 0) return kNoneStage;
if (insn->memory_type == MEM_ID_ACC) return kComputeStage;
if (insn->memory_type == MEM_ID_UOP) return kComputeStage;
return kLoadStage;
}
if (insn->opcode == OPCODE_STORE) {
// FIXME: Right now memory_type is a 2-bit field which means that MEM_ID_OUT will appear as 0
// For now we'll refrain from checking the memory_type to avoid an assertion error...
return kStoreStage;
}
assert(false);
return kNoneStage;
}
// Push no-op
void PushNoop(int stage,
bool push_prev_dep, bool push_next_dep,
bool pop_prev_dep, bool pop_next_dep) {
VTAMemInsn* insn = reinterpret_cast<VTAMemInsn*>(NextInsn());
insn->opcode = (stage==kStoreStage ? OPCODE_STORE : OPCODE_LOAD);
insn->push_prev_dep = push_prev_dep;
insn->push_next_dep = push_next_dep;
insn->pop_prev_dep = pop_prev_dep;
insn->pop_next_dep = pop_next_dep;
insn->sram_base = 0;
insn->dram_base = 0;
insn->y_size = 0;
insn->x_size = 0;
insn->x_stride = 0;
insn->y_pad_0 = 0;
insn->y_pad_1 = 0;
insn->x_pad_0 = 0;
insn->x_pad_1 = 0;
insn->memory_type = (stage == kLoadStage ? MEM_ID_INP : MEM_ID_UOP);
}
private:
// Pending pop of each isntruction queue, qid=0 is not used
int pending_pop_prev_[4];
int pending_pop_next_[4];
static constexpr int kElemBytes = sizeof(VTAGenericInsn);
static constexpr int kMaxElems = kMaxBytes / kElemBytes;
};
/*!
* \brief The command queue object that handles the request.
*/
class CommandQueue {
public:
CommandQueue() {
this->InitSpace();
}
void InitSpace() {
uop_queue_.InitSpace();
insn_queue_.InitSpace();
// VTA stage handles
vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
printf("Initialize VTACommandHandle...\n");
}
~CommandQueue() {
// Close VTA stage handle
VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE);
VTAUnmapRegister(vta_load_handle_, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle_, VTA_RANGE);
VTAUnmapRegister(vta_store_handle_, VTA_RANGE);
printf("Close VTACommandhandle...\n");
}
uint32_t GetElemBytes(uint32_t memory_id) {
switch (memory_id){
case MEM_ID_UOP: return UOP_ELEM_BYTES;
case MEM_ID_INP: return INP_ELEM_BYTES;
case MEM_ID_WGT: return WGT_ELEM_BYTES;
case MEM_ID_ACC: return ACC_ELEM_BYTES;
case MEM_ID_OUT: return INP_ELEM_BYTES;
default: break;
}
printf("Memory id not recognized: %d\n", memory_id);
assert(false);
return 0;
}
void LoadBuffer2D(void* src_dram_addr,
uint32_t src_elem_offset,
uint32_t x_size,
uint32_t y_size,
uint32_t x_stride,
uint32_t x_pad_before,
uint32_t y_pad_before,
uint32_t x_pad_after,
uint32_t y_pad_after,
uint32_t dst_sram_index,
uint32_t dst_memory_type) {
VTAMemInsn* insn = insn_queue_.CreateMemInsn(dst_memory_type);
insn->opcode = OPCODE_LOAD;
insn->memory_type = dst_memory_type;
insn->sram_base = dst_sram_index;
DataBuffer* src = DataBuffer::FromHandle(src_dram_addr);
insn->dram_base = src->phy_addr() / GetElemBytes(dst_memory_type) + src_elem_offset;
insn->y_size = y_size;
insn->x_size = x_size;
insn->x_stride = x_stride;
insn->y_pad_0 = y_pad_before;
insn->y_pad_1 = y_pad_after;
insn->x_pad_0 = x_pad_before;
insn->x_pad_1 = x_pad_after;
this->CheckInsnOverFlow();
}
void StoreBuffer2D(uint32_t src_sram_index,
uint32_t src_memory_type,
void* dst_dram_addr,
uint32_t dst_elem_offset,
uint32_t x_size,
uint32_t y_size,
uint32_t x_stride) {
VTAMemInsn* insn = insn_queue_.CreateStoreInsn();
insn->opcode = OPCODE_STORE;
insn->memory_type = src_memory_type;
insn->sram_base = src_sram_index;
DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr);
insn->dram_base = dst->phy_addr() / GetElemBytes(src_memory_type) + dst_elem_offset;
insn->y_size = y_size;
insn->x_size = x_size;
insn->x_stride = x_stride;
insn->y_pad_0 = 0;
insn->y_pad_1 = 0;
insn->x_pad_0 = 0;
insn->x_pad_1 = 0;
this->CheckInsnOverFlow();
}
void DepPush(int from_qid, int to_qid) {
insn_queue_.DepPush(from_qid, to_qid);
}
void DepPop(int from_qid, int to_qid) {
insn_queue_.DepPop(from_qid, to_qid);
}
void ReadBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
if (!(debug_flag_ & VTA_DEBUG_SKIP_READ_BARRIER)) {
uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
DataBuffer::FromHandle(buffer)->FlushCache(
elem_bytes * start, elem_bytes * extent);
}
}
void WriteBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
if (!(debug_flag_ & VTA_DEBUG_SKIP_WRITE_BARRIER)) {
uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
DataBuffer::FromHandle(buffer)->InvalidateCache(
elem_bytes * start, elem_bytes * extent);
}
}
void Synchronize(uint32_t wait_cycles) {
// Insert dependences to force serialization
if (debug_flag_ & VTA_DEBUG_FORCE_SERIAL) {
insn_queue_.RewriteForceSerial();
}
// This will issue finish after last store finishes
insn_queue_.DepPush(kStoreStage, kComputeStage);
insn_queue_.DepPush(kLoadStage, kComputeStage);
insn_queue_.DepPop(kStoreStage, kComputeStage);
insn_queue_.DepPop(kLoadStage, kComputeStage);
insn_queue_.CommitPendingPop(kComputeStage);
// NOTE: FINISH cannot contain pop
VTAGemInsn* insn = insn_queue_.CreateGemInsn();
insn->opcode = OPCODE_FINISH;
assert(!insn_queue_.PendingPop());
// Check if there are no instruction to execute at all
if (insn_queue_.count() == 0) return;
// Synchronization for the queues
uop_queue_.AutoReadBarrier();
insn_queue_.AutoReadBarrier();
// Dump instructions if debug enabled
if (debug_flag_ & VTA_DEBUG_DUMP_INSN) {
insn_queue_.DumpInsn();
}
// Make sure that the last instruction is a finish instruction
assert(reinterpret_cast<VTAMemInsn*>(
insn_queue_.data())[insn_queue_.count()-1].opcode == OPCODE_FINISH);
#ifdef PYNQ_TARGET
// Make sure that we don't exceed contiguous physical memory limits
assert(insn_queue_.count() < MAX_XFER);
// NOTE: Register address map is derived from the auto-generated
// driver files available under hardware/build/vivado/<design>/export/driver
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_queue_.count());
// FETCH @ 0x18 : Data signal of insns_V
VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_queue_.dram_phy_addr());
// LOAD @ 0x10 : Data signal of inputs_V
VTAWriteMappedReg(vta_load_handle_, 0x10, 0);
// LOAD @ 0x18 : Data signal of weight_V
VTAWriteMappedReg(vta_load_handle_, 0x18, 0);
// COMPUTE @ 0x10 : Data signal of uops_V
VTAWriteMappedReg(vta_compute_handle_, 0x20, 0);
// COMPUTE @ 0x18 : Data signal of biases_V
VTAWriteMappedReg(vta_compute_handle_, 0x28, 0);
// STORE @ 0x10 : Data signal of outputs_V
VTAWriteMappedReg(vta_store_handle_, 0x10, 0);
// VTA start
VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART);
VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART);
VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART);
// Loop until the VTA is done
unsigned t, flag = 0;
for (t = 0; t < wait_cycles; ++t) {
flag = VTAReadMappedReg(vta_compute_handle_, 0x18);
if (flag == VTA_DONE) break;
std::this_thread::yield();
}
// Report error if timeout
assert(t < wait_cycles);
#endif //PYNQ_TARGET
// Reset buffers
uop_queue_.Reset();
insn_queue_.Reset();
}
// Get record kernel
UopKernel* record_kernel() const {
assert(record_kernel_ != nullptr);
return record_kernel_;
}
// Set debug flag
void SetDebugFlag(int debug_flag) {
debug_flag_ = debug_flag;
}
void PushGEMMOp(void** uop_handle,
int (*finit)(void*),
void* signature,
int nbytes) {
UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
if (uptr[0] == nullptr) {
uptr[0] = new UopKernelMap();
}
UopKernel** kptr = uptr[0]->Get(signature, nbytes);
if (kptr[0] == nullptr) {
record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
assert((*finit)(signature) == 0);
kptr[0] = static_cast<UopKernel*>(record_kernel_);
if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
record_kernel_->Dump();
}
record_kernel_ = nullptr;
}
this->PushGEMMOp(static_cast<UopKernel*>(kptr[0]));
this->CheckInsnOverFlow();
}
void PushALUUop(void** uop_handle,
int (*finit)(void*),
void* signature,
int nbytes) {
UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
if (uptr[0] == nullptr) {
uptr[0] = new UopKernelMap();
}
UopKernel** kptr = uptr[0]->Get(signature, nbytes);
if (kptr[0] == nullptr) {
record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
assert((*finit)(signature) == 0);
kptr[0] = static_cast<UopKernel*>(record_kernel_);
if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
record_kernel_->Dump();
}
record_kernel_ = nullptr;
}
this->PushALUUop(static_cast<UopKernel*>(kptr[0]));
this->CheckInsnOverFlow();
}
static std::shared_ptr<CommandQueue>& ThreadLocal() {
static std::shared_ptr<CommandQueue> inst =
std::make_shared<CommandQueue>();
return inst;
}
static void Shutdown() {
ThreadLocal().reset();
}
private:
// Push GEMM uop to the command buffer
void PushGEMMOp(UopKernel* kernel) {
uop_queue_.Push(kernel,
[this]() { this->AutoSync(); });
if (uop_queue_.pending()) {
VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP);
insn->opcode = OPCODE_LOAD;
uop_queue_.FlushUopLoad(insn);
}
VTAGemInsn* insn = insn_queue_.CreateGemInsn();
insn->opcode = OPCODE_GEMM;
insn->uop_bgn = kernel->sram_begin_;
insn->uop_end = kernel->sram_end_;
const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
if (loop.size() > 0) {
insn->iter_out = loop[0].extent;
insn->wgt_factor_out = loop[0].wgt_factor;
insn->src_factor_out = loop[0].src_factor;
insn->dst_factor_out = loop[0].dst_factor;
} else {
insn->iter_out = 1;
insn->wgt_factor_out = 0;
insn->src_factor_out = 0;
insn->dst_factor_out = 0;
}
if (loop.size() > 1) {
insn->iter_in = loop[1].extent;
insn->wgt_factor_in = loop[1].wgt_factor;
insn->src_factor_in = loop[1].src_factor;
insn->dst_factor_in = loop[1].dst_factor;
} else {
insn->iter_in = 1;
insn->wgt_factor_in = 0;
insn->src_factor_in = 0;
insn->dst_factor_in = 0;
}
}
// Push ALU uop to the command buffer
void PushALUUop(UopKernel* kernel) {
uop_queue_.Push(kernel,
[this]() { this->AutoSync(); });
if (uop_queue_.pending()) {
VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP);
insn->opcode = OPCODE_LOAD;
uop_queue_.FlushUopLoad(insn);
}
VTAAluInsn* insn = insn_queue_.CreateAluInsn();
insn->opcode = OPCODE_ALU;
insn->uop_bgn = kernel->sram_begin_;
insn->uop_end = kernel->sram_end_;
insn->alu_opcode = kernel->opcode_;
insn->use_imm = kernel->use_imm_;
insn->imm = kernel->imm_val_;
const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
if (loop.size() == 0) {
insn->iter_out = 1;
insn->dst_factor_out = 0;
insn->src_factor_out = 0;
insn->iter_in = 1;
insn->dst_factor_in = 0;
insn->src_factor_in = 0;
} else if (loop.size() == 1) {
insn->iter_out = 1;
insn->dst_factor_out = 0;
insn->src_factor_out = 0;
insn->iter_in = loop[0].extent;
insn->dst_factor_in = loop[0].dst_factor;
insn->src_factor_in = loop[0].src_factor;
} else {
insn->iter_out = loop[0].extent;
insn->dst_factor_out = loop[0].dst_factor;
insn->src_factor_out = loop[0].src_factor;
insn->iter_in = loop[1].extent;
insn->dst_factor_in = loop[1].dst_factor;
insn->src_factor_in = loop[1].src_factor;
}
}
void CheckInsnOverFlow() {
// At each API call, we can at most commit:
// one pending store, one pending load, and one uop
if (insn_queue_.count() >= MAX_XFER) {
this->AutoSync();
}
}
// Auto sync when instruction overflow
void AutoSync() {
this->Synchronize(1 << 31);
}
// VTA handles (register maps)
VTAHandle vta_fetch_handle_{nullptr};
VTAHandle vta_load_handle_{nullptr};
VTAHandle vta_compute_handle_{nullptr};
VTAHandle vta_store_handle_{nullptr};
// Internal debug flag
int debug_flag_{0};
// The kernel we currently recording
UopKernel* record_kernel_{nullptr};
// Micro op queue
UopQueue<MAX_XFER, true, true> uop_queue_;
// instruction queue
InsnQueue<MAX_XFER, true, true> insn_queue_;
};
} // namespace vta
VTACommandHandle VTATLSCommandHandle() {
return vta::CommandQueue::ThreadLocal().get();
}
void VTARuntimeShutdown() {
vta::CommandQueue::Shutdown();
}
void* VTABufferAlloc(VTACommandHandle cmd, size_t size) {
return vta::DataBuffer::Alloc(size);
}
void VTABufferFree(VTACommandHandle cmd, void* buffer) {
vta::DataBuffer::Free(vta::DataBuffer::FromHandle(buffer));
}
void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) {
return vta::DataBuffer::FromHandle(buffer)->virt_addr();
}
void VTABufferCopy(VTACommandHandle cmd,
const void* from,
size_t from_offset,
void* to,
size_t to_offset,
size_t size,
int kind_mask) {
vta::DataBuffer* from_buffer = nullptr;
vta::DataBuffer* to_buffer = nullptr;
if (kind_mask & 2) {
from_buffer = vta::DataBuffer::FromHandle(from);
from = from_buffer->virt_addr();
}
if (kind_mask & 1) {
to_buffer = vta::DataBuffer::FromHandle(to);
to = to_buffer->virt_addr();
}
if (from_buffer) {
from_buffer->InvalidateCache(from_offset, size);
}
memcpy(static_cast<char*>(to) + to_offset,
static_cast<const char*>(from) + from_offset,
size);
if (to_buffer) {
to_buffer->FlushCache(to_offset, size);
}
}
void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) {
static_cast<vta::CommandQueue*>(cmd)->
SetDebugFlag(debug_flag);
}
void VTAWriteBarrier(VTACommandHandle cmd,
void* buffer, uint32_t elem_bits,
uint32_t start, uint32_t extent) {
static_cast<vta::CommandQueue*>(cmd)->
WriteBarrier(buffer, elem_bits, start, extent);
}
void VTAReadBarrier(VTACommandHandle cmd,
void* buffer, uint32_t elem_bits,
uint32_t start, uint32_t extent) {
static_cast<vta::CommandQueue*>(cmd)->
ReadBarrier(buffer, elem_bits, start, extent);
}
void VTALoadBuffer2D(VTACommandHandle cmd,
void* src_dram_addr,
uint32_t src_elem_offset,
uint32_t x_size,
uint32_t y_size,
uint32_t x_stride,
uint32_t x_pad_before,
uint32_t y_pad_before,
uint32_t x_pad_after,
uint32_t y_pad_after,
uint32_t dst_sram_index,
uint32_t dst_memory_type) {
static_cast<vta::CommandQueue*>(cmd)->
LoadBuffer2D(src_dram_addr, src_elem_offset,
x_size, y_size, x_stride,
x_pad_before, y_pad_before,
x_pad_after, y_pad_after,
dst_sram_index, dst_memory_type);
}
void VTAStoreBuffer2D(VTACommandHandle cmd,
uint32_t src_sram_index,
uint32_t src_memory_type,
void* dst_dram_addr,
uint32_t dst_elem_offset,
uint32_t x_size,
uint32_t y_size,
uint32_t x_stride) {
static_cast<vta::CommandQueue*>(cmd)->
StoreBuffer2D(src_sram_index, src_memory_type,
dst_dram_addr, dst_elem_offset,
x_size, y_size, x_stride);
}
void VTAUopPush(uint32_t mode,
uint32_t reset_out,
uint32_t dst_index,
uint32_t src_index,
uint32_t wgt_index,
uint32_t opcode,
uint32_t use_imm,
uint32_t imm_val) {
vta::CommandQueue::ThreadLocal()->record_kernel()
->Push(mode, reset_out, dst_index, src_index,
wgt_index, opcode, use_imm, imm_val);
}
void VTAUopLoopBegin(uint32_t extent,
uint32_t dst_factor,
uint32_t src_factor,
uint32_t wgt_factor) {
vta::CommandQueue::ThreadLocal()->record_kernel()
->PushLoopBegin(extent, dst_factor, src_factor, wgt_factor);
}
void VTAUopLoopEnd() {
vta::CommandQueue::ThreadLocal()->record_kernel()
->PushLoopEnd();
}
int VTAPushGEMMOp(void** uop_handle,
int (*finit)(void*),
void* signature,
int nbytes) {
vta::CommandQueue::ThreadLocal()->
PushGEMMOp(uop_handle, finit, signature, nbytes);
return 0;
}
int VTAPushALUOp(void** uop_handle,
int (*finit)(void*),
void* signature,
int nbytes) {
vta::CommandQueue::ThreadLocal()->
PushALUUop(uop_handle, finit, signature, nbytes);
return 0;
}
int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid) {
static_cast<vta::CommandQueue*>(cmd)->
DepPush(from_qid, to_qid);
return 0;
}
int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) {
static_cast<vta::CommandQueue*>(cmd)->
DepPop(from_qid, to_qid);
return 0;
}
void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) {
static_cast<vta::CommandQueue*>(cmd)->
Synchronize(wait_cycles);
}
// simply include the driver for now.
#include <tvm/runtime/registry.h>
#include <dmlc/thread_local.h>
#include <vta/runtime.h>
#include "../../tvm/src/runtime/workspace_pool.h"
namespace tvm {
namespace runtime {
std::string VTARPCGetPath(const std::string& name) {
static const PackedFunc* f =
runtime::Registry::Get("tvm.contrib.rpc.server.workpath");
CHECK(f != nullptr) << "require tvm.contrib.rpc.server.workpath";
return (*f)(name);
}
// Global functions that can be called
TVM_REGISTER_GLOBAL("tvm.contrib.vta.init")
.set_body([](TVMArgs args, TVMRetValue* rv) {
std::string path = VTARPCGetPath(args[0]);
VTAProgram(path.c_str());
LOG(INFO) << "VTA initialization end with bistream " << path;
});
TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.shutdown")
.set_body([](TVMArgs args, TVMRetValue* rv) {
VTARuntimeShutdown();
});
class VTADeviceAPI final : public DeviceAPI {
public:
void SetDevice(TVMContext ctx) final {}
void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
if (kind == kExist) {
*rv = 1;
}
}
void* AllocDataSpace(TVMContext ctx,
size_t size, size_t alignment,
TVMType type_hint) final {
return VTABufferAlloc(VTATLSCommandHandle(), size);
}
void FreeDataSpace(TVMContext ctx, void* ptr) final {
VTABufferFree(VTATLSCommandHandle(), ptr);
}
void CopyDataFromTo(const void* from,
size_t from_offset,
void* to,
size_t to_offset,
size_t size,
TVMContext ctx_from,
TVMContext ctx_to,
TVMStreamHandle stream) final {
int kind_mask = 0;
if (ctx_from.device_type != kDLCPU) {
kind_mask |= 2;
}
if (ctx_to.device_type != kDLCPU) {
kind_mask |= 1;
}
VTABufferCopy(VTATLSCommandHandle(),
from, from_offset,
to, to_offset,
size, kind_mask);
}
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
}
void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
void FreeWorkspace(TVMContext ctx, void* data) final;
static const std::shared_ptr<VTADeviceAPI>& Global() {
static std::shared_ptr<VTADeviceAPI> inst =
std::make_shared<VTADeviceAPI>();
return inst;
}
};
struct VTAWorkspacePool : public WorkspacePool {
VTAWorkspacePool() :
WorkspacePool(static_cast<DLDeviceType>(kExtDev),
VTADeviceAPI::Global()) {}
};
void* VTADeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) {
return dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()
->AllocWorkspace(ctx, size);
}
void VTADeviceAPI::FreeWorkspace(TVMContext ctx, void* data) {
dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->FreeWorkspace(ctx, data);
}
TVM_REGISTER_GLOBAL("device_api.ext_dev")
.set_body([](TVMArgs args, TVMRetValue* rv) {
DeviceAPI* ptr = VTADeviceAPI::Global().get();
*rv = static_cast<void*>(ptr);
});
} // namespace runtime
} // namespace tvm
CC ?= g++
CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
LIBS = -l:libsds_lib.so -l:libdma.so
SRC_DIR = ../../src
INCLUDE_DIR = ../../include
DRIVER_DIR = $(SRC_DIR)/driver/pynq
TESTLIB_DIR = $(SRC_DIR)/test
VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
SOURCES = vta_pynq_driver.c vta_test_lib.cc
OBJECTS = vta_pynq_driver.o vta_test_lib.o driver_test.o
EXECUTABLE = vta
# VTA Parameters
# Log of input width in bits
LOG_INP_WIDTH = 3
# Log of weight width in bits
LOG_WGT_WIDTH = 3
# Log of accum width in bits
LOG_ACC_WIDTH = 5
# Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_IN_BLOCK = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_OUT_BLOCK = 4
# Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17
# Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
# Define flags
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM \
-DDEBUG=0 -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_IN_BLOCK) -DLOG_BLOCK_OUT=$(LOG_OUT_BLOCK) \
-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
# All Target
all: $(EXECUTABLE)
%.o: %.cc $(SOURCES)
$(CC) -c -o $@ $< $(CFLAGS)
$(EXECUTABLE): $(OBJECTS)
$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
clean:
rm -rf *.o $(EXECUTABLE)
......@@ -4,7 +4,7 @@
* \brief Test library for the VTA design simulation and driver tests.
*/
#include "vta_test_lib.h"
#include "./test_lib.h"
const char* getOpcodeString(int opcode, bool use_imm) {
// Returns string name
......@@ -153,7 +153,7 @@ void free3dArray(T *** array, int rows, int cols, int depth) {
void * allocBuffer(size_t num_bytes) {
#ifdef NO_SIM
return cma_alloc(num_bytes, CACHED);
return VTAMemAlloc(num_bytes, CACHED);
#else
return malloc(num_bytes);
#endif
......@@ -161,7 +161,7 @@ void * allocBuffer(size_t num_bytes) {
void freeBuffer(void * buffer) {
#ifdef NO_SIM
return cma_free(buffer);
return VTAMemFree(buffer);
#else
return free(buffer);
#endif
......@@ -353,7 +353,7 @@ VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
// Allocate buffer
#ifdef NO_SIM
VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
#else
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
#endif
......@@ -388,7 +388,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
// Allocate buffer
#ifdef NO_SIM
VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
#else
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
#endif
......@@ -449,7 +449,7 @@ VTAUop * getMapALUUops(int vector_size, bool uop_compression) {
// Allocate buffer
#ifdef NO_SIM
VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
#else
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
#endif
......@@ -762,7 +762,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
}
// Compute reference output
inp_T **outputs_ref = alloc2dArray<inp_T>(batch, vector_size);
out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
for (int i = 0; i < batch; i ++) {
for (int j = 0; j < vector_size; j ++) {
acc_T tmp = 0;
......@@ -802,7 +802,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
tmp = inputs[i][j] >> immediate[i / BATCH];
}
// Set
outputs_ref[i][j] = (inp_T) tmp;
outputs_ref[i][j] = (out_T) tmp;
}
}
......@@ -811,7 +811,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
packBuffer<acc_T, ACC_WIDTH>(bias_buf, inputs, batch, vector_size * input_sets, BATCH, BLOCK_OUT);
// Prepare output buffer
inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets);
out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets);
#ifdef NO_SIM
// Invoke the VTA
......@@ -833,8 +833,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
#endif
// Unpack output buffer
inp_T **outputs = alloc2dArray<inp_T>(batch, vector_size);
unpackBuffer<inp_T, INP_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT);
out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT);
// Correctness checks
int err = 0;
......@@ -853,8 +853,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
// Free all allocated arrays
free(immediate);
free2dArray<acc_T>(inputs, batch, vector_size * input_sets);
free2dArray<inp_T>(outputs_ref, batch, vector_size);
free2dArray<inp_T>(outputs, batch, vector_size);
free2dArray<out_T>(outputs_ref, batch, vector_size);
free2dArray<out_T>(outputs, batch, vector_size);
freeBuffer(insn_buf);
freeBuffer(uop_buf);
freeBuffer(bias_buf);
......@@ -891,17 +891,17 @@ virtual_threads=%d\n",
int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2;
int uop_size = uop_compression ? block / BATCH * virtual_threads :
block / BATCH * block / BLOCK_IN * block / BLOCK_OUT * virtual_threads;
int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT;
int inp_size = batch / BATCH * in_feat / BLOCK_IN;
int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT;
int out_size = batch / BATCH * out_feat / BLOCK_OUT;
// Blocked buffer sizes (in terms of elements)
int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT;
int inp_block_size = block / BATCH * block / BLOCK_IN;
int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT;
int out_block_size = block / BATCH * block / BLOCK_OUT;
// Make sure we don't exceed buffer bounds
assert(uop_size <= UOP_BUFF_DEPTH);
assert(wgt_block_size <= WGT_BUFF_DEPTH);
assert(inp_block_size <= INP_BUFF_DEPTH);
assert(wgt_block_size <= WGT_BUFF_DEPTH);
assert(out_block_size <= ACC_BUFF_DEPTH);
// Initialize instruction buffer
......@@ -1017,15 +1017,15 @@ virtual_threads=%d\n",
printMicroOp(uop_size, uop_buf);
#endif
// Initialize weights
wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat);
// Initialize inputs
inp_T **inputs = allocInit2dArray<inp_T, INP_WIDTH>(batch, in_feat);
// Initialize weights
wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat);
// Initialize biases
acc_T **biases = allocInit2dArray<acc_T, ACC_WIDTH>(batch, out_feat);
// Reference GEMM implementation
inp_T **outputs_ref = alloc2dArray<inp_T>(batch, out_feat);
out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
for (int i = 0; i < batch; i ++) {
for (int j = 0; j < out_feat; j ++) {
acc_T sum = biases[i][j];
......@@ -1033,21 +1033,21 @@ virtual_threads=%d\n",
sum += (acc_T) (inputs[i][k] * weights[j][k]);
}
// Set
outputs_ref[i][j] = (inp_T) sum;
outputs_ref[i][j] = (out_T) sum;
}
}
// Prepare the weight buffer
wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size);
packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN);
// Prepare the input buffer
inp_T *input_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * inp_size);
packBuffer<inp_T, INP_WIDTH>(input_buf, inputs, batch, in_feat, BATCH, BLOCK_IN);
// Prepare the weight buffer
wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size);
packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN);
// Prepare the bias buffer
acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * out_size);
packBuffer<acc_T, ACC_WIDTH>(bias_buf, biases, batch, out_feat, BATCH, BLOCK_OUT);
// Prepare the output buffer
inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * out_size);
out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * out_size);
#ifdef NO_SIM
// Invoke the VTA
......@@ -1069,8 +1069,8 @@ virtual_threads=%d\n",
#endif
// Unpack output data
inp_T **outputs = alloc2dArray<inp_T>(batch, out_feat);
unpackBuffer<inp_T, INP_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT);
out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT);
// Correctness checks
int err = 0;
......@@ -1087,15 +1087,15 @@ virtual_threads=%d\n",
}
// Free all allocated arrays
free2dArray<wgt_T>(weights, out_feat, in_feat);
free2dArray<inp_T>(inputs, batch, in_feat);
free2dArray<wgt_T>(weights, out_feat, in_feat);
free2dArray<acc_T>(biases, batch, out_feat);
free2dArray<inp_T>(outputs_ref, batch, out_feat);
free2dArray<inp_T>(outputs, batch, out_feat);
free2dArray<out_T>(outputs_ref, batch, out_feat);
free2dArray<out_T>(outputs, batch, out_feat);
freeBuffer((void *) insn_buf);
freeBuffer((void *) uop_buf);
freeBuffer((void *) weight_buf);
freeBuffer((void *) input_buf);
freeBuffer((void *) weight_buf);
freeBuffer((void *) bias_buf);
freeBuffer((void *) output_buf);
......
......@@ -7,21 +7,25 @@
#ifndef VTA_TESTLIB_H_
#define VTA_TESTLIB_H_
#include "vta_params.h"
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <vta/hw_spec.h>
#ifdef NO_SIM
#include "vta_pynq_driver.h"
#include <vta/driver.h>
#ifdef PYNQ_TARGET
#include "../../../src/pynq/pynq_driver.h"
#endif //PYNQ_TARGET
typedef uint64_t axi_T;
typedef uint32_t uop_T;
typedef int8_t wgt_T;
typedef int8_t inp_T;
typedef int8_t out_T;
typedef int32_t acc_T;
uint64_t vta (
......@@ -35,8 +39,7 @@ uint64_t vta (
#else //NO_SIM
#include "vta.h"
#include "vta_typedefs.h"
#include "../../../hardware/vivado/src/vta.h"
#endif //NO_SIM
......
CC ?= g++
CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
LIBS = -l:libsds_lib.so -l:libdma.so
INCLUDE_DIR = ../../../include
DRIVER_DIR = ../../../src/pynq
TESTLIB_DIR = ../common
VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
SOURCES = pynq_driver.cc test_lib.cc
OBJECTS = pynq_driver.o test_lib.o metal_test.o
EXECUTABLE = vta
# Include top-level config file
ifndef config
ifneq ("$(wildcard ../../../config.mk)", "")
config = ../../../config.mk
else
config = ../../../make/config.mk
endif
endif
include $(config)
# Define flags
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DDEBUG=0
CFLAGS += $(ADD_CFLAGS)
# All Target
all: $(EXECUTABLE)
%.o: %.cc $(SOURCES)
$(CC) -c -o $@ $< $(CFLAGS)
$(EXECUTABLE): $(OBJECTS)
$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
clean:
rm -rf *.o $(EXECUTABLE)
......@@ -9,8 +9,9 @@
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "vta_test_lib.h"
#include "vta_pynq_driver.h"
#include <vta/driver.h>
#include "../../../src/pynq/pynq_driver.h"
#include "../common/test_lib.h"
// VTA invocation (present the same abstraction as in the simulation tests)
uint64_t vta (
......@@ -43,18 +44,18 @@ uint64_t vta (
#endif
// Program VTA
ProgramVTA(bitstream);
VTAProgram(bitstream);
// Get VTA handles
VTAHandle vta_fetch_handle = MapRegister(VTA_FETCH_ADDR, VTA_RANGE);
VTAHandle vta_load_handle = MapRegister(VTA_LOAD_ADDR, VTA_RANGE);
VTAHandle vta_compute_handle = MapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
VTAHandle vta_store_handle = MapRegister(VTA_STORE_ADDR, VTA_RANGE);
VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
// Physical address pointers
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
......@@ -65,29 +66,29 @@ uint64_t vta (
clock_gettime(CLOCK_REALTIME, &start);
// FETCH @ 0x10 : Data signal of insn_count_V
WriteMappedReg(vta_fetch_handle, 0x10, insn_count);
VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V
if (insns) WriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
// LOAD @ 0x10 : Data signal of weight_V
if (weights) WriteMappedReg(vta_load_handle, 0x10, weight_phy);
// LOAD @ 0x18 : Data signal of inputs_V
if (inputs) WriteMappedReg(vta_load_handle, 0x18, input_phy);
if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
// LOAD @ 0x10 : Data signal of inputs_V
if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
// LOAD @ 0x18 : Data signal of weight_V
if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
// COMPUTE @ 0x20 : Data signal of uops_V
if (uops) WriteMappedReg(vta_compute_handle, 0x20, uop_phy);
if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
// COMPUTE @ 0x28 : Data signal of biases_V
if (biases) WriteMappedReg(vta_compute_handle, 0x28, bias_phy);
if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
// STORE @ 0x10 : Data signal of outputs_V
if (outputs) WriteMappedReg(vta_store_handle, 0x10, output_phy);
if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
// VTA start
WriteMappedReg(vta_fetch_handle, 0x0, 0x1);
WriteMappedReg(vta_load_handle, 0x0, 0x81);
WriteMappedReg(vta_compute_handle, 0x0, 0x81);
WriteMappedReg(vta_store_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
int flag = 0, t = 0;
for (t = 0; t < 10000000; ++t) {
flag = ReadMappedReg(vta_compute_handle, 0x18);
flag = VTAReadMappedReg(vta_compute_handle, 0x18);
if (flag & VTA_DONE) break;
}
......@@ -104,10 +105,10 @@ uint64_t vta (
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
// Unmap VTA register
UnmapRegister(vta_fetch_handle, VTA_RANGE);
UnmapRegister(vta_load_handle, VTA_RANGE);
UnmapRegister(vta_compute_handle, VTA_RANGE);
UnmapRegister(vta_store_handle, VTA_RANGE);
VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
VTAUnmapRegister(vta_load_handle, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
VTAUnmapRegister(vta_store_handle, VTA_RANGE);
return t_fpga;
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment