/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /*! * \file vta.h * \brief Type definitions and prototype for VTA HLS design. */ #ifndef VTA_VTA_H_ #define VTA_VTA_H_ #include <ap_axi_sdata.h> #include <ap_int.h> #include <assert.h> #include <hls_stream.h> #include <vta/hw_spec.h> /*! * Define HLS stream depth */ #define PRAGMA_SUB(x) _Pragma (#x) #define PRAGMA_HLS(x) PRAGMA_SUB(x) #define STREAM_IN_DEPTH 8 /* \typedef bus_T memory bus datatype*/ typedef ap_uint<VTA_BUS_WIDTH> bus_T; /* \typedef uop_T Micro-op datatype*/ typedef ap_uint<VTA_UOP_WIDTH> uop_T; /* \typedef inp_T Input datatype*/ typedef ap_int<VTA_INP_WIDTH> inp_T; /* \typedef wgt_T Weight datatype*/ typedef ap_int<VTA_WGT_WIDTH> wgt_T; /* \typedef out_T Output datatype*/ typedef ap_int<VTA_OUT_WIDTH> out_T; /* \typedef acc_T Accumulator datatype*/ typedef ap_int<VTA_ACC_WIDTH> acc_T; /* \typedef mul_T Multiplier output datatype*/ typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T; /* \typedef sum_T GEMM accumulator datatype*/ typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T; /* \typedef uop_idx_T Micro-op SRAM index datatype*/ typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T; /* \typedef inp_idx_T Input SRAM index datatype*/ typedef ap_uint<VTA_LOG_INP_BUFF_DEPTH+1> inp_idx_T; /* \typedef wgt_idx_T Weight SRAM index datatype*/ typedef ap_uint<VTA_LOG_WGT_BUFF_DEPTH+1> wgt_idx_T; /* \typedef acc_idx_T Accumulator SRAM index datatype*/ typedef ap_uint<VTA_LOG_ACC_BUFF_DEPTH+1> acc_idx_T; /* \typedef opcode_T Opcode datatype*/ typedef ap_uint<VTA_OPCODE_BIT_WIDTH> opcode_T; /* \typedef insn_T Instruction datatype*/ typedef ap_uint<VTA_INS_WIDTH> insn_T; /* \typedef loop_T Loop bound datatype*/ typedef ap_uint<VTA_LOOP_ITER_WIDTH> loop_T; /* \typedef memop_id_T Memory operation ID datatype*/ typedef ap_uint<VTA_MEMOP_ID_BIT_WIDTH> memop_id_T; /* \typedef memop_sram_T Memory operation SRAM index datatype*/ typedef ap_uint<VTA_MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T; /* \typedef memop_dram_T Memory operation DRAM index datatype*/ typedef ap_uint<VTA_MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T; /* \typedef memop_size_T Memory operation range datatype*/ typedef ap_uint<VTA_MEMOP_SIZE_BIT_WIDTH> memop_size_T; /* \typedef memop_stride_T Memory operation stride datatype*/ typedef ap_uint<VTA_MEMOP_STRIDE_BIT_WIDTH> memop_stride_T; /* \typedef memop_pad_T Memory operation pad width datatype*/ typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T; /* \typedef aluop_opcode_T ALU operation opcode datatype*/ typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T; /* \typedef aluop_imm_T ALU operation immediate datatype*/ typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T; /* \typedef aluop_shr_arg_T ALU operation shift right immediate datatype*/ typedef ap_int<VTA_SHR_ARG_BIT_WIDTH> aluop_shr_arg_T; /* \typedef aluop_mul_arg_T ALU operation multiply datatype*/ typedef ap_int<VTA_MUL_ARG_BIT_WIDTH> aluop_mul_arg_T; /*! * \brief Fetch module. * Reads in \a insn_count instructions via DMA and pushes them to the * appropriate load, gemm or store queue. * \param insns Instruction data base address in DRAM. AXI-4 master port. * \param insn_count Total instruction count. AXI-lite memory mapped register. * \param load_queue Load instruction queue. AXI-stream FIFO. * \param gemm_queue GEMM instruction queue. AXI-stream FIFO. * \param store_queue Store instruction queue. AXI-stream FIFO. */ void fetch( uint32_t insn_count, volatile insn_T *insns, hls::stream<insn_T> &load_queue, hls::stream<insn_T> &gemm_queue, hls::stream<insn_T> &store_queue); /*! * \brief Load module. * Reads in load instructions from the load queue, and performs appropriate * DMA load operation to the \a wgt_mem and \a inp_mem SRAM buffers from DRAM. * Updates dependence queues accordingly. * \param inputs Input data base address in DRAM. AXI-4 master port. * \param weights Weight data base address in DRAM. AXI-4 master port. * \param load_queue Load instruction queue. AXI-stream FIFO. * \param g2l_dep_queue Dependence queue from GEMM to load stage. * AXI-stream FIFO. * \param l2g_dep_queue Dependence queue from load to GEMM stage. * AXI-stream FIFO. * \param inp_mem Local input SRAM buffer. Write only single port BRAM. * \param wgt_mem Local weight SRAM buffer. Write only single port BRAM. */ void load( volatile bus_T *inputs, volatile bus_T *weights, hls::stream<insn_T> &load_queue, hls::stream<bool> &g2l_dep_queue, hls::stream<bool> &l2g_dep_queue, bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]); /*! * \brief Compute module. * Reads in GEMM instructions from the gemm queue, and performs appropriate * GEMM/ALU instructions. Reads in data from the \a wgt_mem and \a inp_mem, * and writes computation results into the \a out_mem. Updates dependence * queues accordingly. * \param done Signal that indicates that VLA is done. AXI-lite memory mapped * register. * \param uops Micro-op data base address in DRAM. AXI-4 master port. * \param biases Bias data base address in DRAM. AXI-4 master port. * \param gemm_queue GEMM instruction queue. AXI-stream FIFO. * \param l2g_dep_queue Dependence queue from load to gemm stage. * AXI-stream FIFO. * \param s2g_dep_queue Dependence queue from store to gemm stage. * AXI-stream FIFO. * \param g2l_dep_queue Dependence queue from gemm to load stage. * AXI-stream FIFO. * \param g2s_dep_queue Dependence queue from gemm to store stage. * AXI-stream FIFO. * \param inp_mem Local input SRAM buffer. Read only single port BRAM. * \param wgt_mem Local weight SRAM buffer. Read only single port BRAM. * \param out_mem Local output SRAM buffer. Write only single port BRAM. */ void compute( volatile uint32_t &done, volatile uop_T *uops, volatile bus_T *biases, hls::stream<insn_T> &gemm_queue, hls::stream<bool> &l2g_dep_queue, hls::stream<bool> &s2g_dep_queue, hls::stream<bool> &g2l_dep_queue, hls::stream<bool> &g2s_dep_queue, bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO], bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]); /*! * \brief Store module. * Reads in store instructions from the store queue, and performs appropriate * store instructions from the output buffer in SRAM to DRAM. Updates dependence * queues accordingly. * \param outputs Output data base address in DRAM. AXI-4 master port. * \param store_queue Store instruction queue. AXI-stream FIFO. * \param g2s_dep_queue Dependence queue from gemm to store stage. * AXI-stream FIFO. * \param s2g_dep_queue Dependence queue from store to gemm stage. * AXI-stream FIFO. * \param out_mem Local output SRAM buffer. Read only single port BRAM. */ void store( volatile bus_T *outputs, hls::stream<insn_T> &store_queue, hls::stream<bool> &g2s_dep_queue, hls::stream<bool> &s2g_dep_queue, bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]); /*! * \brief VTA wrapper for simulation purpose only. * Orchestrates dataflow execution of the fetch, load, GEMM and store stages. * \param insn_count Total instruction count. AXI-lite memory mapped register. * \param insns Instruction data base address in DRAM. AXI-4 master port. * \param uops Micro-op data base address in DRAM. AXI-4 master port. * \param inputs Input data base address in DRAM. AXI-4 master port. * \param weights Weight data base address in DRAM. AXI-4 master port. * \param biases Bias data base address in DRAM. AXI-4 master port. * \param outputs Output data base address in DRAM. AXI-4 master port. */ void vta( uint32_t insn_count, volatile insn_T *insns, volatile uop_T *uops, volatile bus_T *inputs, volatile bus_T *weights, volatile bus_T *biases, volatile bus_T *outputs); #endif // VTA_VTA_H_