Commit f5960ba6 by Thierry Moreau Committed by Tianqi Chen

[SCHEDULER, HW] Auto scheduler for conv2d, hardware generation (#20)

* Hardware generation fixes/sweep, auto scheduling for VTA conv2d

* Hardware generation fixes/sweep, auto scheduling for VTA conv2d

* derive hw spec from config file

* up to date hardware spec
parent 33a309b2
# Directories
ROOTDIR = $(CURDIR)
BUILD_DIR = $(ROOTDIR)/../../build/hardware/xilinx
BUILD_NAME = build
BUILD_DIR = $(ROOTDIR)/../../$(BUILD_NAME)/hardware/xilinx
SCRIPT_DIR = $(ROOTDIR)/scripts
SRC_DIR = $(ROOTDIR)/src
SIM_DIR = $(ROOTDIR)/sim
......@@ -12,6 +13,15 @@ VIVADO_HLS = vivado_hls
VIVADO = vivado
HSI = hsi
# HLS Mode
MODE = all
# SLURM
SLURM = false
# Prevent generation of DSP
NO_DSP = false
# Prevent generation of ALU
NO_ALU = false
# Include top-level config file
ifndef config
ifneq ("$(wildcard ../../config.mk)", "")
......@@ -41,13 +51,18 @@ $(shell echo "$$(( (1000 + $(VTA_HW_COMP_CLOCK_FREQ) - 1) / $(VTA_HW_COMP_CLOCK_
# Derive config name
CONF = \
$(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
$(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_LOG_UOP_BUFF_SIZE)_$(VTA_LOG_INP_BUFF_SIZE)_$(VTA_LOG_WGT_BUFF_SIZE)_$(VTA_LOG_ACC_BUFF_SIZE)_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
.PHONY: all ip bit driver clean
ifeq ($(SLURM), true)
IP_BUILD_PATH = /scratch/hls/$(CONF)
HW_BUILD_PATH = /scratch/vivado/$(CONF)
endif
.PHONY: all ip bit driver clean clean_all
all: driver
all: bit
ip:
mkdir -p $(IP_BUILD_PATH)
......@@ -57,20 +72,32 @@ ip:
$(VTA_LOG_INP_WIDTH) $(VTA_LOG_WGT_WIDTH) $(VTA_LOG_ACC_WIDTH) $(VTA_LOG_OUT_WIDTH) \
$(VTA_LOG_BATCH) $(VTA_LOG_BLOCK_OUT) $(VTA_LOG_BLOCK_IN) \
$(VTA_LOG_UOP_BUFF_SIZE) $(VTA_LOG_INP_BUFF_SIZE) $(VTA_LOG_WGT_BUFF_SIZE) \
$(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE)
$(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE) \
$(MODE) $(NO_DSP) $(NO_ALU)
ifeq ($(SLURM), true)
mkdir -p $(BUILD_DIR)/hls
mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/.
endif
bit: ip
mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
-tclargs $(IP_BUILD_PATH) $(VTA_HW_COMP_THREADS) $(VTA_HW_COMP_CLOCK_FREQ) \
-tclargs $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) $(VTA_HW_COMP_CLOCK_FREQ) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
ifeq ($(SLURM), true)
mkdir -p $(BUILD_DIR)/vivado
mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/.
endif
driver: bit
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
cd $(HW_BUILD_PATH)/bsp && make
clean:
rm -rf $(BUILD_DIR)
\ No newline at end of file
rm -rf *.out *.log *.sb figures
clean_all: clean
rm -rf $(BUILD_DIR)
#!/usr/bin/env python
import argparse
import datetime
import logging
import numpy as np
import os
import pandas as pd
import re
import time
from collections import namedtuple
from numpy import floor, ceil, log2, log10
from subprocess import call
FPGA = namedtuple("FPGAConstraints",
['bram_w', 'bram_d', 'num_bram'])
Hardware = namedtuple("HWConfig",
['batch', 'block_in', 'block_out',
'input_w', 'weight_w', 'accum_w', 'out_w', 'uop_w'])
def find_bram_confs(fpga, hw_conf, log_uop_sizeB):
# Derive sizes
input_elem_size_b = hw_conf.batch*hw_conf.block_in*hw_conf.input_w
weight_elem_size_b = hw_conf.block_in*hw_conf.block_out*hw_conf.weight_w
accum_elem_size_b = hw_conf.batch*hw_conf.block_out*hw_conf.accum_w
input_min_bram = (input_elem_size_b+fpga.bram_w-1)/fpga.bram_w
weight_min_bram = (weight_elem_size_b+fpga.bram_w-1)/fpga.bram_w
accum_min_bram = (accum_elem_size_b+fpga.bram_w-1)/fpga.bram_w
# Exploring all possible BRAM distributions
bram_confs = []
uop_bram = pow(2, log_uop_sizeB) * 8 / (fpga.bram_w * fpga.bram_d)
for log_i_bram in range(int(log2(input_min_bram)), int(ceil(log2(fpga.num_bram)))):
i_bram = pow(2, log_i_bram)
for log_w_bram in range(int(log2(weight_min_bram)), int(ceil(log2(fpga.num_bram)))):
w_bram = pow(2, log_w_bram)
for log_a_bram in range(int(log2(accum_min_bram)), int(ceil(log2(fpga.num_bram)))):
a_bram = pow(2, log_a_bram)
total_bram = uop_bram + i_bram + w_bram + a_bram + a_bram / hw_conf.accum_w * hw_conf.out_w
if total_bram <= fpga.num_bram:
# Right now we need to restrict uop width
input_elems = i_bram * fpga.bram_w * fpga.bram_d / input_elem_size_b
weight_elems = w_bram * fpga.bram_w * fpga.bram_d / weight_elem_size_b
accum_elems = a_bram * fpga.bram_w * fpga.bram_d / accum_elem_size_b
if log2(input_elems) + log2(weight_elems) + log2(accum_elems) <= hw_conf.uop_w:
log_inp_sizeB = int(log2(i_bram*fpga.bram_d*fpga.bram_w/8))
log_wgt_sizeB = int(log2(w_bram*fpga.bram_d*fpga.bram_w/8))
log_acc_sizeB = int(log2(a_bram*fpga.bram_d*fpga.bram_w/8))
bram_confs.append([log_uop_sizeB, log_inp_sizeB, log_wgt_sizeB, log_acc_sizeB])
# Filter out configs that are suboptimal
suboptimal = [False] * len(bram_confs)
for i in range(0, len(bram_confs)):
for j in range(i + 1, len(bram_confs)):
leq_list = [a <= b for a, b in zip(bram_confs[i], bram_confs[j])]
geq_list = [a >= b for a, b in zip(bram_confs[i], bram_confs[j])]
leq = all(leq_list)
geq = all(geq_list)
if leq:
suboptimal[i] = True
if geq:
suboptimal[j] = True
opt_bram_confs = [x[0] for x in zip(bram_confs, suboptimal) if not x[1]]
return opt_bram_confs
def get_make_command(job, build_dir, hw_conf, bram_conf, mode, slurm=False):
cmd = ""
if slurm:
cmd += "#!/bin/bash\n"
cmd += "#SBATCH --job-name={}\n".format(job)
cmd += "#SBATCH --output={}.out\n".format(job)
cmd += "srun "
if mode=="hls":
cmd += "make ip"
else:
cmd += "make"
cmd += " SLURM={} MODE=skip_sim NO_DSP=false NO_ALU=false".format("true" if slurm else "false")
cmd += " BUILD_NAME={}".format(build_dir)
cmd += " VTA_LOG_INP_WIDTH={}".format(int(log2(hw_conf.input_w)))
cmd += " VTA_LOG_WGT_WIDTH={}".format(int(log2(hw_conf.weight_w)))
cmd += " VTA_LOG_BATCH={}".format(int(log2(hw_conf.batch)))
cmd += " VTA_LOG_BLOCK_IN={}".format(int(log2(hw_conf.block_in)))
cmd += " VTA_LOG_BLOCK_OUT={}".format(int(log2(hw_conf.block_out)))
cmd += " VTA_LOG_UOP_BUFF_SIZE={}".format(bram_conf[0])
cmd += " VTA_LOG_INP_BUFF_SIZE={}".format(bram_conf[1])
cmd += " VTA_LOG_WGT_BUFF_SIZE={}".format(bram_conf[2])
cmd += " VTA_LOG_ACC_BUFF_SIZE={}\n".format(bram_conf[3])
return cmd
def cli():
parser = argparse.ArgumentParser(
description='Analyze HLS experiments'
)
parser.add_argument(
'-mode', dest='mode', action='store', type=str, required=True,
choices=["hls", "vivado"], help='hls synthesis or full compilation'
)
parser.add_argument(
'-base_dir', dest='base_dir', action='store', type=str, required=False,
default="../../build/hardware/xilinx/", help='path to build directory'
)
parser.add_argument(
'-min_ibw', dest='min_ibw', action='store', type=int, required=False,
default=3, help='log2 of minimum input bit-width'
)
parser.add_argument(
'-max_ibw', dest='max_ibw', action='store', type=int, required=False,
default=3, help='log2 of maximum input bit-width'
)
parser.add_argument(
'-min_wbw', dest='min_wbw', action='store', type=int, required=False,
default=3, help='log2 of minimum weight bit-width'
)
parser.add_argument(
'-max_wbw', dest='max_wbw', action='store', type=int, required=False,
default=3, help='log2 of maximum weight bit-width'
)
parser.add_argument(
'-acc_bw', dest='acc_bw', action='store', type=int, required=False,
default=32, help='accumulator bit-width'
)
parser.add_argument(
'-uop_bw', dest='uop_bw', action='store', type=int, required=False,
default=32, help='micro-op bit-width'
)
parser.add_argument(
'-min_batch', dest='min_batch', action='store', type=int, required=False,
default=0, help='log2 of minimum batch size'
)
parser.add_argument(
'-max_batch', dest='max_batch', action='store', type=int, required=False,
default=8, help='log2 of maximum batch size'
)
parser.add_argument(
'-min_ic', dest='min_ic', action='store', type=int, required=False,
default=0, help='log2 of minimum input channels'
)
parser.add_argument(
'-max_ic', dest='max_ic', action='store', type=int, required=False,
default=8, help='log2 of maximum input channels'
)
parser.add_argument(
'-min_oc', dest='min_oc', action='store', type=int, required=False,
default=0, help='log2 of minimum output channels'
)
parser.add_argument(
'-max_oc', dest='max_oc', action='store', type=int, required=False,
default=8, help='log2 of maximum output channels'
)
parser.add_argument(
'-uop_sizeB', dest='uop_sizeB', action='store', type=int, required=False,
default=14, help='log2 of uop buffer in B'
)
parser.add_argument(
'-bram_w', dest='bram_w', action='store', type=int, required=False,
default=32, help='FPGA BRAM port width in b'
)
parser.add_argument(
'-bram_d', dest='bram_d', action='store', type=int, required=False,
default=1024, help='FPGA BRAM depth'
)
parser.add_argument(
'-num_bram', dest='num_bram', action='store', type=int, required=False,
default=124, help='FPGA total BRAM'
)
parser.add_argument(
'-slurm', dest='slurm', action='store_true',
help='Run on cluster using slurm'
)
args = parser.parse_args()
# Logging
logging.basicConfig(filename='compile_designs.log',level=logging.DEBUG)
# FPGA config
pynq = FPGA(args.bram_w, args.bram_d, args.num_bram)
# Get timestamp
timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S')
build_dir = "build_{}".format(timestamp)
num_confs = 0
for log_ibw in range(args.min_ibw, args.max_ibw+1):
ibw = pow(2, log_ibw)
for log_wbw in range(args.min_wbw, args.max_wbw+1):
wbw = pow(2, log_wbw)
for log_batch in range(args.min_batch, args.max_batch+1):
batch = pow(2, log_batch)
for log_ic in range(args.min_ic, args.max_ic+1):
ic = pow(2, log_ic)
for log_oc in range(args.min_oc, args.max_oc+1):
oc = pow(2, log_oc)
conf = Hardware(batch, ic, oc, ibw, wbw, args.acc_bw, ibw, args.uop_bw)
bram_confs = find_bram_confs(pynq, conf, args.uop_sizeB)
for b in bram_confs:
job = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_100MHz_10ns".format(
batch, ic, oc, ibw, wbw, b[0], b[1], b[2], b[3])
num_confs += 1
cmd = get_make_command(job, build_dir, conf, b, args.mode, args.slurm)
sb_file = job+".sb"
file = open(sb_file,"w")
file.write(cmd)
file.close()
call(["echo", cmd])
if args.slurm:
call(["sbatch", sb_file])
else:
call(cmd.split(" "))
if __name__ == '__main__':
cli()
......@@ -22,8 +22,11 @@
# Arg 15: wgt buffer size in B (log)
# Arg 16: acc buffer size in B (log)
# Arg 17: out buffer size in B (log)
# Arg 18: mode
# Arg 19: no_dsp
# Arg 20: no_alu
if { [llength $argv] eq 19 } {
if { [llength $argv] eq 22 } {
set src_dir [lindex $argv 2]
set sim_dir [lindex $argv 3]
set test_dir [lindex $argv 4]
......@@ -41,10 +44,13 @@ if { [llength $argv] eq 19 } {
set wgt_buff_size [lindex $argv 16]
set acc_buff_size [lindex $argv 17]
set out_buff_size [lindex $argv 18]
set mode [lindex $argv 19]
set no_dsp [lindex $argv 20]
set no_alu [lindex $argv 21]
} else {
set src_dir "../src/"
set sim_dir "../sim/"
set test_dir "../../src/test/"
set src_dir "../src"
set sim_dir "../sim"
set test_dir "../../src/test"
set include_dir "../../include"
set target_period 10
set inp_width 3
......@@ -59,17 +65,11 @@ if { [llength $argv] eq 19 } {
set wgt_buff_size 15
set acc_buff_size 17
set out_buff_size 15
set mode "all"
set no_dsp "true"
set no_alu "false"
}
# C define flags to pass to compiler
set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
-DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
# This is necessary because of a Vivado restriction that doesn't allow for
# buses wider than 1024 bits.
......@@ -122,56 +122,89 @@ proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
}
}
# C define flags to pass to compiler
set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
-DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
if {$no_dsp=="true"} {
append cflags " -DNO_DSP"
}
if {$no_alu=="true"} {
append cflags " -DNO_ALU"
}
# HLS behavioral sim
open_project vta_sim
set_top vta
add_files $src_dir/vta.cc -cflags $cflags
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
add_files -tb $test_dir/test_lib.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csim_design -clean
close_project
if {$mode=="all" || $mode=="sim"} {
open_project vta_sim
set_top vta
add_files $src_dir/vta.cc -cflags $cflags
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
add_files -tb $test_dir/test_lib.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csim_design -clean
close_project
}
# Generate fetch stage
open_project vta_fetch
set_top fetch
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} {
open_project vta_fetch
set_top fetch
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
}
close_project
}
# Generate load stage
open_project vta_load
set_top load
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} {
open_project vta_load
set_top load
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
}
close_project
}
# Generate compute stage
open_project vta_compute
set_top compute
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} {
open_project vta_compute
set_top compute
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
}
close_project
}
# Generate store stage
open_project vta_store
set_top store
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} {
open_project vta_store
set_top store
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
}
close_project
}
exit
......@@ -59,44 +59,31 @@ if { [llength $argv] eq 12 } {
# Derive input mem parameters
set inp_mem_width [expr $inp_width * $batch * $in_block]
set inp_mem_depth [expr $inp_mem_size * 8 / $inp_mem_width]
set inp_bus_width 1024
set inp_part [expr $inp_mem_width / $inp_bus_width]
if {[expr $inp_part == 0]} {
set inp_part 1
set inp_bus_width $inp_mem_width
}
set inp_mem_depth [expr $inp_mem_size * 8 / ($inp_mem_width * $inp_part)]
# Derive weight mem parameters
set wgt_mem_width [expr $wgt_width * $out_block * $in_block]
set wgt_mem_depth [expr $wgt_mem_size * 8 / $wgt_mem_width]
set wgt_bus_width 1024
set wgt_part [expr $wgt_mem_width / $wgt_bus_width]
if {[expr $wgt_part == 0]} {
set wgt_part 1
set wgt_bus_width $wgt_mem_width
}
set wgt_mem_depth [expr $wgt_mem_size * 8 / ($wgt_mem_width * $wgt_part)]
# Derive output mem parameters
set out_mem_width [expr $out_width * $batch * $out_block]
set out_mem_depth [expr $out_mem_size * 8 / $out_mem_width]
set out_bus_width 1024
set out_part [expr $out_mem_width / $out_bus_width]
if {[expr $out_part == 0]} {
set out_part 1
set out_bus_width $out_mem_width
}
puts $inp_mem_width
puts $inp_mem_depth
puts $inp_bus_width
puts $inp_part
puts $wgt_mem_width
puts $wgt_mem_depth
puts $wgt_bus_width
puts $wgt_part
puts $out_mem_width
puts $out_mem_depth
puts $out_bus_width
puts $out_part
set out_mem_depth [expr $out_mem_size * 8 / ($out_mem_width * $out_part)]
# User defined paths
set proj_name vta
......@@ -937,10 +924,12 @@ launch_runs impl_1 -to_step write_bitstream -jobs $num_threads
wait_on_run impl_1
# Export hardware description file and bitstream files to export/ dir
file mkdir $proj_path/export
file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.sysdef \
$proj_path/export/vta.hdf
file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.bit \
$proj_path/export/vta.bit
if {[file exist $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.bit]} {
file mkdir $proj_path/export
file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.sysdef \
$proj_path/export/vta.hdf
file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.bit \
$proj_path/export/vta.bit
}
exit
......@@ -16,40 +16,36 @@ int main(void) {
printParameters();
#endif
// Buffer indexing
assert(VTA_LOG_ACC_BUFF_DEPTH >= VTA_LOG_INP_BUFF_DEPTH);
// Micro op bound
assert(VTA_UOP_GEM_3_1 < VTA_UOP_WIDTH);
assert(VTA_UOP_ALU_3_1 < VTA_UOP_WIDTH);
// Instruction alignment checks
assert(VTA_UOP_GEM_2_1 < VTA_UOP_WIDTH);
assert(VTA_UOP_ALU_1_1 < VTA_UOP_WIDTH);
// Make sure there is no misaligment
assert(VTA_INSN_GEM_9_1 < VTA_INSN_GEM_A_0);
assert(VTA_INSN_MEM_7_1 < VTA_INSN_MEM_8_0);
assert(VTA_INSN_GEM_8_1 < VTA_INSN_GEM_9_0);
// Instruction bounds
assert(VTA_INSN_MEM_E_1 < VTA_INS_WIDTH);
assert(VTA_INSN_GEM_E_1 < VTA_INS_WIDTH);
assert(VTA_INSN_ALU_F_1 < VTA_INS_WIDTH);
assert(VTA_INSN_GEM_F_1 < VTA_INS_WIDTH);
assert(VTA_INSN_ALU_G_1 < VTA_INS_WIDTH);
int status = 0;
// Run ALU test (vector-scalar operators)
status |= alu_test(VTA_ALU_OPCODE_MIN, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MIN, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_SHL, true, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_SHL, true, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, false);
// Run ALU test (vector-vector operators)
status |= alu_test(VTA_ALU_OPCODE_MIN, false, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MIN, false, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);
// Run blocked GEMM test
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
......
......@@ -92,7 +92,7 @@ typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_uint<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
typedef ap_int<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
/*!
* \brief Fetch module.
......
"""VTA configuration constants (should match hw_spec.h"""
from __future__ import absolute_import as _abs
# Log of input/activation width in bits (default 3 -> 8 bits)
VTA_LOG_INP_WIDTH = 3
# Log of kernel weight width in bits (default 3 -> 8 bits)
VTA_LOG_WGT_WIDTH = 3
# Log of accum width in bits (default 5 -> 32 bits)
VTA_LOG_ACC_WIDTH = 5
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
VTA_LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
VTA_LOG_BLOCK_IN = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
VTA_LOG_BLOCK_OUT = 4
VTA_LOG_OUT_WIDTH = VTA_LOG_INP_WIDTH
# Log of uop buffer size in Bytes
VTA_LOG_UOP_BUFF_SIZE = 15
# Log of acc buffer size in Bytes
VTA_LOG_ACC_BUFF_SIZE = 17
import os
python_vta_dir = os.path.dirname(__file__)
print python_vta_dir
filename = os.path.join(python_vta_dir, '../../config.mk')
VTA_PYNQ_BRAM_W = 32
VTA_PYNQ_BRAM_D = 1024
VTA_PYNQ_NUM_BRAM = 124
keys = ["VTA_LOG_INP_WIDTH", "VTA_LOG_WGT_WIDTH", "VTA_LOG_ACC_WIDTH",
"VTA_LOG_BATCH", "VTA_LOG_BLOCK_IN", "VTA_LOG_BLOCK_OUT",
"VTA_LOG_UOP_BUFF_SIZE", "VTA_LOG_INP_BUFF_SIZE",
"VTA_LOG_WGT_BUFF_SIZE", "VTA_LOG_ACC_BUFF_SIZE"]
params = {}
if os.path.isfile(filename):
with open(filename) as f:
for line in f:
for k in keys:
if k+" =" in line:
val = line.split("=")[1].strip()
params[k] = int(val)
# print params
else:
print "Error: {} not found. Please make sure you have config.mk in your vta root".format(filename)
exit()
# The Constants
VTA_WGT_WIDTH = 8
VTA_INP_WIDTH = VTA_WGT_WIDTH
VTA_OUT_WIDTH = 32
VTA_INP_WIDTH = 1 << params["VTA_LOG_INP_WIDTH"]
VTA_WGT_WIDTH = 1 << params["VTA_LOG_WGT_WIDTH"]
VTA_OUT_WIDTH = 1 << params["VTA_LOG_ACC_WIDTH"]
VTA_TARGET = "VTA_PYNQ_TARGET"
# Dimensions of the GEMM unit
# (BATCH,BLOCK_IN) x (BLOCK_IN,BLOCK_OUT)
VTA_BATCH = 1
VTA_BLOCK_IN = 16
VTA_BLOCK_OUT = 16
VTA_BATCH = 1 << params["VTA_LOG_BATCH"]
VTA_BLOCK_IN = 1 << params["VTA_LOG_BLOCK_IN"]
VTA_BLOCK_OUT = 1 << params["VTA_LOG_BLOCK_OUT"]
# log-2 On-chip wgt buffer size in Bytes
VTA_LOG_WGT_BUFF_SIZE = 15
# log-2 On-chip uop buffer size in Bytes
VTA_LOG_UOP_BUFF_SIZE = params["VTA_LOG_UOP_BUFF_SIZE"]
# log-2 On-chip input buffer size in Bytes
VTA_LOG_INP_BUFF_SIZE = 15
VTA_LOG_INP_BUFF_SIZE = params["VTA_LOG_INP_BUFF_SIZE"]
# log-2 On-chip wgt buffer size in Bytes
VTA_LOG_WGT_BUFF_SIZE = params["VTA_LOG_WGT_BUFF_SIZE"]
# log-2 On-chip output buffer size in Bytes
VTA_LOG_OUT_BUFF_SIZE = 17
# On-chip wgt buffer size in Bytes
VTA_WGT_BUFF_SIZE = 1 << VTA_LOG_WGT_BUFF_SIZE
VTA_LOG_OUT_BUFF_SIZE = params["VTA_LOG_ACC_BUFF_SIZE"]
# Uop buffer size
VTA_UOP_BUFF_SIZE = 1 << VTA_LOG_UOP_BUFF_SIZE
# Input buffer size
VTA_INP_BUFF_SIZE = 1 << VTA_LOG_INP_BUFF_SIZE
# On-chip wgt buffer size in Bytes
VTA_WGT_BUFF_SIZE = 1 << VTA_LOG_WGT_BUFF_SIZE
# Output buffer size.
VTA_OUT_BUFF_SIZE = 1 << VTA_LOG_OUT_BUFF_SIZE
......@@ -69,11 +83,7 @@ VTA_MEM_ID_OUT = 4
VTA_ALU_OPCODE_MIN = 0
VTA_ALU_OPCODE_MAX = 1
VTA_ALU_OPCODE_ADD = 2
VTA_ALU_OPCODE_SUB = 3
VTA_ALU_OPCODE_MUL = 4
VTA_ALU_OPCODE_SHL = 5
VTA_ALU_OPCODE_SHR = 6
VTA_ALU_OPCODE_UNSET = 7
VTA_ALU_OPCODE_SHR = 3
# Task queue id (pipeline stage)
VTA_QID_LOAD_INP = 1
......
......@@ -617,7 +617,6 @@ def inject_alu_intrin(stmt_in):
extents.append(tmp_body.extent)
tmp_body = tmp_body.body
# Derive opcode
alu_opcode = spec.VTA_ALU_OPCODE_UNSET
if isinstance(loop_body.value, tvm.expr.Add):
alu_opcode = spec.VTA_ALU_OPCODE_ADD
lhs = loop_body.value.a
......@@ -640,9 +639,9 @@ def inject_alu_intrin(stmt_in):
rhs = loop_body.value.b
elif isinstance(loop_body.value, tvm.expr.Call):
if loop_body.value.name == 'shift_left':
alu_opcode = spec.VTA_ALU_OPCODE_SHL
alu_opcode = spec.VTA_ALU_OPCODE_SHR
lhs = loop_body.value.args[0]
rhs = loop_body.value.args[1]
rhs = tvm.ir_pass.Simplify(-loop_body.value.args[1])
elif loop_body.value.name == 'shift_right':
alu_opcode = spec.VTA_ALU_OPCODE_SHR
lhs = loop_body.value.args[0]
......@@ -732,7 +731,8 @@ def inject_alu_intrin(stmt_in):
tvm.ir_pass.Simplify(c/(spec.VTA_BATCH*spec.VTA_BLOCK_OUT)) for c in dst_coeff]
# Flatten the outer loops
src_coeff, dst_coeff, extents = _flatten_loop(src_coeff, dst_coeff, extents)
if extents:
src_coeff, dst_coeff, extents = _flatten_loop(src_coeff, dst_coeff, extents)
# Insert ALU micro-ops
irb = tvm.ir_builder.create()
......
......@@ -117,7 +117,6 @@ class UopKernel {
// The loop nest structure
VerifyDep(dst_index);
VTAUop op;
op.reset_out = reset_out;
op.dst_idx = dst_index;
op.src_idx = src_index;
op.wgt_idx = wgt_index;
......@@ -128,6 +127,12 @@ class UopKernel {
} else {
assert(mode_ == mode);
}
// Set reset_out field if unset
if (reset_out_ == 0xFFFFFFFF) {
reset_out_ = reset_out;
} else {
assert(reset_out_ == reset_out);
}
// Check kernel op and imm/imm_val in ALU mode
if (mode == 1) {
if (opcode_ == 0xFFFFFFFF) {
......@@ -146,12 +151,11 @@ class UopKernel {
uint32_t size = seq_.size();
printf("There are %u uops\n", size);
for (uint32_t i = 0; i < size; ++i) {
printf("[%04u]\t acc=%u, inp=%u, wgt=%u, reset_out=%u\n",
printf("[%04u]\t acc=%u, inp=%u, wgt=%u\n",
i,
seq_[i].dst_idx,
seq_[i].src_idx,
seq_[i].wgt_idx,
seq_[i].reset_out);
seq_[i].wgt_idx);
}
printf("\n");
}
......@@ -160,6 +164,7 @@ class UopKernel {
// The kernel's mode, opcode, immediate setting and value
uint32_t mode_{0xFFFFFFFF}; // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU
uint32_t opcode_{0xFFFFFFFF};
uint32_t reset_out_{0xFFFFFFFF};
bool use_imm_{false};
uint16_t imm_val_{0};
......@@ -521,20 +526,6 @@ class InsnQueue : public BaseQueue {
} else {
return "add";
}
} else if (opcode == VTA_ALU_OPCODE_SUB) {
if (use_imm) {
return "sub imm";
} else {
return "sub";
}
} else if (opcode == VTA_ALU_OPCODE_MUL) {
if (use_imm) {
return "mul imm";
} else {
return "mul";
}
} else if (opcode == VTA_ALU_OPCODE_SHL) {
return "shl";
} else if (opcode == VTA_ALU_OPCODE_SHR) {
return "shr";
}
......@@ -641,6 +632,7 @@ class InsnQueue : public BaseQueue {
static_cast<int>(c.mem.pop_next_dep),
static_cast<int>(c.mem.push_prev_dep),
static_cast<int>(c.mem.push_next_dep));
printf("\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
printf("\trange (%d, %d)\n",
static_cast<int>(c.gemm.uop_bgn),
static_cast<int>(c.gemm.uop_end));
......@@ -662,6 +654,7 @@ class InsnQueue : public BaseQueue {
static_cast<int>(c.mem.pop_next_dep),
static_cast<int>(c.mem.push_prev_dep),
static_cast<int>(c.mem.push_next_dep));
printf("\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
printf("\trange (%d, %d)\n",
static_cast<int>(c.alu.uop_bgn),
static_cast<int>(c.alu.uop_end));
......@@ -1081,6 +1074,7 @@ class CommandQueue {
}
VTAGemInsn* insn = insn_queue_.CreateGemInsn();
insn->opcode = VTA_OPCODE_GEMM;
insn->reset_reg = kernel->reset_out_;
insn->uop_bgn = kernel->sram_begin_;
insn->uop_end = kernel->sram_end_;
const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
......@@ -1119,6 +1113,7 @@ class CommandQueue {
}
VTAAluInsn* insn = insn_queue_.CreateAluInsn();
insn->opcode = VTA_OPCODE_ALU;
insn->reset_reg = kernel->reset_out_;
insn->uop_bgn = kernel->sram_begin_;
insn->uop_end = kernel->sram_end_;
insn->alu_opcode = kernel->opcode_;
......
......@@ -6,10 +6,14 @@ from tvm.contrib import rpc, util
host = "pynq"
port = 9091
target = "llvm -target=armv7-none-linux-gnueabihf"
bit = "vta.bit"
bit = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_100MHz_10ns.bit".format(
vta.VTA_BATCH, vta.VTA_BLOCK_IN, vta.VTA_BLOCK_OUT,
vta.VTA_INP_WIDTH, vta.VTA_WGT_WIDTH,
vta.VTA_LOG_UOP_BUFF_SIZE, vta.VTA_LOG_INP_BUFF_SIZE,
vta.VTA_LOG_WGT_BUFF_SIZE, vta.VTA_LOG_OUT_BUFF_SIZE)
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
bitstream = os.path.join(curr_path, "./", bit)
bitstream = os.path.join(curr_path, "../../../../vta_bitstreams/bitstreams/", bit)
def test_program_rpc():
assert tvm.module.enabled("rpc")
......
......@@ -480,8 +480,8 @@ if __name__ == "__main__":
test_padded_load()
print("Load/store test")
test_save_load_out()
print("GEMM test")
test_gemm()
# print("GEMM test")
# test_gemm()
print("Max immediate")
test_alu(tvm.max, np.maximum, use_imm=True)
print("Max")
......@@ -492,6 +492,8 @@ if __name__ == "__main__":
test_alu(lambda x, y: x + y)
print("Shift right immediate")
test_alu(lambda x, y: x >> y, np.right_shift, use_imm=True)
print("Shift left immediate")
test_alu(lambda x, y: x << y, np.left_shift, use_imm=True)
print("Relu")
test_relu()
# print("Shift and scale")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment