Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
tic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
wenyuanbo
tic
Commits
dae77cdb
Commit
dae77cdb
authored
May 02, 2018
by
Thierry Moreau
Committed by
Tianqi Chen
Jul 11, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[HARDWARE, TEST] Fixed hardware generation flow (#34)
parent
9f0e8ffe
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
551 additions
and
198 deletions
+551
-198
vta/apps/pynq_rpc/start_rpc_server.sh
+1
-1
vta/hardware/xilinx/Makefile
+46
-25
vta/hardware/xilinx/scripts/hls.tcl
+45
-38
vta/hardware/xilinx/scripts/vivado.tcl
+9
-9
vta/hardware/xilinx/sim/vta_test.cc
+3
-0
vta/make/config.json
+2
-2
vta/make/sim_sample.json
+2
-2
vta/make/vta_config.py
+79
-1
vta/python/vta/environment.py
+5
-1
vta/python/vta/ir_pass.py
+4
-4
vta/tests/hardware/common/test_lib.cc
+334
-1
vta/tests/hardware/common/test_lib.h
+13
-3
vta/tests/hardware/pynq/Makefile
+7
-12
vta/tests/hardware/pynq/metal_test.cc
+1
-99
No files found.
vta/apps/pynq_rpc/start_rpc_server.sh
View file @
dae77cdb
#!/bin/bash
export
PYTHONPATH
=
${
PYTHONPATH
}
:/home/xilinx/tvm/python:/home/xilinx/vta/python
export
PYTHONPATH
=
${
PYTHONPATH
}
:/home/xilinx/
vta/nnvm/
tvm/python:/home/xilinx/vta/python
export
LD_LIBRARY_PATH
=
${
LD_LIBRARY_PATH
}
:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/
python
-m
vta.exec.rpc_server
vta/hardware/xilinx/Makefile
View file @
dae77cdb
...
...
@@ -13,8 +13,10 @@ VIVADO_HLS = vivado_hls
VIVADO
=
vivado
HSI
=
hsi
# HLS Mode
MODE
=
all
# HLS mode
MODE
=
skip_sim
# Debug flag
DEBUG
=
false
# SLURM
SLURM
=
false
# Prevent generation of DSP
...
...
@@ -22,15 +24,26 @@ NO_DSP = false
# Prevent generation of ALU
NO_ALU
=
false
# Include top-level config file
ifndef
config
ifneq
(
"$(wildcard ../../config.mk)"
,
""
)
config
=
../../config.mk
else
config
=
../../make/config.mk
endif
endif
include
$(config)
# Process VTA JSON config
VTA_CONFIG
=
python
$(CURDIR)
/../../make/vta_config.py
CFLAGS
:=
$(
shell
${
VTA_CONFIG
}
--cflags
)
VTA_TARGET
:=
$(
shell
${
VTA_CONFIG
}
--target
)
#---------------------
# VTA Parameters
#--------------------
VTA_INP_WIDTH
:=
$(
shell
${
VTA_CONFIG
}
--get-inpwidth
)
VTA_WGT_WIDTH
:=
$(
shell
${
VTA_CONFIG
}
--get-wgtwidth
)
VTA_ACC_WIDTH
:=
$(
shell
${
VTA_CONFIG
}
--get-accwidth
)
VTA_OUT_WIDTH
:=
$(
shell
${
VTA_CONFIG
}
--get-outwidth
)
VTA_BATCH
:=
$(
shell
${
VTA_CONFIG
}
--get-batch
)
VTA_IN_BLOCK
:=
$(
shell
${
VTA_CONFIG
}
--get-blockin
)
VTA_OUT_BLOCK
:=
$(
shell
${
VTA_CONFIG
}
--get-blockout
)
VTA_UOP_BUFF_SIZE
:=
$(
shell
${
VTA_CONFIG
}
--get-uopbuffsize
)
VTA_INP_BUFF_SIZE
:=
$(
shell
${
VTA_CONFIG
}
--get-inpbuffsize
)
VTA_WGT_BUFF_SIZE
:=
$(
shell
${
VTA_CONFIG
}
--get-wgtbuffsize
)
VTA_ACC_BUFF_SIZE
:=
$(
shell
${
VTA_CONFIG
}
--get-accbuffsize
)
VTA_OUT_BUFF_SIZE
:=
$(
shell
${
VTA_CONFIG
}
--get-outbuffsize
)
#---------------------
# Compilation parameters
...
...
@@ -50,8 +63,8 @@ TARGET_PER = \
$(
shell
echo
"
$$
(( (1000 +
$(VTA_HW_COMP_CLOCK_FREQ)
- 1
)
/
$(VTA_HW_COMP_CLOCK_FREQ)
-
$(VTA_HW_COMP_TIMING_COMP)))
"
)
# Derive config name
CONF
=
\
$(VTA_BATCH)
x
$(VTA_IN_BLOCK)
x
$(VTA_OUT_BLOCK)
_
$(VTA_INP_WIDTH)
bx
$(VTA_WGT_WIDTH)
b_
$(VTA_LOG_UOP_BUFF_SIZE)
_
$(VTA_LOG_INP_BUFF_SIZE)
_
$(VTA_LOG_WGT_BUFF_SIZE)
_
$(VTA_LOG_ACC_BUFF_SIZE
)
_
$(VTA_HW_COMP_CLOCK_FREQ)
MHz_
$(TARGET_PER)
ns
CONF
_ROOT
=
$(
shell
${
VTA_CONFIG
}
--cfg-str
)
CONF
=
$(CONF_ROOT
)
_
$(VTA_HW_COMP_CLOCK_FREQ)
MHz_
$(TARGET_PER)
ns
IP_BUILD_PATH
=
$(BUILD_DIR)
/hls/
$(CONF)
HW_BUILD_PATH
=
$(BUILD_DIR)
/vivado/
$(CONF)
...
...
@@ -60,26 +73,34 @@ ifeq ($(SLURM), true)
HW_BUILD_PATH
=
/scratch/vivado/
$(CONF)
endif
.PHONY
:
all ip bit driver clean clean_all
# IP file path
IP_PATH
=
$(BUILD_DIR)
/hls/
$(CONF)
/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip
# Bitstream file path
BIT_PATH
=
$(BUILD_DIR)
/vivado/
$(CONF)
/export/
$(CONF)
.bit
all
:
bit
.PHONY
:
all ip bit bsp clean clean_all
ip
:
all
:
bsp
ip
:
$(IP_PATH)
bit
:
$(BIT_PATH)
$(IP_PATH)
:
$(SRC_DIR)/*
mkdir
-p
$(IP_BUILD_PATH)
cd
$(IP_BUILD_PATH)
&&
\
$(VIVADO_HLS)
-f
$(SCRIPT_DIR)
/hls.tcl
\
-tclargs
$(SRC_DIR)
$(SIM_DIR)
$(TEST_DIR)
$(INCLUDE_DIR)
$(TARGET_PE
R)
\
$(VTA_LOG_INP_WIDTH)
$(VTA_LOG_WGT_WIDTH)
$(VTA_LOG_ACC_WIDTH)
$(VTA_LOG_OUT_WIDTH
)
\
$(VTA_LOG_BATCH)
$(VTA_LOG_BLOCK_OUT)
$(VTA_LOG_BLOCK_IN
)
\
$(VTA_LOG_UOP_BUFF_SIZE)
$(VTA_LOG_INP_BUFF_SIZE)
$(VTA_LOG_WGT_BUFF_SIZE
)
\
$(VTA_LOG_ACC_BUFF_SIZE)
$(VTA_LOG_OU
T_BUFF_SIZE)
\
$(MODE)
$(NO_DSP)
$(NO_ALU
)
-tclargs
$(SRC_DIR)
$(SIM_DIR)
$(TEST_DIR)
$(INCLUDE_DI
R)
\
$(MODE)
$(DEBUG)
$(NO_DSP)
$(NO_ALU)
$(TARGET_PER
)
\
$(VTA_INP_WIDTH)
$(VTA_WGT_WIDTH)
$(VTA_ACC_WIDTH)
$(VTA_OUT_WIDTH
)
\
$(VTA_BATCH)
$(VTA_IN_BLOCK)
$(VTA_OUT_BLOCK
)
\
$(VTA_UOP_BUFF_SIZE)
$(VTA_INP_BUFF_SIZE)
$(VTA_WG
T_BUFF_SIZE)
\
$(VTA_ACC_BUFF_SIZE)
$(VTA_OUT_BUFF_SIZE
)
ifeq
($(SLURM),
true)
mkdir
-p
$(BUILD_DIR)/hls
mv
$(IP_BUILD_PATH)
$(BUILD_DIR)/hls/.
endif
bit
:
ip
$(BIT_PATH)
:
$(IP_PATH)
mkdir
-p
$(HW_BUILD_PATH)
cd
$(HW_BUILD_PATH)
&&
\
$(VIVADO)
-mode
tcl
-source
$(SCRIPT_DIR)
/vivado.tcl
\
...
...
@@ -92,12 +113,12 @@ ifeq ($(SLURM), true)
mv
$(HW_BUILD_PATH)
$(BUILD_DIR)/vivado/.
endif
driver
:
bit
bsp
:
$(BIT_PATH)
cd
$(HW_BUILD_PATH)
&&
$(HSI)
-mode
tcl
-source
$(SCRIPT_DIR)
/hsi.tcl
-nojournal
-nolog
cd
$(HW_BUILD_PATH)
/bsp
&&
make
clean
:
rm
-rf
*
.out
*
.log
*
.sb figures
clean
_
all
:
clean
cleanall
:
clean
rm
-rf
$(BUILD_DIR)
vta/hardware/xilinx/scripts/hls.tcl
View file @
dae77cdb
...
...
@@ -9,65 +9,69 @@
# Arg 2: path to sim sources
# Arg 3: path to test sources
# Arg 4: path to include sources
# Arg 5: target clock period
# Arg 6: input type width
(
log
)
# Arg 7: weight type width
(
log
)
# Arg 8: accum type width
(
log
)
# Arg 9: output type width
(
log
)
# Arg 10: batch size
(
log
)
# Arg 11: in block size
(
log
)
# Arg 12: out block size
(
log
)
# Arg 13: uop buffer size in B
(
log
)
# Arg 14: inp buffer size in B
(
log
)
# Arg 15: wgt buffer size in B
(
log
)
# Arg 16: acc buffer size in B
(
log
)
# Arg 17: out buffer size in B
(
log
)
# Arg 18: mode
# Arg 19: no_dsp
# Arg 20: no_alu
# Arg 5: mode
# Arg 6: debug
# Arg 7: no_dsp
# Arg 8: no_alu
# Arg 9: target clock period
# Arg 10: input type width
(
log
)
# Arg 11: weight type width
(
log
)
# Arg 12: accum type width
(
log
)
# Arg 13: output type width
(
log
)
# Arg 14: batch size
(
log
)
# Arg 15: in block size
(
log
)
# Arg 16: out block size
(
log
)
# Arg 17: uop buffer size in B
(
log
)
# Arg 18: inp buffer size in B
(
log
)
# Arg 19: wgt buffer size in B
(
log
)
# Arg 20: acc buffer size in B
(
log
)
# Arg 21: out buffer size in B
(
log
)
if
{
[
llength
$argv
]
eq 2
2
}
{
if
{
[
llength
$argv
]
eq 2
3
}
{
set src_dir
[
lindex
$argv
2
]
set sim_dir
[
lindex
$argv
3
]
set test_dir
[
lindex
$argv
4
]
set include_dir
[
lindex
$argv
5
]
set target_period
[
lindex
$argv
6
]
set inp_width
[
lindex
$argv
7
]
set wgt_width
[
lindex
$argv
8
]
set acc_width
[
lindex
$argv
9
]
set out_width
[
lindex
$argv
10
]
set batch
[
lindex
$argv
11
]
set block_in
[
lindex
$argv
12
]
set block_out
[
lindex
$argv
13
]
set uop_buff_size
[
lindex
$argv
14
]
set inp_buff_size
[
lindex
$argv
15
]
set wgt_buff_size
[
lindex
$argv
16
]
set acc_buff_size
[
lindex
$argv
17
]
set out_buff_size
[
lindex
$argv
18
]
set mode
[
lindex
$argv
19
]
set no_dsp
[
lindex
$argv
20
]
set no_alu
[
lindex
$argv
21
]
set mode
[
lindex
$argv
6
]
set debug
[
lindex
$argv
7
]
set no_dsp
[
lindex
$argv
8
]
set no_alu
[
lindex
$argv
9
]
set target_period
[
lindex
$argv
10
]
set inp_width
[
lindex
$argv
11
]
set wgt_width
[
lindex
$argv
12
]
set acc_width
[
lindex
$argv
13
]
set out_width
[
lindex
$argv
14
]
set batch
[
lindex
$argv
15
]
set block_in
[
lindex
$argv
16
]
set block_out
[
lindex
$argv
17
]
set uop_buff_size
[
lindex
$argv
18
]
set inp_buff_size
[
lindex
$argv
19
]
set wgt_buff_size
[
lindex
$argv
20
]
set acc_buff_size
[
lindex
$argv
21
]
set out_buff_size
[
lindex
$argv
22
]
}
else
{
set src_dir
"../src"
set sim_dir
"../sim"
set test_dir
"../../src/test"
set include_dir
"../../include"
set mode
"all"
set debug
"false"
set no_dsp
"true"
set no_alu
"false"
set target_period 10
set inp_width 3
set wgt_width 3
set acc_width 5
set out_width 3
set batch 1
set block_out 4
set block_in 4
set block_out 4
set uop_buff_size 15
set inp_buff_size 15
set wgt_buff_size 15
set acc_buff_size 17
set out_buff_size 15
set mode
"all"
set no_dsp
"true"
set no_alu
"false"
exit
}
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
...
...
@@ -124,12 +128,15 @@ proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
# C define flags to pass to compiler
set cflags
"-I
$include
_dir -I
$src
_dir -I
$test
_dir
\
-DVTA_
DEBUG=0 -DVTA_
LOG_WGT_WIDTH=
$wgt
_width -DVTA_LOG_INP_WIDTH=
$inp
_width
\
-DVTA_LOG_WGT_WIDTH=
$wgt
_width -DVTA_LOG_INP_WIDTH=
$inp
_width
\
-DVTA_LOG_ACC_WIDTH=
$acc
_width -DVTA_LOG_OUT_WIDTH=
$out
_width
\
-DVTA_LOG_BATCH=
$batch
-DVTA_LOG_BLOCK_OUT=
$block
_out -DVTA_LOG_BLOCK_IN=
$block
_in
\
-DVTA_LOG_UOP_BUFF_SIZE=
$uop
_buff_size -DVTA_LOG_INP_BUFF_SIZE=
$inp
_buff_size
\
-DVTA_LOG_WGT_BUFF_SIZE=
$wgt
_buff_size -DVTA_LOG_ACC_BUFF_SIZE=
$acc
_buff_size
\
-DVTA_LOG_OUT_BUFF_SIZE=
$out
_buff_size"
if
{
$debug
==
"true"
}
{
append cflags
" -DVTA_DEBUG=1"
}
if
{
$no
_dsp==
"true"
}
{
append cflags
" -DNO_DSP"
}
...
...
vta/hardware/xilinx/scripts/vivado.tcl
View file @
dae77cdb
...
...
@@ -26,15 +26,15 @@ if { [llength $argv] eq 12 } {
set ip_path
[
lindex
$argv
0
]
set num_threads
[
lindex
$argv
1
]
set clock_freq
[
lindex
$argv
2
]
set inp_width
[
lindex
$argv
3
]
set wgt_width
[
lindex
$argv
4
]
set out_width
[
lindex
$argv
5
]
set batch
[
lindex
$argv
6
]
set out_block
[
lindex
$argv
7
]
set in_block
[
lindex
$argv
8
]
set inp_mem_size
[
lindex
$argv
9
]
set wgt_mem_size
[
lindex
$argv
10
]
set out_mem_size
[
lindex
$argv
11
]
set inp_width
[
expr
1 <<
[
lindex
$argv
3
]
]
set wgt_width
[
expr
1 <<
[
lindex
$argv
4
]
]
set out_width
[
expr
1 <<
[
lindex
$argv
5
]
]
set batch
[
expr
1 <<
[
lindex
$argv
6
]
]
set out_block
[
expr
1 <<
[
lindex
$argv
7
]
]
set in_block
[
expr
1 <<
[
lindex
$argv
8
]
]
set inp_mem_size
[
expr
1 <<
[
lindex
$argv
9
]
]
set wgt_mem_size
[
expr
1 <<
[
lindex
$argv
10
]
]
set out_mem_size
[
expr
1 <<
[
lindex
$argv
11
]
]
if
{
$clock
_freq eq 100
}
{
set clock_id 0
puts
"Setting clock frequency to 100MHz"
...
...
vta/hardware/xilinx/sim/vta_test.cc
View file @
dae77cdb
...
...
@@ -53,5 +53,8 @@ int main(void) {
status
|=
blocked_gemm_test
(
256
,
256
,
VTA_BLOCK_OUT
*
4
,
true
,
1
);
status
|=
blocked_gemm_test
(
256
,
256
,
VTA_BLOCK_OUT
*
4
,
false
,
1
);
// Simple GEMM unit test
status
|=
gemm_test
(
64
,
64
,
64
,
true
);
return
status
;
}
vta/make/config.json
View file @
dae77cdb
...
...
@@ -7,8 +7,8 @@
"LOG_BATCH"
:
0
,
"LOG_BLOCK_IN"
:
4
,
"LOG_BLOCK_OUT"
:
4
,
"LOG_UOP_BUFF_SIZE"
:
1
5
,
"LOG_UOP_BUFF_SIZE"
:
1
4
,
"LOG_INP_BUFF_SIZE"
:
15
,
"LOG_WGT_BUFF_SIZE"
:
1
5
,
"LOG_WGT_BUFF_SIZE"
:
1
8
,
"LOG_ACC_BUFF_SIZE"
:
17
}
vta/make/sim_sample.json
View file @
dae77cdb
...
...
@@ -7,8 +7,8 @@
"LOG_BATCH"
:
0
,
"LOG_BLOCK_IN"
:
4
,
"LOG_BLOCK_OUT"
:
4
,
"LOG_UOP_BUFF_SIZE"
:
1
5
,
"LOG_UOP_BUFF_SIZE"
:
1
4
,
"LOG_INP_BUFF_SIZE"
:
15
,
"LOG_WGT_BUFF_SIZE"
:
1
5
,
"LOG_WGT_BUFF_SIZE"
:
1
8
,
"LOG_ACC_BUFF_SIZE"
:
17
}
vta/make/vta_config.py
View file @
dae77cdb
...
...
@@ -28,6 +28,32 @@ def main():
help
=
"print all the config json"
)
parser
.
add_argument
(
"--target"
,
action
=
"store_true"
,
help
=
"print the target"
)
parser
.
add_argument
(
"--cfg-str"
,
action
=
"store_true"
,
help
=
"print the configuration string"
)
parser
.
add_argument
(
"--get-inpwidth"
,
action
=
"store_true"
,
help
=
"returns log of input bitwidth"
)
parser
.
add_argument
(
"--get-wgtwidth"
,
action
=
"store_true"
,
help
=
"returns log of weight bitwidth"
)
parser
.
add_argument
(
"--get-accwidth"
,
action
=
"store_true"
,
help
=
"returns log of accum bitwidth"
)
parser
.
add_argument
(
"--get-outwidth"
,
action
=
"store_true"
,
help
=
"returns log of output bitwidth"
)
parser
.
add_argument
(
"--get-batch"
,
action
=
"store_true"
,
help
=
"returns log of tensor batch dimension"
)
parser
.
add_argument
(
"--get-blockin"
,
action
=
"store_true"
,
help
=
"returns log of tensor block in dimension"
)
parser
.
add_argument
(
"--get-blockout"
,
action
=
"store_true"
,
help
=
"returns log of tensor block out dimension"
)
parser
.
add_argument
(
"--get-uopbuffsize"
,
action
=
"store_true"
,
help
=
"returns log of micro-op buffer size in B"
)
parser
.
add_argument
(
"--get-inpbuffsize"
,
action
=
"store_true"
,
help
=
"returns log of input buffer size in B"
)
parser
.
add_argument
(
"--get-wgtbuffsize"
,
action
=
"store_true"
,
help
=
"returns log of weight buffer size in B"
)
parser
.
add_argument
(
"--get-accbuffsize"
,
action
=
"store_true"
,
help
=
"returns log of accum buffer size in B"
)
parser
.
add_argument
(
"--get-outbuffsize"
,
action
=
"store_true"
,
help
=
"returns log of output buffer size in B"
)
args
=
parser
.
parse_args
()
if
len
(
sys
.
argv
)
==
1
:
...
...
@@ -46,13 +72,17 @@ def main():
raise
RuntimeError
(
"Cannot find config in
%
s"
%
str
(
path_list
))
cfg
=
json
.
load
(
open
(
ok_path_list
[
0
]))
cfg
[
"LOG_OUT_WIDTH"
]
=
cfg
[
"LOG_INP_WIDTH"
]
cfg
[
"LOG_OUT_BUFF_SIZE"
]
=
cfg
[
"LOG_ACC_BUFF_SIZE"
]
+
cfg
[
"LOG_ACC_WIDTH"
]
-
cfg
[
"LOG_OUT_WIDTH"
]
pkg
=
get_pkg_config
(
cfg
)
if
args
.
target
:
print
(
pkg
.
target
)
if
args
.
cflags
:
print
(
" "
.
join
(
pkg
.
cflags
))
cflags_str
=
" "
.
join
(
pkg
.
cflags
)
if
cfg
[
"TARGET"
]
==
"pynq"
:
cflags_str
+=
" -DVTA_TARGET_PYNQ"
print
(
cflags_str
)
if
args
.
ldflags
:
print
(
" "
.
join
(
pkg
.
ldflags
))
...
...
@@ -60,6 +90,54 @@ def main():
if
args
.
cfg_json
:
print
(
pkg
.
cfg_json
)
if
args
.
cfg_str
:
cfg_str
=
"{}x{}x{}_{}bx{}b_{}_{}_{}_{}"
.
format
(
(
1
<<
cfg
[
"LOG_BATCH"
]),
(
1
<<
cfg
[
"LOG_BLOCK_IN"
]),
(
1
<<
cfg
[
"LOG_BLOCK_OUT"
]),
(
1
<<
cfg
[
"LOG_INP_WIDTH"
]),
(
1
<<
cfg
[
"LOG_WGT_WIDTH"
]),
cfg
[
"LOG_UOP_BUFF_SIZE"
],
cfg
[
"LOG_INP_BUFF_SIZE"
],
cfg
[
"LOG_WGT_BUFF_SIZE"
],
cfg
[
"LOG_ACC_BUFF_SIZE"
])
print
cfg_str
if
args
.
get_inpwidth
:
print
(
cfg
[
"LOG_INP_WIDTH"
])
if
args
.
get_wgtwidth
:
print
(
cfg
[
"LOG_WGT_WIDTH"
])
if
args
.
get_accwidth
:
print
(
cfg
[
"LOG_ACC_WIDTH"
])
if
args
.
get_outwidth
:
print
(
cfg
[
"LOG_OUT_WIDTH"
])
if
args
.
get_batch
:
print
(
cfg
[
"LOG_BATCH"
])
if
args
.
get_blockin
:
print
(
cfg
[
"LOG_BLOCK_IN"
])
if
args
.
get_blockout
:
print
(
cfg
[
"LOG_BLOCK_OUT"
])
if
args
.
get_uopbuffsize
:
print
(
cfg
[
"LOG_UOP_BUFF_SIZE"
])
if
args
.
get_inpbuffsize
:
print
(
cfg
[
"LOG_INP_BUFF_SIZE"
])
if
args
.
get_wgtbuffsize
:
print
(
cfg
[
"LOG_WGT_BUFF_SIZE"
])
if
args
.
get_outbuffsize
:
print
(
cfg
[
"LOG_OUT_BUFF_SIZE"
])
if
args
.
get_accbuffsize
:
print
(
cfg
[
"LOG_ACC_BUFF_SIZE"
])
if
__name__
==
"__main__"
:
main
()
vta/python/vta/environment.py
View file @
dae77cdb
...
...
@@ -130,11 +130,15 @@ class Environment(object):
self
.
BLOCK_IN
*
self
.
WGT_WIDTH
)
self
.
ACC_ELEM_BITS
=
(
self
.
BATCH
*
self
.
BLOCK_
IN
*
self
.
BLOCK_
OUT
*
self
.
ACC_WIDTH
)
self
.
OUT_ELEM_BITS
=
(
self
.
BATCH
*
self
.
BLOCK_OUT
*
self
.
OUT_WIDTH
)
self
.
INP_ELEM_BYTES
=
self
.
INP_ELEM_BITS
//
8
self
.
WGT_ELEM_BYTES
=
self
.
WGT_ELEM_BITS
//
8
self
.
ACC_ELEM_BYTES
=
self
.
ACC_ELEM_BITS
//
8
self
.
OUT_ELEM_BYTES
=
self
.
OUT_ELEM_BITS
//
8
# dtypes
self
.
acc_dtype
=
"int
%
d"
%
self
.
ACC_WIDTH
self
.
inp_dtype
=
"int
%
d"
%
self
.
INP_WIDTH
...
...
vta/python/vta/ir_pass.py
View file @
dae77cdb
...
...
@@ -339,7 +339,7 @@ def inject_dma_intrin(stmt_in):
base
=
0
for
i
in
range
(
1
,
ndim
+
1
):
if
not
util
.
equal_const_int
(
buf
.
strides
[
ndim
-
i
]
-
x_size
,
0
):
raise
RuntimeError
(
"scope
%
s need
need
to have block=
%
d"
%
(
scope
,
elem_block
))
raise
RuntimeError
(
"scope
%
s need
s
to have block=
%
d"
%
(
scope
,
elem_block
))
x_size
=
x_size
*
buf
.
shape
[
ndim
-
i
]
if
util
.
equal_const_int
(
x_size
-
elem_block
,
0
):
base
=
i
+
1
...
...
@@ -469,10 +469,10 @@ def inject_dma_intrin(stmt_in):
if
pad_before
or
pad_after
:
raise
RuntimeError
(
"Do not support copy into DRAM with pad"
)
if
src
.
scope
==
env
.
acc_scope
:
elem_width
=
env
.
INP_WIDTH
# output compression to inp type
elem_bytes
=
env
.
INP_ELEM_BYTES
# output compression to inp type
elem_width
=
env
.
OUT_WIDTH
elem_bytes
=
env
.
OUT_ELEM_BYTES
mem_type
=
env
.
dev
.
MEM_ID_OUT
data_type
=
"int
%
d"
%
env
.
INP
_WIDTH
data_type
=
"int
%
d"
%
env
.
OUT
_WIDTH
task_qid
=
env
.
dev
.
QID_STORE_OUT
else
:
raise
RuntimeError
(
"Do not support copy
%
s->dram"
%
(
src
.
scope
))
...
...
vta/tests/hardware/common/test_lib.cc
View file @
dae77cdb
/*!
* Copyright (c) 2018 by Contributors
* \file
vta_
test_lib.cpp
* \file test_lib.cpp
* \brief Test library for the VTA design simulation and driver tests.
*/
#include "./test_lib.h"
#ifdef NO_SIM
#ifdef VTA_TARGET_PYNQ
uint64_t
vta
(
uint32_t
insn_count
,
VTAGenericInsn
*
insns
,
VTAUop
*
uops
,
inp_T
*
inputs
,
wgt_T
*
weights
,
acc_T
*
biases
,
inp_T
*
outputs
)
{
// Performance counter variables
uint64_t
t_fpga
;
struct
timespec
start
,
stop
;
// Derive bitstream file
char
bitstream
[
128
];
char
str_batch_size
[
4
];
char
str_block_out_size
[
4
];
char
str_block_in_size
[
4
];
char
str_block_bit_width
[
4
];
snprintf
(
str_batch_size
,
sizeof
(
str_batch_size
),
"%d"
,
VTA_BATCH
);
snprintf
(
str_block_out_size
,
sizeof
(
str_block_out_size
),
"%d"
,
VTA_BLOCK_OUT
);
snprintf
(
str_block_in_size
,
sizeof
(
str_block_in_size
),
"%d"
,
VTA_BLOCK_IN
);
snprintf
(
str_block_bit_width
,
sizeof
(
str_block_bit_width
),
"%d"
,
VTA_WGT_WIDTH
);
snprintf
(
bitstream
,
sizeof
(
bitstream
),
"%s"
,
"vta.bit"
);
#if VTA_DEBUG == 1
printf
(
"INFO - Programming FPGA: %s!
\n
"
,
bitstream
);
#endif
// Program VTA
VTAProgram
(
bitstream
);
// Get VTA handles
void
*
vta_fetch_handle
=
VTAMapRegister
(
VTA_FETCH_ADDR
,
VTA_RANGE
);
void
*
vta_load_handle
=
VTAMapRegister
(
VTA_LOAD_ADDR
,
VTA_RANGE
);
void
*
vta_compute_handle
=
VTAMapRegister
(
VTA_COMPUTE_ADDR
,
VTA_RANGE
);
void
*
vta_store_handle
=
VTAMapRegister
(
VTA_STORE_ADDR
,
VTA_RANGE
);
// Physical address pointers
uint32_t
insn_phy
=
insns
?
cma_get_phy_addr
(
insns
)
:
0
;
uint32_t
uop_phy
=
uops
?
cma_get_phy_addr
(
uops
)
:
0
;
uint32_t
input_phy
=
inputs
?
cma_get_phy_addr
(
inputs
)
:
0
;
uint32_t
weight_phy
=
weights
?
cma_get_phy_addr
(
weights
)
:
0
;
uint32_t
bias_phy
=
biases
?
cma_get_phy_addr
(
biases
)
:
0
;
uint32_t
output_phy
=
outputs
?
cma_get_phy_addr
(
outputs
)
:
0
;
#if VTA_DEBUG == 1
printf
(
"INFO - Starting FPGA!
\n
"
);
#endif
clock_gettime
(
CLOCK_REALTIME
,
&
start
);
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg
(
vta_fetch_handle
,
0x10
,
insn_count
);
// FETCH @ 0x18 : Data signal of insns_V
if
(
insns
)
VTAWriteMappedReg
(
vta_fetch_handle
,
0x18
,
insn_phy
);
// LOAD @ 0x10 : Data signal of inputs_V
if
(
inputs
)
VTAWriteMappedReg
(
vta_load_handle
,
0x10
,
input_phy
);
// LOAD @ 0x18 : Data signal of weight_V
if
(
weights
)
VTAWriteMappedReg
(
vta_load_handle
,
0x18
,
weight_phy
);
// COMPUTE @ 0x20 : Data signal of uops_V
if
(
uops
)
VTAWriteMappedReg
(
vta_compute_handle
,
0x20
,
uop_phy
);
// COMPUTE @ 0x28 : Data signal of biases_V
if
(
biases
)
VTAWriteMappedReg
(
vta_compute_handle
,
0x28
,
bias_phy
);
// STORE @ 0x10 : Data signal of outputs_V
if
(
outputs
)
VTAWriteMappedReg
(
vta_store_handle
,
0x10
,
output_phy
);
// VTA start
VTAWriteMappedReg
(
vta_fetch_handle
,
0x0
,
0x1
);
VTAWriteMappedReg
(
vta_load_handle
,
0x0
,
0x81
);
VTAWriteMappedReg
(
vta_compute_handle
,
0x0
,
0x81
);
VTAWriteMappedReg
(
vta_store_handle
,
0x0
,
0x81
);
int
flag
=
0
,
t
=
0
;
for
(
t
=
0
;
t
<
10000000
;
++
t
)
{
flag
=
VTAReadMappedReg
(
vta_compute_handle
,
0x18
);
if
(
flag
&
VTA_DONE
)
break
;
}
if
(
t
==
10000000
)
{
printf
(
"
\t
WARNING: VTA TIMEOUT!!!!
\n
"
);
#if VTA_DEBUG == 1
}
else
{
printf
(
"INFO - FPGA Finished!
\n
"
);
#endif
}
clock_gettime
(
CLOCK_REALTIME
,
&
stop
);
t_fpga
=
1000000000ULL
*
(
stop
.
tv_sec
-
start
.
tv_sec
)
+
(
stop
.
tv_nsec
-
start
.
tv_nsec
);
// Unmap VTA register
VTAUnmapRegister
(
vta_fetch_handle
,
VTA_RANGE
);
VTAUnmapRegister
(
vta_load_handle
,
VTA_RANGE
);
VTAUnmapRegister
(
vta_compute_handle
,
VTA_RANGE
);
VTAUnmapRegister
(
vta_store_handle
,
VTA_RANGE
);
return
t_fpga
;
}
#endif // VTA_TARGET_PYNQ
#endif // NO_SIM
uint32_t
globalSeed
;
const
char
*
getOpcodeString
(
int
opcode
,
bool
use_imm
)
{
...
...
@@ -1122,3 +1225,232 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
return
-
1
;
}
}
int
gemm_test
(
int
batch
,
int
in_channels
,
int
out_channels
,
bool
uop_compression
)
{
// Some assertions
assert
(
batch
%
VTA_BATCH
==
0
);
assert
(
in_channels
%
VTA_BLOCK_IN
==
0
);
assert
(
out_channels
%
VTA_BLOCK_OUT
==
0
);
printf
(
"=====================================================================================
\n
"
);
printf
(
"INFO - Blocked GEMM test: batch=%d, in_channels=%d, out_channels=%d, uop_comp=%d
\n
"
,
batch
,
in_channels
,
out_channels
,
uop_compression
);
// Derive number of elements that need to be loaded/stored
int
ins_size
=
7
;
int
uop_size
=
uop_compression
?
batch
/
VTA_BATCH
:
batch
/
VTA_BATCH
*
in_channels
/
VTA_BLOCK_IN
*
out_channels
/
VTA_BLOCK_OUT
;
int
inp_size
=
batch
/
VTA_BATCH
*
in_channels
/
VTA_BLOCK_IN
;
int
wgt_size
=
in_channels
/
VTA_BLOCK_IN
*
out_channels
/
VTA_BLOCK_OUT
;
int
out_size
=
batch
/
VTA_BATCH
*
out_channels
/
VTA_BLOCK_OUT
;
// Make sure we don't exceed buffer bounds
assert
(
uop_size
<=
VTA_UOP_BUFF_DEPTH
);
assert
(
inp_size
<=
VTA_INP_BUFF_DEPTH
);
assert
(
wgt_size
<=
VTA_WGT_BUFF_DEPTH
);
assert
(
out_size
<=
VTA_ACC_BUFF_DEPTH
);
// Initialize instruction buffer
VTAGenericInsn
*
insn_buf
=
static_cast
<
VTAGenericInsn
*>
(
allocBuffer
(
sizeof
(
VTAGenericInsn
)
*
ins_size
));
int
insn_idx
=
0
;
// Load uops
insn_buf
[
insn_idx
++
]
=
get1DLoadStoreInsn
(
VTA_OPCODE_LOAD
,
VTA_MEM_ID_UOP
,
0
,
0
,
uop_size
,
0
,
0
,
0
,
0
);
// Load bias
insn_buf
[
insn_idx
++
]
=
get1DLoadStoreInsn
(
VTA_OPCODE_LOAD
,
// opcode
VTA_MEM_ID_ACC
,
// type
0
,
// sram offset
0
,
// dram offset
out_size
,
// size
0
,
// pop prev dep
0
,
// pop next dep
1
,
// push prev dep
0
);
// push next dep
// Load weight block (pop next)
insn_buf
[
insn_idx
++
]
=
get1DLoadStoreInsn
(
VTA_OPCODE_LOAD
,
// opcode
VTA_MEM_ID_WGT
,
// type
0
,
// sram offset
0
,
// dram offset
wgt_size
,
// size
0
,
// pop prev dep
1
,
// pop next dep
0
,
// push prev dep
0
);
// push next dep
// Load input block (push next)
insn_buf
[
insn_idx
++
]
=
get1DLoadStoreInsn
(
VTA_OPCODE_LOAD
,
// opcode
VTA_MEM_ID_INP
,
// type
0
,
// sram offset
0
,
// dram offset
inp_size
,
// size
0
,
// pop prev dep
0
,
// pop next dep
0
,
// push prev dep
1
);
// push next dep
// Perform GEMM (pop prev, push prev if not last, push next if last)
insn_buf
[
insn_idx
++
]
=
getGEMMInsn
(
0
,
// uop offset
batch
/
VTA_BATCH
,
// batch
in_channels
/
VTA_BLOCK_IN
,
// in_channels
out_channels
/
VTA_BLOCK_OUT
,
// out_channels
uop_compression
,
// uop_compression
1
,
// pop_prev_dep
0
,
// pop_next_dep
0
,
// push prev dep
1
);
// push_next_dep
// Store output block (pop prev, push prev if not last)
insn_buf
[
insn_idx
++
]
=
get1DLoadStoreInsn
(
VTA_OPCODE_STORE
,
// opcode
VTA_MEM_ID_OUT
,
// type
0
,
// sram offset
0
,
// dram offset
out_size
,
// size
1
,
// pop prev dep
0
,
// pop next dep
1
,
// push prev dep
0
);
// push next dep
// Finish
insn_buf
[
insn_idx
++
]
=
getFinishInsn
(
0
,
1
);
// Prepare the uop buffer
VTAUop
*
uop_buf
=
getGEMMUops
(
batch
/
VTA_BATCH
,
in_channels
/
VTA_BLOCK_IN
,
out_channels
/
VTA_BLOCK_OUT
,
uop_compression
,
0
);
#if VTA_DEBUG == 1
printInstruction
(
ins_size
,
insn_buf
);
printMicroOp
(
uop_size
,
uop_buf
);
#endif
// Initialize inputs
inp_T
**
inputs
=
allocInit2dArray
<
inp_T
,
VTA_INP_WIDTH
>
(
batch
,
in_channels
);
// Initialize weights
wgt_T
**
weights
=
allocInit2dArray
<
wgt_T
,
VTA_WGT_WIDTH
>
(
out_channels
,
in_channels
);
// Initialize biases
acc_T
**
biases
=
allocInit2dArray
<
acc_T
,
VTA_ACC_WIDTH
>
(
batch
,
out_channels
);
// Reference GEMM implementation
out_T
**
outputs_ref
=
alloc2dArray
<
out_T
>
(
batch
,
out_channels
);
for
(
int
i
=
0
;
i
<
batch
;
i
++
)
{
for
(
int
j
=
0
;
j
<
out_channels
;
j
++
)
{
acc_T
sum
=
biases
[
i
][
j
];
for
(
int
k
=
0
;
k
<
in_channels
;
k
++
)
{
sum
+=
(
acc_T
)
(
inputs
[
i
][
k
]
*
weights
[
j
][
k
]);
}
// Set
outputs_ref
[
i
][
j
]
=
(
out_T
)
sum
;
}
}
// Prepare the input buffer
inp_T
*
input_buf
=
static_cast
<
inp_T
*>
(
allocBuffer
(
VTA_INP_ELEM_BYTES
*
inp_size
));
packBuffer
<
inp_T
,
VTA_INP_WIDTH
>
(
input_buf
,
inputs
,
batch
,
in_channels
,
VTA_BATCH
,
VTA_BLOCK_IN
);
// Prepare the weight buffer
wgt_T
*
weight_buf
=
static_cast
<
wgt_T
*>
(
allocBuffer
(
VTA_WGT_ELEM_BYTES
*
wgt_size
));
packBuffer
<
wgt_T
,
VTA_WGT_WIDTH
>
(
weight_buf
,
weights
,
out_channels
,
in_channels
,
VTA_BLOCK_OUT
,
VTA_BLOCK_IN
);
// Prepare the bias buffer
acc_T
*
bias_buf
=
static_cast
<
acc_T
*>
(
allocBuffer
(
VTA_ACC_ELEM_BYTES
*
out_size
));
packBuffer
<
acc_T
,
VTA_ACC_WIDTH
>
(
bias_buf
,
biases
,
batch
,
out_channels
,
VTA_BATCH
,
VTA_BLOCK_OUT
);
// Prepare the output buffer
out_T
*
output_buf
=
static_cast
<
out_T
*>
(
allocBuffer
(
VTA_INP_ELEM_BYTES
*
out_size
));
#ifdef NO_SIM
// Invoke the VTA
uint64_t
t_fpga
=
vta
(
ins_size
,
insn_buf
,
uop_buf
,
input_buf
,
weight_buf
,
bias_buf
,
output_buf
);
// Report on timining
printf
(
"INFO - Synchronization time: %.3lfms
\n
"
,
static_cast
<
float
>
(
t_fpga
)
/
1E6
);
printf
(
"INFO - Throughput: %.3lfGOPs/s
\n
"
,
static_cast
<
float
>
(
batch
)
*
in_channels
*
out_channels
*
2
/
t_fpga
);
#else
// Invoke the VTA
vta
(
ins_size
,
(
volatile
insn_T
*
)
insn_buf
,
(
volatile
uop_T
*
)
uop_buf
,
(
volatile
inp_vec_T
*
)
input_buf
,
(
volatile
wgt_vec_T
*
)
weight_buf
,
(
volatile
acc_vec_T
*
)
bias_buf
,
(
volatile
out_vec_T
*
)
output_buf
);
#endif
// Unpack output data
out_T
**
outputs
=
alloc2dArray
<
out_T
>
(
batch
,
out_channels
);
unpackBuffer
<
out_T
,
VTA_OUT_WIDTH
>
(
outputs
,
output_buf
,
batch
,
out_channels
,
VTA_BATCH
,
VTA_BLOCK_OUT
);
// Correctness checks
int
err
=
0
;
for
(
int
i
=
0
;
i
<
batch
;
i
++
)
{
for
(
int
j
=
0
;
j
<
out_channels
;
j
++
)
{
if
(
outputs_ref
[
i
][
j
]
!=
outputs
[
i
][
j
])
{
err
++
;
#if VTA_DEBUG == 1
printf
(
"DEBUG - %d, %d: expected 0x%x but got 0x%x
\n
"
,
i
,
j
,
static_cast
<
int
>
(
outputs_ref
[
i
][
j
]),
static_cast
<
int
>
(
outputs
[
i
][
j
]));
#endif
}
}
}
// Free all allocated arrays
free2dArray
<
inp_T
>
(
inputs
,
batch
,
in_channels
);
free2dArray
<
wgt_T
>
(
weights
,
out_channels
,
in_channels
);
free2dArray
<
acc_T
>
(
biases
,
batch
,
out_channels
);
free2dArray
<
out_T
>
(
outputs_ref
,
batch
,
out_channels
);
free2dArray
<
out_T
>
(
outputs
,
batch
,
out_channels
);
freeBuffer
(
insn_buf
);
freeBuffer
(
uop_buf
);
freeBuffer
(
input_buf
);
freeBuffer
(
weight_buf
);
freeBuffer
(
bias_buf
);
freeBuffer
(
output_buf
);
if
(
err
==
0
)
{
printf
(
"INFO - Blocked GEMM test successful!
\n
"
);
return
0
;
}
else
{
printf
(
"INFO - Blocked GEMM test failed, got %d errors!
\n
"
,
err
);
return
-
1
;
}
}
\ No newline at end of file
vta/tests/hardware/common/test_lib.h
View file @
dae77cdb
/*!
* Copyright (c) 2018 by Contributors
* \file
vta_
test_lib.cpp
* \file test_lib.cpp
* \brief Test library for the VTA design simulation and driver tests.
*/
...
...
@@ -17,9 +17,9 @@
#include <vta/driver.h>
#ifdef VTA_
PYNQ_TARGET
#ifdef VTA_
TARGET_PYNQ
#include "../../../src/pynq/pynq_driver.h"
#endif // VTA_
PYNQ_TARGET
#endif // VTA_
TARGET_PYNQ
typedef
uint64_t
axi_T
;
typedef
uint32_t
uop_T
;
...
...
@@ -300,4 +300,14 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
int
blocked_gemm_test
(
int
batch
,
int
channels
,
int
block
,
bool
uop_compression
,
int
virtual_threads
);
/*!
* \brief VTA GEMM unit test.
* \param batch Batch size.
* \param in_channels Input channels.
* \param out_channels Output channels.
* \param uop_compression Apply micro-op compression.
* \return Number of errors from the test run.
*/
int
gemm_test
(
int
batch
,
int
in_channels
,
int
out_channels
,
bool
uop_compression
);
#endif // TESTS_HARDWARE_COMMON_TEST_LIB_H_
vta/tests/hardware/pynq/Makefile
View file @
dae77cdb
CC
?=
g++
CFLAGS
=
-Wall
-O3
-std
=
c++11
-I
/usr/include
LDFLAGS
=
-L
/usr/lib
-L
/opt/python3.6/lib/python3.6/site-packages/pynq/lib/
LIBS
=
-l
:libsds_lib.so
-l
:libdma.so
LIBS
=
-l
:libsds_lib.so
-l
:libdma.so
-lstdc
++
INCLUDE_DIR
=
../../../include
DRIVER_DIR
=
../../../src/pynq
TESTLIB_DIR
=
../common
...
...
@@ -10,19 +10,14 @@ SOURCES = pynq_driver.cc test_lib.cc
OBJECTS
=
pynq_driver.o test_lib.o metal_test.o
EXECUTABLE
=
vta
# Include top-level config file
ifndef
config
ifneq
(
"$(wildcard ../../../config.mk)"
,
""
)
config
=
../../../config.mk
else
config
=
../../../make/config.mk
endif
endif
include
$(config)
# Include VTA config
VTA_CONFIG
=
python ../../../make/vta_config.py
CFLAGS
+=
`
${
VTA_CONFIG
}
--cflags
`
LDFLAGS
+=
`
${
VTA_CONFIG
}
--ldflags
`
VTA_TARGET
:=
$(
shell
${
VTA_CONFIG
}
--target
)
# Define flags
CFLAGS
+=
-I
$(INCLUDE_DIR)
-DNO_SIM
-DDEBUG
=
0
CFLAGS
+=
$(ADD_CFLAGS)
CFLAGS
+=
-I
$(INCLUDE_DIR)
-DNO_SIM
-DVTA_DEBUG
=
0
# All Target
all
:
$(EXECUTABLE)
...
...
vta/tests/hardware/pynq/metal_test.cc
View file @
dae77cdb
/*!
* Copyright (c) 2018 by Contributors
* \file
driver
_test.cpp
* \file
metal
_test.cpp
* \brief Bare-metal test to test driver and VTA design.
*/
...
...
@@ -13,104 +13,6 @@
#include "../../../src/pynq/pynq_driver.h"
#include "../common/test_lib.h"
// VTA invocation (present the same abstraction as in the simulation tests)
uint64_t
vta
(
uint32_t
insn_count
,
VTAGenericInsn
*
insns
,
VTAUop
*
uops
,
inp_T
*
inputs
,
wgt_T
*
weights
,
acc_T
*
biases
,
inp_T
*
outputs
)
{
// Performance counter variables
uint64_t
t_fpga
;
struct
timespec
start
,
stop
;
// Derive bitstream file
char
bitstream
[
128
];
char
str_batch_size
[
4
];
char
str_block_out_size
[
4
];
char
str_block_in_size
[
4
];
char
str_block_bit_width
[
4
];
snprintf
(
str_batch_size
,
sizeof
(
str_batch_size
),
"%d"
,
VTA_BATCH
);
snprintf
(
str_block_out_size
,
sizeof
(
str_block_out_size
),
"%d"
,
VTA_BLOCK_OUT
);
snprintf
(
str_block_in_size
,
sizeof
(
str_block_in_size
),
"%d"
,
VTA_BLOCK_IN
);
snprintf
(
str_block_bit_width
,
sizeof
(
str_block_bit_width
),
"%d"
,
VTA_WGT_WIDTH
);
snprintf
(
bitstream
,
sizeof
(
bitstream
),
"%s"
,
"vta.bit"
);
#if VTA_DEBUG == 1
printf
(
"INFO - Programming FPGA: %s!
\n
"
,
bitstream
);
#endif
// Program VTA
VTAProgram
(
bitstream
);
// Get VTA handles
VTAHandle
vta_fetch_handle
=
VTAMapRegister
(
VTA_FETCH_ADDR
,
VTA_RANGE
);
VTAHandle
vta_load_handle
=
VTAMapRegister
(
VTA_LOAD_ADDR
,
VTA_RANGE
);
VTAHandle
vta_compute_handle
=
VTAMapRegister
(
VTA_COMPUTE_ADDR
,
VTA_RANGE
);
VTAHandle
vta_store_handle
=
VTAMapRegister
(
VTA_STORE_ADDR
,
VTA_RANGE
);
// Physical address pointers
uint32_t
insn_phy
=
insns
?
cma_get_phy_addr
(
insns
)
:
0
;
uint32_t
uop_phy
=
uops
?
cma_get_phy_addr
(
uops
)
:
0
;
uint32_t
input_phy
=
inputs
?
cma_get_phy_addr
(
inputs
)
:
0
;
uint32_t
weight_phy
=
weights
?
cma_get_phy_addr
(
weights
)
:
0
;
uint32_t
bias_phy
=
biases
?
cma_get_phy_addr
(
biases
)
:
0
;
uint32_t
output_phy
=
outputs
?
cma_get_phy_addr
(
outputs
)
:
0
;
#if VTA_DEBUG == 1
printf
(
"INFO - Starting FPGA!
\n
"
);
#endif
clock_gettime
(
CLOCK_REALTIME
,
&
start
);
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg
(
vta_fetch_handle
,
0x10
,
insn_count
);
// FETCH @ 0x18 : Data signal of insns_V
if
(
insns
)
VTAWriteMappedReg
(
vta_fetch_handle
,
0x18
,
insn_phy
);
// LOAD @ 0x10 : Data signal of inputs_V
if
(
inputs
)
VTAWriteMappedReg
(
vta_load_handle
,
0x10
,
input_phy
);
// LOAD @ 0x18 : Data signal of weight_V
if
(
weights
)
VTAWriteMappedReg
(
vta_load_handle
,
0x18
,
weight_phy
);
// COMPUTE @ 0x20 : Data signal of uops_V
if
(
uops
)
VTAWriteMappedReg
(
vta_compute_handle
,
0x20
,
uop_phy
);
// COMPUTE @ 0x28 : Data signal of biases_V
if
(
biases
)
VTAWriteMappedReg
(
vta_compute_handle
,
0x28
,
bias_phy
);
// STORE @ 0x10 : Data signal of outputs_V
if
(
outputs
)
VTAWriteMappedReg
(
vta_store_handle
,
0x10
,
output_phy
);
// VTA start
VTAWriteMappedReg
(
vta_fetch_handle
,
0x0
,
0x1
);
VTAWriteMappedReg
(
vta_load_handle
,
0x0
,
0x81
);
VTAWriteMappedReg
(
vta_compute_handle
,
0x0
,
0x81
);
VTAWriteMappedReg
(
vta_store_handle
,
0x0
,
0x81
);
int
flag
=
0
,
t
=
0
;
for
(
t
=
0
;
t
<
10000000
;
++
t
)
{
flag
=
VTAReadMappedReg
(
vta_compute_handle
,
0x18
);
if
(
flag
&
VTA_DONE
)
break
;
}
if
(
t
==
10000000
)
{
printf
(
"
\t
WARNING: VTA TIMEOUT!!!!
\n
"
);
#if VTA_DEBUG == 1
}
else
{
printf
(
"INFO - FPGA Finished!
\n
"
);
#endif
}
clock_gettime
(
CLOCK_REALTIME
,
&
stop
);
t_fpga
=
1000000000ULL
*
(
stop
.
tv_sec
-
start
.
tv_sec
)
+
(
stop
.
tv_nsec
-
start
.
tv_nsec
);
// Unmap VTA register
VTAUnmapRegister
(
vta_fetch_handle
,
VTA_RANGE
);
VTAUnmapRegister
(
vta_load_handle
,
VTA_RANGE
);
VTAUnmapRegister
(
vta_compute_handle
,
VTA_RANGE
);
VTAUnmapRegister
(
vta_store_handle
,
VTA_RANGE
);
return
t_fpga
;
}
int
main
(
void
)
{
#if VTA_DEBUG == 1
printParameters
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment