vta_get_started.py 15.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
Tianqi Chen committed
17
"""
18
.. _vta-get-started:
19

Tianqi Chen committed
20 21
Get Started with VTA
====================
22
**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
Tianqi Chen committed
23

24
This is an introduction tutorial on how to use TVM to program the VTA design.
Tianqi Chen committed
25

26 27 28 29
In this tutorial, we will demonstrate the basic TVM workflow to implement
a vector addition on the VTA design's vector ALU.
This process includes specific scheduling transformations necessary to lower
computation down to low-level accelerator operations.
Tianqi Chen committed
30

31 32 33
To begin, we need to import TVM which is our deep learning optimizing compiler.
We also need to import the VTA python package which contains VTA specific
extensions for TVM to target the VTA design.
Tianqi Chen committed
34 35 36
"""
from __future__ import absolute_import, print_function

37 38
import os
import tvm
Tianqi Chen committed
39
import vta
40 41 42 43 44 45 46 47
import numpy as np

######################################################################
# Loading in VTA Parameters
# ~~~~~~~~~~~~~~~~~~~~~~~~~
# VTA is a modular and customizable design. Consequently, the user
# is free to modify high-level hardware parameters that affect
# the hardware design layout.
48
# These parameters are specified in the :code:`vta_config.json` file by their
49 50 51 52
# :code:`log2` values.
# These VTA parameters can be loaded with the :code:`vta.get_env`
# function.
#
53
# Finally, the TVM target is also specified in the :code:`vta_config.json` file.
54 55 56 57 58 59 60 61 62 63 64 65 66 67
# When set to *sim*, execution will take place inside of a behavioral
# VTA simulator.
# If you want to run this tutorial on the Pynq FPGA development platform,
# follow the *VTA Pynq-Based Testing Setup* guide.

env = vta.get_env()

######################################################################
# FPGA Programming
# ----------------
# When targeting the Pynq FPGA development board, we need to configure
# the board with a VTA bitstream.

# We'll need the TVM RPC module and the VTA simulator module
68 69
from tvm import rpc
from tvm.contrib import util
70 71 72 73 74 75 76
from vta.testing import simulator

# We read the Pynq RPC host IP address and port number from the OS environment
host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))

# We configure both the bitstream and the runtime system on the Pynq
77
# to match the VTA configuration specified by the vta_config.json file.
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
if env.TARGET == "pynq":

    # Make sure that TVM was compiled with RPC=1
    assert tvm.module.enabled("rpc")
    remote = rpc.connect(host, port)

    # Reconfigure the JIT runtime
    vta.reconfig_runtime(remote)

    # Program the FPGA with a pre-compiled VTA bitstream.
    # You can program the FPGA with your own custom bitstream
    # by passing the path to the bitstream file instead of None.
    vta.program_fpga(remote, bitstream=None)

# In simulation mode, host the RPC server locally.
elif env.TARGET == "sim":
    remote = rpc.LocalSession()

######################################################################
# Computation Declaration
# -----------------------
# As a first step, we need to describe our computation.
# TVM adopts tensor semantics, with each intermediate result
# represented as multi-dimensional array. The user needs to describe
# the computation rule that generates the output tensors.
#
# In this example we describe a vector addition, which requires multiple
# computation stages, as shown in the dataflow diagram below.
# First we describe the input tensors :code:`A` and :code:`B` that are living
# in main memory.
# Second, we need to declare intermediate tensors :code:`A_buf` and
# :code:`B_buf`, which will live in VTA's on-chip buffers.
# Having this extra computational stage allows us to explicitly
# stage cached reads and writes.
# Third, we describe the vector addition computation which will
# add :code:`A_buf` to :code:`B_buf` to produce :code:`C_buf`.
# The last operation is a cast and copy back to DRAM, into results tensor
# :code:`C`.
#
# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/vadd_dataflow.png
#      :align: center

######################################################################
# Input Placeholders
# ~~~~~~~~~~~~~~~~~~
# We describe the placeholder tensors :code:`A`, and :code:`B` in a tiled data
# format to match the data layout requirements imposed by the VTA vector ALU.
#
# For VTA's general purpose operations such as vector adds, the tile size is
# :code:`(env.BATCH, env.BLOCK_OUT)`.
# The dimensions are specified in
129
# the :code:`vta_config.json` configuration file and are set by default to
130 131 132
# a (1, 16) vector.
#
# In addition, A and B's data types also needs to match the :code:`env.acc_dtype`
133
# which is set by the :code:`vta_config.json` file to be a 32-bit integer.
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208

# Output channel factor m - total 64 x 16 = 1024 output channels
m = 64
# Batch factor o - total 1 x 1 = 1
o = 1
# A placeholder tensor in tiled data format
A = tvm.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="A", dtype=env.acc_dtype)
# B placeholder tensor in tiled data format
B = tvm.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="B", dtype=env.acc_dtype)

######################################################################
# Copy Buffers
# ~~~~~~~~~~~~
# One specificity of hardware accelerators, is that on-chip memory has to be
# explicitly managed.
# This means that we'll need to describe intermediate tensors :code:`A_buf`
# and :code:`B_buf` that can have a different memory scope than the original
# placeholder tensors :code:`A` and :code:`B`.
#
# Later in the scheduling phase, we can tell the compiler that :code:`A_buf`
# and :code:`B_buf` will live in the VTA's on-chip buffers (SRAM), while
# :code:`A` and :code:`B` will live in main memory (DRAM).
# We describe A_buf and B_buf as the result of a compute
# operation that is the identity function.
# This can later be interpreted by the compiler as a cached read operation.

# A copy buffer
A_buf = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: A(*i), "A_buf")
# B copy buffer
B_buf = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: B(*i), "B_buf")

######################################################################
# Vector Addition
# ~~~~~~~~~~~~~~~
# Now we're ready to describe the vector addition result tensor :code:`C`,
# with another compute operation.
# The compute function takes the shape of the tensor, as well as a lambda
# function that describes the computation rule for each position of the tensor.
#
# No computation happens during this phase, as we are only declaring how
# the computation should be done.

# Describe the in-VTA vector addition
C_buf = tvm.compute(
    (o, m, env.BATCH, env.BLOCK_OUT),
    lambda *i: A_buf(*i).astype(env.acc_dtype) + B_buf(*i).astype(env.acc_dtype),
    name="C_buf")

######################################################################
# Casting the Results
# ~~~~~~~~~~~~~~~~~~~
# After the computation is done, we'll need to send the results computed by VTA
# back to main memory.

######################################################################
# .. note::
#
#   **Memory Store Restrictions**
#
#   One specificity of VTA is that it only supports DRAM stores in the narrow
#   :code:`env.inp_dtype` data type format.
#   This lets us reduce the data footprint for memory transfers (more on this
#   in the basic matrix multiply example).
#
# We perform one last typecast operation to the narrow
# input activation data format.

# Cast to output type, and send to main memory
C = tvm.compute(
    (o, m, env.BATCH, env.BLOCK_OUT),
    lambda *i: C_buf(*i).astype(env.inp_dtype),
    name="C")

######################################################################
# This concludes the computation declaration part of this tutorial.
Tianqi Chen committed
209

210

Tianqi Chen committed
211
######################################################################
212 213 214 215 216 217
# Scheduling the Computation
# --------------------------
# While the above lines describes the computation rule, we can obtain
# :code:`C` in many ways.
# TVM asks the user to provide an implementation of the computation called
# *schedule*.
Tianqi Chen committed
218
#
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
# A schedule is a set of transformations to an original computation that
# transforms the implementation of the computation without affecting
# correctness.
# This simple VTA programming tutorial aims to demonstrate basic schedule
# transformations that will map the original schedule down to VTA hardware
# primitives.


######################################################################
# Default Schedule
# ~~~~~~~~~~~~~~~~
# After we construct the schedule, by default the schedule computes
# :code:`C` in the following way:

# Let's take a look at the generated schedule
s = tvm.create_schedule(C.op)

print(tvm.lower(s, [A, B, C], simple_mode=True))
Tianqi Chen committed
237 238

######################################################################
239 240 241 242 243
# Although this schedule makes sense, it won't compile to VTA.
# In order to obtain correct code generation, we need to apply scheduling
# primitives and code annotation that will transform the schedule into
# one that can be directly lowered onto VTA hardware intrinsics.
# Those include:
Tianqi Chen committed
244
#
245 246 247
#  - DMA copy operations which will take globally-scoped tensors and copy
#    those into locally-scoped tensors.
#  - Vector ALU operations that will perform the vector add.
Tianqi Chen committed
248 249

######################################################################
250 251 252 253 254 255 256
# Buffer Scopes
# ~~~~~~~~~~~~~
# First, we set the scope of the copy buffers to indicate to TVM that these
# intermediate tensors will be stored in the VTA's on-chip SRAM buffers.
# Below, we tell TVM that :code:`A_buf`, :code:`B_buf`, :code:`C_buf`
# will live in VTA's on-chip *accumulator buffer* which serves as
# VTA's general purpose register file.
Tianqi Chen committed
257
#
258 259 260 261
# Set the intermediate tensors' scope to VTA's on-chip accumulator buffer
s[A_buf].set_scope(env.acc_scope)
s[B_buf].set_scope(env.acc_scope)
s[C_buf].set_scope(env.acc_scope)
Tianqi Chen committed
262 263

######################################################################
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
# DMA Transfers
# ~~~~~~~~~~~~~
# We need to schedule DMA transfers to move data living in DRAM to
# and from the VTA on-chip buffers.
# We insert :code:`dma_copy` pragmas to indicate to the compiler
# that the copy operations will be performed in bulk via DMA,
# which is common in hardware accelerators.

# Tag the buffer copies with the DMA pragma to map a copy loop to a
# DMA transfer operation
s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
s[C].pragma(s[C].op.axis[0], env.dma_copy)

######################################################################
# ALU Operations
# ~~~~~~~~~~~~~~
# VTA has a vector ALU that can perform vector operations on tensors
# in the accumulator buffer.
# In order to tell TVM that a given operation needs to be mapped to the
# VTA's vector ALU, we need to explicitly tag the vector addition loop
# with an :code:`env.alu` pragma.

# Tell TVM that the computation needs to be performed
# on VTA's vector ALU
s[C_buf].pragma(C_buf.op.axis[0], env.alu)

# Let's take a look at the finalized schedule
print(vta.lower(s, [A, B, C], simple_mode=True))

######################################################################
# This concludes the scheduling portion of this tutorial.

######################################################################
# TVM Compilation
# ---------------
# After we have finished specifying the schedule, we can compile it
# into a TVM function. By default TVM compiles into a type-erased
# function that can be directly called from python side.
Tianqi Chen committed
303
#
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
# In the following line, we use :code:`tvm.build` to create a function.
# The build function takes the schedule, the desired signature of the
# function(including the inputs and outputs) as well as target language
# we want to compile to.
#
my_vadd = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_vadd")

######################################################################
# Saving the Module
# ~~~~~~~~~~~~~~~~~
# TVM lets us save our module into a file so it can loaded back later. This
# is called ahead-of-time compilation and allows us to save some compilation
# time.
# More importantly, this allows us to cross-compile the executable on our
# development machine and send it over to the Pynq FPGA board over RPC for
# execution.

# Write the compiled module into an object file.
temp = util.tempdir()
my_vadd.save(temp.relpath("vadd.o"))

# Send the executable over RPC
remote.upload(temp.relpath("vadd.o"))

######################################################################
# Loading the Module
# ~~~~~~~~~~~~~~~~~~
# We can load the compiled module from the file system to run the code.

f = remote.load_module("vadd.o")

######################################################################
# Running the Function
# --------------------
# The compiled TVM function uses a concise C API and can be invoked from
# any language.
#
# TVM provides an array API in python to aid quick testing and prototyping.
# The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard.
#
# - We first create a remote context (for remote execution on the Pynq).
# - Then :code:`tvm.nd.array` formats the data accordingly.
# - :code:`f()` runs the actual computation.
# - :code:`asnumpy()` copies the result array back in a format that can be
#   interpreted.
#

# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the A and B arrays randomly in the int range of (-128, 128]
A_orig = np.random.randint(
    -128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(A.dtype)
B_orig = np.random.randint(
    -128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(B.dtype)

# Apply packing to the A and B arrays from a 2D to a 4D packed layout
A_packed = A_orig.reshape(
    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
B_packed = B_orig.reshape(
    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))

# Format the input/output arrays with tvm.nd.array to the DLPack standard
A_nd = tvm.nd.array(A_packed, ctx)
B_nd = tvm.nd.array(B_packed, ctx)
C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx)

# Invoke the module to perform the computation
f(A_nd, B_nd, C_nd)

######################################################################
# Verifying Correctness
# ---------------------
# Compute the reference result with numpy and assert that the output of the
# matrix multiplication indeed is correct

# Compute reference result with numpy
C_ref = (A_orig.astype(env.acc_dtype) + B_orig.astype(env.acc_dtype)).astype(C.dtype)
C_ref = C_ref.reshape(
    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
np.testing.assert_equal(C_ref, C_nd.asnumpy())
print("Successful vector add test!")

######################################################################
# Summary
# -------
# This tutorial provides a walk-through of TVM for programming the
# deep learning accelerator VTA with a simple vector addition example.
# The general workflow includes:
#
# - Programming the FPGA with the VTA bitstream over RPC.
# - Describing the vector add computation via a series of computations.
# - Describing how we want to perform the computation using schedule primitives.
# - Compiling the function to the VTA target.
# - Running the compiled module and verifying it against a numpy implementation.
#
# You are more than welcome to check other examples out and tutorials
# to learn more about the supported operations, schedule primitives
# and other features supported by TVM to program VTA.
#