convolution_opt.py 16.1 KB
Newer Older
1 2 3 4 5 6 7
"""
2D Convolution Optimization
===========================
**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_

This tutorial provides an overview on how to use TVM to map a 2D convolution
workload efficiently on the VTA design.
8
We recommend covering the :ref:`vta-mat-mult-opt` tutorial first.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28

2D convolution is dominant in most computer vision deep neural networks.
In this tutorial, we will demonstrate TVM schedule optimizations to map
2D convolution operators in NCHW layout onto VTA.
We also introduce the notion of latency hiding, which allows us to
maximize VTA's compute and memory resource utilization.
"""

######################################################################
# RPC Setup
# ---------
# We start by programming the Pynq's FPGA and building its RPC runtime.

from __future__ import absolute_import, print_function

import os
import tvm
import vta
import numpy as np

29 30
from tvm import rpc
from tvm.contrib import util
31 32
from vta.testing import simulator

33
# Load VTA parameters from the vta/config/vta_config.json file
34 35 36 37 38 39 40
env = vta.get_env()

# We read the Pynq RPC host IP address and port number from the OS environment
host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))

# We configure both the bitstream and the runtime system on the Pynq
41
# to match the VTA configuration specified by the vta_config.json file.
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
if env.TARGET == "pynq":

    # Make sure that TVM was compiled with RPC=1
    assert tvm.module.enabled("rpc")
    remote = rpc.connect(host, port)

    # Reconfigure the JIT runtime
    vta.reconfig_runtime(remote)

    # Program the FPGA with a pre-compiled VTA bitstream.
    # You can program the FPGA with your own custom bitstream
    # by passing the path to the bitstream file instead of None.
    vta.program_fpga(remote, bitstream=None)

# In simulation mode, host the RPC server locally.
elif env.TARGET == "sim":
    remote = rpc.LocalSession()

######################################################################
# Computation Declaration
# -----------------------
# As a first step, we need to describe our 2D convolution computation
# in NCHW format.
#
# We define the 2D convolution shape by the batch size,
# spatial dimensions, input channels, output channels, kernel dimensions,
# kernel dimensions, padding dimensions, and stride dimensions.
#
# We pick the shape of the 9th convolutional layer of the ResNet-18
# architecture as our convolution workload parameters.
#
# We've added extra operators to the 2D convolution that apply
# shifting and clipping to the output in order to mimic a fixed-point
# convolution followed by a rectified linear activation.
# We describe the TVM dataflow graph of the 2D convolution layer below:
#
# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/conv2d_dataflow.png
#      :align: center
#
# This computation is intentionally too large to fit onto VTA's on-chip
# buffers all at once. Therefore in the scheduling phase we'll
# rely on computation blocking strategies to break the computation down into
# manageable chunks.
#
# .. note::
#
#   *Spatial padding*
#
#   Note that we'll need to import the TOPI library to apply spatial padding
#   on the input feature map tensor.
#   Spatial padding facilitates blocking in the context of 2D convolutions
#   due to the fact that the same (x, y) spatial location of the input
#   feature map of any given layer is read more than once if the convolution
#   kernel window size is greater than one.
#   On CPUs, and GPUs, one way to increase efficiency of memory accesses
#   when parallelizing work is spatial packing, which requires data re-layout.
#   VTA load DMA engine can insert padding automatically so that the original
#   input feature map does not have to be re-packed in memory.
#
#   We show the effect of VTA's on the fly spatial padding when data is being
#   loaded from DRAM into VTA's SRAM, following a 2D strided and padded memory
#   read.
#
#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/padding.png
#        :align: center
#        :width: 480px

import topi

# 2D convolution layer dimensions taken from ResNet-18 architecture
# (9th convolutional layer)
batch_size = 1
height = 14
width = 14
in_channels = 256
out_channels = 256
kernel_h = 3
kernel_w = 3
pad_h = 1
pad_w = 1
stride_h = 1
stride_w = 1
assert batch_size % env.BATCH == 0
assert in_channels % env.BLOCK_IN == 0
assert out_channels % env.BLOCK_OUT == 0

# Input feature map: (N, IC, H, W, n, ic)
data_shape = (batch_size // env.BATCH,
              in_channels // env.BLOCK_IN,
              height,
              width,
              env.BATCH,
              env.BLOCK_IN)
# Kernel: (OC, IC, H, W, oc, ic)
kernel_shape = (out_channels // env.BLOCK_OUT,
                in_channels // env.BLOCK_IN,
                kernel_h,
                kernel_w,
                env.BLOCK_OUT,
                env.BLOCK_IN)
# Derive output feature map dimensions
fout_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
fout_width = (width + 2 * pad_w - kernel_w) // stride_w + 1
# Output feature map: (N, OC, H, W, n, oc)
output_shape = (batch_size // env.BATCH,
                out_channels // env.BLOCK_OUT,
                fout_height,
                fout_width,
                env.BATCH,
                env.BLOCK_OUT)

# Convolution reduction axes
dy = tvm.reduce_axis((0, kernel_h), name='dy')
dx = tvm.reduce_axis((0, kernel_w), name='dx')
ic = tvm.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
ic_tns = tvm.reduce_axis((0, env.BLOCK_IN), name='ic_tns')

# Input placeholder tensors
data = tvm.placeholder(data_shape,
                       name="data",
                       dtype=env.inp_dtype)
kernel = tvm.placeholder(kernel_shape,
                         name="kernel",
                         dtype=env.wgt_dtype)

# Copy buffers:
#   Apply spatial padding to input feature map
data_buf = topi.nn.pad(data,
                       [0, 0, pad_h, pad_w, 0, 0],
                       name="data_buf")
kernel_buf = tvm.compute(kernel_shape, lambda *i: kernel(*i), "kernel_buf")

# Declare 2D convolution
res_conv = tvm.compute(
    output_shape,
    lambda bo, co, i, j, bi, ci: tvm.sum(
      data_buf[bo, ic, i*stride_h+dy, j*stride_w+dx, bi, ic_tns].astype(env.acc_dtype) *
      kernel_buf[co, ic, dy, dx, ci, ic_tns].astype(env.acc_dtype),
    axis=[ic, dy, dx, ic_tns]),
    name="res_conv")

# Add shift stage for fix-point normalization
res_shr = tvm.compute(output_shape,
                      lambda *i: res_conv(*i) >> 8,
                      name="res_shr")

# Apply clipping between (0, input max value)
inp_max = (1 << (env.INP_WIDTH - 1)) - 1
res_max = tvm.compute(output_shape,
                      lambda *i: tvm.max(res_shr(*i), 0),
                      "res_max")
res_min = tvm.compute(output_shape,
                      lambda *i: tvm.min(res_max(*i), inp_max),
                      "res_min")

# Result Tensor
res = tvm.compute(output_shape,
                  lambda *i: res_min(*i).astype(env.inp_dtype),
                  name="res")


######################################################################
# Scheduling the Computation
# --------------------------
# We'll look at a set of schedule transformations necessary to map the
# 2D convolution onto VTA in an efficient fashion.
# Those include:
#
# - Computation blocking
# - Virtual threading to increase compute utilization
# - Lowering to VTA hardware intrinsics

# Create TVM schedule
s = tvm.create_schedule(res.op)
# Let's look at the default TVM schedule
print(tvm.lower(s, [data, kernel, res], simple_mode=True))

######################################################################
# Blocking the Computation
# ~~~~~~~~~~~~~~~~~~~~~~~~
# The 2D convolution is by default too large for activations or kernel weights
# to fit on VTA's on-chip buffers all at once.
# We apply blocking along input channels, output channels, and along
# the height spatial dimensions.
# We don't apply blocking along the width spatial dimension since it's
# the innermost dimension in the NCHW layout (and consequently to increase
# locality, it's best not to block along the innermost dimension).

# Let's define tiling sizes
b_block = 1 // env.BATCH
oc_block = 128 // env.BLOCK_OUT
ic_block = 16 // env.BLOCK_IN
h_block = 7
w_block = 14

# Tile the output tensor along the spatial and output channel dimensions
# (since by default we are doing single batch inference, the split along
#  the batch dimension has no effect)
b, oc, y, x, b_tns, oc_tns = s[res].op.axis
b_out, b_inn = s[res].split(b, factor=b_block)
oc_out, oc_inn = s[res].split(oc, factor=oc_block)
y_out, y_inn = s[res].split(y, factor=h_block)
x_out, x_inn = s[res].split(x, factor=w_block)
s[res].reorder(b_out, oc_out, y_out, x_out, b_inn, oc_inn, y_inn, x_inn, b_tns, oc_tns)

# Move intermediate computation into each output compute tile
s[res_conv].compute_at(s[res], x_out)
s[res_shr].compute_at(s[res], x_out)
s[res_max].compute_at(s[res], x_out)
s[res_min].compute_at(s[res], x_out)

# Apply additional loop split along reduction axis (input channel)
b_inn, oc_inn, y_inn, x_inn, b_tns, oc_tns = s[res_conv].op.axis
ic_out, ic_inn = s[res_conv].split(ic, factor=ic_block)

# Reorder axes.
# 1) Group the VTA tensor axes in the inner most position: b_tns, oc_tns, ic_tns
#    to allow TVM to tensorize.
# 2) We move the ic_out axis all the way out of the convolution loop to block
#    along the reduction axis.
# 3) Now we re-order the block axes: b_inn, oc_inn, y_inn, x_inn, ic_inn, dy, dx.
#    VTA runtime/hardware requires us to write to a different output feature map
#    location for every VTA tensor operation.
#    This restriction requires us to order one of oc_inn, y_inn or x_inn right
#    before b_tns, since they all affect output feature map indexing.
#    Therefore, we choose to bring x_inn inside as shown below.
s[res_conv].reorder(ic_out, b_inn, oc_inn, y_inn, ic_inn, dy, dx, x_inn, b_tns, oc_tns, ic_tns)

######################################################################
# Virtual Threading
# ~~~~~~~~~~~~~~~~~
# Virtual threading is a mechanism that increases task-level pipeline
# parallelism in the VTA hardware design.
# Put it another way, it increases compute resource utilization by hiding
# memory access latency.
#
# In the implementation below, virtual threading distributes work across two
# threads split along the output channel axis.
# We show how work is split when computing the 2D convolution in the figure
# below.
#
# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/virtual_threading.png
#      :align: center
#      :width: 480px

# VTA only supports 2 virtual threads
v_threads = 2

# Perform virtual thread split along output channel outer axis
_, tx = s[res].split(oc_out, factor=v_threads)
s[res].reorder(tx, b_out)
s[res].bind(tx, tvm.thread_axis("cthread"))

# Let's look at the current TVM schedule after blocking and virtual threading
print(tvm.lower(s, [data, kernel, res], simple_mode=True))

######################################################################
# Lowering Copies to DMA Transfers
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Next we set the buffer scopes to the corresponding on-chip VTA SRAM buffers.
# We move the load loops into the 2D convolution computation loop to stage
# memory loads such that they fit in the on-chip SRAM buffers.
# Finally we annotate the load/store loop outer axes with the DMA copy pragma
# to perform bulk memory transfers on VTA.

# Set scope of SRAM buffers
s[data_buf].set_scope(env.inp_scope)
s[kernel_buf].set_scope(env.wgt_scope)
s[res_conv].set_scope(env.acc_scope)
s[res_shr].set_scope(env.acc_scope)
s[res_min].set_scope(env.acc_scope)
s[res_max].set_scope(env.acc_scope)

# Block data and kernel cache reads
s[data_buf].compute_at(s[res_conv], ic_out)
s[kernel_buf].compute_at(s[res_conv], ic_out)

# Use DMA copy pragma on DRAM->SRAM operations
s[data_buf].pragma(s[data_buf].op.axis[0], env.dma_copy)
s[kernel_buf].pragma(s[kernel_buf].op.axis[0], env.dma_copy)

# Use DMA copy pragma on SRAM->DRAM operation in each result block
# (this implies that these copies should be performed along b_inn,
# or result axis 4)
s[res].pragma(s[res].op.axis[4], env.dma_copy)

######################################################################
# Lowering Computation to VTA Compute Intrinsics
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The last phase is to lower the computation loops down to VTA hardware
# intrinsics by mapping the 2D convolution to tensor intrinsics,
# and mapping the shift, and clipping computation to the vector ALU.

# Apply tensorization over the batch tensor tile axis
s[res_conv].tensorize(b_tns, env.gemm)

# Add an ALU pragma over the shift and clipping operations
s[res_shr].pragma(s[res_shr].op.axis[0], env.alu)
s[res_min].pragma(s[res_min].op.axis[0], env.alu)
s[res_max].pragma(s[res_max].op.axis[0], env.alu)

# Let's look at the final lowered TVM schedule after lowering memory
# loads/stores down to DMA copy intrinsics, and the computation down to
# VTA compute intrinsics.
print(vta.lower(s, [data, kernel, res], simple_mode=True))

######################################################################
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.

# This library facilitates 2D convolution testing
from topi.testing import conv2d_nchw_python

# Compile the TVM module
my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv")
temp = util.tempdir()
my_conv.save(temp.relpath("conv2d.o"))
remote.upload(temp.relpath("conv2d.o"))
f = remote.load_module("conv2d.o")

# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the data and kernel arrays randomly in the int range
# of (-128, 128] in NCHW layout
data_np = np.random.randint(
    -128, 128,
    size=(batch_size, in_channels, height, width)).astype(data.dtype)
kernel_np = np.random.randint(
    -128, 128,
    size=(out_channels, in_channels, kernel_h, kernel_w)).astype(kernel.dtype)

# Apply packing to the data and kernel arrays from a 2D NCHW
# to a 4D NCHWnc packed layout
data_packed = data_np.reshape(batch_size // env.BATCH,
                              env.BATCH,
                              in_channels // env.BLOCK_IN,
                              env.BLOCK_IN,
                              height,
                              width).transpose((0, 2, 4, 5, 1, 3))

kernel_packed = kernel_np.reshape(out_channels // env.BLOCK_OUT,
                                  env.BLOCK_OUT,
                                  in_channels // env.BLOCK_IN,
                                  env.BLOCK_IN,
                                  kernel_h,
                                  kernel_w).transpose((0, 2, 4, 5, 1, 3))

# Format the input/output arrays with tvm.nd.array to the DLPack standard
data_nd = tvm.nd.array(data_packed, ctx)
kernel_nd = tvm.nd.array(kernel_packed, ctx)
res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx)

# Invoke the module to perform the computation
f(data_nd, kernel_nd, res_nd)

# Verify against numpy implementation
res_ref = conv2d_nchw_python(data_np.astype(env.acc_dtype),
                            kernel_np.astype(env.acc_dtype),
                            (stride_h, stride_w),
                            (pad_h, pad_w)).astype(env.acc_dtype)
res_ref = res_ref >> env.INP_WIDTH
res_ref = np.clip(res_ref, 0, inp_max)
res_ref = res_ref.astype(res.dtype)
res_ref = res_ref.reshape((batch_size // env.BATCH,
                           env.BATCH,
                           out_channels // env.BLOCK_OUT,
                           env.BLOCK_OUT,
                           fout_height,
                           fout_width)).transpose((0, 2, 4, 5, 1, 3))
416
tvm.testing.assert_allclose(res_ref, res_nd.asnumpy())
417 418 419 420 421 422 423 424 425 426
print("Successful 2D convolution test!")

######################################################################
# Summary
# -------
# This tutorial demonstrates how TVM scheduling primitives can be used to
# lower 2D convolution onto hardware accelerator intrinsics, making
# use of hardware specific optimizations, such as latency hiding with
# virtual threading.
#