# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ .. _basic-mat-mult: Simple Matrix Multiply ====================== **Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_ In this tutorial, we will build on top of the :ref:`vta-get-started` tutorial and introduce additional concepts required to implement matrix multiplication on VTA with the TVM workflow. """ ###################################################################### # RPC Setup # --------- # We start by programming the Pynq's FPGA and building its RPC runtime # as we did in the VTA introductory tutorial. from __future__ import absolute_import, print_function import os import tvm import vta import numpy as np from tvm import rpc from tvm.contrib import util from vta.testing import simulator # Load VTA parameters from the vta/config/vta_config.json file env = vta.get_env() # We read the Pynq RPC host IP address and port number from the OS environment host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091")) # We configure both the bitstream and the runtime system on the Pynq # to match the VTA configuration specified by the vta_config.json file. if env.TARGET == "pynq": # Make sure that TVM was compiled with RPC=1 assert tvm.module.enabled("rpc") remote = rpc.connect(host, port) # Reconfigure the JIT runtime vta.reconfig_runtime(remote) # Program the FPGA with a pre-compiled VTA bitstream. # You can program the FPGA with your own custom bitstream # by passing the path to the bitstream file instead of None. vta.program_fpga(remote, bitstream=None) # In simulation mode, host the RPC server locally. elif env.TARGET == "sim": remote = rpc.LocalSession() ###################################################################### # Computation Declaration # ----------------------- # In this example we describe a simple matrix multiplication addition, which # requires multiple computation stages, as shown in the dataflow diagram below. # First we describe the input tensors :code:`A` and :code:`B` that are living # in main memory. # Second, we need to declare intermediate tensors :code:`A_buf` and # :code:`B_buf`, which will live in VTA's on-chip buffers. # Having this extra computational stage allows us to explicitly # stage cached reads and writes. # Third, we describe the matrix multiplication computation over # :code:`A_buf` and :code:`B_buf` to produce the product matrix :code:`C_buf`. # The last operation is a cast and copy back to DRAM, into results tensor # :code:`C`. # # .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/gemm_dataflow.png # :align: center ###################################################################### # Data Layout # ~~~~~~~~~~~ # We describe the placeholder tensors :code:`A`, and :code:`B` in a tiled data # format to match the data layout requirements imposed by the VTA tensor core. ###################################################################### # .. note:: # # **Data Tiling** # # One source of complexity when targeting accelerators is to make sure # that the data layout matches the layout imposed by the accelerator design. # VTA is designed around a *tensor core* that performs, one matrix-matrix # operation per cycle between an activation matrix and a weight matrix, # adding the result matrix to an accumulator matrix, as shown in the # figure below. # # .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/tensor_core.png # :align: center # :width: 480px # # The dimensions of that matrix-matrix multiplication are specified in # the :code:`vta_config.json` configuration file. # The activation matrix has a :code:`(BATCH, BLOCK_IN)` shape # and the transposed weight matrix has a :code:`(BLOCK_OUT, BLOCK_IN)` shape, # thus inferring that the resulting output matrix has a # :code:`(BATCH, BLOCK_OUT)` shape. # Consequently input and output tensors processed by VTA need to be # tiled according to these aforementioned dimension. # # The diagram below shows the impact of data tiling on a matrix that is # originally of shape (4, 8). # Tiling by a (2, 2) tile shape ensures that data within each tile is # contiguous. # The resulting tiled tensor has a shape of (2, 4, 2, 2). # # .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/data_tiling.png # :align: center # :width: 480px # # We first define the variables :code:`m`, :code:`n`, :code:`o` to represent # the shape of the matrix multiplication. These variables are multiplicative # factors over the :code:`BLOCK_OUT`, :code:`BLOCK_IN`, and :code:`BATCH` # tensor dimensions respectively. # By default, the configuration file sets :code:`BATCH`, :code:`BLOCK_IN`, and # :code:`BLOCK_OUT` to be 1, 16 and 16 respectively (:code:`BATCH` being set to # 1 implies that our compute building block is vector-matrix multiply). # ###################################################################### # .. note:: # # **Data Types** # # It's important to not only match the inner-tile # dimension of VTA's tensor core, but also to match the specific data types # expected by VTA. # VTA for now only supports fixed point data types, which integer width is # specified in the :code:`vta_config.json` file by :code:`INP_WIDTH` and # :code:`WGT_WIDTH` for the activations and weights data types respectively. # In addition, the accumulator data type integer width is specified by # :code:`ACC_WIDTH`. # # By default, the configuration file sets :code:`INP_WIDTH` # and :code:`WGT_WIDTH` to 8. # The accumulator width :code:`ACC_WIDTH` is set to 32, in order to avoid # overflow during accumulation. # As a result, :code:`env.inp_dtype` and :code:`env.wgt_dtype` are all # narrow 8-bit integers, while :code:`env.acc_dtype` is a standard 32-bit # integer. # Output channel factor m - total 16x16=256 output channels m = 16 # Input channel factor n - total 16x16=256 input channels n = 16 # Batch factor o (we use single batch inference) o = 1 # A placeholder tensor in tiled data format A = tvm.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="A", dtype=env.inp_dtype) # B placeholder tensor in tiled data format B = tvm.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="B", dtype=env.wgt_dtype) # A copy buffer A_buf = tvm.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: A(*i), "A_buf") # B copy buffer B_buf = tvm.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: B(*i), "B_buf") ###################################################################### # Matrix Multiplication # ~~~~~~~~~~~~~~~~~~~~~ # Now we're ready to describe the matrix multiplication result tensor :code:`C`, # with another compute operation. # The compute function takes the shape of the tensor, as well as a lambda # function that describes the computation rule for each position of the tensor. # # In order to implement matrix multiplication, the lambda function needs to # include a reduction formula over the input channel dimension axes. # To create a reduction formula, we can declare a reduction axis using # :code:`tvm.reduce_axis`, which takes in the range of reductions. # :code:`tvm.sum` takes in the expression to be reduced as well as # the reduction axes to compute the sum of value over all k in the declared # ranges. # # Note that the reduction needs to be performed over 32-bit :code:`env.acc_dtype` # accumulator data types. # # No computation happens during this phase, as we are only declaring how # the computation should be done. # Outer input feature reduction axis ko = tvm.reduce_axis((0, n), name="ko") # Inner input feature reduction axis ki = tvm.reduce_axis((0, env.BLOCK_IN), name="ki") # Describe the in-VTA matrix multiplication C_buf = tvm.compute( (o, m, env.BATCH, env.BLOCK_OUT), lambda bo, co, bi, ci: tvm.sum(A_buf[bo, ko, bi, ki].astype(env.acc_dtype) * B_buf[co, ko, ci, ki].astype(env.acc_dtype), axis=[ko, ki]), name="C_buf") ###################################################################### # Casting the Results # ~~~~~~~~~~~~~~~~~~~ # After the computation is done, we'll need to send the results computed by VTA # back to main memory. ###################################################################### # .. note:: # # **Memory Store Restrictions** # # One specificity of VTA is that it only supports DRAM stores in the narrow # :code:`env.inp_dtype` data type format. # This lets us reduce the data footprint for memory transfers, but also lets # us quantize the wide accumulator data type down to a data format that # matches the input activation data type. # This means that in the context of neural network inference, the outputs # of a given layer after activation can be consumed directly by the next # layer. # # We perform one last typecast operation to the narrow # input activation data format. # Cast to output type, and send to main memory C = tvm.compute( (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: C_buf(*i).astype(env.inp_dtype), name="C") ###################################################################### # This concludes the computation declaration part of this tutorial. ###################################################################### # Scheduling the Computation # -------------------------- # While the above lines describes the computation rule, we can obtain # :code:`C` in many ways. # TVM asks the user to provide an implementation of the computation called # *schedule*. # # A schedule is a set of transformations to an original computation that # transforms the implementation of the computation without affecting # correctness. # This simple VTA programming tutorial aims to demonstrate basic schedule # transformations that will map the original schedule down to VTA hardware # primitives. ###################################################################### # Default Schedule # ~~~~~~~~~~~~~~~~ # After we construct the schedule, by default the schedule computes # :code:`C` in the following way: # Let's take a look at the generated schedule s = tvm.create_schedule(C.op) print(tvm.lower(s, [A, B, C], simple_mode=True)) ###################################################################### # Although this schedule makes sense, it won't compile to VTA. # In order to obtain correct code generation, we need to apply scheduling # primitives and code annotation that will transform the schedule into # one that can be directly lowered onto VTA hardware intrinsics. # Those include: # # - DMA copy operations which will take globally-scoped tensors and copy # those into locally-scoped tensors. # - Tensor operations that will perform the matrix multiplication. ###################################################################### # Buffer Scopes # ~~~~~~~~~~~~~ # First, we set the scope of the buffers to tell TVM that these buffers # will be living in the VTA's on-chip SRAM caches. # Below, we tell TVM that :code:`A_buf`, :code:`B_buf`, :code:`C_buf` # will respectively live in VTA's on-chip input, weight and accumulator # memory. ###################################################################### # .. note:: # # **VTA's On-Chip SRAMs** # # VTA has three different memory scopes, each corresponding to different # on-chip SRAM buffers. # # - :code:`env.inp_scope`: Input buffer, which is a read-only SRAM buffer # that stores input matrices of shape :code:`(env.BATCH, env.BLOCK_IN)` # of type :code:`env.inp_dtype`. The input buffer contains # `2 ^ LOG_INP_BUFF_SIZE` matrix elements (as specified in the # :code:`vta_config.json` file). # - :code:`env.wgt_scope`: Weight buffer, which is a read-only SRAM buffer # that stores weight matrices of shape :code:`(env.BLOCK_OUT, env.BLOCK_IN)` # of type :code:`env.wgt_dtype`. The weight buffer contains # `2 ^ LOG_WGT_BUFF_SIZE` matrix elements. # - :code:`env.acc_scope`: Accumulator buffer, which is a read/write SRAM # buffer that stores accumulator matrices of shape # :code:`(env.BATCH, env.BLOCK_OUT)` of type :code:`env.acc_dtype`. # The accumulator buffer is VTA's general purpose register file: it holds # both intermediate results of convolutions and matrix multiplications # as well as intermediate results of pooling, batch normalization, and # activation layers. The accumulator buffer contains # `2 ^ LOG_ACC_BUFF_SIZE` matrix elements. # Set the intermediate tensor's scope to VTA's on-chip buffers s[A_buf].set_scope(env.inp_scope) s[B_buf].set_scope(env.wgt_scope) s[C_buf].set_scope(env.acc_scope) ###################################################################### # DMA Transfers # ~~~~~~~~~~~~~ # We need to schedule DMA transfers to move data living in DRAM to # and from the VTA on-chip buffers. # This can be achieved using the :code:`compute_at` schedule primitive # which nests the copying of the buffers into the computation loop # that performs the matrix multiplication. # # We insert :code:`dma_copy` pragmas to indicate to the compiler # that the copy operations will be performed in bulk via DMA, # which is common in hardware accelerators. # Finally, we print the temporary schedule to observe the effects of # moving the copy operations into the matrix multiplication loop. # Move buffer copy into matrix multiply loop s[A_buf].compute_at(s[C_buf], ko) s[B_buf].compute_at(s[C_buf], ko) # Tag the buffer copies with the DMA pragma to insert a DMA transfer s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy) s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy) s[C].pragma(s[C].op.axis[0], env.dma_copy) # Let's take a look at the transformed schedule print(tvm.lower(s, [A, B, C], simple_mode=True)) ###################################################################### # Tensorization # ~~~~~~~~~~~~~ # The last step of the schedule transformation consists in applying # *tensorization* to our schedule. # Tensorization is analogous to vectorization, but extends the concept # to a higher-dimensional unit of computation. # Consequently, tensorization imposes data layout constraints as discussed # earlier when declaring the data layout input placeholders. # We've already arranged our tensors in a tiled format, so the next thing # we need to perform is loop reordering to accommodate for tensorization. # # Here we choose to move the outermost reduction axis all the way out. # This dictates that we first iterate over input channels, then batch # dimensions, and finally output channels. # Lastly, we apply the tensorization scheduling primitive :code:`tensorize` # along the outer axis of the inner-most matrix matrix multiplication tensor # block. # We print the finalized schedule that is ready for code-generation # by the VTA runtime JIT compiler. s[C_buf].reorder( ko, s[C_buf].op.axis[0], s[C_buf].op.axis[1], s[C_buf].op.axis[2], s[C_buf].op.axis[3], ki) s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm) # Let's take a look at the finalized schedule print(vta.lower(s, [A, B, C], simple_mode=True)) ###################################################################### # This concludes the scheduling portion of this tutorial. ###################################################################### # TVM Compilation # --------------- # After we have finished specifying the schedule, we can compile it # into a TVM function. # Build GEMM VTA kernel my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm") # Write the compiled module into an object file. temp = util.tempdir() my_gemm.save(temp.relpath("gemm.o")) # Send the executable over RPC remote.upload(temp.relpath("gemm.o")) # Load the compiled module f = remote.load_module("gemm.o") ###################################################################### # Running the Function # -------------------- # The compiled TVM function uses a concise C API and can be invoked from # code language. # # TVM provides an array API in python to aid quick testing and prototyping. # The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard. # # - We first create a remote context (for remote execution on the Pynq). # - Then :code:`tvm.nd.array` formats the data accordingly. # - :code:`f()` runs the actual computation. # - :code:`asnumpy()` copies the result array back in a format that can be # interpreted. # # Get the remote device context ctx = remote.ext_dev(0) # Initialize the A and B arrays randomly in the int range of (-128, 128] A_orig = np.random.randint( -128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype) B_orig = np.random.randint( -128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype) # Apply packing to the A and B arrays from a 2D to a 4D packed layout A_packed = A_orig.reshape( o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3)) B_packed = B_orig.reshape( m, env.BLOCK_OUT, n, env.BLOCK_IN).transpose((0, 2, 1, 3)) # Format the input/output arrays with tvm.nd.array to the DLPack standard A_nd = tvm.nd.array(A_packed, ctx) B_nd = tvm.nd.array(B_packed, ctx) C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx) # Invoke the module to perform the computation f(A_nd, B_nd, C_nd) ###################################################################### # Verifying Correctness # --------------------- # Compute the reference result with numpy and assert that the output of the # matrix multiplication indeed is correct # Compute reference result with numpy C_ref = np.dot(A_orig.astype(env.acc_dtype), B_orig.T.astype(env.acc_dtype)).astype(C.dtype) C_ref = C_ref.reshape( o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3)) np.testing.assert_equal(C_ref, C_nd.asnumpy()) print("Successful matrix multiply test!") ###################################################################### # Summary # ------- # This tutorial showcases the TVM workflow to implement a simple matrix # multiplication example on VTA. # The general workflow includes: # # - Programming the FPGA with the VTA bitstream over RPC. # - Describing matrix multiplication via a series of computations. # - Describing how we want to perform the computation using schedule primitives. # - Compiling the function to the VTA target. # - Running the compiled module and verifying it against a numpy implementation. #