opt_conv_tensorcore.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
.. _opt-conv-tensorcore:

How to optimize convolution using TensorCores
=============================================
**Author**: `Siyuan Feng <https://github.com/Hzfengsy>`_

In this tutorial, we will demonstrate how to write a high performance convolution
schedule using TensorCores in TVM. In this example, we assume the input to
convolution has a large batch. We strongly recommend covering the :ref:`opt-conv-gpu` tutorial first.

"""

################################################################
# TensorCore Introduction
# -----------------------
# Each Tensor Core provides a 4x4x4 matrix processing array that operates
# :code:`D = A * B + C`, where A, B, C and D are 4x4 matrices as Figure shows.
# The matrix multiplication inputs A and B are FP16 matrices, while the accumulation
# matrices C and D may be FP16 or FP32 matrices.
#
# However, CUDA programmers can only use warp-level primitive
# :code:`wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag)` to perform
# 16x16x16 half-precision matrix multiplication on tensor cores. Before invoking
# the matrix multiplication, programmers must load data from memory into registers
# with primitive :code:`wmma::load_matrix_sync`, explicitly. The NVCC compiler translates
# that primitive into multiple memory load instructions. At run time, every thread loads
# 16 elements from matrix A and 16 elements from B.

################################################################
# Preparation and Algorithm
# -------------------------
# We use the fixed size for input tensors with 256 channels and 14 x 14 dimensions.
# The batch size is 256. Convolution filters contain 512 filters of size 3 x 3.
# We use stride size 1 and padding size 1 for the convolution. In the example, we use
# NHWCnc memory layout.The following code defines the convolution algorithm in TVM.

import tvm
from tvm import te
import numpy as np
from tvm.contrib import nvcc

# The sizes of inputs and filters
batch_size = 256
height = 14
width = 14
in_channels = 256
out_channels = 512
kernel_h = 3
kernel_w = 3
pad_h = 1
pad_w = 1
stride_h = 1
stride_w = 1

# TensorCore shape
block_size = 16

assert (batch_size % block_size == 0)
assert (in_channels % block_size == 0)
assert (out_channels % block_size == 0)

# Input feature map: (N, H, W, IC, n, ic)
data_shape = (batch_size // block_size,
              height,
              width,
              in_channels // block_size,
              block_size,
              block_size)
# Kernel: (H, W, IC, OC, ic, oc)
kernel_shape = (kernel_h,
                kernel_w,
                in_channels // block_size,
                out_channels // block_size,
                block_size,
                block_size)
# Output feature map: (N, H, W, OC, n, oc)
output_shape = (batch_size // block_size,
                height,
                width,
                out_channels // block_size,
                block_size,
                block_size)

# Reduction axes
kh = te.reduce_axis((0, kernel_h), name='kh')
kw = te.reduce_axis((0, kernel_w), name='kw')
ic = te.reduce_axis((0, in_channels // block_size), name='ic')
ii = te.reduce_axis((0, block_size), name='ii')

# Algorithm
A = te.placeholder(data_shape, name='A', dtype="float16")
W = te.placeholder(kernel_shape, name='W', dtype="float16")
Apad = te.compute(
    (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size,
     block_size),
    lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
        tvm.tir.all(h >= pad_h, h - pad_h < height,
                w >= pad_w, w - pad_w < width),
        A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.tir.const(0., "float16")),
    name='Apad')
Conv = te.compute(output_shape,
                   lambda n, h, w, o, nn, oo: te.sum(
                       Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") *
                       W[kh, kw, ic, o, ii, oo].astype("float32"),
                       axis=[ic, kh, kw, ii]),
                   name="Conv")

s = te.create_schedule(Conv.op)
s[Apad].compute_inline()

###############################################################################
# Memory Scope
# ------------
# In traditional GPU schedule, we have global, shared and local memory scope.
# To support TensorCores, we add another three special memory scope: :code:`wmma.matrix_a`,
# :code:`wmma.matrix_b` and :code:`wmma.accumulator`. On hardware, all fragments scope
# stores at the on-chip registers level, the same place with local memory.

# Designate the memory hierarchy
AS = s.cache_read(Apad, 'shared', [Conv])
WS = s.cache_read(W, 'shared', [Conv])
AF = s.cache_read(AS, 'wmma.matrix_a', [Conv])
WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
ConvF = s.cache_write(Conv, 'wmma.accumulator')

###############################################################################
# Define Tensor Intrinsic
# -----------------------
# In fact, TensorCore is a special hardware operation. So, we can just use tensorize
# to replace a unit of computation with the TensorCore instruction. The first thing is
# that we need to define tensor intrinsic.
#
# There are four basic operation in TensorCore: :code:`fill_fragment`, :code:`load_matrix`,
# :code:`mma_sync` and :code:`store_matrix`. Since :code:`fill_fragment` and :code:`mma_sync`
# are both used in matrix multiplication, so we can just write following three intrinsics.

def intrin_wmma_load_matrix(scope):
    n = 16
    A = te.placeholder((n, n), name='A', dtype='float16')
    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=256)
    C = te.compute((n, n), lambda i, j: A[i, j], name='C')
    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)

    def intrin_func(ins, outs):
        ib = tvm.tir.ir_builder.create()

        BA = ins[0]
        BC = outs[0]
        ib.emit(tvm.tir.call_intrin('handle', 'tvm_load_matrix_sync',
                                BC.data, n, n, n, BC.elem_offset // 256,
                                BA.access_ptr('r'), n, 'row_major'))
        return ib.get()

    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})


def intrin_wmma_gemm():
    n = 16
    A = te.placeholder((n, n), name='A', dtype='float16')
    B = te.placeholder((n, n), name='B', dtype='float16')
    k = te.reduce_axis((0, n), name="k")
    C = te.compute((n, n),
                    lambda ii, jj:
                    te.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
                    name='C')
    BA = tvm.tir.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=256)
    BB = tvm.tir.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=256)
    BC = tvm.tir.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=256)

    def intrin_func(ins, outs):
        BA, BB = ins
        BC, = outs

        def init():
            ib = tvm.tir.ir_builder.create()
            ib.emit(tvm.tir.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, n, n, BC.elem_offset // 256, 0.0))
            return ib.get()

        def update():
            ib = tvm.tir.ir_builder.create()
            ib.emit(tvm.tir.call_intrin('handle', 'tvm_mma_sync',
                                    BC.data, BC.elem_offset // 256,
                                    BA.data, BA.elem_offset // 256,
                                    BB.data, BB.elem_offset // 256,
                                    BC.data, BC.elem_offset // 256))
            return ib.get()

        return update(), init(), update()

    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})


def intrin_wmma_store_matrix():
    n = 16
    A = te.placeholder((n, n), name='A', dtype='float32')
    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=256)
    C = te.compute((n, n), lambda i, j: A[i, j], name='C')
    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=256)

    def intrin_func(ins, outs):
        ib = tvm.tir.ir_builder.create()
        BA = ins[0]
        BC = outs[0]
        ib.emit(tvm.tir.call_intrin('handle', 'tvm_store_matrix_sync',
                                BA.data, n, n, n, BA.elem_offset // 256,
                                BC.access_ptr('w'), n, 'row_major'))
        return ib.get()

    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})

###############################################################################
# Scheduling the Computation
# --------------------------
# To use TensorCores in TVM, we must schedule the computation into specific structure
# to match the tensor intrinsic. The same as traditional GPU programs, we can also use
# shared memory to boost the speed. If you have any questions about blocking and shared
# memory, please refer :ref:`opt-conv-gpu`.
#
# In this example, each block contains 2x4 warps, and each warp calls 4x2 TensorCore
# instructions. Thus, the output shape of each warp is 64x32 and each block outputs
# 128x128 titles. Due to the limit of shared memory space, we only load 2 blocks (2x128x128 tiles)
# one time.
#
# .. note::
#
#   *Warp-level Operation*
#
#   Note that all TensorCore instructions are warp-level instructions, which means all 32 threads
#   in a warp should do this instruction simultaneously. Making theadIdx.x extent=32 is one of the
#   easiest way to solve this. Then We can bind threadIdx.x to any loops except those contain
#   TensorCore intrinsics directly or indirectly. Also note that it is not the unique solution.
#   The only thing we should do is to make sure all threads in a warp can call TensorCore at the same time.

# Define tiling sizes
block_row_warps = 4
block_col_warps = 2
warp_row_tiles = 2
warp_col_tiles = 4
warp_size = 32
chunk = 2

block_x = te.thread_axis('blockIdx.x')
block_y = te.thread_axis('blockIdx.y')
block_z = te.thread_axis('blockIdx.z')
thread_x = te.thread_axis('threadIdx.x')
thread_y = te.thread_axis('threadIdx.y')
thread_z = te.thread_axis('threadIdx.z')

nc, hc, wc, oc, nnc, ooc = Conv.op.axis
block_k = s[Conv].fuse(hc, wc)
s[Conv].bind(block_k, block_z)
nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
block_i, nc = s[Conv].split(nc, factor=block_row_warps)
oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
block_j, oc = s[Conv].split(oc, factor=block_col_warps)
s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
s[Conv].bind(block_i, block_x)
s[Conv].bind(block_j, block_y)
s[Conv].bind(nc, thread_y)
s[Conv].bind(oc, thread_z)

# Schedule local computation
s[ConvF].compute_at(s[Conv], oc)
n, h, w, o, nnf, oof = ConvF.op.axis
ko, ki = s[ConvF].split(ic, factor=chunk)
s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)

# Move intermediate computation into each output compute tile
s[AF].compute_at(s[ConvF], kw)
s[WF].compute_at(s[ConvF], kw)

# Schedule for A's share memory
s[AS].compute_at(s[ConvF], kh)
n, h, w, i, nn, ii = AS.op.axis
tx, xo = s[AS].split(n, nparts=block_row_warps)
ty, yo = s[AS].split(xo, nparts=block_col_warps)
t = s[AS].fuse(nn, ii)
to, ti = s[AS].split(t, factor=warp_size)
s[AS].bind(tx, thread_y)
s[AS].bind(ty, thread_z)
s[AS].bind(ti, thread_x)

# Schedule for W's share memory
s[WS].compute_at(s[ConvF], kh)
kh, kw, ic, o, ii, oo = WS.op.axis
tx, xo = s[WS].split(o, nparts=block_row_warps)
ty, yo = s[WS].split(xo, nparts=block_col_warps)
t = s[WS].fuse(ii, oo)
to, ti = s[WS].split(t, nparts=warp_size)
s[WS].bind(tx, thread_y)
s[WS].bind(ty, thread_z)
s[WS].bind(to, thread_x)
s[WS].vectorize(ti)
print(tvm.lower(s, [A, W, Conv], simple_mode=True))

###############################################################################
# Lowering Computation to Intrinsics
# ----------------------------------
# The last phase is to lower the computation loops down to TensorCore hardware intrinsics
# by mapping the 2D convolution to tensor intrinsics

s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a'))
s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b'))
s[Conv].tensorize(nnc, intrin_wmma_store_matrix())
s[ConvF].tensorize(nnf, intrin_wmma_gemm())
print(tvm.lower(s, [A, W, Conv], simple_mode=True))

###############################################################################
# Generate CUDA Kernel
# --------------------
# Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution.
# Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not
# be able to run on our build server

ctx = tvm.gpu(0)
if nvcc.have_tensorcore(ctx.compute_version):
    with tvm.target.build_config(auto_unroll_max_step=16):
        func = tvm.build(s, [A, W, Conv], 'cuda')
    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
    a = tvm.nd.array(a_np, ctx)
    w = tvm.nd.array(w_np, ctx)
    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))

###############################################################################
# Summary
# -------
# This tutorial demonstrates how TVM scheduling primitives can be used to
# call TensorCores on specific GPUs.