Auto TensorCore CodeGen (#4234)

* Add Auto TensorCore TensorCore Unit Test * Rebase to tvm master branch & Add auto tensor core * Code Refine * Add tensor core switch by pragma * Add pragma in tensor core example code * Get real tile size to replace hard coded 16 * support more than 2 dimensions (e.g. batchmatmul) for buffer bind scope * support batch matmul * Move cuda env check to tensor_core.cc * Coderefine for tensor_core.cc * Refine comments * Some refinements of code and comment * Update TensorCore UT to pass the CPU test * remove redundant code * matmul's storage align for different layout * Add support for differenct position of type cast * Add formal tutorial for auto tensorcore codegen * move tensorcore check up to tutorial code * code and doc refine * comment out tune_and_evaluate in tutorial * fix cpplint error

Auto TensorCore CodeGen (#4234)
* Add Auto TensorCore TensorCore Unit Test * Rebase to tvm master branch & Add auto tensor core * Code Refine * Add tensor core switch by pragma * Add pragma in tensor core example code * Get real tile size to replace hard coded 16 * support more than 2 dimensions (e.g. batchmatmul) for buffer bind scope * support batch matmul * Move cuda env check to tensor_core.cc * Coderefine for tensor_core.cc * Refine comments * Some refinements of code and comment * Update TensorCore UT to pass the CPU test * remove redundant code * matmul's storage align for different layout * Add support for differenct position of type cast * Add formal tutorial for auto tensorcore codegen * move tensorcore check up to tutorial code * code and doc refine * comment out tune_and_evaluate in tutorial * fix cpplint error
d64bf6b5 · Minmin Sun (孙敏敏) · Tianqi Chen · 281f643c · d64bf6b5 · d64bf6b5
Commit d64bf6b5 authored Nov 10, 2019 by Minmin Sun (孙敏敏) Committed by Tianqi Chen Nov 09, 2019
7 changed files
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -1248,6 +1248,8 @@ constexpr const char* reduce_scope = "reduce_scope";
 constexpr const char* pragma_scope_prefix = "pragma_";
 /*! \brief Import llvm source or file into the final code gen module */
 constexpr const char* pragma_import_llvm = "pragma_import_llvm";
+/*! \brief Try to modify the AST to support Tensor Core */
+constexpr const char* pragma_tensor_core = "pragma_tensor_core";
 /*!
 * \brief Mark of prefetch scope, value=offset,
 *  run prefetch of Tensor on the current loop scope

--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -206,6 +206,20 @@ Stmt StorageFlatten(Stmt stmt,
                    Map<Tensor, Buffer> extern_buffer,
                    int cache_line_size,
                    bool create_bound_attribute = false);
+
+/*!
+ * \brief Try to modify the AST to support TensorCore
+ *
+ * \param stmt The stmt to be trasnformed.
+ * \param schedule The original schedule.
+ * \param extern_buffer Map specifies external
+ *    buffer assignment of input and outputs.
+ * \return Transformed stmt.
+ */
+Stmt RewriteForTensorCore(Stmt stmt,
+                          Schedule schedule,
+                          Map<Tensor, Buffer> extern_buffer);
+
 /*!
 * \brief Verify if there is any argument bound to compact buffer.
 *

--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -387,6 +387,7 @@ def lower(sch,
    binds, arg_list = get_binds(args, compact, binds)

    # Phase 1
+    stmt = ir_pass.RewriteForTensorCore(stmt, sch, binds)
    stmt = ir_pass.StorageFlatten(stmt, binds, 64, cfg.instrument_bound_checkers)
    stmt = ir_pass.CanonicalSimplify(stmt)
    for f in lower_phase1:

--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -94,6 +94,12 @@ TVM_REGISTER_API("ir_pass.StorageFlatten")
    }
  });

+TVM_REGISTER_API("ir_pass.RewriteForTensorCore")
+.set_body_typed<Stmt(const Stmt&, const Schedule&, const Map<Tensor, Buffer>&)>
+  ([](const Stmt& stmt, const Schedule& schedule, const Map<Tensor, Buffer>& extern_buffer) {
+      return RewriteForTensorCore(stmt, schedule, extern_buffer);
+  });
+
 TVM_REGISTER_API("ir_pass.AttrsEqual")
 .set_body_typed<bool(const NodeRef&, const NodeRef&)>([](const NodeRef& lhs, const NodeRef& rhs) {
    return AttrsEqual()(lhs, rhs);

--- a/src/pass/tensor_core.cc
+++ b/src/pass/tensor_core.cc
--- a/tests/python/unittest/test_pass_rewrite_for_tensor_core.py
+++ b/tests/python/unittest/test_pass_rewrite_for_tensor_core.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# 'License'); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import topi
+import numpy as np
+from tvm.contrib import nvcc
+
+def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
+    A = tvm.placeholder((n, l), name='A', dtype='float16')
+    B = tvm.placeholder((l, m), name='B', dtype='float16')
+    k = tvm.reduce_axis((0, l), name='k')
+    C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('float32') * B[k, j].astype('float32'), axis=k))
+    s = tvm.create_schedule(C.op)
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    AA = s.cache_read(A, "shared", [C])
+    AL = s.cache_read(AA, "local", [C])
+    BB = s.cache_read(B, "shared", [C])
+    BL = s.cache_read(BB, "local", [C])
+    CL = s.cache_write(C, "local")
+
+    bx = 4
+    by = 32
+    step_k = 8
+    v = 4
+    TX = 8
+    TY = 1
+    tile_x = bx * TX
+    tile_y = by * TY
+    WX = min(warp_tile_m, tile_x)
+    tile_k = 16
+    vthread = 1
+
+    yo, ty = s[C].split(y, tile_y*vthread)
+    vy, ty = s[C].split(ty, tile_y)
+    ty, yi = s[C].split(ty, TY)
+
+    xo, xi = s[C].split(x, tile_x)
+    tz, xi = s[C].split(xi, WX)
+    tx, xi = s[C].split(xi, TX)
+    ko, ki = s[CL].split(k, step_k * tile_k)
+    kl, ki = s[CL].split(ki, tile_k)
+
+    s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
+    s[C].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[C].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[C].bind(tz, tvm.thread_axis("threadIdx.z"))
+    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[C].bind(vy, tvm.thread_axis((0, vthread), "vthread", name="vy"))
+    s[CL].compute_at(s[C], tx)
+    yo, xo = CL.op.axis
+    s[CL].reorder(ko, kl, ki, yo, xo)
+
+    s[AA].compute_at(s[CL], ko)
+    xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx*v)
+    tz, tx = s[AA].split(xi, factor=(WX//TX)*v)
+    tx, vec = s[AA].split(tx, factor=v)
+    fused = s[AA].fuse(s[AA].op.axis[0], xo)
+    _, ty = s[AA].split(fused, factor=by)
+    s[AA].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[AA].bind(tz, tvm.thread_axis("threadIdx.z"))
+    s[AA].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[AA].vectorize(vec)
+
+    s[BB].compute_at(s[CL], ko)
+    xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx*v)
+    tz, tx = s[BB].split(xi, factor=(WX//TX)*v)
+    tx, vec = s[BB].split(tx, factor=v)
+    fused = s[BB].fuse(s[BB].op.axis[0], xo)
+    _, ty = s[BB].split(fused, factor=by)
+    s[BB].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[BB].bind(tz, tvm.thread_axis("threadIdx.z"))
+    s[BB].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[BB].vectorize(vec)
+
+    s[AL].compute_at(s[CL], kl)
+    s[BL].compute_at(s[CL], kl)
+
+    s[CL].pragma(ko, 'tensor_core')
+
+    func = tvm.build(s, [A, B, C], 'cuda')
+
+    ctx = tvm.gpu(0)
+    a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
+    b_np = np.random.uniform(size=(l, m)).astype(B.dtype)
+    c_np = np.zeros((n, m), dtype=np.float32)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+    func(a, b, c)
+    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
+    print('gemm m=%d n=%d k=%d: %f ms' % (m, n, l, evaluator(a, b, c).mean * 1e3))
+
+    c_np = np.dot(a_np, b_np)
+    np.testing.assert_allclose(c_np, c.asnumpy(), rtol=1e-3)
+
+def tensor_core_batch_matmul(warp_tile_m=16, m=64, n=32, l=96, batch=2):
+    A = tvm.placeholder((batch, n, l), name='A', dtype='float16')
+    B = tvm.placeholder((batch, l, m), name='B', dtype='float16')
+    k = tvm.reduce_axis((0, l), name='k')
+    C = tvm.compute((batch, n, m), lambda b, i, j: tvm.sum((A[b, i, k] * B[b, k, j]).astype('float32'), axis=k))
+    s = tvm.create_schedule(C.op)
+    z, y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    AA = s.cache_read(A, "shared", [C])
+    AL = s.cache_read(AA, "local", [C])
+    BB = s.cache_read(B, "shared", [C])
+    BL = s.cache_read(BB, "local", [C])
+    CL = s.cache_write(C, "local")
+
+    bx = 2
+    by = 32
+    step_k = 8
+    v = 4
+    TX = 8
+    TY = 1
+    tile_x = bx * TX
+    tile_y = by * TY
+    WX = min(warp_tile_m, tile_x)
+    tile_k = 16
+    vthread = 1
+
+    yo, ty = s[C].split(y, tile_y*vthread)
+    vy, ty = s[C].split(ty, tile_y)
+    ty, yi = s[C].split(ty, TY)
+
+    xo, xi = s[C].split(x, tile_x)
+    tz, xi = s[C].split(xi, WX)
+    tx, xi = s[C].split(xi, TX)
+    ko, ki = s[CL].split(k, step_k * tile_k)
+    kl, ki = s[CL].split(ki, tile_k)
+
+    s[C].reorder(z, yo, xo, tz, ty, tx, yi, xi)
+    s[C].bind(z, tvm.thread_axis("blockIdx.z"))
+    s[C].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[C].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[C].bind(tz, tvm.thread_axis("threadIdx.z"))
+    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[C].bind(vy, tvm.thread_axis((0, vthread), "vthread", name="vy"))
+    s[CL].compute_at(s[C], tx)
+    zo, yo, xo = CL.op.axis
+    s[CL].reorder(ko, kl, ki, zo, yo, xo)
+
+    s[AA].compute_at(s[CL], ko)
+    xo, xi = s[AA].split(s[AA].op.axis[2], factor=bx*v)
+    tz, tx = s[AA].split(xi, factor=(WX//TX)*v)
+    tx, vec = s[AA].split(tx, factor=v)
+    fused = s[AA].fuse(s[AA].op.axis[1], xo)
+    _, ty = s[AA].split(fused, factor=by)
+    s[AA].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[AA].bind(tz, tvm.thread_axis("threadIdx.z"))
+    s[AA].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[AA].vectorize(vec)
+
+    s[BB].compute_at(s[CL], ko)
+    xo, xi = s[BB].split(s[BB].op.axis[2], factor=bx*v)
+    tz, tx = s[BB].split(xi, factor=(WX//TX)*v)
+    tx, vec = s[BB].split(tx, factor=v)
+    fused = s[BB].fuse(s[BB].op.axis[1], xo)
+    _, ty = s[BB].split(fused, factor=by)
+    s[BB].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[BB].bind(tz, tvm.thread_axis("threadIdx.z"))
+    s[BB].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[BB].vectorize(vec)
+
+    s[AL].compute_at(s[CL], kl)
+    s[BL].compute_at(s[CL], kl)
+
+    s[CL].pragma(ko, 'tensor_core')
+
+    func = tvm.build(s, [A, B, C], 'cuda')
+
+    ctx = tvm.gpu(0)
+    a_np = np.random.uniform(size=(batch, n, l)).astype(A.dtype)
+    b_np = np.random.uniform(size=(batch, l, m)).astype(B.dtype)
+    c_np = np.zeros((batch, n, m), dtype=np.float32)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros((batch, n, m), dtype=C.dtype), ctx)
+    func(a, b, c)
+    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
+    print('batch gemm m=%d n=%d k=%d batch=%d: %f ms' % (m, n, l, batch, evaluator(a, b, c).mean * 1e3))
+
+    for bs in range(batch):
+      c_np[bs, :, :] = np.dot(a_np[bs, :, :], b_np[bs, :, :])
+    np.testing.assert_allclose(c_np, c.asnumpy(), rtol=1e-3)
+
+def test_tensor_core_matmul():
+    if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+        print("skip because cuda is not enabled..")
+        return
+    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
+        print("skip because gpu does not support tensor core")
+        return
+
+    tensor_core_matmul(16) #test with warp_tile 16x16x16
+    tensor_core_matmul(8) #test with warp_tile 8x32x16
+    tensor_core_matmul(32) #test with warp_tile 32x8x16
+
+def test_tensor_core_batch_matmul():
+    if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+        print("skip because cuda is not enabled..")
+        return
+    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
+        print("skip because gpu does not support tensor core")
+        return
+
+    tensor_core_batch_matmul()
+
+if __name__ == '__main__':
+    test_tensor_core_matmul()
+    test_tensor_core_batch_matmul()
--- a/tutorials/optimize/opt_matmul_auto_tensorcore.py
+++ b/tutorials/optimize/opt_matmul_auto_tensorcore.py