chisel_accel.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import tvm
import numpy as np
import tsim
import sys

""" Vector Bit Slice and Pack Function
Parameters
----------
A : Vector to be sliced and packed
slice_width : slice width

Returns
---------
C: 2d matrix where each cloumn (because of bit packing) represents each bit slice of A
"""
def slice(A, slice_width):
    assert np.log2(slice_width) % 1 == 0, "only power of 2 is supported"
    dtype = type(A[0]) 
    row = 0
    # currently only supports uint
    if dtype is np.uint8: row = 8 // slice_width
    elif dtype is np.uint16: row = 16 // slice_width
    elif dtype is np.uint32: row = 32 // slice_width
    elif dtype is np.uint64: row = 64 // slice_width
    else: raise ValueError("datatype currently not supported")
    if (row >= 8):
        dtype = 'uint' + str(row)
    else:
        dtype = 'uint8'

    C = np.zeros((row, len(A))).astype(dtype) # sliced and transform 

    # create mask
    slice_mask = 2**(slice_width)-1
    # slice and pack
    for x in range(len(A)):
        for y in range(row):
            C[y][x] = (np.uint64(A[x]) >> np.uint64(slice_width * y)) & np.uint64(slice_mask)
    return C

def slice_mat(A, slice_width):
    assert np.log2(slice_width) % 1 == 0, "only power of 2 is supported"
    dtype = type(A[0][0]) 
    row = 0
    # currently only supports uint
    if dtype is np.uint8: row = 8 // slice_width
    elif dtype is np.uint16: row = 16 // slice_width
    elif dtype is np.uint32: row = 32 // slice_width
    elif dtype is np.uint64: row = 64 // slice_width
    else: raise ValueError("datatype currently not supported")
    if (row >= 8):
        dtype = 'uint' + str(row)
    else:
        dtype = 'uint8'

    # 3d array (bits, row, clmn)
    C = np.zeros((row, A.shape[0], A.shape[1])).astype(dtype) # sliced and transform 

    # create mask
    slice_mask = 2**(slice_width)-1
    # slice and pack
    for z in range(A.shape[0]):
        C[:, z, :] = slice(A[z], slice_width)
    return C

""" Matrix Multiplication Function
Parameters
----------
A : Matrix A
B: Matrix B
i_width : weight slice width
w_width : activation slice width

Returns
---------
C: result of A * B
"""
# A is a n*m matrix, B is a m*p matrix(not transposed yet)
def matrix_multiply(A, B, i_width, w_width):
    assert A.shape[1] == B.shape[0], "can't perform multiplication"
    BT = B.transpose()
    cycles = 0
    B_sliced = slice_mat(BT, w_width)
    C = np.zeros((A.shape[0], B.shape[1])).astype('uint64')
    for i in range(A.shape[0]):
        A_sliced = slice(A[i], i_width)
        test = test_accel(A_sliced, B_sliced, i_width, w_width)
        C[i] = test[0]
        cycles += test[1]
        np.testing.assert_array_equal(C[i], compute(A_sliced, B_sliced, i_width, w_width))
        print("PASS row " + str(i))

    np.testing.assert_array_equal(C, np.matmul(A.astype('uint64'),B))
    print("result: ")
    print(C)
    print("TEST PASSED, cycles: " + str(cycles))
    return C

""" Software Verification Function
Parameter Dimesions
---------
A (bits, y) and B (bits, y, x) (transposed)

Takes 1 vector and 1 matrix input (sliced and packed)

Returns
---------
Resulting vector
"""
def compute(A, B, i_width, w_width):
    assert A.shape[1] == B.shape[1], "sliced shape not match"
    # reset hardware accumulator
    accum = np.zeros(A.shape[1])
    for x in range(A.shape[0]):
        for y in range(B.shape[0]):
            accum += np.matmul(A[x].astype('uint64'), B[y].transpose()) << np.uint64(x*i_width + y*w_width)
    # get value from accumulator
    return accum

"""Testing Function for Matrix Vector Multiplication"""
def test_accel(A, B, i_width, w_width):
    assert A.shape[1] == B.shape[2], "sliced shape not match"
    dtype = A.dtype
    ctx = tvm.cpu(0)
    f = tsim.load_module()

    a_arr = []
    b_arr = []
    for i in range(A.shape[0]):
        list_a = np.zeros(A.shape[1]).astype(dtype)
        for j in range(A.shape[1]):
            list_a[j] = A[i][j]
        a_arr.append(tvm.nd.array(list_a.astype(dtype), ctx))

    for i in range(B.shape[0]):
        # transpose
        list_b = np.zeros((B.shape[2], B.shape[1])).astype(dtype)
        for j in range(B.shape[2]):
            for k in range(B.shape[1]):
                list_b[j][k] = B[i][j][k]
        b_arr.append(tvm.nd.array(list_b.astype(dtype), ctx))

    cycles = 0
    accum = tvm.nd.array(np.zeros(A.shape[1]).astype("uint32"), ctx)
    for i in range(len(a_arr)):
        for j in range(len(b_arr)):
            shift = np.uint8(i*i_width + j*w_width)
            if i == 0 and j == 0: 
                cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(1)) # reset accumulator
            else: 
                cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(0)) # no reset

    return (accum.asnumpy(), cycles)

""" Matrix Generator
Parameters
----------     
dtype : String, datatype generated (supports only uint)
i_width : weight bit slices(needs to be less than actual bit width)
w_width : activation bit slices(needs to be less than actual bit width)
"""
def top_test(dtype, i_width, w_width):

    # only supports positive values (up to 2**(bits-1))
    rmax = 127 
    # (m,16) * (16,16) GEMM
    rrow = np.random.randint(7) + 1 
    clmn = 16
    A = np.random.randint(rmax, size=(rrow,clmn)).astype(dtype)
    B = np.random.randint(rmax, size=(clmn,clmn)).astype(dtype)

    print("A: " + str(A))
    print("B: " + str(B))
    # perform GEMM
    matrix_multiply(A, B, i_width, w_width)

if __name__ == "__main__":
    tsim.init("chisel")
    for i in range(1):
        # reg1 and reg2 bits in hardware/chisel/src/main/Compute.scala must be modified for slices greater than 8 bits
        if sys.argv[1] == 'serial':
          # generates a random uint8 GEMM with 2-bit(8/4) input and 4-bit(8/2) weight 
          top_test("uint8", 4, 2)
        elif sys.argv[1] == 'parallel':
          # generates a random uint8 GEMM with 8-bit input and 8-bit weight (bit parallel) 
          top_test('uint8', 8, 8)