Commit 36b34738 by Yuwei Hu Committed by Tianqi Chen

[TOPI] 1bit dense operator on x86_64 (#629)

* add x86_64 target

* add binary dense operator

* rebase

* improve schedule

* remove x86 target

* improve schedule
parent 3b9f1652
...@@ -176,3 +176,39 @@ def schedule_global_pool(outs): ...@@ -176,3 +176,39 @@ def schedule_global_pool(outs):
The computation schedule for the op. The computation schedule for the op.
""" """
return _default_schedule(outs, False) return _default_schedule(outs, False)
@tvm.target.generic_func
def schedule_binarize_pack(outs):
"""Schedule for binarize_pack
Parameters
----------
outs: Array of Tensor
The computation graph description of binarize_pack
in the format of an array of tensors.
Returns
-------
sch: Schedule
The computation schedule for the op.
"""
return _default_schedule(outs, False)
@tvm.target.generic_func
def schedule_binary_dense(outs):
"""Schedule for binary_dense
Parameters
----------
outs: Array of Tensor
The computation graph description of binary_dense
in the format of an array of tensors.
Returns
-------
sch: Schedule
The computation schedule for the op.
"""
return _default_schedule(outs, False)
...@@ -13,3 +13,4 @@ from .mapping import * ...@@ -13,3 +13,4 @@ from .mapping import *
from .pooling import * from .pooling import *
from .softmax import * from .softmax import *
from .conv2d_transpose import * from .conv2d_transpose import *
from .bnn import *
"""Binary Neural Network (BNN) Operators"""
from __future__ import absolute_import as _abs
import tvm
from .. import tag
from ..util import simplify, get_const_int
def binarize_pack(data, axis=None, name="PackedInput"):
"""Binarization and bit-packing along a certain axis.
Parameters
----------
data : tvm.Tensor
n-D input, can be any layout.
axis : None or int
The axis along which to do binarization and bit-packing,
default is the last axis.
name : str, optional
The name prefix operators generate.
Returns
-------
output : tvm.Tensor
n-D, the same layout as input, dtype is uint32.
"""
ishape = data.shape
if axis is None:
axis = len(ishape) - 1
assert get_const_int(ishape[axis]) % 32 == 0
n = len(ishape)
oshape = tuple(simplify(ishape[i] // 32) if i == axis \
else ishape[i] for i in range(n))
def _binarize_pack(*indices):
start_idx = [indices[i] * 32 if i == axis else indices[i] for i in range(n)]
packed = tvm.const(0, 'uint32')
for j in range(32):
idx = [start_idx[i] + j if i == axis else start_idx[i] for i in range(n)]
sign = (data(*idx) >= 0).astype("uint32")
packed = (packed | sign)
if j == 31:
return packed
packed = packed << 1
return tvm.compute(oshape, _binarize_pack, name=name, tag='binarize_pack')
def binary_dense(data, weight):
"""Binary matrix multiplication using xor and bit-count.
Parameters
----------
data : tvm.Tensor
2-D with shape [batch, in_dim], dtype is uint32.
weight : tvm.Tensor
2-D with shape [out_dim, in_dim], dtype is uint32.
Returns
-------
output : tvm.Tensor
2-D with shape [batch, out_dim], dtype is float32.
"""
assert data.dtype == 'uint32' and weight.dtype == 'uint32', \
"dtype of data and weight should be uint32"
assert len(data.shape) == 2 and len(weight.shape) == 2, \
"only support 2-dim binary dense"
batch, in_dim = data.shape
out_dim, _ = weight.shape
k = tvm.reduce_axis((0, in_dim), name='k')
matmul = tvm.compute((batch, out_dim), lambda i, j: \
tvm.sum(tvm.popcount(data[i, k] ^ weight[j, k]), axis=k), \
tag='binary_dense')
return tvm.compute((batch, out_dim), lambda i, j: \
32 * in_dim - 2. * matmul(i, j), \
tag=tag.ELEMWISE)
...@@ -3,3 +3,5 @@ ...@@ -3,3 +3,5 @@
from __future__ import absolute_import as _abs from __future__ import absolute_import as _abs
from .conv2d import schedule_conv2d from .conv2d import schedule_conv2d
from .binarize_pack import schedule_binarize_pack
from .binary_dense import schedule_binary_dense
# pylint: disable=invalid-name
"""Schedule for binarization and bit-packing."""
from __future__ import absolute_import as _abs
import tvm
from .. import generic
@generic.schedule_binarize_pack.register(["cpu"])
def schedule_binarize_pack(outs):
"""Schedule for binarize_pack.
Parameters
----------
outs: Array of Tensor
The computation graph description of binarize_pack
in the format of an array of tensors.
Returns
-------
s: Schedule
The computation schedule for binarize_pack.
"""
outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
s = tvm.create_schedule([x.op for x in outs])
def _schedule(Out):
s[Out].parallel(Out.op.axis[0])
def traverse(OP):
# schedule binarize_pack
if OP.tag == 'binarize_pack':
Out = OP.output(0)
_schedule(Out)
else:
raise RuntimeError("Unsupported operator: %s" % OP.tag)
traverse(outs[0].op)
return s
# pylint: disable=invalid-name, unused-variable, unused-argument
"""Schedule for binary dense operator."""
from __future__ import absolute_import as _abs
import tvm
from .. import tag
from .. import generic
@generic.schedule_binary_dense.register(["cpu"])
def schedule_binary_dense(outs):
"""Schedule for binary_dense.
Parameters
----------
outs: Array of Tensor
The computation graph description of binary_dense
in the format of an array of tensors.
Returns
-------
s: Schedule
The computation schedule for binary_dense.
"""
outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
s = tvm.create_schedule([x.op for x in outs])
def _schedule(A, B, C):
s[C].split(s[C].op.reduce_axis[0], factor=8)
s[C].parallel(s[C].op.axis[0])
if C.op in s.outputs:
Out = C
else:
Out = outs[0].op.output(0)
xo, xi = s[Out].split(Out.op.axis[1], factor=8)
s[Out].vectorize(xi)
def traverse(OP):
# inline all one-to-one-mapping operators except the last stage (output)
if tag.is_broadcast(OP.tag):
if OP not in s.outputs:
s[OP].compute_inline()
for tensor in OP.input_tensors:
if tensor.op.input_tensors:
traverse(tensor.op)
# schedule binary_dense
elif OP.tag == 'binary_dense':
output = OP.output(0)
data = OP.input_tensors[0]
weight = OP.input_tensors[1]
_schedule(data, weight, output)
else:
raise RuntimeError("Unsupported operator: %s" % OP.tag)
traverse(outs[0].op)
return s
"""Test code for binary neural network operators."""
import numpy as np
import tvm
import topi
from topi.util import get_const_tuple
from tvm.contrib.pickle_memoize import memoize
def verify_binary_dense(batch, in_dim, out_dim):
A = tvm.placeholder((batch, in_dim), name='A')
B = tvm.placeholder((out_dim, in_dim), name='B')
bnn_A = topi.nn.binarize_pack(A)
bnn_B = topi.nn.binarize_pack(B)
# binary dense
bnn_A1 = tvm.placeholder(bnn_A.shape, dtype=bnn_A.dtype)
bnn_B1 = tvm.placeholder(bnn_B.shape, dtype=bnn_B.dtype)
bnn_C = topi.nn.binary_dense(bnn_A1, bnn_B1)
# schedule
with tvm.target.create('llvm'):
s1 = topi.generic.schedule_binarize_pack(bnn_A)
s2 = topi.generic.schedule_binarize_pack(bnn_B)
s3 = topi.generic.schedule_binary_dense(bnn_C)
dtype = A.dtype
@memoize("topi.tests.test_topi_binary_dense")
def get_ref_data():
# generate random matrix of +1 or -1 value
a_np = (np.random.randint(2, size=(batch, in_dim)) * 2 - 1).astype(dtype)
b_np = (np.random.randint(2, size=(out_dim, in_dim)) * 2 - 1).astype(dtype)
c_np = np.dot(a_np, b_np.T)
return (a_np, b_np, c_np)
a_np, b_np, c_np = get_ref_data()
ctx = tvm.cpu(0)
a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(b_np, ctx)
bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), ctx)
bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), ctx)
bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), ctx)
f1 = tvm.build(s1, [A, bnn_A], 'llvm -mcpu=core-avx2')
f2 = tvm.build(s2, [B, bnn_B], 'llvm -mcpu=core-avx2')
f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], 'llvm -mcpu=core-avx2')
f1(a, bnn_a)
f2(b, bnn_b)
f3(bnn_a, bnn_b, bnn_c)
np.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
def test_binary_dense():
verify_binary_dense(1, 4096, 1024)
verify_binary_dense(1, 1024, 1000)
if __name__ == "__main__":
test_binary_dense()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment