[TOPI] 1bit dense operator on x86_64 (#629)

* add x86_64 target * add binary dense operator * rebase * improve schedule * remove x86 target * improve schedule

[TOPI] 1bit dense operator on x86_64 (#629)
* add x86_64 target * add binary dense operator * rebase * improve schedule * remove x86 target * improve schedule
36b34738 · Yuwei Hu · Tianqi Chen · 3b9f1652 · 36b34738 · 36b34738
Commit 36b34738 authored Dec 25, 2017 by Yuwei Hu Committed by Tianqi Chen Dec 25, 2017
7 changed files
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -176,3 +176,39 @@ def schedule_global_pool(outs):
        The computation schedule for the op.
    """
    return _default_schedule(outs, False)
+@tvm.target.generic_func
+def schedule_binarize_pack(outs):
+    """Schedule for binarize_pack
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of binarize_pack
+          in the format of an array of tensors.
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+@tvm.target.generic_func
+def schedule_binary_dense(outs):
+    """Schedule for binary_dense
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of binary_dense
+          in the format of an array of tensors.
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -13,3 +13,4 @@ from .mapping import *
 from .pooling import *
 from .softmax import *
 from .conv2d_transpose import *
+from .bnn import *
--- a/topi/python/topi/nn/bnn.py
+++ b/topi/python/topi/nn/bnn.py
+"""Binary Neural Network (BNN) Operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import tag
+from ..util import simplify, get_const_int
+def binarize_pack(data, axis=None, name="PackedInput"):
+    """Binarization and bit-packing along a certain axis.
+    Parameters
+    ----------
+    data : tvm.Tensor
+        n-D input, can be any layout.
+    axis : None or int
+        The axis along which to do binarization and bit-packing,
+        default is the last axis.
+    name : str, optional
+        The name prefix operators generate.
+    Returns
+    -------
+    output : tvm.Tensor
+        n-D, the same layout as input, dtype is uint32.
+    """
+    ishape = data.shape
+    if axis is None:
+        axis = len(ishape) - 1
+    assert get_const_int(ishape[axis]) % 32 == 0
+    n = len(ishape)
+    oshape = tuple(simplify(ishape[i] // 32) if i == axis \
+        else ishape[i] for i in range(n))
+    def _binarize_pack(*indices):
+        start_idx = [indices[i] * 32 if i == axis else indices[i] for i in range(n)]
+        packed = tvm.const(0, 'uint32')
+        for j in range(32):
+            idx = [start_idx[i] + j if i == axis else start_idx[i] for i in range(n)]
+            sign = (data(*idx) >= 0).astype("uint32")
+            packed = (packed | sign)
+            if j == 31:
+                return packed
+            packed = packed << 1
+    return tvm.compute(oshape, _binarize_pack, name=name, tag='binarize_pack')
+def binary_dense(data, weight):
+    """Binary matrix multiplication using xor and bit-count.
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [batch, in_dim], dtype is uint32.
+    weight : tvm.Tensor
+        2-D with shape [out_dim, in_dim], dtype is uint32.
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim], dtype is float32.
+    """
+    assert data.dtype == 'uint32' and weight.dtype == 'uint32', \
+        "dtype of data and weight should be uint32"
+    assert len(data.shape) == 2 and len(weight.shape) == 2, \
+        "only support 2-dim binary dense"
+    batch, in_dim = data.shape
+    out_dim, _ = weight.shape
+    k = tvm.reduce_axis((0, in_dim), name='k')
+    matmul = tvm.compute((batch, out_dim), lambda i, j: \
+                          tvm.sum(tvm.popcount(data[i, k] ^ weight[j, k]), axis=k), \
+                          tag='binary_dense')
+    return tvm.compute((batch, out_dim), lambda i, j: \
+                        32 * in_dim - 2. * matmul(i, j), \
+                        tag=tag.ELEMWISE)
--- a/topi/python/topi/x86/__init__.py
+++ b/topi/python/topi/x86/__init__.py
@@ -3,3 +3,5 @@
 from __future__ import absolute_import as _abs
 from .conv2d import schedule_conv2d
+from .binarize_pack import schedule_binarize_pack
+from .binary_dense import schedule_binary_dense
--- a/topi/python/topi/x86/binarize_pack.py
+++ b/topi/python/topi/x86/binarize_pack.py
+# pylint: disable=invalid-name
+"""Schedule for binarization and bit-packing."""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import generic
+@generic.schedule_binarize_pack.register(["cpu"])
+def schedule_binarize_pack(outs):
+    """Schedule for binarize_pack.
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of binarize_pack
+        in the format of an array of tensors.
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for binarize_pack.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(Out):
+        s[Out].parallel(Out.op.axis[0])
+    def traverse(OP):
+        # schedule binarize_pack
+        if OP.tag == 'binarize_pack':
+            Out = OP.output(0)
+            _schedule(Out)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+    traverse(outs[0].op)
+    return s
--- a/topi/python/topi/x86/binary_dense.py
+++ b/topi/python/topi/x86/binary_dense.py
+# pylint: disable=invalid-name, unused-variable, unused-argument
+"""Schedule for binary dense operator."""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import tag
+from .. import generic
+@generic.schedule_binary_dense.register(["cpu"])
+def schedule_binary_dense(outs):
+    """Schedule for binary_dense.
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of binary_dense
+        in the format of an array of tensors.
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for binary_dense.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(A, B, C):
+        s[C].split(s[C].op.reduce_axis[0], factor=8)
+        s[C].parallel(s[C].op.axis[0])
+        if C.op in s.outputs:
+            Out = C
+        else:
+            Out = outs[0].op.output(0)
+        xo, xi = s[Out].split(Out.op.axis[1], factor=8)
+        s[Out].vectorize(xi)
+    def traverse(OP):
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule binary_dense
+        elif OP.tag == 'binary_dense':
+            output = OP.output(0)
+            data = OP.input_tensors[0]
+            weight = OP.input_tensors[1]
+            _schedule(data, weight, output)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+    traverse(outs[0].op)
+    return s
--- a/topi/tests/python/test_topi_bnn.py
+++ b/topi/tests/python/test_topi_bnn.py
+"""Test code for binary neural network operators."""
+import numpy as np
+import tvm
+import topi
+from topi.util import get_const_tuple
+from tvm.contrib.pickle_memoize import memoize
+def verify_binary_dense(batch, in_dim, out_dim):
+    A = tvm.placeholder((batch, in_dim), name='A')
+    B = tvm.placeholder((out_dim, in_dim), name='B')
+    bnn_A = topi.nn.binarize_pack(A)
+    bnn_B = topi.nn.binarize_pack(B)
+    # binary dense
+    bnn_A1 = tvm.placeholder(bnn_A.shape, dtype=bnn_A.dtype)
+    bnn_B1 = tvm.placeholder(bnn_B.shape, dtype=bnn_B.dtype)
+    bnn_C = topi.nn.binary_dense(bnn_A1, bnn_B1)
+    # schedule
+    with tvm.target.create('llvm'):
+        s1 = topi.generic.schedule_binarize_pack(bnn_A)
+        s2 = topi.generic.schedule_binarize_pack(bnn_B)
+        s3 = topi.generic.schedule_binary_dense(bnn_C)
+    dtype = A.dtype
+    @memoize("topi.tests.test_topi_binary_dense")
+    def get_ref_data():
+        # generate random matrix of +1 or -1 value
+        a_np = (np.random.randint(2, size=(batch, in_dim)) * 2 - 1).astype(dtype)
+        b_np = (np.random.randint(2, size=(out_dim, in_dim)) * 2 - 1).astype(dtype)
+        c_np = np.dot(a_np, b_np.T)
+        return (a_np, b_np, c_np)
+    a_np, b_np, c_np = get_ref_data()
+    ctx = tvm.cpu(0)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), ctx)
+    bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), ctx)
+    bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), ctx)
+    f1 = tvm.build(s1, [A, bnn_A], 'llvm -mcpu=core-avx2')
+    f2 = tvm.build(s2, [B, bnn_B], 'llvm -mcpu=core-avx2')
+    f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], 'llvm -mcpu=core-avx2')
+    f1(a, bnn_a)
+    f2(b, bnn_b)
+    f3(bnn_a, bnn_b, bnn_c)
+    np.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
+def test_binary_dense():
+    verify_binary_dense(1, 4096, 1024)
+    verify_binary_dense(1, 1024, 1000)
+if __name__ == "__main__":
+    test_binary_dense()