[TVM] [NNPACK] Modernize and improve NNPACK bindings (#2084)

fc83c7f2 · Andrew Tulloch · Tianqi Chen · 9f441d81 · fc83c7f2 · fc83c7f2
Commit fc83c7f2 authored Nov 09, 2018 by Andrew Tulloch Committed by Tianqi Chen Nov 09, 2018
7 changed files
--- a/cmake/modules/contrib/NNPack.cmake
+++ b/cmake/modules/contrib/NNPack.cmake
@@ -9,6 +9,10 @@ if(USE_NNPACK)
 	include_directories(${PTHREAD_POOL_PATH}/include)
    find_library(NNPACK_CONTRIB_LIB nnpack ${NNPACK_PATH}/lib)
  find_library(NNPACK_PTHREAD_CONTRIB_LIB pthreadpool ${NNPACK_PATH}/lib)
+  find_library(NNPACK_CPUINFO_CONTRIB_LIB cpuinfo ${NNPACK_PATH}/lib)
+  find_library(NNPACK_CLOG_CONTRIB_LIB clog ${NNPACK_PATH}/lib)
  list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_CONTRIB_LIB})
  list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_PTHREAD_CONTRIB_LIB})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_CPUINFO_CONTRIB_LIB})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_CLOG_CONTRIB_LIB})
 endif(USE_NNPACK)
--- a/python/tvm/contrib/nnpack.py
+++ b/python/tvm/contrib/nnpack.py
@@ -63,14 +63,32 @@ def fully_connected_output(lhs, rhs, nthreads=1):
            "tvm.contrib.nnpack.fully_connected_output",
            ins[0], ins[1], outs[0], nthreads), name="C")
-def convolution_inference(data, kernel, bias, padding, stride, nthreads=1):
-    """Create an extern op to do inference convolution of 3D tensor data and
+class ConvolutionAlgorithm:
+    AUTO = 0
+    FFT_8x8 = 1
+    FFT_16x16 = 2
+    WT_8x8 = 3
+    IMPLICIT_GEMM = 4
+    DIRECT = 5
+    WT_8x8_FP16 = 6
+class ConvolutionTransformStrategy:
+    COMPUTE = 1
+    PRECOMPUTE = 2
+def convolution_inference(
+        data, kernel, bias, padding, stride, nthreads=1,
+        algorithm=ConvolutionAlgorithm.AUTO):
+    """Create an extern op to do inference convolution of 4D tensor data and
    4D tensor kernel and 1D tensor bias with nnpack.
    Parameters
    ----------
    data : Tensor
-        data 3D tensor input[input_channels][input_height][input_width] of
+        data 4D tensor input[batch][input_channels][input_height][input_width] of
        FP32 elements.
    kernel : Tensor
        kernel 4D tensor kernel[output_channels][input_channels][kernel_height]
@@ -88,23 +106,108 @@ def convolution_inference(data, kernel, bias, padding, stride, nthreads=1):
    Returns
    -------
    output : Tensor
-        output 3D tensor output[output_channels][output_height][output_width]
+        output 4D tensor output[batch][output_channels][output_height][output_width]
        of FP32 elements.
    """
    assert isinstance(padding, list) and len(padding) == 4
    assert isinstance(stride, list) and len(stride) == 2
-    _, input_height, input_width = data.shape
+    batch, _, input_height, input_width = data.shape
    output_channels, _, kernel_height, kernel_width = kernel.shape
    output_height = (input_height + padding[0] + padding[1] - kernel_height) / stride[0] + 1
    output_width = (input_width + padding[0] + padding[1] - kernel_width) / stride[1] + 1
    return _api.extern(
-        (output_channels, output_height, output_width), [data, kernel, bias],
+        (batch, output_channels, output_height, output_width),
+        [data, kernel, bias] if bias is not None else [data, kernel],
        lambda ins, outs: _intrin.call_packed(
-            "tvm.contrib.nnpack.convolution_inference", ins[0], ins[1], ins[2],
+            "tvm.contrib.nnpack.convolution_inference",
+            ins[0],
+            ins[1],
+            ins[2] if bias is not None else 0,
            outs[0], padding[0], padding[1], padding[2], padding[3],
-            stride[0], stride[1], nthreads), name="C")
+            stride[0], stride[1], nthreads, algorithm), name="C")
+def convolution_inference_without_weight_transform(
+        data, transformed_kernel, bias, padding, stride, nthreads=1,
+        algorithm=ConvolutionAlgorithm.AUTO):
+    """Create an extern op to do inference convolution of 4D tensor data and
+    4D pre-transformed tensor kernel and 1D tensor bias with nnpack.
+    Parameters
+    ----------
+    data : Tensor
+        data 4D tensor input[batch][input_channels][input_height][input_width] of
+        FP32 elements.
+    transformed_kernel : Tensor
+        transformed_kernel 4D tensor kernel[output_channels][input_channels][tile]
+        [tile] of FP32 elements.
+    bias : Tensor
+        bias 1D array bias[output_channels][input_channels][kernel_height]
+        [kernel_width] of FP32 elements.
+    padding : list
+        padding A 4-dim list of [pad_top, pad_bottom, pad_left, pad_right],
+        which indicates the padding around the feature map.
+    stride : list
+        stride A 2-dim list of [stride_height, stride_width], which indicates
+        the stride.
+    Returns
+    -------
+    output : Tensor
+        output 4D tensor output[batch][output_channels][output_height][output_width]
+        of FP32 elements.
+    """
+    assert algorithm in (ConvolutionAlgorithm.WT_8x8,
+                         ConvolutionAlgorithm.WT_8x8_FP16)
+    assert isinstance(padding, list) and len(padding) == 4
+    assert isinstance(stride, list) and len(stride) == 2
+    batch, _, input_height, input_width = data.shape
+    output_channels, _, _, _ = transformed_kernel.shape
+    kernel_height, kernel_width = (3, 3)
+    output_height = (input_height + padding[0] + padding[1] - kernel_height) / stride[0] + 1
+    output_width = (input_width + padding[0] + padding[1] - kernel_width) / stride[1] + 1
+    return _api.extern(
+        (batch, output_channels, output_height, output_width),
+        [data, transformed_kernel, bias] if bias is not None else [data, transformed_kernel],
+        lambda ins, outs: _intrin.call_packed(
+            "tvm.contrib.nnpack.convolution_inference_without_weight_transform",
+            ins[0],
+            ins[1],
+            ins[2] if bias is not None else 0,
+            outs[0], padding[0], padding[1], padding[2], padding[3],
+            stride[0], stride[1], nthreads, algorithm), name="C")
+def convolution_inference_weight_transform(
+        kernel, nthreads=1,
+        algorithm=ConvolutionAlgorithm.AUTO):
+    """Create an extern op to do inference convolution of 3D tensor data and
+    4D tensor kernel and 1D tensor bias with nnpack.
+    Parameters
+    ----------
+    kernel : Tensor
+        kernel 4D tensor kernel[output_channels][input_channels][kernel_height]
+        [kernel_width] of FP32 elements.
+    Returns
+    -------
+    output : Tensor
+        output 4D tensor output[output_channels][input_channels][tile][tile]
+        of FP32 elements.
+    """
+    assert algorithm in (ConvolutionAlgorithm.WT_8x8, ConvolutionAlgorithm.WT_8x8_FP16)
+    output_channels, input_channels, _, _ = kernel.shape
+    transform_tile_size = 8
+    return _api.extern(
+        (output_channels, input_channels, transform_tile_size, transform_tile_size),
+        [kernel],
+        lambda ins, outs: _intrin.call_packed(
+            "tvm.contrib.nnpack.convolution_inference_weight_transform",
+            ins[0], outs[0], nthreads, algorithm), name="transform_kernel")
 def convolution_output(data, kernel, bias, padding, nthreads=1):
    """Create an extern op to compute convolution of 4D tensor data and
@@ -144,4 +247,5 @@ def convolution_output(data, kernel, bias, padding, nthreads=1):
            "tvm.contrib.nnpack.convolution_output", ins[0], ins[1], ins[2],
            outs[0], padding[0], padding[1], padding[2], padding[3], nthreads), name="C")
 _init_api("tvm.contrib.nnpack")
--- a/src/contrib/nnpack/convolution.cc
+++ b/src/contrib/nnpack/convolution.cc
--- a/src/contrib/nnpack/nnpack_utils.cc
+++ b/src/contrib/nnpack/nnpack_utils.cc
@@ -10,20 +10,30 @@ using namespace runtime;
 typedef dmlc::ThreadLocalStore<NNPackThreadLocalEntry> NNPackThreadLocalStore;
 NNPackThreadLocalEntry* NNPackThreadLocalEntry::ThreadLocal() {
  return NNPackThreadLocalStore::Get();
 }
 bool NNPackConfig(uint64_t nthreads) {
  NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
-  if (entry->threadpool != NULL &&
+  if (entry->threadpool && pthreadpool_get_threads_count(entry->threadpool) == nthreads) {
-      pthreadpool_get_threads_count(entry->threadpool) != nthreads) {
+    CHECK_NE(nthreads, 1);
+    return true;
+  }
+  if (entry->threadpool) {
    pthreadpool_destroy(entry->threadpool);
-    entry->threadpool = NULL;
+    entry->threadpool = nullptr;
  }
-  if (entry->threadpool == NULL) {
-    entry->threadpool = pthreadpool_create(nthreads);
+  if (nthreads == 1) {
+    // a null threadpool means the function is invoked on the calling thread,
+    // which is the desired logic for nthreads == 1
+    CHECK(!entry->threadpool);
+    return true;
  }
+  entry->threadpool = pthreadpool_create(nthreads);
  return true;
 }

--- a/src/contrib/nnpack/nnpack_utils.h
+++ b/src/contrib/nnpack/nnpack_utils.h
@@ -15,7 +15,7 @@ namespace contrib {
 using namespace runtime;
 struct NNPackThreadLocalEntry {
-  pthreadpool_t threadpool{NULL};
+  pthreadpool_t threadpool{nullptr};
  static NNPackThreadLocalEntry* ThreadLocal();
 };

--- a/tests/lint/pylintrc
+++ b/tests/lint/pylintrc
@@ -290,10 +290,10 @@ variable-rgx=[a-z_][a-z0-9_]{2,30}$
 variable-name-hint=[a-z_][a-z0-9_]{2,30}$
 # Regular expression matching correct function names
-function-rgx=[a-z_][a-z0-9_]{2,30}$
+function-rgx=[a-z_][a-z0-9_]{2,48}$
 # Naming hint for function names
-function-name-hint=[a-z_][a-z0-9_]{2,30}$
+function-name-hint=[a-z_][a-z0-9_]{2,48}$
 # Regular expression matching correct class names
 class-rgx=[A-Z_][a-zA-Z0-9]+$

--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -100,7 +100,7 @@ def np_conv(na, nw, padding, stride=1):
    return nb
 def test_convolution_inference():
-    BATCH = 32
+    BATCH = 8
    IH = 48
    IW = 48
    IC = 16
@@ -111,40 +111,111 @@ def test_convolution_inference():
    OH = (IH + 2*PAD - K) + 1
    OW = (IW + 2*PAD - K) + 1
-    dshape = (IC, IH, IW)
+    dshape = (BATCH, IC, IH, IW)
    kshape = (OC, IC, K, K)
    bshape = (OC, )
-    oshape = (OC, OH, OW)
+    oshape = (BATCH, OC, OH, OW)
    data = tvm.placeholder(dshape, name='data')
    kernel = tvm.placeholder(kshape, name='kernel')
    bias = tvm.placeholder(bshape, name='bias')
-    output = nnpack.convolution_inference(data, kernel, bias,
+    def verify(target="llvm",
-        [PAD, PAD, PAD, PAD], [STRIDE, STRIDE])
+               algorithm=nnpack.ConvolutionAlgorithm.AUTO,
+               with_bias=True):
+        if not tvm.module.enabled(target):
+            print("skip because %s is not enabled..." % target)
+            return
+        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
+            print("skip because extern function is not available")
+            return
+        ctx = tvm.cpu(0)
+        output = nnpack.convolution_inference(
+            data, kernel, bias if with_bias else None,
+            [PAD, PAD, PAD, PAD], [STRIDE, STRIDE],
+            algorithm=algorithm)
        s = tvm.create_schedule(output.op)
-    def verify(target="llvm"):
+        f = tvm.build(s, [data, kernel, bias, output], target)
+        na = np.random.uniform(size=dshape).astype(data.dtype)
+        nb = np.random.uniform(size=kshape).astype(kernel.dtype)
+        nc = np.zeros(bshape, dtype=bias.dtype)
+        ta = tvm.nd.array(na, ctx)
+        tb = tvm.nd.array(nb, ctx)
+        tc = tvm.nd.array(nc, ctx)
+        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
+        f(ta, tb, tc, td)
+        nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(1, bshape[0], 1, 1)
+        tvm.testing.assert_allclose(
+            td.asnumpy(), nd.reshape(BATCH, IC, IH, IW), rtol=1e-5)
+    for algorithm in [
+            nnpack.ConvolutionAlgorithm.AUTO,
+            nnpack.ConvolutionAlgorithm.FFT_8x8,
+            nnpack.ConvolutionAlgorithm.FFT_16x16,
+            nnpack.ConvolutionAlgorithm.WT_8x8,
+            nnpack.ConvolutionAlgorithm.IMPLICIT_GEMM,
+            nnpack.ConvolutionAlgorithm.WT_8x8_FP16,
+    ]:
+        for with_bias in [True, False]:
+            verify(algorithm=algorithm, with_bias=with_bias)
+def test_convolution_inference_without_weight_transform():
+    BATCH = 6
+    IH = 48
+    IW = 48
+    IC = 16
+    OC = 16
+    K = 3
+    PAD = 1
+    STRIDE = 1
+    OH = (IH + 2*PAD - K) + 1
+    OW = (IW + 2*PAD - K) + 1
+    dshape = (BATCH, IC, IH, IW)
+    kshape = (OC, IC, K, K)
+    bshape = (OC, )
+    oshape = (BATCH, OC, OH, OW)
+    data = tvm.placeholder(dshape, name='data')
+    kernel = tvm.placeholder(kshape, name='kernel')
+    bias = tvm.placeholder(bshape, name='bias')
+    def verify(target="llvm",
+               algorithm=nnpack.ConvolutionAlgorithm.AUTO,
+               with_bias=True):
        if not tvm.module.enabled(target):
            print("skip because %s is not enabled..." % target)
            return
        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
            print("skip because extern function is not available")
            return
        ctx = tvm.cpu(0)
+        transformed_kernel = nnpack.convolution_inference_weight_transform(
+            kernel, algorithm=algorithm)
+        output = nnpack.convolution_inference_without_weight_transform(
+            data, transformed_kernel, bias if with_bias else None,
+            [PAD, PAD, PAD, PAD], [STRIDE, STRIDE],
+            algorithm=algorithm)
+        s = tvm.create_schedule(output.op)
        f = tvm.build(s, [data, kernel, bias, output], target)
        na = np.random.uniform(size=dshape).astype(data.dtype)
        nb = np.random.uniform(size=kshape).astype(kernel.dtype)
-        nc = np.zeros(bshape, dtype=bias.dtype)
+        nc = np.random.uniform(size=bshape).astype(bias.dtype) if with_bias else np.zeros(bshape, dtype=bias.dtype)
        ta = tvm.nd.array(na, ctx)
        tb = tvm.nd.array(nb, ctx)
        tc = tvm.nd.array(nc, ctx)
        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
        f(ta, tb, tc, td)
-        nd = np_conv(np.reshape(na, (1, IC, IH, IW)), nb, PAD, STRIDE)
+        nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(1, bshape[0], 1, 1)
        tvm.testing.assert_allclose(
-            td.asnumpy(), nd.reshape(IC, IH, IW), rtol=1e-5)
+            td.asnumpy(), nd.reshape(BATCH, IC, IH, IW), rtol=1e-5)
-    verify()
+    for algorithm in [nnpack.ConvolutionAlgorithm.WT_8x8]:
+        for with_bias in [True, False]:
+            verify(algorithm=algorithm, with_bias=with_bias)
 def test_convolution_output():
    BATCH = 32