Fix vmlal.s16 code generation for int8 x int8 -> int32 (#2748)

a7e35fc3 · Andrew Tulloch · Tianqi Chen · 2239508b · a7e35fc3 · a7e35fc3
Commit a7e35fc3 authored Mar 08, 2019 by Andrew Tulloch Committed by Tianqi Chen Mar 08, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 61 additions and 1 deletions

src/pass/lower_intrin.cc
+17 -1

tests/python/unittest/test_codegen_arm.py
+44 -0

No files found.
--- a/src/pass/lower_intrin.cc
+++ b/src/pass/lower_intrin.cc
@@ -50,7 +50,23 @@ class IntrinInjecter : public IRMutator {
    // on ARM.
    if (const Broadcast* bcast = e.as<Broadcast>()) {
      if (const Cast* cast = bcast->value.as<Cast>()) {
-        if (cast->type.bits() == cast->value.type().bits() * 2) {
+        auto should_swap = [&]() {
+          // Maintain behaviour (int8 -> int16, fp16 -> fp32).
+          if (cast->type.bits() == cast->value.type().bits() * 2) {
+            return true;
+          }
+          // Check both operands are integer-like.
+          if (!cast->type.is_uint() && !cast->type.is_int()) {
+            return false;
+          }
+          if (!cast->value.type().is_uint() && !cast->value.type().is_int()) {
+            return false;
+          }
+          // If both are integer-like, swap if we have a widening cast.
+          return cast->type.bits() > cast->value.type().bits();
+        };
+        if (should_swap()) {
          Expr new_bcast = Broadcast::make(cast->value, bcast->lanes);
          return Cast::make(bcast->type, new_bcast);
        }

--- a/tests/python/unittest/test_codegen_arm.py
+++ b/tests/python/unittest/test_codegen_arm.py
@@ -26,5 +26,49 @@ def test_popcount():
    check_correct_assembly('uint32', 2, 2)
    check_correct_assembly('uint64', 2, 3)
+def test_vmlal_s16():
+    target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
+    def check_correct_assembly(N):
+        K = tvm.var("K")
+        A = tvm.placeholder((K, N), dtype="int8", name='A')
+        B = tvm.placeholder((K, N), dtype="int8", name='A')
+        k = tvm.reduce_axis((0, K))
+        C = tvm.compute((N, ), lambda n: tvm.sum(
+            A[k, n].astype("int32") * B[k, n].astype("int32"), axis=[k]), name='C')
+        s = tvm.create_schedule(C.op)
+        s[C].vectorize(s[C].op.axis[0])
+        f = tvm.build(s, [A, B, C], target)
+        # Verify we see the correct number of vmlal.s16 instructions
+        assembly = f.get_source('asm')
+        matches = re.findall("vmlal.s16", assembly)
+        assert (len(matches) == N // 4)
+    check_correct_assembly(4)
+    check_correct_assembly(8)
+    check_correct_assembly(16)
+    def check_broadcast_correct_assembly(N):
+        K = tvm.var("K")
+        A = tvm.placeholder((K, N), dtype="int8", name='A')
+        B = tvm.placeholder((K,), dtype="int8", name='A')
+        k = tvm.reduce_axis((0, K))
+        C = tvm.compute((N, ), lambda n: tvm.sum(
+            A[k, n].astype("int32") * B[k].astype("int32"),
+            axis=[k]), name='C')
+        s = tvm.create_schedule(C.op)
+        s[C].vectorize(s[C].op.axis[0])
+        f = tvm.build(s, [A, B, C], target)
+        # Verify we see the correct number of vmlal.s16 instructions
+        assembly = f.get_source('asm')
+        matches = re.findall("vmlal.s16", assembly)
+        assert len(matches) == N // 4
+    check_broadcast_correct_assembly(8)
+    check_broadcast_correct_assembly(16)
+    check_broadcast_correct_assembly(32)
+    check_broadcast_correct_assembly(64)
 if __name__ == "__main__":
    test_popcount()
+    test_vmlal_s16()