[tvm][any] broadcast with values other than one (#3967)

* [tvm][any] broadcast with values other than 1 * Add test for incompatible runtime values * Remove hybrid script compact buffer binding * retrigger ci

[tvm][any] broadcast with values other than one (#3967)
* [tvm][any] broadcast with values other than 1 * Add test for incompatible runtime values * Remove hybrid script compact buffer binding * retrigger ci
9d5cba20 · Zhi · Haichen Shen · 15ae9780 · 9d5cba20 · 9d5cba20
Commit 9d5cba20 authored Oct 11, 2019 by Zhi Committed by Haichen Shen Oct 11, 2019
11 changed files
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -206,6 +206,14 @@ Stmt StorageFlatten(Stmt stmt,
                    Map<Tensor, Buffer> extern_buffer,
                    int cache_line_size,
                    bool create_bound_attribute = false);
+/*!
+ * \brief Verify if there is any argument bound to compact buffer.
+ *
+ * \param stmt The stmt to be verified.
+ * \return true if there is any buffer_bind_scope attribute found,
+ *        otherwise, false.
+ */
+bool VerifyCompactBuffer(Stmt stmt);
 /*!
 * \brief Remove No Op from the Stmt.

--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -264,7 +264,7 @@ def build_config(**kwargs):
    return config
-def get_binds(args, binds=None):
+def get_binds(args, compact=False, binds=None):
    """Internal function to get binds and arg_list given arguments.
    Parameters
@@ -272,6 +272,9 @@ def get_binds(args, binds=None):
    args : list of Buffer or Tensor or Var
        The argument lists to the function.
+    compact : bool
+        If the statement has already bound to a compact buffer.
    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
        Dictionary that maps the Tensor to Buffer which specified the data layout
        requirement of the function. By default, a new compact buffer is created
@@ -290,12 +293,15 @@ def get_binds(args, binds=None):
    arg_list = []
    for x in args:
        if isinstance(x, tensor.Tensor):
+            any_dim = any(isinstance(i, expr.Var) for i in x.shape)
+            buffer_type = "auto_broadcast" if any_dim and not compact else ""
            if x not in binds:
                buf = api.decl_buffer(x.shape,
                                      dtype=x.dtype,
                                      name=x.name,
                                      data_alignment=cfg.data_alignment,
-                                      offset_factor=cfg.offset_factor)
+                                      offset_factor=cfg.offset_factor,
+                                      buffer_type=buffer_type)
                binds[x] = buf
                arg_list.append(buf)
            else:
@@ -361,7 +367,6 @@ def lower(sch,
       The result function, if with_api_wrapper=False
       Then the Stmt before make api is returned.
    """
-    binds, arg_list = get_binds(args, binds)
    cfg = current_build_config()
    add_lower_pass = cfg.add_lower_pass if cfg.add_lower_pass else []
    if cfg.dump_pass_ir:
@@ -377,11 +382,16 @@ def lower(sch,
    for f in lower_phase0:
        stmt = f(stmt)
+    compact = ir_pass.VerifyCompactBuffer(stmt)
+    binds, arg_list = get_binds(args, compact, binds)
    # Phase 1
    stmt = ir_pass.StorageFlatten(stmt, binds, 64, cfg.instrument_bound_checkers)
    stmt = ir_pass.CanonicalSimplify(stmt)
    for f in lower_phase1:
        stmt = f(stmt)
    # Phase 2
    if not simple_mode:
        stmt = ir_pass.LoopPartition(stmt, cfg.partition_const_loop)
@@ -400,6 +410,7 @@ def lower(sch,
        cfg.unroll_explicit)
    for f in lower_phase2:
        stmt = f(stmt)
    # Phase 3
    stmt = ir_pass.Simplify(stmt)
    stmt = ir_pass.LowerStorageAccessInfo(stmt)
@@ -413,6 +424,7 @@ def lower(sch,
        stmt = ir_pass.InstrumentBoundCheckers(stmt)
    if simple_mode:
        return stmt
    return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)

--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -159,5 +159,6 @@ REGISTER_PASS(VerifyMemory);
 REGISTER_PASS(VerifyGPUCode);
 REGISTER_PASS(DecorateDeviceScope);
 REGISTER_PASS(InstrumentBoundCheckers);
+REGISTER_PASS(VerifyCompactBuffer);
 }  // namespace ir
 }  // namespace tvm
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -331,8 +331,19 @@ Buffer BufferWithOffsetAlignment(Array<Expr> shape,
                                 Type dtype,
                                 std::string name,
                                 int data_alignment,
-                                 int offset_factor) {
+                                 int offset_factor,
+                                 bool compact) {
  auto data = Var(name, Handle());
+  bool has_any = false;
+  if (!compact) {
+    for (const auto& it : shape) {
+      if (it.as<Variable>()) {
+        has_any = true;
+        break;
+      }
+    }
+  }
+  BufferType buffer_type = has_any ? kAutoBroadcast : kDefault;
  Expr elem_offset;
  if (offset_factor != 0) {
@@ -342,10 +353,11 @@ Buffer BufferWithOffsetAlignment(Array<Expr> shape,
  }
  return BufferNode::make(data, dtype, shape, Array<Expr>(), elem_offset, name, "",
-    data_alignment, offset_factor, kDefault);
+    data_alignment, offset_factor, buffer_type);
 }
 void GetBinds(const Array<Tensor>& args,
+              bool compact,
              const std::unordered_map<Tensor, Buffer>& binds,
              Map<Tensor, Buffer>* out_binds,
              Array<NodeRef>* out_arg_list,
@@ -355,7 +367,7 @@ void GetBinds(const Array<Tensor>& args,
  for (const auto &x : args) {
    if (out_binds->find(x) == out_binds->end()) {
      auto buf = BufferWithOffsetAlignment(x->shape, x->dtype, x->op->name,
-        config->data_alignment, config->offset_factor);
+        config->data_alignment, config->offset_factor, compact);
      out_binds->Set(x, buf);
      out_arg_list->push_back(buf);
    } else {
@@ -380,9 +392,6 @@ Stmt BuildStmt(Schedule sch,
               bool loop_partition,
               Array<NodeRef> *out_arg_list,
               const BuildConfig& config) {
-  Map<Tensor, Buffer> out_binds;
-  GetBinds(args, binds, &out_binds, out_arg_list, config);
  sch = sch.normalize();
  // Phase 0
@@ -390,6 +399,10 @@ Stmt BuildStmt(Schedule sch,
  auto stmt = schedule::ScheduleOps(sch, bounds, false);
  stmt = ir::InjectPrefetch(stmt);
+  bool compact = ir::VerifyCompactBuffer(stmt);
+  Map<Tensor, Buffer> out_binds;
+  GetBinds(args, compact, binds, &out_binds, out_arg_list, config);
  // Phase 1
  stmt = ir::StorageFlatten(stmt, out_binds, 64,
                            config->instrument_bound_checkers);

--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -180,31 +180,6 @@ Stmt HybridOpNode::BuildProvide(
    bool debug_keep_trivial_loop) const {
  CHECK_EQ(stage->op.operator->(), this);
  Stmt ret = AttrStmt::make(make_zero(Int(32)), attr::extern_scope, 0, this->body);
-  auto f_push_bind = [&ret](Buffer buffer, Tensor tensor) {
-    Array<NodeRef> bind_spec;
-    Array<Expr> tuple;
-    bind_spec.push_back(buffer);
-    bind_spec.push_back(tensor);
-    for (size_t k = 0; k < buffer->shape.size(); ++k) {
-      tuple.push_back(make_const(buffer->shape[k].type(), 0));
-      tuple.push_back(buffer->shape[k]);
-    }
-    ret = AttrStmt::make(
-        bind_spec, attr::buffer_bind_scope,
-        Call::make(Handle(), intrinsic::tvm_tuple, tuple, Call::Intrinsic), ret);
-  };
-  for (int i = static_cast<int>(outputs.size()) - 1; i >= 0; --i) {
-    Buffer buffer = decl_buffer(
-      outputs[i]->shape,
-      outputs[i]->dtype);
-    f_push_bind(buffer, stage->op.output(i));
-  }
-  auto curr_inputs = InputTensors();
-  for (int i = static_cast<int>(curr_inputs.size()) - 1; i >= 0; --i) {
-    Buffer buffer = decl_buffer(curr_inputs[i]->shape, curr_inputs[i]->dtype);
-    f_push_bind(buffer, curr_inputs[i]);
-  }
  std::unordered_map<Tensor, Tensor> rmap;
  for (int i = 0; i < this->num_outputs(); ++i) {
    rmap[outputs[i]] = stage->op.output(i);

--- a/src/pass/verify_compact_buffer.cc
+++ b/src/pass/verify_compact_buffer.cc
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file verify_compact_buffer.cc
+ * \brief Verify if there was any compact buffer bound to a statement.
+ */
+#include <tvm/buffer.h>
+#include <tvm/expr.h>
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/tensor.h>
+#include <unordered_map>
+namespace tvm {
+namespace ir {
+class VerifyBuffer : public IRVisitor {
+ public:
+  bool Verify(const Stmt& stmt) {
+    this->Visit(stmt);
+    return is_compact_;
+  }
+  void Visit_(const AttrStmt* op) final {
+    IRVisitor::Visit_(op);
+    if (op->attr_key == attr::buffer_bind_scope) {
+      is_compact_ = true;
+    }
+  }
+ private:
+  bool is_compact_{false};
+};
+bool VerifyCompactBuffer(Stmt stmt) {
+  VerifyBuffer verifier;
+  return verifier.Verify(stmt);
+}
+}  // namespace ir
+}  // namespace tvm
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -37,7 +37,7 @@ def test_reduce_prims():
        s[R].compute_inline()
        # one line to build the function.
-        def check_device(device, host="stackvm"):
+        def check_device(device, host="llvm"):
            ctx = tvm.context(device, 0)
            if not tvm.module.enabled(host):
                return

--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -47,17 +47,35 @@ def verify_any_broadcast(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op):
        tvm.testing.assert_allclose(result.asnumpy(), np_op(x_np, y_np))
 def test_any_broadcast():
+    # Test broadcast with 1s
    verify_any_broadcast((relay.Any(),), (3, 2), (1,), (3, 2), relay.add, np.add)
    verify_any_broadcast((relay.Any(), 2), (1, 2), (1, 2), (1, 2), relay.add, np.add)
    verify_any_broadcast((relay.Any(), 2), (1, 2), (3, 2), (1, 2), relay.add, np.add)
    verify_any_broadcast((relay.Any(), 2), (3, 2), (1, 2), (3, 2), relay.add, np.add)
    verify_any_broadcast((relay.Any(), 2), (3, relay.Any()), (1, 2), (3, 1), relay.add, np.add)
-    # The following currently fail because topi compute treats Any as 1
+    # Test broadcast with values other than 1
-    # will requires auto_broadcast buffer to solve the problem
+    verify_any_broadcast((relay.Any(),), (3, 2), (2,), (3, 2), relay.add, np.add)
-    # TODO(@zhiics): Fix this
+    verify_any_broadcast((relay.Any(), 2), (3, 2), (3, 2), (3, 2), relay.add, np.add)
-    # verify_any_broadcast((relay.Any(),), (3, 2), (2,), (3, 2), relay.add, np.add)
-    # verify_any_broadcast((relay.Any(), 2), (3, 2), (3, 2), (3, 2), relay.add, np.add)
+def test_any_broadcast_fail():
+    # Test broadcast with incompatible values at runtime
+    def check_fail(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op):
+        try:
+            verify_any_broadcast(
+                x_shape, y_shape, x_np_shape, y_np_shape, op, np_op)
+        except tvm._ffi.base.TVMError:
+            pass
+        else:
+            assert False
+    check_fail((relay.Any(),), (3, 2), (1,), (4, 2), relay.add, np.add)
+    check_fail((relay.Any(), 2), (3, 2), (4, 2), (4, 2), relay.add, np.add)
+    check_fail((relay.Any(), 2), (3, relay.Any()), (1, 2), (4, 1), relay.add, np.add)
+    check_fail((relay.Any(), 2), (3, 3), (1, 3), (3, 3), relay.add, np.add)
+    check_fail((relay.Any(),), (3, 2), (2), (4, 2), relay.add, np.add)
 def test_any_concat():
    x = relay.var('x', shape=(relay.Any(), 2), dtype="float32")
@@ -285,6 +303,7 @@ def test_recursive_concat_with_wrong_annotation():
 if __name__ == "__main__":
    test_any_broadcast()
+    test_any_broadcast_fail()
    test_any_concat()
    test_any_reshape()
    test_any_take()

--- a/tests/python/unittest/test_codegen_arm.py
+++ b/tests/python/unittest/test_codegen_arm.py
@@ -42,13 +42,14 @@ def test_popcount():
    check_correct_assembly('uint32', 2, 2)
    check_correct_assembly('uint64', 2, 3)
 def test_vmlal_s16():
    target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
    def check_correct_assembly(N):
        K = tvm.var("K")
        A = tvm.placeholder((K, N), dtype="int8", name='A')
-        B = tvm.placeholder((K, N), dtype="int8", name='A')
+        B = tvm.placeholder((K, N), dtype="int8", name='B')
        k = tvm.reduce_axis((0, K))
        C = tvm.compute((N, ), lambda n: tvm.sum(
            A[k, n].astype("int32") * B[k, n].astype("int32"), axis=[k]), name='C')
@@ -60,14 +61,15 @@ def test_vmlal_s16():
        assembly = f.get_source('asm')
        matches = re.findall("vmlal.s16", assembly)
        assert (len(matches) == N // 4)
-    check_correct_assembly(4)
    check_correct_assembly(8)
    check_correct_assembly(16)
+    check_correct_assembly(32)
+    check_correct_assembly(64)
    def check_broadcast_correct_assembly(N):
        K = tvm.var("K")
        A = tvm.placeholder((K, N), dtype="int8", name='A')
-        B = tvm.placeholder((K,), dtype="int8", name='A')
+        B = tvm.placeholder((K,), dtype="int8", name='B')
        k = tvm.reduce_axis((0, K))
        C = tvm.compute((N, ), lambda n: tvm.sum(
            A[k, n].astype("int32") * B[k].astype("int32"),
@@ -85,6 +87,7 @@ def test_vmlal_s16():
    check_broadcast_correct_assembly(32)
    check_broadcast_correct_assembly(64)
 if __name__ == "__main__":
    test_popcount()
    test_vmlal_s16()
--- a/tests/python/unittest/test_lang_buffer.py
+++ b/tests/python/unittest/test_lang_buffer.py
@@ -188,8 +188,21 @@ def test_buffer_broadcast_expr():
        fadd(a, b, c, 4, 1)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+    def check_auto_bind():
+        if not tvm.module.enabled("llvm"):
+            return
+        # Let build bind buffers
+        fadd = tvm.build(s, [A, B, C, o1, x], target='llvm', name='bcast_add')
+        ctx = tvm.cpu(0)
+        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx)
+        fadd(a, b, c, 4, 1)
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
    check_stride()
    check_no_stride()
+    check_auto_bind()
 if __name__ == "__main__":

--- a/topi/include/topi/detail/broadcast.h
+++ b/topi/include/topi/detail/broadcast.h
@@ -74,9 +74,11 @@ inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::Expr>& shape1,
    } else if (var1) {
      bh.common_shape.push_front(shape2[s2_size - i]);
      bh.vars2.push_front(bh.all_vars[0]);
+      bh.vars1.push_front(bh.all_vars[0]);
    } else if (var2) {
      bh.common_shape.push_front(shape1[s1_size - i]);
      bh.vars1.push_front(bh.all_vars[0]);
+      bh.vars2.push_front(bh.all_vars[0]);
    } else {
      CHECK(false) << "Incompatible broadcast dims: " << shape1[s1_size - i]
                   << " and " << shape2[s2_size - i] << " in: "
@@ -98,16 +100,18 @@ inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::Expr>& shape1,
 }
 inline tvm::Array<tvm::Expr> InputIndexFromBroadcast(
-    const tvm::Array<tvm::Var>& ovars, const tvm::Tensor& T,
+    const tvm::Array<tvm::Var>& ovars,
-    const std::deque<tvm::Var>& my_vars, const std::deque<tvm::Var>& all_vars) {
+    const tvm::Tensor& T,
+    const std::deque<tvm::Var>& my_vars,
+    const std::deque<tvm::Var>& all_vars) {
  tvm::Array<tvm::Expr> ivars;
  CHECK_EQ(ovars.size(), all_vars.size());
-  // N^2, could use a map but NBD..
+  // N^2, could use a map but NBD.
  size_t expected_dims = T->shape.size();
  for (size_t i = 0; i < ovars.size(); ++i) {
    bool found = false;
    for (size_t j = 0; j < my_vars.size(); ++j) {
-    if (all_vars[i].same_as(my_vars[j])) {
+      if (all_vars[i].same_as(my_vars[j])) {
        ivars.push_back(ovars[i]);
        found = true;
        break;
@@ -123,13 +127,12 @@ inline tvm::Array<tvm::Expr> InputIndexFromBroadcast(
  return ivars;
 }
 template <typename FBinaryExpr>
 inline tvm::Tensor WithBroadcast(FBinaryExpr op,
                                 const tvm::Tensor& A,
                                 const tvm::Tensor& B,
-                                 std::string name = "tensor",
+                                 const std::string& name = "tensor",
-                                 std::string tag = "") {
+                                 const std::string& tag = "") {
  auto bh = BroadcastShape(A->shape, B->shape);
  auto l = [&](tvm::Array<tvm::Var> ovars) {
    return op(A(InputIndexFromBroadcast(ovars, A, bh.vars1, bh.all_vars)),