[Relay][External Codegen] Support data types for CSourceModuleCodegen args and output (#4934)

* Support int args and no extra buffers * Fixes * remove testing code * fix style * more style * use const args * style Co-authored-by: Jon Soifer <jonso@microsoft.com>

[Relay][External Codegen] Support data types for CSourceModuleCodegen args and output (#4934)
* Support int args and no extra buffers * Fixes * remove testing code * fix style * more style * use const args * style Co-authored-by: Jon Soifer <jonso@microsoft.com>
a2429c1f · Jon Soifer · GitHub · 87c20bb2 · a2429c1f · a2429c1f
Unverified Commit a2429c1f authored Feb 24, 2020 by Jon Soifer Committed by GitHub Feb 24, 2020
Showing with 139 additions and 53 deletions

src/relay/backend/contrib/codegen_c/codegen.cc
+34 -23

src/relay/backend/contrib/codegen_c/codegen_c.h
+72 -22

src/relay/backend/contrib/dnnl/codegen.cc
+15 -8

tests/python/relay/test_external_codegen.py
+18 -0

No files found.
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ b/src/relay/backend/contrib/codegen_c/codegen.cc
@@ -41,9 +41,11 @@ class CodegenC : public ExprVisitor, public CodegenCBase {
  explicit CodegenC(const std::string& id) { this->ext_func_id_ = id; }

  void VisitExpr_(const VarNode* node) {
-    ext_func_args_.push_back(node->name_hint());
+    ext_func_args_.push_back(GetRef<Var>(node));
    out_.clear();
-    out_.push_back({node->name_hint(), 0});
+    Output output;
+    output.name = node->name_hint();
+    out_.push_back(output);
  }

  void VisitExpr_(const CallNode* call) final {
@@ -70,6 +72,12 @@ class CodegenC : public ExprVisitor, public CodegenCBase {
    for (size_t i = 0; i < in_shape.size(); ++i) {
      macro_stream << ", " << in_shape[i];
    }
+
+    const auto* type_node = call->checked_type().as<TensorTypeNode>();
+    CHECK(type_node);
+    const auto& dtype = GetDtypeString(type_node);
+    macro_stream << ", " << dtype;
+
    macro_stream << ");";
    func_decl_.push_back(macro_stream.str());

@@ -83,20 +91,18 @@ class CodegenC : public ExprVisitor, public CodegenCBase {
          decl_stream << ", ";
        }
        first = false;
-        decl_stream << out.first;
+        decl_stream << out.name;
      }
    }

-    auto type_node = call->checked_type().as<TensorTypeNode>();
-    CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
-        << "Only support single output tensor with float type";
    std::string out = "buf_" + std::to_string(buf_idx_++);
    auto out_shape = GetShape(call->checked_type());
    int out_size = 1;
    for (size_t i = 0; i < out_shape.size(); ++i) {
      out_size *= out_shape[i];
    }
-    buf_stream << "float* " << out << " = (float*)std::malloc(4 * " << out_size << ");";
+    buf_stream << dtype << "* " << out <<
+      " = (" << dtype << "*)std::malloc(4 * " << out_size << ");";
    buf_decl_.push_back(buf_stream.str());

    decl_stream << ", " << out << ");";
@@ -104,7 +110,12 @@ class CodegenC : public ExprVisitor, public CodegenCBase {

    // Update output buffer
    out_.clear();
-    out_.push_back({out, out_size});
+    Output output;
+    output.name = out;
+    output.dtype = dtype;
+    output.need_copy = true;
+    output.size = out_size;
+    out_.push_back(output);
  }

  /*!
@@ -128,7 +139,7 @@ class CodegenC : public ExprVisitor, public CodegenCBase {
  /*! \brief The index of allocated buffers. */
  int buf_idx_ = 0;
  /*! \brief The arguments of a C compiler compatible function. */
-  std::vector<std::string> ext_func_args_;
+  Array<Var> ext_func_args_;
  /*! \brief The statements of a C compiler compatible function. */
  std::vector<std::string> ext_func_body;
  /*! \brief The declaration statements of a C compiler compatible function. */
@@ -136,7 +147,7 @@ class CodegenC : public ExprVisitor, public CodegenCBase {
  /*! \brief The declaration statements of buffers. */
  std::vector<std::string> buf_decl_;
  /*! \brief The name and index pairs for output. */
-  std::vector<std::pair<std::string, int>> out_;
+  std::vector<Output> out_;
 };

 class CSourceCodegen : public CSourceModuleCodegenBase {
@@ -161,21 +172,21 @@ class CSourceCodegen : public CSourceModuleCodegenBase {

    // Append some common macro for operator definition.
    const char* operator_macro = R"op_macro(
-    #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)       \
-      extern "C" void p_ID_(float* a, float* b, float* out) { \
-        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
-          out[i] = a[i] p_OP_ b[i];                           \
-        }                                                     \
+    #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_, p_DTYPE)       \
+      extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) {    \
+        for (int64_t i = 0; i < p_DIM1_; ++i) {                        \
+          out[i] = a[i] p_OP_ b[i];                                    \
+        }                                                              \
      }

-    #define CSOURCE_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
-      extern "C" void p_ID_(float* a, float* b, float* out) {     \
-        for (int64_t i = 0; i < p_DIM1_; ++i) {                   \
-          for (int64_t j = 0; j < p_DIM2_; ++j) {                 \
-            int64_t k = i * p_DIM2_ + j;                          \
-            out[k] = a[k] p_OP_ b[k];                             \
-          }                                                       \
-        }                                                         \
+    #define CSOURCE_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_, p_DTYPE)  \
+      extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) {        \
+        for (int64_t i = 0; i < p_DIM1_; ++i) {                            \
+          for (int64_t j = 0; j < p_DIM2_; ++j) {                          \
+            int64_t k = i * p_DIM2_ + j;                                   \
+            out[k] = a[k] p_OP_ b[k];                                      \
+          }                                                                \
+        }                                                                  \
      }
    )op_macro";


--- a/src/relay/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relay/backend/contrib/codegen_c/codegen_c.h
@@ -35,6 +35,13 @@ namespace tvm {
 namespace relay {
 namespace contrib {

+struct Output {
+  std::string name;
+  std::string dtype;
+  int size;
+  bool need_copy;
+};
+
 class CSourceModuleCodegenBase {
 public:
  CSourceModuleCodegenBase() = default;
@@ -98,7 +105,7 @@ class CodegenCBase {
   * \brief Gerenate C code for the external function.
   *
   * \param func_name The name of the external function.
-   * \param arg_cnt The expected number of arguments.
+   * \param args arguments to the external function.
   *
   * \code
   *
@@ -116,16 +123,18 @@ class CodegenCBase {
   *
   * \endcode
   */
-  void GenerateBackendCFunc(const std::string& func_name, int arg_cnt) {
+  void GenerateBackendCFunc(const std::string& func_name,
+                            const Array<Var>& args,
+                            const Output& out) {
    // Print signature
    code_stream_ << "\n";
    code_stream_ << "extern \"C\" int " << func_name << "_wrapper_(";
-    for (int i = 0; i < arg_cnt - 1; i++) {
+    for (size_t i = 0; i < args.size(); i++) {
      code_stream_ << "DLTensor* arg" << i << ",\n";
      code_stream_ << "\t";
    }
-    if (arg_cnt > 0) {
-      code_stream_ << "DLTensor* arg" << arg_cnt - 1 << ") {\n";
+    if (args.size() > 0) {
+      code_stream_ << "DLTensor* arg" << args.size() << ") {\n";
    }

    EnterScope();
@@ -133,12 +142,13 @@ class CodegenCBase {
    // Generate the internal call.
    PrintIndents();
    code_stream_ << func_name << "_(";
-    for (int i = 0; i < arg_cnt - 1; i++) {
-      code_stream_ << "static_cast<float*>(arg" << i << "->data),\n";
+    for (size_t i = 0; i < args.size(); i++) {
+      const auto& dtype_str = GetDtypeString(args[i]);
+      code_stream_ << "static_cast<" << dtype_str << "*>(arg" << i << "->data),\n";
      PrintIndents();
    }
-    if (arg_cnt > 0) {
-      code_stream_ << "static_cast<float*>(arg" << arg_cnt - 1 << "->data)";
+    if (args.size() > 0) {
+      code_stream_ << "static_cast<" << out.dtype << "*>(arg" << args.size() << "->data)";
    }
    code_stream_ << ");\n";
    PrintIndents();
@@ -207,17 +217,21 @@ class CodegenCBase {
   *
   * \return The emitted code string.
   */
-  std::string JitImpl(std::string ext_func_id, std::vector<std::string> args,
-                      std::vector<std::string> buf_decl, std::vector<std::string> body,
-                      std::vector<std::pair<std::string, int>> out) {
+  std::string JitImpl(std::string ext_func_id, const Array<Var>& args,
+                      const std::vector<std::string>& buf_decl,
+                      const std::vector<std::string>& body,
+                      const std::vector<Output>& out) {
    // Create the signature. For example, it could be:
    // extern "C" void dnnl_0_(float* input0, float* input1, float* out, int M, int N) {}
    code_stream_ << "extern \"C\" void " << ext_func_id << "_(";

+    CHECK_EQ(out.size(), 1U) << "Internal error: only single output is support.";
+
    for (const auto& arg : args) {
-      code_stream_ << "float* " << arg << ", ";
+      const auto& dtype_str = GetDtypeString(arg);
+      code_stream_ << dtype_str << "* " << arg->name_hint() << ", ";
    }
-    code_stream_ << "float* out) {\n";
+    code_stream_ << out[0].dtype << "* out) {\n";
    this->EnterScope();

    // Function body
@@ -232,24 +246,60 @@ class CodegenCBase {
    }

    // Copy output
-    CHECK_EQ(out.size(), 1U) << "Internal error: only single output is support.";
-    this->PrintIndents();
-    code_stream_ << "std::memcpy(out, " << out[0].first << ", 4 * " << out[0].second << ");\n";
-
-    // Free buffers
-    for (size_t i = 0; i < buf_decl.size(); i++) {
+    if (out[0].need_copy) {
      this->PrintIndents();
-      code_stream_ << "std::free(buf_" << i << ");\n";
+      code_stream_ << "std::memcpy(out, " << out[0].name << ", 4 * " << out[0].size << ");\n";
+
+      // Free buffers
+      for (size_t i = 0; i < buf_decl.size(); i++) {
+        this->PrintIndents();
+        code_stream_ << "std::free(buf_" << i << ");\n";
+      }
    }

    this->ExitScope();
    code_stream_ << "}\n";

    // Create the wrapper to call the ext_func
-    this->GenerateBackendCFunc(ext_func_id, args.size() + 1 /* output */);
+    this->GenerateBackendCFunc(ext_func_id, args, out[0]);
    return code_stream_.str();
  }

+  /*!
+   * \brief Returns dtype string
+   *
+   * \param var Var to get the dtype of
+   *
+   * \return The dtype string.
+   */
+  std::string GetDtypeString(const Var& var) {
+    auto ttype = var->checked_type().as<TensorTypeNode>();
+    CHECK(ttype) << "Expect TensorTypeNode";
+    return GetDtypeString(ttype);
+  }
+
+  /*!
+   * \brief Returns dtype string
+   *
+   * \param ttype TensorTypeNode* to get the dtype of
+   *
+   * \return The dtype string.
+   */
+  std::string GetDtypeString(const TensorTypeNode* ttype) {
+    std::string dtype;
+    if (runtime::TypeMatch(ttype->dtype, kDLFloat, 32)) {
+      dtype = "float";
+    } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 32)) {
+      dtype = "int";
+    } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 64)) {
+      dtype = "int64_t";
+    } else {
+      LOG(FATAL) << "Unsupported dtype " << ttype->dtype;
+    }
+
+    return dtype;
+  }
+
  /*! \brief The external function source code stream. */
  std::ostringstream code_stream_;


--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -45,9 +45,11 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase {
  explicit CodegenDNNL(const std::string& id) { this->ext_func_id_ = id; }

  void VisitExpr_(const VarNode* node) final {
-    ext_func_args_.push_back(node->name_hint());
+    ext_func_args_.push_back(GetRef<Var>(node));
    out_.clear();
-    out_.push_back({node->name_hint(), 0});
+    Output output;
+    output.name = node->name_hint();
+    out_.push_back(output);
  }

  void VisitExpr_(const TupleGetItemNode* op) final {
@@ -90,14 +92,14 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase {
          decl_stream << ", ";
        }
        first = false;
-        decl_stream << out.first;
+        decl_stream << out.name;
      }
    }

    // Analyze the output buffer
    auto type_node = call->checked_type().as<TensorTypeNode>();
-    CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
-        << "Only support single output tensor with float type";
+    CHECK(type_node);
+    const auto& dtype = GetDtypeString(type_node);
    std::string out = "buf_" + std::to_string(buf_idx_++);
    auto out_shape = GetShape(call->checked_type());
    int out_size = 1;
@@ -118,7 +120,12 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase {

    // Update output buffer
    out_.clear();
-    out_.push_back({out, out_size});
+    Output output;
+    output.name = out;
+    output.dtype = dtype;
+    output.need_copy = true;
+    output.size = out_size;
+    out_.push_back(output);
  }

  std::string JIT(void) {
@@ -213,13 +220,13 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase {
   */
  int buf_idx_{0};
  /*! \brief The arguments used by a wrapped function that calls DNNL kernels. */
-  std::vector<std::string> ext_func_args_;
+  Array<Var> ext_func_args_;
  /*! \brief statement of the function that will be compiled using DNNL kernels. */
  std::vector<std::string> ext_func_body;
  /*! \brief The declaration of intermeidate buffers. */
  std::vector<std::string> buf_decl_;
  /*! \brief The name of the the outputs. */
-  std::vector<std::pair<std::string, int>> out_;
+  std::vector<Output> out_;
 };

 /*!

--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -161,6 +161,23 @@ def test_extern_gcc_single_op():
    check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)


+def test_extern_gcc_single_op_int():
+    x = relay.var('x', shape=(8, 8), dtype="int32")
+    y = relay.var('y', shape=(8, 8), dtype="int32")
+
+    x0 = relay.var('x0', shape=(8, 8), dtype="int32")
+    y0 = relay.var('y0', shape=(8, 8), dtype="int32")
+    z = x0 + y0
+    f = relay.Function([x0, y0], z)
+    f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
+    call = relay.Call(f, [x, y])
+    mod = tvm.IRModule.from_expr(call)
+    x_data = np.random.rand(8, 8).astype('int32')
+    y_data = np.random.rand(8, 8).astype('int32')
+
+    check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)
+
+
 def test_extern_gcc():
    x = relay.var('x', shape=(2, 2))
    y = relay.var('y', shape=(2, 2))
@@ -242,5 +259,6 @@ def test_extern_dnnl():
 if __name__ == "__main__":
    test_multi_node_subgraph()
    test_extern_gcc_single_op()
+    test_extern_gcc_single_op_int()
    test_extern_gcc()
    test_extern_dnnl()