/*!
 *  Copyright (c) 2017 by Contributors
 * \file Use external nnpack library call.
 */
#include <tvm/runtime/registry.h>
#include <tvm/runtime/util.h>
#include <dmlc/logging.h>
#include <nnpack.h>
#include "nnpack_utils.h"

namespace tvm {
namespace contrib {
using namespace runtime;

TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
    .set_body([](TVMArgs args, TVMRetValue *ret) {
      NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
      static std::once_flag flag;
      std::call_once(flag,
                     []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
      DLTensor *input = args[0];
      DLTensor *kernel = args[1];
      DLTensor *bias = nullptr;
      if (args[2].type_code() == kArrayHandle) {
        bias = args[2];
      }
      DLTensor *output = args[3];
      uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6],
               pad_left = args[7];
      nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
      uint64_t stride_width = args[8], stride_height = args[9];
      nnp_size stride_size{stride_width, stride_height};
      NNPackConfig(args[10]);

      uint64_t algo_ = args[11];
      nnp_convolution_algorithm algo =
          static_cast<nnp_convolution_algorithm>(algo_);
      CHECK_EQ(input->ndim, 4);
      CHECK_EQ(kernel->ndim, 4);
      if (bias) {
        CHECK_EQ(bias->ndim, 1);
      }
      CHECK_EQ(output->ndim, 4);
      CHECK_EQ(input->shape[1], kernel->shape[1]);
      CHECK_EQ(input->shape[0], output->shape[0]);
      size_t input_channels = input->shape[1];
      CHECK_EQ(output->shape[1], kernel->shape[0]);
      if (bias) {
        CHECK_EQ(output->shape[1], bias->shape[0]);
      }
      size_t output_channels = output->shape[1];
      nnp_size input_size{static_cast<size_t>(input->shape[2]),
                          static_cast<size_t>(input->shape[3])};
      nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
                           static_cast<size_t>(kernel->shape[3])};
      CHECK(input->strides == nullptr);
      CHECK(kernel->strides == nullptr);
      if (bias) {
        CHECK(bias->strides == nullptr);
      }

      CHECK(TypeMatch(input->dtype, kDLFloat, 32));
      CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
      if (bias) {
        CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
      }
      CHECK(TypeMatch(output->dtype, kDLFloat, 32));

      // Allocate a zero-bias if we don't pass one in.
      std::unique_ptr<std::vector<float>> zero_bias;
      if (!bias) {
        zero_bias.reset(new std::vector<float>(output->shape[1], 0.0));
      }

      for (auto n = 0; n < input->shape[0]; ++n) {
        nnp_status status = nnp_convolution_inference(
            algo, nnp_convolution_transform_strategy_compute, input_channels,
            output_channels, input_size, input_padding, kernel_size,
            stride_size,
            static_cast<float *>(input->data) + n * input->shape[1] *
                                                   input->shape[2] *
                                                   input->shape[3],
            static_cast<float *>(kernel->data),
            bias ? static_cast<float *>(bias->data) : zero_bias->data(),
            static_cast<float *>(output->data) + n * output->shape[1] *
                                                    output->shape[2] *
                                                    output->shape[3],
            NULL, NULL, nnp_activation_identity, NULL, entry->threadpool, NULL);

        CHECK_EQ(status, nnp_status_success);
      }
    });

TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_transform")
    .set_body([](TVMArgs args, TVMRetValue *ret) {
      NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
      static std::once_flag flag;
      std::call_once(flag,
                     []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
      DLTensor *input = args[0];
      DLTensor *transformed_kernel = args[1];
      DLTensor *bias = nullptr;
      if (args[2].type_code() == kArrayHandle) {
        bias = args[2];
      }
      DLTensor *output = args[3];
      uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6],
               pad_left = args[7];
      nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
      uint64_t stride_width = args[8], stride_height = args[9];
      nnp_size stride_size{stride_width, stride_height};
      NNPackConfig(args[10]);

      uint64_t algo_ = args[11];
      nnp_convolution_algorithm algo =
          static_cast<nnp_convolution_algorithm>(algo_);
      CHECK_EQ(input->ndim, 4);
      if (bias) {
        CHECK_EQ(bias->ndim, 1);
      }
      CHECK_EQ(output->ndim, 4);
      CHECK_EQ(input->shape[0], output->shape[0]);
      size_t input_channels = input->shape[1];
      if (bias) {
        CHECK_EQ(output->shape[1], bias->shape[0]);
      }
      size_t output_channels = output->shape[1];
      nnp_size input_size{static_cast<size_t>(input->shape[2]),
                          static_cast<size_t>(input->shape[3])};
      nnp_size kernel_size{3, 3};
      CHECK(input->strides == nullptr);
      CHECK(transformed_kernel->strides == nullptr);
      if (bias) {
        CHECK(bias->strides == nullptr);
      }

      CHECK(TypeMatch(input->dtype, kDLFloat, 32));
      CHECK(TypeMatch(transformed_kernel->dtype, kDLFloat, 32));
      if (bias) {
        CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
      }
      CHECK(TypeMatch(output->dtype, kDLFloat, 32));

      // Allocate a zero-bias if we don't pass one in.
      std::unique_ptr<std::vector<float>> zero_bias;
      if (!bias) {
        zero_bias.reset(new std::vector<float>(output->shape[1], 0.0));
      }

      for (auto n = 0; n < input->shape[0]; ++n) {
      nnp_status status = nnp_convolution_inference(
          algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels,
          input_size, input_padding, kernel_size, stride_size,
          static_cast<float *>(input->data) + n * input->shape[1] *
                               input->shape[2] *
                               input->shape[3],
          static_cast<float *>(transformed_kernel->data),
          bias ? static_cast<float *>(bias->data) : zero_bias->data(),
          static_cast<float *>(output->data) + n * output->shape[1] *
                               output->shape[2] *
                               output->shape[3],
          NULL, NULL,
          nnp_activation_identity, NULL, entry->threadpool, NULL);
      CHECK_EQ(status, nnp_status_success);
      }
    });

TVM_REGISTER_GLOBAL(
    "tvm.contrib.nnpack.convolution_inference_weight_transform")
    .set_body([](TVMArgs args, TVMRetValue *ret) {
      NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
      static std::once_flag flag;
      std::call_once(flag,
                     []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
      DLTensor *kernel = args[0];
      DLTensor *transformed_kernel = args[1];
      // Dummy sizes
      nnp_padding input_padding{1, 1, 1, 1};
      nnp_size stride_size{1, 1};

      nnp_size input_size{100, 100};

      NNPackConfig(args[2]);

      uint64_t algo_ = args[3];
      nnp_convolution_algorithm algo =
          static_cast<nnp_convolution_algorithm>(algo_);
      CHECK_EQ(kernel->ndim, 4);
      size_t input_channels = kernel->shape[1];
      size_t output_channels = kernel->shape[0];
      CHECK_EQ(kernel->shape[2], 3);
      CHECK_EQ(kernel->shape[3], 3);
      nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
                           static_cast<size_t>(kernel->shape[3])};
      CHECK(kernel->strides == nullptr);
      CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));

      size_t transformed_kernel_size = 0;
      nnp_status status;
      status = nnp_convolution_inference(
          algo, nnp_convolution_transform_strategy_precompute, input_channels,
          output_channels, input_size, input_padding, kernel_size, stride_size,
          nullptr, nullptr, nullptr, nullptr, nullptr, &transformed_kernel_size,
          nnp_activation_identity, nullptr, entry->threadpool, nullptr);
      CHECK_EQ(status, nnp_status_success);

      CHECK_LE(transformed_kernel_size, GetDataSize(*transformed_kernel));

      status = nnp_convolution_inference(
          algo, nnp_convolution_transform_strategy_precompute, input_channels,
          output_channels, input_size, input_padding, kernel_size, stride_size,
          nullptr, static_cast<float *>(kernel->data), nullptr, nullptr,
          static_cast<float *>(transformed_kernel->data),
          &transformed_kernel_size, nnp_activation_identity, nullptr,
          entry->threadpool, nullptr);
      CHECK_EQ(status, nnp_status_success);
    });
}  // namespace contrib
}  // namespace tvm