/*!
 *  Copyright (c) 2017 by Contributors
 * \file split_host_device.cc
 * \brief Split device function from host.
 */
#include <tvm/ir.h>
#include <tvm/lowered_func.h>
#include <tvm/channel.h>
#include <tvm/ir_pass.h>
#include <tvm/ir_mutator.h>
#include <tvm/runtime/module.h>
#include <unordered_map>

namespace tvm {
namespace ir {

// use/def analysis, also delete unreferenced lets
class IRUseDefAnalysis : public IRMutator {
 public:
  Stmt Mutate_(const AttrStmt *op, const Stmt& s) final {
    if (op->attr_key == attr::thread_extent) {
      IterVar iv(op->node.node_);
      CHECK_NE(iv->thread_tag.length(), 0U);
      // thread_extent can appear multiple times
      // use the first appearance as def.
      if (!use_count_.count(iv->var.get())) {
        this->HandleDef(iv->var.get());
        thread_axis_.push_back(iv);
        thread_extent_.push_back(op->value);
      }

      Expr value = op->value;
      if (visit_thread_extent_) {
        value = this->Mutate(value);
      }
      Stmt body = this->Mutate(op->body);
      if (value.same_as(value) && body.same_as(body)) return s;
      return AttrStmt::make(op->node, op->attr_key, value, body);
    } else if (op->attr_key == attr::channel_write_scope ||
               op->attr_key == attr::channel_read_scope) {
      Channel ch(op->node.node_);
      if (!use_count_.count(ch->handle_var.get())) {
        this->HandleDef(ch->handle_var.get());
      }
      return IRMutator::Mutate_(op, s);
    } else {
      return IRMutator::Mutate_(op, s);
    }
  }

  Stmt Mutate_(const LetStmt *op, const Stmt& s) final {
    this->HandleDef(op->var.get());
    Stmt body = this->Mutate(op->body);
    // eliminate unreferenced let
    if (use_count_.at(op->var.get()) == 0 &&
        !HasSideEffect(op->value)) {
      return body;
    } else {
      Expr value = this->Mutate(op->value);
      if (body.same_as(op->body) &&
          value.same_as(op->value)) {
        return s;
      } else {
        return LetStmt::make(op->var, value, body);
      }
    }
  }

  Stmt Mutate_(const For *op, const Stmt& s) final {
    this->HandleDef(op->loop_var.get());
    return IRMutator::Mutate_(op, s);
  }

  Stmt Mutate_(const Allocate *op, const Stmt& s) final {
    this->HandleDef(op->buffer_var.get());
    return IRMutator::Mutate_(op, s);
  }

  Stmt Mutate_(const Store *op, const Stmt& s) final {
    this->HandleUse(op->buffer_var);
    return IRMutator::Mutate_(op, s);
  }

  Expr Mutate_(const Let *op, const Expr& e) final {
    this->HandleDef(op->var.get());
    Expr body = this->Mutate(op->body);
    // eliminate unreferenced let
    if (use_count_.at(op->var.get()) == 0 &&
        !HasSideEffect(op->value)) {
      return body;
    } else {
      Expr value = this->Mutate(op->value);
      if (body.same_as(op->body) &&
          value.same_as(op->value)) {
        return e;
      } else {
        return Let::make(op->var, value, body);
      }
    }
  }

  Expr Mutate_(const Variable *op, const Expr& e) final {
    this->HandleUse(e);
    return IRMutator::Mutate_(op, e);
  }

  Expr Mutate_(const Load *op, const Expr& e) final {
    this->HandleUse(op->buffer_var);
    return IRMutator::Mutate_(op, e);
  }

  void HandleDef(const Variable* v) {
    CHECK(!def_count_.count(v))
        << "variable " << v->name_hint
        << " has already been defined, the Stmt is not SSA";
    CHECK(!use_count_.count(v))
        << "variable " << v->name_hint
        << " has been used before definition!";
    use_count_[v] = 0;
    def_count_[v] = 1;
  }

  void HandleUse(const Expr& v) {
    CHECK(v.as<Variable>());
    Var var(v.node_);
    auto it = use_count_.find(var.get());
    if (it != use_count_.end()) {
      if (it->second >= 0) {
        ++it->second;
      }
    } else {
      undefined_.push_back(var);
      use_count_[var.get()] = -1;
    }
  }

  // The fields are publically readible to
  // be accessible to the users.
  bool visit_thread_extent_{true};
  Array<Var> undefined_;
  Array<IterVar> thread_axis_;
  Array<Expr> thread_extent_;
  std::unordered_map<const Variable*, int> use_count_;
  std::unordered_map<const Variable*, int> def_count_;
};

class HostDeviceSplitter : public IRMutator {
 public:
  Stmt Mutate_(const Allocate* op, const Stmt& s) final {
    handle_data_type_[op->buffer_var.get()] = make_const(op->type, 0);
    return IRMutator::Mutate_(op, s);
  }

  Stmt Mutate_(const AttrStmt *op, const Stmt& s) final {
    if (op->attr_key == attr::thread_extent ||
        op->attr_key == attr::pipeline_exec_scope) {
      return SplitDeviceFunc(s);
    }
    return IRMutator::Mutate_(op, s);
  }

  Array<LoweredFunc> Split(LoweredFunc f) {
    CHECK_EQ(f->func_type, kMixedFunc);
    for (auto kv : f->handle_data_type) {
      handle_data_type_[kv.first.get()] = kv.second;
    }
    name_ = f->name;
    std::shared_ptr<LoweredFuncNode> n =
        std::make_shared<LoweredFuncNode>(*f.operator->());
    n->body = this->Mutate(f->body);
    n->func_type = kHostFunc;
    Array<LoweredFunc> ret{LoweredFunc(n)};
    for (LoweredFunc x : device_funcs_) {
      ret.push_back(x);
    }
    return ret;
  }

 private:
  Stmt SplitDeviceFunc(Stmt body) {
    std::ostringstream os;
    os << name_ << "__kernel" << device_funcs_.size();
    std::shared_ptr<LoweredFuncNode> n = std::make_shared<LoweredFuncNode>();
    // isolate the device function.
    IRUseDefAnalysis m;
    m.visit_thread_extent_ = false;
    n->body = m.Mutate(body);
    n->name = os.str();
    n->func_type = kDeviceFunc;
    n->thread_axis = m.thread_axis_;
    // Strictly order the arguments: Var pointers, positional arguments.
    for (Var v : m.undefined_) {
      if (v.type().is_handle()) {
        n->args.push_back(v);
        // mark handle data type.
        auto it = handle_data_type_.find(v.get());
        if (it != handle_data_type_.end()) {
          n->handle_data_type.Set(v, it->second);
        }
      }
    }
    for (Var v : m.undefined_) {
      if (!v.type().is_handle()) {
        n->args.push_back(v);
      }
    }
    LoweredFunc f_device(n);
    Array<Expr> call_args;
    call_args.push_back(StringImm::make(f_device->name));
    for (Var arg : n->args) {
      call_args.push_back(arg);
    }
    for (Expr ext : m.thread_extent_) {
      call_args.push_back(ext);
    }
    device_funcs_.emplace_back(f_device);
    return Evaluate::make(Call::make(
        Int(32), intrinsic::tvm_call_packed,
        call_args, Call::Intrinsic));
  }

  // function name
  std::string name_;
  // the device functions
  std::vector<LoweredFunc> device_funcs_;
  std::unordered_map<const Variable*, Expr> handle_data_type_;
};


Array<Var> UndefinedVars(const Stmt& stmt, const Array<Var>& args) {
  IRUseDefAnalysis m;
  for (Var arg : args) {
    m.use_count_[arg.get()] = 0;
  }
  m.Mutate(stmt);
  return m.undefined_;
}

Array<LoweredFunc> SplitHostDevice(LoweredFunc func) {
  return HostDeviceSplitter().Split(func);
}

}  // namespace ir
}  // namespace tvm