compile_engine.cc 11.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
/*!
 *  Copyright (c) 2018 by Contributors
 * \file relay/backend/compile_engine.cc
 * \brief Internal compialtion engine.
 */
#include <tvm/schedule.h>
#include <tvm/packed_func_ext.h>
#include <tvm/operation.h>
#include <tvm/runtime/registry.h>
#include <tvm/relay/pass.h>
#include <tvm/relay/expr_functor.h>
#include <tvm/relay/op_attr_types.h>
#include <utility>
#include <limits>
#include <mutex>
#include <functional>
#include "compile_engine.h"

namespace tvm {
namespace relay {

CCacheKey CCacheKeyNode::make(Function source_func, Target target) {
  auto n = make_node<CCacheKeyNode>();
  n->source_func = std::move(source_func);
  n->target = std::move(target);
  return CCacheKey(n);
}

// The getter to get schedule from compile engine.
// Get schedule from functor.
class ScheduleGetter :
      public ExprFunctor<Array<Tensor>(const Expr&)> {
 public:
  explicit ScheduleGetter(Target target)
      : target_(target) {}

  Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
    // for now, we always use int32 shape when possible
    // even if the result of shape inference becomes int64.
    Array<IndexExpr> res;
    for (IndexExpr val : shape) {
      const int64_t* pval = as_const_int(val);
      if (pval != nullptr) {
        CHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
        CHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
        res.push_back(ir::IntImm::make(Int(32), *pval));
      } else {
        res.push_back(val);
      }
    }
    return res;
  }

  std::pair<Schedule, CachedFunc> Create(const Function& prim_func) {
    static auto fschedule =
        Op::GetAttr<FTVMSchedule>("FTVMSchedule");
    auto cache_node = make_node<CachedFuncNode>();
    cache_node->target = target_;
59
    for (Var param : prim_func->params) {
60
      Array<tvm::Tensor> inputs;
61
      if (const auto* ttype = param->checked_type().as<TensorTypeNode>()) {
62 63 64 65
        tvm::Tensor tensor = tvm::placeholder(
            GetShape(ttype->shape), ttype->dtype);
        cache_node->inputs.push_back(tensor);
        inputs.push_back(tensor);
66 67 68 69 70 71 72 73 74 75 76
      } else {
        // flatten tuple of tensor type.
        const auto* tuple_type = param->type_as<TupleTypeNode>();
        for (Type field : tuple_type->fields) {
          const auto* ttype = field.as<TensorTypeNode>();
          CHECK(ttype != nullptr);
          tvm::Tensor tensor = tvm::placeholder(
              GetShape(ttype->shape), ttype->dtype);
          cache_node->inputs.push_back(tensor);
          inputs.push_back(tensor);
        }
77
      }
78
      memo_[param] = inputs;
79 80 81 82 83 84 85
    }
    readable_name_stream_ << "fused";
    cache_node->outputs = this->VisitExpr(prim_func->body);
    cache_node->func_name = readable_name_stream_.str();
    CachedFunc cfunc(cache_node);
    CHECK(master_op_.defined());
    Schedule schedule = fschedule[master_op_](
86
        master_attrs_, cache_node->outputs, target_);
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
    return std::make_pair(schedule, cfunc);
  }

  Array<Tensor> VisitExpr(const Expr& expr) {
    auto it = memo_.find(expr);
    if (it != memo_.end()) {
      return it->second;
    } else {
      Array<Tensor> res = ExprFunctor::VisitExpr(expr);
      memo_[expr] = res;
      return res;
    }
  }

  Array<Tensor> VisitExpr_(const VarNode* op) final {
102
    LOG(FATAL) << "Free variable " << op->name_hint();
103 104 105
    return {};
  }

106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
  Array<Tensor> VisitExpr_(const ConstantNode* op) final {
    CHECK(op->is_scalar());
    void* data = op->data->data;
    DataType dtype = TVMType2Type(op->data->dtype);
    Tensor value = tvm::compute({}, [&](const Array<tvm::Var>&) {
        if (dtype == Int(32)) {
          return make_const(dtype, static_cast<const int32_t*>(data)[0]);
        } else if (dtype == Int(64)) {
          return make_const(dtype, static_cast<const int64_t*>(data)[0]);
        } else if (dtype == Float(32)) {
          return make_const(dtype, static_cast<const float*>(data)[0]);
        } else if (dtype == Float(64)) {
          return make_const(dtype, static_cast<const double*>(data)[0]);
        } else if (dtype == Bool()) {
          return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
        } else {
          LOG(FATAL) << "not handled";
          return tvm::Expr();
        }
      });
    return {value};
  }

129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
  Array<Tensor> VisitExpr_(const CallNode* call_node) final {
    static auto fcompute =
        Op::GetAttr<FTVMCompute>("FTVMCompute");
    static auto fpattern =
        Op::GetAttr<TOpPattern>("TOpPattern");

    Array<Tensor> inputs;
    int count_tuple = 0;
    for (Expr arg : call_node->args) {
      if (arg->checked_type().as<TupleTypeNode>()) {
        ++count_tuple;
      }
      for (Tensor tensor : VisitExpr(arg)) {
        inputs.push_back(tensor);
      }
    }
    if (count_tuple) {
      CHECK_EQ(call_node->args.size(), 1U)
          << "Only allow function with a single tuple input";
    }
    CHECK(call_node->op.as<OpNode>())
        << "Primitive function only allows call into primitive ops";
    Op op = Downcast<Op>(call_node->op);
    Array<Tensor> outputs = fcompute[op](
        call_node->attrs,
        inputs,
        call_node->checked_type(),
        target_);

    int op_pattern = fpattern[op];
    if (op_pattern >= kCommReduce) {
160
      CHECK(!master_op_.defined() || master_op_pattern_ < kCommReduce)
161 162
          << "Two complicated op in a primitive function "
          << " master=" << master_op_ << " current=" << op;
163
    }
164
    if (op_pattern >= master_op_pattern_) {
165
      master_op_ = op;
166
      master_attrs_ = call_node->attrs;
167
      master_op_pattern_ = op_pattern;
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
    }
    if (outputs.size() != 1) {
      const auto* tuple_type =
          call_node->checked_type().as<TupleTypeNode>();
      CHECK(tuple_type) << "Expect output to be a tuple type";
      CHECK_EQ(tuple_type->fields.size(), outputs.size());
    }
    readable_name_stream_ << '_' << op->name;
    return outputs;
  }

  Array<Tensor> VisitExpr_(const FunctionNode* op) final {
    LOG(FATAL) << "Do not support sub function";
    return Array<Tensor>();
  }

  Array<Tensor> VisitExpr_(const LetNode* op) final {
    Array<Tensor> val = VisitExpr(op->value);
    CHECK(!memo_.count(op->var));
    memo_[op->var] = val;
    return VisitExpr(op->body);
  }

  Array<Tensor> VisitExpr_(const TupleNode* op) final {
    Array<Tensor> fields;
    for (Expr field : op->fields) {
      CHECK(field->checked_type().as<TensorTypeNode>())
          << "Only allow Tuple of Tensor";
      Array<Tensor> res = VisitExpr(field);
      CHECK_EQ(res.size(), 1);
      fields.push_back(res[0]);
    }
    return fields;
  }

  Array<Tensor> VisitExpr_(const TupleGetItemNode* op) final {
    const auto* tuple_type = op->tuple->type_as<TupleTypeNode>();
    Array<Tensor> tuple = VisitExpr(op->tuple);
    CHECK_EQ(tuple_type->fields.size(), tuple.size());
    CHECK_GE(op->index, 0);
    CHECK_LT(static_cast<size_t>(op->index), tuple.size());
    return {tuple[op->index]};
  }

 private:
  tvm::Target target_;
  Op master_op_;
215
  Attrs master_attrs_;
216
  int master_op_pattern_{0};
217 218 219 220 221 222 223
  std::ostringstream readable_name_stream_;
  std::unordered_map<Expr, Array<Tensor>, NodeHash, NodeEqual> memo_;
};


class CompileEngineImpl : public CompileEngineNode {
 public:
224
  // Lower the function.
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
  CachedFunc Lower(const CCacheKey& key)  {
    return LowerInternal(key)->cached_func;
  }

  // For now, build one module per function.
  PackedFunc JIT(const CCacheKey& key) final {
    CCacheValue value = LowerInternal(key);
    if (value->packed_func != nullptr) return value->packed_func;
    // build the function.
    if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
      tvm::runtime::Module m = (*f)(value->cached_func->funcs, key->target);
      value->packed_func = m.GetFunction(value->cached_func->func_name);
    } else {
      LOG(FATAL) << "relay.backend.build is not registered";
    }
    return value->packed_func;
  }
  void Clear() final {
    cache_.clear();
  }
  // List all items in the cache.
  Array<NodeRef> ListItems() {
    std::lock_guard<std::mutex> lock(mutex_);
    Array<NodeRef> items;
    for (auto& kv : cache_) {
      items.push_back(kv.first);
      items.push_back(kv.second);
    }
    return items;
  }
  /*!
   * \brief Create schedule for target.
   * \param source_func The primitive function to be lowered.
   * \param target The target we want to create schedule for.
   * \return Pair of schedule and cache.
   *  The funcs field in cache is not yet populated.
   */
  std::pair<Schedule, CachedFunc> CreateSchedule(
      const Function& source_func, const Target& target) {
    return ScheduleGetter(target).Create(source_func);
  }

 private:
  // implement lowered func
  CCacheValue LowerInternal(const CCacheKey& key)  {
    std::lock_guard<std::mutex> lock(mutex_);
    CCacheValue value;
    auto it = cache_.find(key);
    if (it != cache_.end()) {
      it->second->use_count += 1;
      if (it->second->cached_func.defined()) return it->second;
      value = it->second;
    } else {
      value = CCacheValue(make_node<CCacheValueNode>());
      value->use_count = 0;
      cache_[key] = value;
    }
282 283 284
    // Enforce use the target.
    TargetContext target_ctx(key->target);

285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
    CHECK(!value->cached_func.defined());
    auto spair = CreateSchedule(key->source_func, key->target);
    auto cache_node = make_node<CachedFuncNode>(
        *(spair.second.operator->()));
    cache_node->func_name = GetUniqeName(cache_node->func_name);
    // NOTE: array will copy on write.
    Array<Tensor> all_args = cache_node->inputs;
    for (Tensor arg : cache_node->outputs) {
      all_args.push_back(arg);
    }
    // lower the function
    if (const auto* f = runtime::Registry::Get("relay.backend.lower")) {
      cache_node->funcs = (*f)(
          spair.first, all_args, cache_node->func_name, key->source_func);
    } else {
      LOG(FATAL) << "relay.backend._lower is not registred";
    }
    value->cached_func = CachedFunc(cache_node);
    return value;
  }
  /*!
   * \brief Get unique name from name.
   * \param name The orginal name.
   * \return Updated name which is unique.
   */
  std::string GetUniqeName(std::string name) {
311 312 313
    for (size_t i = 0; i < name.length(); ++i) {
      if (name[i] == '.') name[i] = '_';
    }
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
    while (true) {
      auto it = name_map_.find(name);
      if (it == name_map_.end()) {
        name_map_[name] = 1;
        return name;
      } else {
        std::ostringstream os;
        os << name << "_" << it->second;
        ++(it->second);
        name = os.str();
      }
    }
    return name;
  }
  /*! \brief compiler cache lock*/
  std::mutex mutex_;
  /*! \brief internal name map to get an unique name */
  std::unordered_map<std::string, int> name_map_;
  /*! \brief internal compiler cache */
  std::unordered_map<CCacheKey, CCacheValue> cache_;
};

/*! \brief The global compile engine */
const CompileEngine& CompileEngine::Global() {
  // intentionally allocate raw pointer to avoid
  // free during destructuion.
  static CompileEngine* inst = new CompileEngine(
      make_node<CompileEngineImpl>());
  return *inst;
}


TVM_REGISTER_GLOBAL("relay.backend._make_CCacheKey")
.set_body_typed<CCacheKey(Function, Target)>(CCacheKeyNode::make);

TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGlobal")
.set_body_typed<CompileEngine()>([]() {
    return CompileEngine::Global();
  });

TVM_REGISTER_GLOBAL("relay.backend._CompileEngineClear")
.set_body_typed<void(const CompileEngine&)>([](CompileEngine self) {
    self->Clear();
  });

TVM_REGISTER_GLOBAL("relay.backend._CompileEngineLower")
.set_body_typed<CachedFunc(CompileEngine, CCacheKey)>(
    [](CompileEngine self, CCacheKey key) {
      return self->Lower(key);
    });

TVM_REGISTER_GLOBAL("relay.backend._CompileEngineJIT")
.set_body_typed<PackedFunc(CompileEngine, CCacheKey)>(
    [](CompileEngine self, CCacheKey key) {
      return self->JIT(key);
    });

TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListItems")
.set_body_typed<Array<NodeRef>(CompileEngine)>(
    [](CompileEngine self){
      return static_cast<CompileEngineImpl*>(self.operator->())->ListItems();
    });
}  // namespace relay
}  // namespace tvm