[Relay][VM]VM Profiler (#3727)

* [Relay][VM]VM debugger * Report mean/min/max for op duration * Typos * Lint * Lint * Lint * Support build debug VM in CMake * Lint * Enable VM debug in unit test * Disable debug vm test until new docker image is built * Add device sync code * Fix qnn unit test * Disable vm debug by default * Rename files * Rename classes * Fix comment * Fix comment

[Relay][VM]VM Profiler (#3727)
* [Relay][VM]VM debugger * Report mean/min/max for op duration * Typos * Lint * Lint * Lint * Support build debug VM in CMake * Lint * Enable VM debug in unit test * Disable debug vm test until new docker image is built * Add device sync code * Fix qnn unit test * Disable vm debug by default * Rename files * Rename classes * Fix comment * Fix comment
95f12e31 · Wei Chen · Jared Roesch · c87ace7e · 95f12e31 · 95f12e31
Commit 95f12e31 authored Aug 20, 2019 by Wei Chen Committed by Jared Roesch Aug 20, 2019
13 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,10 +131,33 @@ file(GLOB COMPILER_SRCS
    src/schedule/*.cc
    )

-file(GLOB_RECURSE RELAY_SRCS
-    src/relay/*.cc
+file(GLOB_RECURSE RELAY_OP_SRCS
+    src/relay/op/*.cc
    )
-list(APPEND COMPILER_SRCS ${RELAY_SRCS})
+file(GLOB_RECURSE RELAY_PASS_SRCS
+    src/relay/pass/*.cc
+    )
+file(GLOB RELAY_BACKEND_SRCS
+    src/relay/backend/*.cc
+    src/relay/backend/vm/*.cc
+    )
+file(GLOB_RECURSE RELAY_IR_SRCS
+    src/relay/ir/*.cc
+    )
+file(GLOB_RECURSE RELAY_QNN_SRCS
+    src/relay/qnn/*.cc
+)
+list(APPEND COMPILER_SRCS ${RELAY_OP_SRCS})
+list(APPEND COMPILER_SRCS ${RELAY_PASS_SRCS})
+list(APPEND COMPILER_SRCS ${RELAY_BACKEND_SRCS})
+list(APPEND COMPILER_SRCS ${RELAY_IR_SRCS})
+list(APPEND COMPILER_SRCS ${RELAY_QNN_SRCS})
+
+if(USE_VM_PROFILER)
+  message(STATUS "Build compiler with Relay VM profiler support...")
+  file(GLOB BACKEND_VM_PROFILER_SRCS src/relay/backend/vm/profiler/*.cc)
+  list(APPEND COMPILER_SRCS ${BACKEND_VM_PROFILER_SRCS})
+endif(USE_VM_PROFILER)

 file(GLOB DATATYPE_SRCS src/codegen/datatype/*.cc)
 list(APPEND COMPILER_SRCS ${DATATYPE_SRCS})
@@ -198,6 +221,12 @@ if(USE_GRAPH_RUNTIME)
  endif(USE_GRAPH_RUNTIME_DEBUG)
 endif(USE_GRAPH_RUNTIME)

+if(USE_VM_PROFILER)
+  message(STATUS "Build with Relay VM profiler support...")
+  file(GLOB RUNTIME_VM_PROFILER_SRCS src/runtime/vm/profiler/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_VM_PROFILER_SRCS})
+endif(USE_VM_PROFILER)
+
 # Module rules
 include(cmake/modules/VTA.cmake)
 include(cmake/modules/CUDA.cmake)

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -147,6 +147,7 @@ stage('Build') {
           echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake
           echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake
           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
+           echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
           echo set\\(USE_ANTLR ON\\) >> config.cmake
           echo set\\(USE_BLAS openblas\\) >> config.cmake
           echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
@@ -164,6 +165,7 @@ stage('Build') {
           echo set\\(USE_VULKAN ON\\) >> config.cmake
           echo set\\(USE_MICRO ON\\) >> config.cmake
           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
+           echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
           echo set\\(CMAKE_CXX_COMPILER clang-7\\) >> config.cmake
           echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
           """
@@ -182,6 +184,7 @@ stage('Build') {
           echo set\\(USE_SORT ON\\) >> config.cmake
           echo set\\(USE_MICRO ON\\) >> config.cmake
           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
+           echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
           echo set\\(USE_LLVM llvm-config-8\\) >> config.cmake
           echo set\\(USE_NNPACK ON\\) >> config.cmake
           echo set\\(NNPACK_PATH /NNPACK/build/\\) >> config.cmake
@@ -212,6 +215,7 @@ stage('Build') {
           echo set\\(USE_SORT ON\\) >> config.cmake
           echo set\\(USE_RPC ON\\) >> config.cmake
           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
+           echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
           echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake
           echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
           echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake

--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -88,6 +88,9 @@ set(USE_GRAPH_RUNTIME ON)
 # Whether enable additional graph debug functions
 set(USE_GRAPH_RUNTIME_DEBUG OFF)

+# Whether enable additional vm profiler functions
+set(USE_VM_PROFILER OFF)
+
 # Whether build with LLVM support
 # Requires LLVM version >= 4.0
 #

--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -375,8 +375,41 @@ struct VMFrame {
 */
 class VirtualMachine : public runtime::ModuleNode {
 public:
-  PackedFunc GetFunction(const std::string& name,
-                         const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+  /*!
+   * \brief Get a PackedFunc from module.
+   *
+   *  The PackedFunc may not be fully initialized,
+   *  there might still be first time running overhead when
+   *  executing the function on certain devices.
+   *  For benchmarking, use prepare to eliminate
+   *
+   * \param name the name of the function.
+   * \param sptr_to_self The shared_ptr that points to this module node.
+   *
+   * \return PackedFunc(nullptr) when it is not available.
+   *
+   * \note The function will always remain valid.
+   *   If the function needs resource from the module(e.g. late linking),
+   *   it should capture sptr_to_self.
+   */
+  virtual PackedFunc GetFunction(const std::string& name,
+                                 const std::shared_ptr<ModuleNode>& sptr_to_self);
+
+  /*!
+   * \brief Invoke a PackedFunction
+   *
+   * \param packed_index The offset of the PackedFunction in all functions.
+   * \param func The PackedFunction to be invoked.
+   * \param arg_count The number of arguments to the PackedFunction.
+   * \param output_size The number of outputs of the PackedFunction.
+   * \param args Arguments to the PackedFunction.
+   *
+   * \note The return value will be stored in the last output_size slots of args.
+   */
+  virtual void InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
+                            Index output_size, const std::vector<Object>& args);
+
+  virtual ~VirtualMachine() {}

  const char* type_key() const final {
    return "VirtualMachine";
@@ -456,6 +489,10 @@ class VirtualMachine : public runtime::ModuleNode {
   */
  void RunLoop();

+  /*! \brief Get device context for params.
+   */
+  TVMContext GetParamsContext() const;
+
  /*!
   * \brief Load parameters from the parameter bytearray.
   * \param params The binary file that contains parameters.
@@ -478,9 +515,6 @@ class VirtualMachine : public runtime::ModuleNode {
   */
  void InvokeGlobal(const VMFunction& func, const std::vector<Object>& args);

-  /*! \brief Get device context for params.
-   */
-  TVMContext GetParamsContext() const;

  /*! \brief The parameter name to data mapping. */
  std::unordered_map<std::string, Object> params_;

--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -34,6 +34,7 @@ from . import debug
 from . import param_dict
 from . import feature
 from .backend import vm
+from .backend import profiler_vm
 from .backend import serializer
 from .backend import deserializer
 from .backend import vmobj

--- a/python/tvm/relay/backend/profiler_vm.py
+++ b/python/tvm/relay/backend/profiler_vm.py
+# License .to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable, invalid-name
+"""
+The Relay Virtual Machine profiler.
+
+Provides extra APIs for profiling vm execution.
+"""
+import tvm
+from . import vm, _vm
+
+def _update_target(target):
+    target = target if target else tvm.target.current_target()
+    if target is None:
+        raise ValueError("Target is not set in env or passed as argument.")
+
+    tgts = {}
+    if isinstance(target, (str, tvm.target.Target)):
+        dev_type = tvm.expr.IntImm("int32", tvm.nd.context(str(target)).device_type)
+        tgts[dev_type] = tvm.target.create(target)
+    elif isinstance(target, dict):
+        for dev, tgt in target.items():
+            dev_type = tvm.expr.IntImm("int32", tvm.nd.context(dev).device_type)
+            tgts[dev_type] = tvm.target.create(tgt)
+    else:
+        raise TypeError("target is expected to be str, tvm.target.Target, " +
+                        "or dict of str to str/tvm.target.Target, but received " +
+                        "{}".format(type(target)))
+    return tgts
+
+class VMCompilerProfiler(vm.VMCompiler):
+    """Build Relay module to run on VM runtime."""
+    def __init__(self):
+        super().__init__()
+        self.mod = _vm._VMCompilerProfiler()
+        self._compile = self.mod["compile"]
+        self._get_vm = self.mod["get_vm"]
+
+    def compile(self, mod, target=None, target_host=None):
+        """
+        Parameters
+        ----------
+        mod : relay.Module
+            The Relay module to build.
+
+        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
+            device/context name) to str/tvm.target.Target, optional
+            For heterogeneous compilation, it is a dictionary indicating context
+            to target mapping. For homogeneous compilation, it is a build target.
+
+        target_host : str or :any:`tvm.target.Target`, optional
+            Host compilation target, if target is device.
+            When TVM compiles device specific program such as CUDA,
+            we also need host(CPU) side code to interact with the driver
+            to setup the dimensions and parameters correctly.
+            target_host is used to specify the host side codegen target.
+            By default, llvm is used if it is enabled,
+            otherwise a stackvm intepreter is used.
+
+        Returns
+        -------
+        vm : VirtualMachineProfiler
+            The profile VM runtime.
+        """
+        target = _update_target(target)
+        self._compile(mod, target, target_host)
+        return VirtualMachineProfiler(self._get_vm())
+
+class VirtualMachineProfiler(vm.VirtualMachine):
+    """Relay profile VM runtime."""
+    def __init__(self, mod):
+        super().__init__(mod)
+        self._get_stat = self.mod["get_stat"]
+
+    def get_stat(self):
+        return self._get_stat()
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -30,12 +30,17 @@
 #include <tvm/relay/transform.h>
 #include <tvm/runtime/vm.h>
 #include <iostream>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "../../../runtime/vm/naive_allocator.h"
 #include "../../backend/compile_engine.h"
 #include "../../pass/pass_util.h"
+#include "compiler.h"

 namespace tvm {
 namespace relay {
@@ -56,36 +61,6 @@ using namespace relay::transform;
 // (@jroesch): VM passes, eventually declare as passes.
 bool IsClosure(const Function& func);

-template <typename T, typename U>
-using NodeMap = std::unordered_map<T, U, NodeHash, NodeEqual>;
-using TagMap = NodeMap<tvm::relay::Constructor, Index>;
-using TagNameMap = std::unordered_map<size_t, tvm::relay::Constructor>;
-using GlobalMap = NodeMap<GlobalVar, Index>;
-using ConstMap = NodeMap<Constant, Index>;
-using ConstTensorShapeMap = NodeMap<TensorType, std::pair<Index, NDArray>>;
-using TargetsMap = Map<tvm::Integer, tvm::Target>;
-
-struct VMCompilerContext {
-  // The module context for the compilation
-  Module module;
-  // Error reporter
-  ErrorReporter err_reporter;
-  // Map from a unique integer to ADT constructor tag
-  TagNameMap tag_index_map;
-  // Map from ADT constructor tag to a unique integer
-  TagMap tag_map;
-  // Map from global var to a unique integer
-  GlobalMap global_map;
-  // Map from Const object to its index in const pool
-  ConstMap const_map;
-  // Map from Const tensor shape to its index in const pool
-  ConstTensorShapeMap const_tensor_shape_map;
-  // List of lowered functions
-  std::vector<LoweredFunc> lowered_funcs;
-  // The functions that have been lowered.
-  std::unordered_map<LoweredFunc, size_t, NodeHash, NodeEqual> seen_funcs;
-};
-
 // Compute the constant pool, i.e a mapping from Constant node to constant index.
 struct ConstantPool : ExprVisitor {
  std::set<GlobalVar> visited;
@@ -664,152 +639,131 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
 };


-class VMCompiler : public runtime::ModuleNode {
- public:
-  PackedFunc GetFunction(const std::string& name,
-                         const std::shared_ptr<ModuleNode>& sptr_to_self) final {
-    if (name == "compile") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.num_args, 3);
-        this->Compile(args[0], args[1], args[2]);
-      });
-    } else if (name == "get_vm") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        *rv = runtime::Module(vm_);
-      });
-    } else {
-      LOG(FATAL) << "Unknown packed function: " << name;
-      return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
-    }
+PackedFunc VMCompiler::GetFunction(const std::string& name,
+                                   const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  if (name == "compile") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      CHECK_EQ(args.num_args, 3);
+      this->Compile(args[0], args[1], args[2]);
+    });
+  } else if (name == "get_vm") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = runtime::Module(vm_);
+    });
+  } else {
+    LOG(FATAL) << "Unknown packed function: " << name;
+    return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
  }
+}

-  const char* type_key() const final {
-    return "VMCompiler";
+void VMCompiler::Compile(const Module& mod_ref,
+                         const TargetsMap& targets,
+                         const tvm::Target& target_host) {
+  CHECK_EQ(targets.size(), 1)
+    << "Currently VM compiler doesn't support heterogeneous compilation";
+
+  InitVM();
+  targets_ = targets;
+  target_host_ = target_host;
+
+  // Run some optimizations first, this code should
+  // be moved to pass manager.
+  context_.module = OptimizeModule(mod_ref);
+
+  // Populate the global map.
+  //
+  // This maps global variables to a global index
+  // in the VMFunction table.
+  PopulateGlobalMap();
+
+  // Next we populate constant map.
+  auto constant_analysis_result = LayoutConstantPool(context_.module);
+  context_.const_map = std::get<0>(constant_analysis_result);
+  context_.const_tensor_shape_map = std::get<1>(constant_analysis_result);
+
+  // Next we get ready by allocating space for
+  // the global state.
+  vm_->functions.resize(context_.module->functions.size());
+  vm_->constants.resize(context_.const_map.size() + context_.const_tensor_shape_map.size());
+
+  for (auto pair : context_.const_map) {
+    vm_->constants[pair.second] = Object::Tensor(pair.first->data);
  }

-  std::shared_ptr<VirtualMachine> GetVirtualMachine() const {
-    return vm_;
+  for (auto pair : context_.const_tensor_shape_map) {
+    vm_->constants[pair.second.first] = Object::Tensor(pair.second.second);
  }

-  void Compile(const Module& mod_ref,
-               const TargetsMap& targets,
-               const tvm::Target& target_host) {
-    CHECK_EQ(targets.size(), 1)
-      << "Currently VM compiler doesn't support heterogeneous compilation";
-    targets_ = targets;
-    target_host_ = target_host;
-    vm_ = std::make_shared<VirtualMachine>();
-
-    // Run some optimizations first, this code should
-    // be moved to pass manager.
-    context_.module = OptimizeModule(mod_ref);
-
-    // Populate the global map.
-    //
-    // This maps global variables to a global index
-    // in the VMFunction table.
-    PopulateGlobalMap();
+  for (auto named_func : context_.module->functions) {
+    auto gvar = named_func.first;
+    auto func = named_func.second;
+    VMFunctionCompiler func_compiler(&context_, targets_);
+    auto vm_func = func_compiler.Compile(gvar, func);

-    // Next we populate constant map.
-    auto constant_analysis_result = LayoutConstantPool(context_.module);
-    context_.const_map = std::get<0>(constant_analysis_result);
-    context_.const_tensor_shape_map = std::get<1>(constant_analysis_result);
-
-    // Next we get ready by allocating space for
-    // the global state.
-    vm_->functions.resize(context_.module->functions.size());
-    vm_->constants.resize(context_.const_map.size() + context_.const_tensor_shape_map.size());
-
-    for (auto pair : context_.const_map) {
-      vm_->constants[pair.second] = Object::Tensor(pair.first->data);
-    }
-
-    for (auto pair : context_.const_tensor_shape_map) {
-      vm_->constants[pair.second.first] = Object::Tensor(pair.second.second);
-    }
-
-    for (auto named_func : context_.module->functions) {
-      auto gvar = named_func.first;
-      auto func = named_func.second;
-      VMFunctionCompiler func_compiler(&context_, targets_);
-      auto vm_func = func_compiler.Compile(gvar, func);
-
-      size_t func_index = context_.global_map.at(gvar);
-      CHECK(func_index < vm_->functions.size());
-      vm_->functions[func_index] = vm_func;
-    }
+    size_t func_index = context_.global_map.at(gvar);
+    CHECK(func_index < vm_->functions.size());
+    vm_->functions[func_index] = vm_func;
+  }

 #if USE_RELAY_DEBUG
-    for (auto vm_func : vm_->functions) {
-      DLOG(INFO) << vm_func << "-------------";
-    }
+  for (auto vm_func : vm_->functions) {
+    DLOG(INFO) << vm_func << "-------------";
+  }
 #endif  // USE_RELAY_DEBUG

-    LibraryCodegen();
+  LibraryCodegen();

-    for (auto gv : context_.global_map) {
-      vm_->global_map.insert({gv.first->name_hint, gv.second});
-    }
+  for (auto gv : context_.global_map) {
+    vm_->global_map.insert({gv.first->name_hint, gv.second});
  }
+}

- protected:
-  Module OptimizeModule(const Module& mod) {
-    // TODO(@icemelon9): check number of targets and build config, add more optimization pass
-    transform::Sequential seq({transform::SimplifyInference(),
-                               transform::ToANormalForm(),
-                               transform::InlinePrimitives(),
-                               transform::LambdaLift(),
-                               transform::InlinePrimitives(),
-                               transform::FuseOps()});
-    auto pass_ctx = transform::PassContext::Create();
-    tvm::With<relay::transform::PassContext> ctx(pass_ctx);
-    return seq(mod);
-  }
-
-  void PopulateGlobalMap() {
-    // First we populate global map.
-    size_t global_index = 0;
-    for (auto named_func : context_.module->functions) {
-      auto gvar = named_func.first;
-      context_.global_map.insert({gvar, global_index++});
-    }
-  }
+Module VMCompiler::OptimizeModule(const Module& mod) {
+  // TODO(@icemelon9): check number of targets and build config, add more optimization pass
+  transform::Sequential seq({transform::SimplifyInference(),
+                             transform::ToANormalForm(),
+                             transform::InlinePrimitives(),
+                             transform::LambdaLift(),
+                             transform::InlinePrimitives(),
+                             transform::FuseOps()});
+  auto pass_ctx = transform::PassContext::Create();
+  tvm::With<relay::transform::PassContext> ctx(pass_ctx);
+  return seq(mod);
+}

-  void LibraryCodegen() {
-    auto const& lowered_funcs = context_.lowered_funcs;
-    if (lowered_funcs.size() == 0) {
-      return;
-    }
-    // TODO(@icemelon9): support heterogeneous targets
-    Target target;
-    for (auto kv : targets_) {
-      target = kv.second;
-    }
-    if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
-      runtime::Module mod =
-          (*f)(tvm::Array<LoweredFunc>(lowered_funcs.begin(), lowered_funcs.end()), target,
-               target_host_);
-      CHECK(mod.operator->());
-      vm_->lib = mod;
-    } else {
-      LOG(FATAL) << "relay.backend.build is not registered";
-    }
-    size_t primitive_index = 0;
-    for (auto lfunc : lowered_funcs) {
-      vm_->primitive_map.insert({lfunc->name, primitive_index++});
-    }
+void VMCompiler::PopulateGlobalMap() {
+  // First we populate global map.
+  size_t global_index = 0;
+  for (auto named_func : context_.module->functions) {
+    auto gvar = named_func.first;
+    context_.global_map.insert({gvar, global_index++});
  }
+}

- protected:
-  /*! \brief Target devices. */
-  TargetsMap targets_;
-  /*! \brief Target host device. */
-  tvm::Target target_host_;
-  /*! \brief Global shared meta data */
-  VMCompilerContext context_;
-  /*! \brief Compiled virtual machine. */
-  std::shared_ptr<VirtualMachine> vm_;
-};
+void VMCompiler::LibraryCodegen() {
+  auto const& lowered_funcs = context_.lowered_funcs;
+  if (lowered_funcs.size() == 0) {
+    return;
+  }
+  // TODO(@icemelon9): support heterogeneous targets
+  Target target;
+  for (auto kv : targets_) {
+    target = kv.second;
+  }
+  if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
+    runtime::Module mod =
+        (*f)(tvm::Array<LoweredFunc>(lowered_funcs.begin(), lowered_funcs.end()), target,
+             target_host_);
+    CHECK(mod.operator->());
+    vm_->lib = mod;
+  } else {
+    LOG(FATAL) << "relay.backend.build is not registered";
+  }
+  size_t primitive_index = 0;
+  for (auto lfunc : lowered_funcs) {
+    vm_->primitive_map.insert({lfunc->name, primitive_index++});
+  }
+}

 runtime::Module CreateVMCompiler() {
  std::shared_ptr<VMCompiler> exec = std::make_shared<VMCompiler>();

--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/relay/backend/vm/compiler.h
+ * \brief A compiler from relay::Module to the VM byte code.
+ */
+
+#ifndef TVM_RELAY_BACKEND_VM_COMPILER_H_
+#define TVM_RELAY_BACKEND_VM_COMPILER_H_
+
+#include <tvm/relay/error.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/interpreter.h>
+#include <tvm/logging.h>
+#include <tvm/relay/transform.h>
+#include <tvm/runtime/vm.h>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "../../../runtime/vm/profiler/vm.h"
+#include "../../../runtime/vm/naive_allocator.h"
+#include "../../backend/compile_engine.h"
+#include "../../pass/pass_util.h"
+
+namespace tvm {
+namespace relay {
+namespace vm {
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::vm;
+using namespace relay::transform;
+
+template <typename T, typename U>
+using NodeMap = std::unordered_map<T, U, NodeHash, NodeEqual>;
+using TagMap = NodeMap<tvm::relay::Constructor, Index>;
+using TagNameMap = std::unordered_map<size_t, tvm::relay::Constructor>;
+using GlobalMap = NodeMap<GlobalVar, Index>;
+using ConstMap = NodeMap<Constant, Index>;
+using ConstTensorShapeMap = NodeMap<TensorType, std::pair<Index, NDArray>>;
+using TargetsMap = Map<tvm::Integer, tvm::Target>;
+
+struct VMCompilerContext {
+  // The module context for the compilation
+  Module module;
+  // Error reporter
+  ErrorReporter err_reporter;
+  // Map from a unique integer to ADT constructor tag
+  TagNameMap tag_index_map;
+  // Map from ADT constructor tag to a unique integer
+  TagMap tag_map;
+  // Map from global var to a unique integer
+  GlobalMap global_map;
+  // Map from Const object to its index in const pool
+  ConstMap const_map;
+  // Map from Const tensor shape to its index in const pool
+  ConstTensorShapeMap const_tensor_shape_map;
+  // List of lowered functions
+  std::vector<LoweredFunc> lowered_funcs;
+  // The functions that have been lowered.
+  std::unordered_map<LoweredFunc, size_t, NodeHash, NodeEqual> seen_funcs;
+};
+
+
+class VMCompiler : public runtime::ModuleNode {
+ public:
+  virtual ~VMCompiler() {}
+
+  virtual PackedFunc GetFunction(const std::string& name,
+                                 const std::shared_ptr<ModuleNode>& sptr_to_self);
+
+  const char* type_key() const {
+    return "VMCompiler";
+  }
+
+  std::shared_ptr<VirtualMachine> GetVirtualMachine() const {
+    return vm_;
+  }
+
+  virtual void InitVM() {
+    vm_ = std::make_shared<VirtualMachine>();
+  }
+
+  void Compile(const Module& mod_ref,
+               const TargetsMap& targets,
+               const tvm::Target& target_host);
+
+ protected:
+  Module OptimizeModule(const Module& mod);
+
+  void PopulateGlobalMap();
+
+  void LibraryCodegen();
+
+ protected:
+  /*! \brief Target devices. */
+  TargetsMap targets_;
+  /*! \brief Target host device. */
+  tvm::Target target_host_;
+  /*! \brief Global shared meta data */
+  VMCompilerContext context_;
+  /*! \brief Compiled virtual machine. */
+  std::shared_ptr<VirtualMachine> vm_;
+};
+
+}  // namespace vm
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_VM_COMPILER_H_
--- a/src/relay/backend/vm/profiler/compiler.cc
+++ b/src/relay/backend/vm/profiler/compiler.cc
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/relay/backend/vm/profiler/compiler.cc
+ * \brief A compiler from relay::Module to the VM byte code.
+ */
+
+#include "../../../../runtime/vm/profiler/vm.h"
+#include "../compiler.h"
+
+namespace tvm {
+namespace relay {
+namespace vm {
+
+class VMCompilerDebug : public VMCompiler {
+ public:
+  VMCompilerDebug() {}
+  void InitVM() override { vm_ = std::make_shared<VirtualMachineDebug>(); }
+  virtual ~VMCompilerDebug() {}
+};
+
+runtime::Module CreateVMCompilerDebug() {
+  std::shared_ptr<VMCompilerDebug> exec = std::make_shared<VMCompilerDebug>();
+  return runtime::Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("relay._vm._VMCompilerProfiler")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      *rv = CreateVMCompilerDebug();
+    });
+
+}  // namespace vm
+}  // namespace relay
+}  // namespace tvm
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/runtime/vm/profiler/vm.cc
+ * \brief The Relay debug virtual machine.
+ */
+
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/vm.h>
+
+#include <algorithm>
+#include <chrono>
+#include <iomanip>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "vm.h"
+
+namespace tvm {
+namespace runtime {
+namespace vm {
+
+PackedFunc VirtualMachineDebug::GetFunction(
+    const std::string& name, const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  if (name == "get_stat") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      double total_duration = 0.0;
+      std::ostringstream os;
+      os << std::setw(30) << std::left << "#OpName"
+         << "\t" << std::setw(10) << std::left << "#InvokeCount"
+         << "\t"
+         << "#Duration(us): Sum/Mean/Min/Max" << std::endl;
+
+      for (auto kv : op_durations) {
+        auto vals = op_durations[kv.first];
+        auto sum = std::accumulate(vals.begin(), vals.end(), 0.0);;
+        auto mean = sum / static_cast<double>(vals.size());
+        auto min_value = *std::min_element(vals.begin(), vals.end());
+        auto max_value = *std::max_element(vals.begin(), vals.end());
+
+        os << std::setw(30) << std::left << packed_index_map[kv.first] << "\t"
+           << std::setw(10) << std::left << op_invokes[kv.first] << "\t"
+           <<  sum << "/" << mean << "/" << min_value << "/" << max_value << std::endl;
+
+        total_duration += sum;
+      }
+      os << "Total Duration " << total_duration << " us" << std::endl;
+      *rv = os.str();
+    });
+  } else if (name == "init") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      CHECK_EQ(args.size() % 2, 0);
+      std::vector<TVMContext> contexts;
+      for (int i = 0; i < args.size() / 2; ++i) {
+        TVMContext ctx;
+        int device_type = args[i * 2];
+        ctx.device_type = DLDeviceType(device_type);
+        ctx.device_id = args[i * 2 + 1];
+        contexts.push_back(ctx);
+      }
+      this->Init(contexts);
+    });
+  } else {
+    return VirtualMachine::GetFunction(name, sptr_to_self);
+  }
+}
+
+void VirtualMachineDebug::Init(const std::vector<TVMContext>& ctxs) {
+  VirtualMachine::Init(ctxs);
+  for (auto kv : primitive_map) {
+    packed_index_map[kv.second] = kv.first;
+    op_invokes[kv.second] = 0;
+  }
+}
+
+void VirtualMachineDebug::InvokePacked(Index packed_index,
+                                       const PackedFunc& func, Index arg_count,
+                                       Index output_size,
+                                       const std::vector<Object>& args) {
+  auto ctx = VirtualMachine::GetParamsContext();
+  auto op_begin = std::chrono::high_resolution_clock::now();
+  VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size,
+                               args);
+  TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+  auto op_end = std::chrono::high_resolution_clock::now();
+  double op_duration =
+      std::chrono::duration_cast<std::chrono::duration<double> >(op_end -
+                                                                 op_begin)
+          .count();
+
+  op_durations[packed_index].push_back(op_duration * 1e6);
+  op_invokes[packed_index] += 1;
+}
+
+}  // namespace vm
+}  // namespace runtime
+}  // namespace tvm
--- a/src/runtime/vm/profiler/vm.h
+++ b/src/runtime/vm/profiler/vm.h
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/runtime/vm/profiler/vm.h
+ * \brief The Relay debug virtual machine.
+ */
+
+#ifndef TVM_RUNTIME_VM_PROFILER_VM_H_
+#define TVM_RUNTIME_VM_PROFILER_VM_H_
+
+#include <tvm/runtime/vm.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace vm {
+
+class VirtualMachineDebug : public VirtualMachine {
+ public:
+  VirtualMachineDebug() : VirtualMachine() {}
+
+  PackedFunc GetFunction(const std::string& name,
+                         const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+
+  void InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
+                    Index output_size, const std::vector<Object>& args) final;
+
+  ~VirtualMachineDebug() {}
+
+ private:
+  void Init(const std::vector<TVMContext>& ctxs);
+
+  std::unordered_map<Index, std::string> packed_index_map;
+  std::unordered_map<Index, std::vector<double>> op_durations;
+  std::unordered_map<Index, int> op_invokes;
+};
+
+}  // namespace vm
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_VM_PROFILER_VM_H_
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -721,8 +721,9 @@ Object VirtualMachine::Invoke(const std::string& name, const std::vector<Object>
  return Invoke(this->functions[func_index], args);
 }

-void InvokePacked(const PackedFunc& func, Index arg_count, Index output_size,
-                  const std::vector<Object>& args) {
+void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func,
+                                  Index arg_count, Index output_size,
+                                  const std::vector<Object>& args) {
  size_t arity = 0;
  for (Index i = 0; i < arg_count; i++) {
    if (args[i].ptr_->tag == ObjectTag::kDatatype) {
@@ -846,7 +847,7 @@ void VirtualMachine::RunLoop() {
        for (Index i = 0; i < arity; ++i) {
          args.push_back(ReadRegister(instr.packed_args[i]));
        }
-        InvokePacked(func, arity, instr.output_size, args);
+        InvokePacked(instr.packed_index, func, arity, instr.output_size, args);
        for (Index i = 0; i < instr.output_size; ++i) {
          WriteRegister(instr.packed_args[instr.arity - instr.output_size + i],
                        args[instr.arity - instr.output_size + i]);

--- a/tests/python/unittest/test_runtime_vm_profiler.py
+++ b/tests/python/unittest/test_runtime_vm_profiler.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import tvm
+import numpy as np
+
+from nose.tools import nottest
+from tvm import relay
+from tvm.relay.testing import resnet
+
+@nottest
+def test_basic():
+    mod, params = resnet.get_workload()
+    compiler = relay.profiler_vm.VMCompilerProfiler()
+    target = 'llvm'
+    ctx = tvm.cpu()
+    vm = compiler.compile(mod, target)
+    vm.init(ctx)
+    vm.load_params(params)
+
+    data = np.random.rand(1, 3, 224, 224).astype('float32')
+    res = vm.invoke("main", [data])
+    print("\n{}".format(vm.get_stat()))
+
+if __name__ == "__main__":
+    test_basic()