[RUNTIME] Add min_repeat_ms to time_evaluator (#2200)

b1188485 · Lianmin Zheng · Tianqi Chen · 1e78d41c · b1188485 · b1188485
Commit b1188485 authored Jan 01, 2019 by Lianmin Zheng Committed by Tianqi Chen Jan 01, 2019
8 changed files
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -187,8 +187,10 @@ def measure_option(builder, runner):
    Note
    ----
    To make measurement results accurate, you should pick the correct value for the argument
-    `number` and `repeat` in Runner(). Using `min_repeat_ms` can dynamically adjusts `number`,
-    so it is recommended. The typical value for NVIDIA GPU is 100 ms.
+    `number` and `repeat` in Runner(). Some devices need a certain minimum running time to
+    "warm up," such as GPUs that need time to reach a performance power state.
+    Using `min_repeat_ms` can dynamically adjusts `number`, so it is recommended.
+    The typical value for NVIDIA GPU is 150 ms.
    """
    from .measure_methods import LocalBuilder, LocalRunner


--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -140,20 +140,22 @@ class RPCRunner(Runner):
        The host address of RPC Tracker
    port: int
        The port of RPC Tracker
-    number : int, optional
-        Number of times to do measurement for tasking average
+    number: int
+        The number of times to run the generated code for taking average.
+        We call these runs as one `repeat` of measurement.
    repeat : int, optional
-        Number of times to repeat the measurement.
+        The number of times to repeat the measurement.
        In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-    min_repeat_ms : float, optional
-        Minimum duration of a timer measurement in milliseconds.
-        When the run time of a measurement trial falls below this time, the
-        `number` parameter will be automatically increased.
-        Set this to improve the accuracy of perf measurement, e.g., when timers
-        are not precise enough to capture short-running tasks. This parameter is
-        also critical when devices need a certain minimum running time to "warm
-        up," such as GPUs that need time to reach a performance power state.
+        where the first "1" is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int, optional
+        The minimum duration of one `repeat` in milliseconds.
+        By default, one `repeat` contains `number` runs. If this parameter is set,
+        the parameters `number` will be dynamically adjusted to meet the
+        minimum duration requirement of one `repeat`.
+        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+        will be automatically increased.
    cooldown_interval: float, optional
        The cool down interval between two measurements.
    check_correctness: bool, optional
@@ -177,7 +179,6 @@ class RPCRunner(Runner):
        self.number = number
        self.repeat = repeat
        self.min_repeat_ms = min_repeat_ms
-        self.cur_number = number

        self.ref_input = None
        self.ref_output = None
@@ -188,7 +189,6 @@ class RPCRunner(Runner):

    def set_task(self, task):
        self.task = task
-        self.cur_number = self.number

        if check_remote(task.target, self.key, self.host, self.port):
            logger.info("Get devices for measurement successfully!")
@@ -240,8 +240,9 @@ class RPCRunner(Runner):
                ret = self.executor.submit(run_through_rpc,
                                           measure_inp,
                                           build_res,
-                                           self.cur_number,
+                                           self.number,
                                           self.repeat,
+                                           self.min_repeat_ms,
                                           self.cooldown_interval,
                                           remote_args,
                                           self.ref_input,
@@ -256,32 +257,6 @@ class RPCRunner(Runner):
                else:
                    results.append(res)

-        # If some runs were too fast, do remeasure for them
-        # to meet the requirement of `min_repeat_ms`
-        remeasure = np.zeros((len(measure_inputs),), dtype=np.bool)
-        pre_number = next_number = self.cur_number
-        min_repeat_duration = self.min_repeat_ms / 1000.0
-        for i, res in enumerate(results):
-            if res.error_no == MeasureErrorNo.NO_ERROR:
-                if np.mean(res.costs) * pre_number <= min_repeat_duration:
-                    next_number = max(next_number,
-                                      int(np.ceil(min_repeat_duration / np.mean(res.costs))))
-                    remeasure[i] = True
-
-        if pre_number != next_number:
-            self.cur_number = next_number
-            msg = "increasing number to %d" % self.cur_number
-            logger.info(msg)
-
-            re_measure_inputs = [x for i, x in enumerate(measure_inputs) if remeasure[i]]
-            re_build_results = [x for i, x in enumerate(build_results) if remeasure[i]]
-            re_res = self.run(re_measure_inputs, re_build_results)
-            ct = 0
-            for i, rerun in enumerate(remeasure):
-                if rerun:
-                    results[i] = re_res[ct]
-                    ct += 1
-
        return results

 class LocalRunner(RPCRunner):
@@ -291,21 +266,22 @@ class LocalRunner(RPCRunner):
    ----------
    timeout: float
        The timeout of a compilation
-    number : int, optional
-        Number of times to do measurement for tasking average
+    number: int
+        The number of times to run the generated code for taking average.
+        We call these runs as one `repeat` of measurement.
    repeat : int, optional
-        Number of times to repeat the measurement.
+        The number of times to repeat the measurement.
        In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-        each of which is the average of `number` test run.
-    min_repeat_ms : float, optional
-        Minimum duration of a timer measurement in milliseconds.
-        When the run time of a measurement trial falls below this time, the
-        `number` parameter will be automatically increased.
-        Set this to improve the accuracy of perf measurement, e.g., when timers
-        are not precise enough to capture short-running tasks. This parameter is
-        also critical when devices need a certain minimum running time to "warm
-        up," such as GPUs that need time to reach a performance power state.
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int, optional
+        The minimum duration of one `repeat` in milliseconds.
+        By default, one `repeat` contains `number` runs. If this parameter is set,
+        the parameters `number` will be dynamically adjusted to meet the
+        minimum duration requirement of one `repeat`.
+        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+        will be automatically increased.
    cooldown_interval: float, optional
        The cool down interval between two measurements.
    check_correctness: bool, optional
@@ -416,7 +392,7 @@ def android_ndk_build_func(measure_input, tmp_dir, **kwargs):


 def run_through_rpc(measure_input, build_result,
-                    number, repeat, cooldown_interval,
+                    number, repeat, min_repeat_ms, cooldown_interval,
                    remote_args, ref_input=None, ref_output=None):
    """Run a generated library through rpc

@@ -426,13 +402,22 @@ def run_through_rpc(measure_input, build_result,
        The raw measure input
    build_result: BuildResult
        The result returned from Builder. This contains the path to the generated library.
-    number : int, optional
-        Number of times to do measurement for tasking average
+    number: int
+        The number of times to run the generated code for taking average.
+        We call these runs as one `repeat` of measurement.
    repeat : int, optional
-        Number of times to repeat the measurement.
+        The number of times to repeat the measurement.
        In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-        each of which is the average of `number` test run.
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int, optional
+        The minimum duration of one `repeat` in milliseconds.
+        By default, one `repeat` contains `number` runs. If this parameter is set,
+        the parameters `number` will be dynamically adjusted to meet the
+        minimum duration requirement of one `repeat`.
+        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+        will be automatically increased.
    cooldown_interval: float
        The cool down interval between two measurements
    remote_args: Tuple
@@ -454,14 +439,14 @@ def run_through_rpc(measure_input, build_result,
        func = remote.load_module(os.path.split(build_result.filename)[1])
        ctx = remote.context(str(measure_input.target), 0)
        time_f = func.time_evaluator(
-            func.entry_name, ctx, number=number, repeat=repeat)
+            func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms)

        # set input
        if ref_input:
            args = [nd.array(x, ctx=ctx) for x in ref_input]
        else:
            # create empty arrays on the remote device and copy them once.
-            # This can avoid some memory issues that make the measurment results unreliable.
+            # This can avoid some memory issues that make the measurement results unreliable.
            args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
            args = [nd.array(x, ctx=ctx) for x in args]
            ctx.sync()

--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -127,7 +127,7 @@ class Module(ModuleBase):
            kwargs.update({'options': ["-I" + path for path in find_include_path()]})
        fcompile(file_name, files, **kwargs)

-    def time_evaluator(self, func_name, ctx, number, repeat=1):
+    def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
        """Get an evaluator that measures time cost of running function.

        Parameters
@@ -139,26 +139,38 @@ class Module(ModuleBase):
            The context we should run this function on.

        number: int
-            The number of steps used in measuring each time interval
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.

        repeat: int, optional
-            Number of times to run the timer measurement
-            If repeat equals 3, then we will get 3 numbers in the ProfileResult.
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.

        Note
        ----
-        The function will be invoked  repeat * number + 1 times,
+        The function will be invoked  (1 + number x repeat) times,
        with the first call discarded in case there is lazy initialization.

        Returns
        -------
        ftimer : Function
-            The function that takes same argument as func
-            and return a float representing seconds per function call.
+            The function that takes same argument as func and returns a ProfileResult.
+            The ProfileResult reports `repeat` time costs in seconds.
        """
        try:
            feval = _RPCTimeEvaluator(
-                self, func_name, ctx.device_type, ctx.device_id, number, repeat)
+                self, func_name, ctx.device_type, ctx.device_id, number, repeat, min_repeat_ms)

            def evaluator(*args):
                """Internal wrapped evaluator."""

--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -124,10 +124,11 @@ class RPCModuleNode final : public ModuleNode {
  PackedFunc GetTimeEvaluator(const std::string& name,
                              TVMContext ctx,
                              int number,
-                              int repeat) {
+                              int repeat,
+                              int min_repeat_ms) {
    RPCFuncHandle handle = GetFuncHandle(name);
    if (handle == nullptr) return PackedFunc();
-    handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat);
+    handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat, min_repeat_ms);
    return WrapRemote(handle);
  }

@@ -203,10 +204,10 @@ TVM_REGISTER_GLOBAL("module._RPCTimeEvaluator")
    ctx.device_id = args[3];
    if (tkey == "rpc") {
      *rv = static_cast<RPCModuleNode*>(m.operator->())
-          ->GetTimeEvaluator(args[1], ctx, args[4], args[5]);
+          ->GetTimeEvaluator(args[1], ctx, args[4], args[5], args[6]);
    } else {
      *rv = WrapTimeEvaluator(
-          m.GetFunction(args[1], false), ctx, args[4], args[5]);
+          m.GetFunction(args[1], false), ctx, args[4], args[5], args[6]);
    }
  });


--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -13,6 +13,8 @@
 #include <chrono>
 #include <vector>
 #include <utility>
+#include <cmath>
+#include <algorithm>
 #include "rpc_session.h"
 #include "../../common/ring_buffer.h"

@@ -1002,9 +1004,9 @@ void RPCSession::CopyFromRemote(void* from,
 }

 RPCFuncHandle RPCSession::GetTimeEvaluator(
-    RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat) {
+    RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat, int min_repeat_ms) {
  return this->CallRemote(
-      RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat);
+      RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat, min_repeat_ms);
 }

 // Event handler functions
@@ -1138,7 +1140,7 @@ void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) {

 void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
  PackedFunc *pf = static_cast<PackedFunc*>(args[0].operator void*());
-  void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3]));
+  void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3], args[4]));
  delete pf;
  *rv = fhandle;
 }
@@ -1190,21 +1192,41 @@ void RPCSession::EventHandler::HandlePackedCall() {
  CHECK_EQ(state_, kRecvCode);
 }

-PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat) {
-  auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) {
+PackedFunc WrapTimeEvaluator(PackedFunc pf,
+                             TVMContext ctx,
+                             int number,
+                             int repeat,
+                             int min_repeat_ms) {
+  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
    TVMRetValue temp;
    std::ostringstream os;
    // skip first time call, to activate lazy compilation components.
    pf.CallPacked(args, &temp);
    DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+
    for (int i = 0; i < repeat; ++i) {
-      // start timing
-      auto tbegin = std::chrono::high_resolution_clock::now();
-      for (int i = 0; i < number; ++i) {
-        pf.CallPacked(args, &temp);
-      }
-      DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
-      auto tend = std::chrono::high_resolution_clock::now();
+      std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> tbegin, tend;
+      double duration_ms = 0.0;
+
+      do {
+        if (duration_ms > 0.0) {
+          number = static_cast<int>(
+              std::max((min_repeat_ms / (duration_ms / number) + 1),
+                       number * 1.618));   // 1.618 is chosen by random
+        }
+
+        tbegin = std::chrono::high_resolution_clock::now();
+        // start timing
+        for (int i = 0; i < number; ++i) {
+          pf.CallPacked(args, &temp);
+        }
+        DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+        tend = std::chrono::high_resolution_clock::now();
+
+        duration_ms = std::chrono::duration_cast<std::chrono::duration<double> >
+            (tend - tbegin).count() * 1000;
+      } while (duration_ms < min_repeat_ms);
+
      double speed = std::chrono::duration_cast<std::chrono::duration<double> >(
          tend - tbegin).count() / number;
      os.write(reinterpret_cast<char*>(&speed), sizeof(speed));

--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -151,14 +151,26 @@ class RPCSession {
   *
   * \param fhandle The function handle.
   * \param ctx The ctx to run measurement on.
-   * \param number How many steps to run in each time evaluation
-   * \param repeat How many times to repeat the timer
+   * \param number The number of times to run this function for taking average.
+          We call these runs as one `repeat` of measurement.
+   * \param repeat The number of times to repeat the measurement.
+          In total, the function will be invoked (1 + number x repeat) times,
+          where the first one is warm up and will be discarded.
+          The returned result contains `repeat` costs,
+          each of which is an average of `number` costs.
+   * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
+          By default, one `repeat` contains `number` runs. If this parameter is set,
+          the parameters `number` will be dynamically adjusted to meet the
+          minimum duration requirement of one `repeat`.
+          i.e., When the run time of one `repeat` falls below this time,
+          the `number` parameter will be automatically increased.
   * \return A remote timer function
   */
  RPCFuncHandle GetTimeEvaluator(RPCFuncHandle fhandle,
                                 TVMContext ctx,
                                 int number,
-                                 int repeat);
+                                 int repeat,
+                                 int min_repeat_ms);
  /*!
   * \brief Call a remote defined system function with arguments.
   * \param fcode The function code.
@@ -221,13 +233,29 @@ class RPCSession {
 };

 /*!
- * \brief Wrap a timer function for a given packed function.
+ * \brief Wrap a timer function to measure the time cost of a given packed function.
 * \param f The function argument.
 * \param ctx The context.
- * \param number Number of steps in the inner iteration
- * \param repeat How many steps to repeat the time evaluation.
+ * \param number The number of times to run this function for taking average.
+          We call these runs as one `repeat` of measurement.
+ * \param repeat The number of times to repeat the measurement.
+          In total, the function will be invoked (1 + number x repeat) times,
+          where the first one is warm up and will be discarded.
+          The returned result contains `repeat` costs,
+          each of which is an average of `number` costs.
+ * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
+          By default, one `repeat` contains `number` runs. If this parameter is set,
+          the parameters `number` will be dynamically adjusted to meet the
+          minimum duration requirement of one `repeat`.
+          i.e., When the run time of one `repeat` falls below this time,
+          the `number` parameter will be automatically increased.
+ * \return f_timer A timer function.
 */
-PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat);
+PackedFunc WrapTimeEvaluator(PackedFunc f,
+                             TVMContext ctx,
+                             int number,
+                             int repeat,
+                             int min_repeat_ms);

 /*!
 * \brief Create a Global RPC module that refers to the session.

--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -69,29 +69,9 @@ def test_check_correctness():
               callbacks=[_callback_wrong])


-def test_min_repeat_ms():
-    task, target = get_sample_task()
-
-    measure_option = autotvm.measure_option(
-        builder=autotvm.LocalBuilder(),
-        runner=autotvm.LocalRunner(number=1, min_repeat_ms=100)
-    )
-
-    def _callback(tuner, measure_inputs, measure_results):
-        for inp, res in zip(measure_inputs, measure_results):
-            if res.error_no != 0:
-                continue
-
-            assert 1000 * np.mean(res.costs) * \
-                   measure_option['runner'].cur_number >= 100
-
-    tuner = autotvm.tuner.RandomTuner(task)
-    tuner.tune(n_trial=5, measure_option=measure_option,
-               callbacks=[_callback])
-
 if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)

    test_task_tuner_without_measurement()
    test_check_correctness()
-    test_min_repeat_ms()
+
--- a/tests/python/unittest/test_runtime_measure.py
+++ b/tests/python/unittest/test_runtime_measure.py
+import time
+import ctypes
+
+import tvm
+from tvm.contrib.util import tempdir
+
+
+def test_min_repeat_ms():
+    tmp = tempdir()
+    filename = tmp.relpath("log")
+
+    @tvm.register_func
+    def my_debug(filename):
+        """one call lasts for 100 ms and writes one character to a file"""
+        time.sleep(0.1)
+        filename = ctypes.c_char_p(filename.value).value
+        with open(filename, "a") as fout:
+            fout.write("c")
+
+    X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename))
+    s = tvm.create_schedule(X.op)
+    func = tvm.build(s, [X])
+
+    x = tvm.nd.empty((), dtype="int32")
+    ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
+                                 number=1, repeat=1)
+    ftimer(x)
+
+    with open(filename, "r") as fin:
+        ct = len(fin.readline())
+    
+    assert ct == 2
+
+
+    ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
+                                 number=1, repeat=1, min_repeat_ms=1000)
+    ftimer(x)
+
+    # make sure we get more than 10 calls
+    with open(filename, "r") as fin:
+        ct = len(fin.readline())
+
+    assert ct > 10 + 2
+        
+
+if __name__ == "__main__":
+    test_min_repeat_ms()
+