Commit b1188485 by Lianmin Zheng Committed by Tianqi Chen

[RUNTIME] Add min_repeat_ms to time_evaluator (#2200)

parent 1e78d41c
...@@ -187,8 +187,10 @@ def measure_option(builder, runner): ...@@ -187,8 +187,10 @@ def measure_option(builder, runner):
Note Note
---- ----
To make measurement results accurate, you should pick the correct value for the argument To make measurement results accurate, you should pick the correct value for the argument
`number` and `repeat` in Runner(). Using `min_repeat_ms` can dynamically adjusts `number`, `number` and `repeat` in Runner(). Some devices need a certain minimum running time to
so it is recommended. The typical value for NVIDIA GPU is 100 ms. "warm up," such as GPUs that need time to reach a performance power state.
Using `min_repeat_ms` can dynamically adjusts `number`, so it is recommended.
The typical value for NVIDIA GPU is 150 ms.
""" """
from .measure_methods import LocalBuilder, LocalRunner from .measure_methods import LocalBuilder, LocalRunner
......
...@@ -140,20 +140,22 @@ class RPCRunner(Runner): ...@@ -140,20 +140,22 @@ class RPCRunner(Runner):
The host address of RPC Tracker The host address of RPC Tracker
port: int port: int
The port of RPC Tracker The port of RPC Tracker
number : int, optional number: int
Number of times to do measurement for tasking average The number of times to run the generated code for taking average.
We call these runs as one `repeat` of measurement.
repeat : int, optional repeat : int, optional
Number of times to repeat the measurement. The number of times to repeat the measurement.
In total, the generated code will be run (1 + number x repeat) times, In total, the generated code will be run (1 + number x repeat) times,
where the first one is warm up. The returned result contains `repeat` costs, where the first "1" is warm up and will be discarded.
min_repeat_ms : float, optional The returned result contains `repeat` costs,
Minimum duration of a timer measurement in milliseconds. each of which is an average of `number` costs.
When the run time of a measurement trial falls below this time, the min_repeat_ms: int, optional
`number` parameter will be automatically increased. The minimum duration of one `repeat` in milliseconds.
Set this to improve the accuracy of perf measurement, e.g., when timers By default, one `repeat` contains `number` runs. If this parameter is set,
are not precise enough to capture short-running tasks. This parameter is the parameters `number` will be dynamically adjusted to meet the
also critical when devices need a certain minimum running time to "warm minimum duration requirement of one `repeat`.
up," such as GPUs that need time to reach a performance power state. i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval: float, optional cooldown_interval: float, optional
The cool down interval between two measurements. The cool down interval between two measurements.
check_correctness: bool, optional check_correctness: bool, optional
...@@ -177,7 +179,6 @@ class RPCRunner(Runner): ...@@ -177,7 +179,6 @@ class RPCRunner(Runner):
self.number = number self.number = number
self.repeat = repeat self.repeat = repeat
self.min_repeat_ms = min_repeat_ms self.min_repeat_ms = min_repeat_ms
self.cur_number = number
self.ref_input = None self.ref_input = None
self.ref_output = None self.ref_output = None
...@@ -188,7 +189,6 @@ class RPCRunner(Runner): ...@@ -188,7 +189,6 @@ class RPCRunner(Runner):
def set_task(self, task): def set_task(self, task):
self.task = task self.task = task
self.cur_number = self.number
if check_remote(task.target, self.key, self.host, self.port): if check_remote(task.target, self.key, self.host, self.port):
logger.info("Get devices for measurement successfully!") logger.info("Get devices for measurement successfully!")
...@@ -240,8 +240,9 @@ class RPCRunner(Runner): ...@@ -240,8 +240,9 @@ class RPCRunner(Runner):
ret = self.executor.submit(run_through_rpc, ret = self.executor.submit(run_through_rpc,
measure_inp, measure_inp,
build_res, build_res,
self.cur_number, self.number,
self.repeat, self.repeat,
self.min_repeat_ms,
self.cooldown_interval, self.cooldown_interval,
remote_args, remote_args,
self.ref_input, self.ref_input,
...@@ -256,32 +257,6 @@ class RPCRunner(Runner): ...@@ -256,32 +257,6 @@ class RPCRunner(Runner):
else: else:
results.append(res) results.append(res)
# If some runs were too fast, do remeasure for them
# to meet the requirement of `min_repeat_ms`
remeasure = np.zeros((len(measure_inputs),), dtype=np.bool)
pre_number = next_number = self.cur_number
min_repeat_duration = self.min_repeat_ms / 1000.0
for i, res in enumerate(results):
if res.error_no == MeasureErrorNo.NO_ERROR:
if np.mean(res.costs) * pre_number <= min_repeat_duration:
next_number = max(next_number,
int(np.ceil(min_repeat_duration / np.mean(res.costs))))
remeasure[i] = True
if pre_number != next_number:
self.cur_number = next_number
msg = "increasing number to %d" % self.cur_number
logger.info(msg)
re_measure_inputs = [x for i, x in enumerate(measure_inputs) if remeasure[i]]
re_build_results = [x for i, x in enumerate(build_results) if remeasure[i]]
re_res = self.run(re_measure_inputs, re_build_results)
ct = 0
for i, rerun in enumerate(remeasure):
if rerun:
results[i] = re_res[ct]
ct += 1
return results return results
class LocalRunner(RPCRunner): class LocalRunner(RPCRunner):
...@@ -291,21 +266,22 @@ class LocalRunner(RPCRunner): ...@@ -291,21 +266,22 @@ class LocalRunner(RPCRunner):
---------- ----------
timeout: float timeout: float
The timeout of a compilation The timeout of a compilation
number : int, optional number: int
Number of times to do measurement for tasking average The number of times to run the generated code for taking average.
We call these runs as one `repeat` of measurement.
repeat : int, optional repeat : int, optional
Number of times to repeat the measurement. The number of times to repeat the measurement.
In total, the generated code will be run (1 + number x repeat) times, In total, the generated code will be run (1 + number x repeat) times,
where the first one is warm up. The returned result contains `repeat` costs, where the first one is warm up and will be discarded.
each of which is the average of `number` test run. The returned result contains `repeat` costs,
min_repeat_ms : float, optional each of which is an average of `number` costs.
Minimum duration of a timer measurement in milliseconds. min_repeat_ms: int, optional
When the run time of a measurement trial falls below this time, the The minimum duration of one `repeat` in milliseconds.
`number` parameter will be automatically increased. By default, one `repeat` contains `number` runs. If this parameter is set,
Set this to improve the accuracy of perf measurement, e.g., when timers the parameters `number` will be dynamically adjusted to meet the
are not precise enough to capture short-running tasks. This parameter is minimum duration requirement of one `repeat`.
also critical when devices need a certain minimum running time to "warm i.e., When the run time of one `repeat` falls below this time, the `number` parameter
up," such as GPUs that need time to reach a performance power state. will be automatically increased.
cooldown_interval: float, optional cooldown_interval: float, optional
The cool down interval between two measurements. The cool down interval between two measurements.
check_correctness: bool, optional check_correctness: bool, optional
...@@ -416,7 +392,7 @@ def android_ndk_build_func(measure_input, tmp_dir, **kwargs): ...@@ -416,7 +392,7 @@ def android_ndk_build_func(measure_input, tmp_dir, **kwargs):
def run_through_rpc(measure_input, build_result, def run_through_rpc(measure_input, build_result,
number, repeat, cooldown_interval, number, repeat, min_repeat_ms, cooldown_interval,
remote_args, ref_input=None, ref_output=None): remote_args, ref_input=None, ref_output=None):
"""Run a generated library through rpc """Run a generated library through rpc
...@@ -426,13 +402,22 @@ def run_through_rpc(measure_input, build_result, ...@@ -426,13 +402,22 @@ def run_through_rpc(measure_input, build_result,
The raw measure input The raw measure input
build_result: BuildResult build_result: BuildResult
The result returned from Builder. This contains the path to the generated library. The result returned from Builder. This contains the path to the generated library.
number : int, optional number: int
Number of times to do measurement for tasking average The number of times to run the generated code for taking average.
We call these runs as one `repeat` of measurement.
repeat : int, optional repeat : int, optional
Number of times to repeat the measurement. The number of times to repeat the measurement.
In total, the generated code will be run (1 + number x repeat) times, In total, the generated code will be run (1 + number x repeat) times,
where the first one is warm up. The returned result contains `repeat` costs, where the first one is warm up and will be discarded.
each of which is the average of `number` test run. The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval: float cooldown_interval: float
The cool down interval between two measurements The cool down interval between two measurements
remote_args: Tuple remote_args: Tuple
...@@ -454,14 +439,14 @@ def run_through_rpc(measure_input, build_result, ...@@ -454,14 +439,14 @@ def run_through_rpc(measure_input, build_result,
func = remote.load_module(os.path.split(build_result.filename)[1]) func = remote.load_module(os.path.split(build_result.filename)[1])
ctx = remote.context(str(measure_input.target), 0) ctx = remote.context(str(measure_input.target), 0)
time_f = func.time_evaluator( time_f = func.time_evaluator(
func.entry_name, ctx, number=number, repeat=repeat) func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms)
# set input # set input
if ref_input: if ref_input:
args = [nd.array(x, ctx=ctx) for x in ref_input] args = [nd.array(x, ctx=ctx) for x in ref_input]
else: else:
# create empty arrays on the remote device and copy them once. # create empty arrays on the remote device and copy them once.
# This can avoid some memory issues that make the measurment results unreliable. # This can avoid some memory issues that make the measurement results unreliable.
args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info] args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
args = [nd.array(x, ctx=ctx) for x in args] args = [nd.array(x, ctx=ctx) for x in args]
ctx.sync() ctx.sync()
......
...@@ -127,7 +127,7 @@ class Module(ModuleBase): ...@@ -127,7 +127,7 @@ class Module(ModuleBase):
kwargs.update({'options': ["-I" + path for path in find_include_path()]}) kwargs.update({'options': ["-I" + path for path in find_include_path()]})
fcompile(file_name, files, **kwargs) fcompile(file_name, files, **kwargs)
def time_evaluator(self, func_name, ctx, number, repeat=1): def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
"""Get an evaluator that measures time cost of running function. """Get an evaluator that measures time cost of running function.
Parameters Parameters
...@@ -139,26 +139,38 @@ class Module(ModuleBase): ...@@ -139,26 +139,38 @@ class Module(ModuleBase):
The context we should run this function on. The context we should run this function on.
number: int number: int
The number of steps used in measuring each time interval The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.
repeat: int, optional repeat: int, optional
Number of times to run the timer measurement The number of times to repeat the measurement.
If repeat equals 3, then we will get 3 numbers in the ProfileResult. In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
Note Note
---- ----
The function will be invoked repeat * number + 1 times, The function will be invoked (1 + number x repeat) times,
with the first call discarded in case there is lazy initialization. with the first call discarded in case there is lazy initialization.
Returns Returns
------- -------
ftimer : Function ftimer : Function
The function that takes same argument as func The function that takes same argument as func and returns a ProfileResult.
and return a float representing seconds per function call. The ProfileResult reports `repeat` time costs in seconds.
""" """
try: try:
feval = _RPCTimeEvaluator( feval = _RPCTimeEvaluator(
self, func_name, ctx.device_type, ctx.device_id, number, repeat) self, func_name, ctx.device_type, ctx.device_id, number, repeat, min_repeat_ms)
def evaluator(*args): def evaluator(*args):
"""Internal wrapped evaluator.""" """Internal wrapped evaluator."""
......
...@@ -124,10 +124,11 @@ class RPCModuleNode final : public ModuleNode { ...@@ -124,10 +124,11 @@ class RPCModuleNode final : public ModuleNode {
PackedFunc GetTimeEvaluator(const std::string& name, PackedFunc GetTimeEvaluator(const std::string& name,
TVMContext ctx, TVMContext ctx,
int number, int number,
int repeat) { int repeat,
int min_repeat_ms) {
RPCFuncHandle handle = GetFuncHandle(name); RPCFuncHandle handle = GetFuncHandle(name);
if (handle == nullptr) return PackedFunc(); if (handle == nullptr) return PackedFunc();
handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat); handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat, min_repeat_ms);
return WrapRemote(handle); return WrapRemote(handle);
} }
...@@ -203,10 +204,10 @@ TVM_REGISTER_GLOBAL("module._RPCTimeEvaluator") ...@@ -203,10 +204,10 @@ TVM_REGISTER_GLOBAL("module._RPCTimeEvaluator")
ctx.device_id = args[3]; ctx.device_id = args[3];
if (tkey == "rpc") { if (tkey == "rpc") {
*rv = static_cast<RPCModuleNode*>(m.operator->()) *rv = static_cast<RPCModuleNode*>(m.operator->())
->GetTimeEvaluator(args[1], ctx, args[4], args[5]); ->GetTimeEvaluator(args[1], ctx, args[4], args[5], args[6]);
} else { } else {
*rv = WrapTimeEvaluator( *rv = WrapTimeEvaluator(
m.GetFunction(args[1], false), ctx, args[4], args[5]); m.GetFunction(args[1], false), ctx, args[4], args[5], args[6]);
} }
}); });
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
#include <chrono> #include <chrono>
#include <vector> #include <vector>
#include <utility> #include <utility>
#include <cmath>
#include <algorithm>
#include "rpc_session.h" #include "rpc_session.h"
#include "../../common/ring_buffer.h" #include "../../common/ring_buffer.h"
...@@ -1002,9 +1004,9 @@ void RPCSession::CopyFromRemote(void* from, ...@@ -1002,9 +1004,9 @@ void RPCSession::CopyFromRemote(void* from,
} }
RPCFuncHandle RPCSession::GetTimeEvaluator( RPCFuncHandle RPCSession::GetTimeEvaluator(
RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat) { RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat, int min_repeat_ms) {
return this->CallRemote( return this->CallRemote(
RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat); RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat, min_repeat_ms);
} }
// Event handler functions // Event handler functions
...@@ -1138,7 +1140,7 @@ void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) { ...@@ -1138,7 +1140,7 @@ void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) {
void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) { void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
PackedFunc *pf = static_cast<PackedFunc*>(args[0].operator void*()); PackedFunc *pf = static_cast<PackedFunc*>(args[0].operator void*());
void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3])); void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3], args[4]));
delete pf; delete pf;
*rv = fhandle; *rv = fhandle;
} }
...@@ -1190,21 +1192,41 @@ void RPCSession::EventHandler::HandlePackedCall() { ...@@ -1190,21 +1192,41 @@ void RPCSession::EventHandler::HandlePackedCall() {
CHECK_EQ(state_, kRecvCode); CHECK_EQ(state_, kRecvCode);
} }
PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat) { PackedFunc WrapTimeEvaluator(PackedFunc pf,
auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) { TVMContext ctx,
int number,
int repeat,
int min_repeat_ms) {
auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
TVMRetValue temp; TVMRetValue temp;
std::ostringstream os; std::ostringstream os;
// skip first time call, to activate lazy compilation components. // skip first time call, to activate lazy compilation components.
pf.CallPacked(args, &temp); pf.CallPacked(args, &temp);
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
for (int i = 0; i < repeat; ++i) { for (int i = 0; i < repeat; ++i) {
// start timing std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> tbegin, tend;
auto tbegin = std::chrono::high_resolution_clock::now(); double duration_ms = 0.0;
for (int i = 0; i < number; ++i) {
pf.CallPacked(args, &temp); do {
} if (duration_ms > 0.0) {
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); number = static_cast<int>(
auto tend = std::chrono::high_resolution_clock::now(); std::max((min_repeat_ms / (duration_ms / number) + 1),
number * 1.618)); // 1.618 is chosen by random
}
tbegin = std::chrono::high_resolution_clock::now();
// start timing
for (int i = 0; i < number; ++i) {
pf.CallPacked(args, &temp);
}
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
tend = std::chrono::high_resolution_clock::now();
duration_ms = std::chrono::duration_cast<std::chrono::duration<double> >
(tend - tbegin).count() * 1000;
} while (duration_ms < min_repeat_ms);
double speed = std::chrono::duration_cast<std::chrono::duration<double> >( double speed = std::chrono::duration_cast<std::chrono::duration<double> >(
tend - tbegin).count() / number; tend - tbegin).count() / number;
os.write(reinterpret_cast<char*>(&speed), sizeof(speed)); os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
......
...@@ -151,14 +151,26 @@ class RPCSession { ...@@ -151,14 +151,26 @@ class RPCSession {
* *
* \param fhandle The function handle. * \param fhandle The function handle.
* \param ctx The ctx to run measurement on. * \param ctx The ctx to run measurement on.
* \param number How many steps to run in each time evaluation * \param number The number of times to run this function for taking average.
* \param repeat How many times to repeat the timer We call these runs as one `repeat` of measurement.
* \param repeat The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
* \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time,
the `number` parameter will be automatically increased.
* \return A remote timer function * \return A remote timer function
*/ */
RPCFuncHandle GetTimeEvaluator(RPCFuncHandle fhandle, RPCFuncHandle GetTimeEvaluator(RPCFuncHandle fhandle,
TVMContext ctx, TVMContext ctx,
int number, int number,
int repeat); int repeat,
int min_repeat_ms);
/*! /*!
* \brief Call a remote defined system function with arguments. * \brief Call a remote defined system function with arguments.
* \param fcode The function code. * \param fcode The function code.
...@@ -221,13 +233,29 @@ class RPCSession { ...@@ -221,13 +233,29 @@ class RPCSession {
}; };
/*! /*!
* \brief Wrap a timer function for a given packed function. * \brief Wrap a timer function to measure the time cost of a given packed function.
* \param f The function argument. * \param f The function argument.
* \param ctx The context. * \param ctx The context.
* \param number Number of steps in the inner iteration * \param number The number of times to run this function for taking average.
* \param repeat How many steps to repeat the time evaluation. We call these runs as one `repeat` of measurement.
* \param repeat The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
* \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time,
the `number` parameter will be automatically increased.
* \return f_timer A timer function.
*/ */
PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat); PackedFunc WrapTimeEvaluator(PackedFunc f,
TVMContext ctx,
int number,
int repeat,
int min_repeat_ms);
/*! /*!
* \brief Create a Global RPC module that refers to the session. * \brief Create a Global RPC module that refers to the session.
......
...@@ -69,29 +69,9 @@ def test_check_correctness(): ...@@ -69,29 +69,9 @@ def test_check_correctness():
callbacks=[_callback_wrong]) callbacks=[_callback_wrong])
def test_min_repeat_ms():
task, target = get_sample_task()
measure_option = autotvm.measure_option(
builder=autotvm.LocalBuilder(),
runner=autotvm.LocalRunner(number=1, min_repeat_ms=100)
)
def _callback(tuner, measure_inputs, measure_results):
for inp, res in zip(measure_inputs, measure_results):
if res.error_no != 0:
continue
assert 1000 * np.mean(res.costs) * \
measure_option['runner'].cur_number >= 100
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(n_trial=5, measure_option=measure_option,
callbacks=[_callback])
if __name__ == '__main__': if __name__ == '__main__':
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
test_task_tuner_without_measurement() test_task_tuner_without_measurement()
test_check_correctness() test_check_correctness()
test_min_repeat_ms()
import time
import ctypes
import tvm
from tvm.contrib.util import tempdir
def test_min_repeat_ms():
tmp = tempdir()
filename = tmp.relpath("log")
@tvm.register_func
def my_debug(filename):
"""one call lasts for 100 ms and writes one character to a file"""
time.sleep(0.1)
filename = ctypes.c_char_p(filename.value).value
with open(filename, "a") as fout:
fout.write("c")
X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename))
s = tvm.create_schedule(X.op)
func = tvm.build(s, [X])
x = tvm.nd.empty((), dtype="int32")
ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
number=1, repeat=1)
ftimer(x)
with open(filename, "r") as fin:
ct = len(fin.readline())
assert ct == 2
ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
number=1, repeat=1, min_repeat_ms=1000)
ftimer(x)
# make sure we get more than 10 calls
with open(filename, "r") as fin:
ct = len(fin.readline())
assert ct > 10 + 2
if __name__ == "__main__":
test_min_repeat_ms()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment