Commit b1188485 by Lianmin Zheng Committed by Tianqi Chen

[RUNTIME] Add min_repeat_ms to time_evaluator (#2200)

parent 1e78d41c
......@@ -187,8 +187,10 @@ def measure_option(builder, runner):
Note
----
To make measurement results accurate, you should pick the correct value for the argument
`number` and `repeat` in Runner(). Using `min_repeat_ms` can dynamically adjusts `number`,
so it is recommended. The typical value for NVIDIA GPU is 100 ms.
`number` and `repeat` in Runner(). Some devices need a certain minimum running time to
"warm up," such as GPUs that need time to reach a performance power state.
Using `min_repeat_ms` can dynamically adjusts `number`, so it is recommended.
The typical value for NVIDIA GPU is 150 ms.
"""
from .measure_methods import LocalBuilder, LocalRunner
......
......@@ -140,20 +140,22 @@ class RPCRunner(Runner):
The host address of RPC Tracker
port: int
The port of RPC Tracker
number : int, optional
Number of times to do measurement for tasking average
number: int
The number of times to run the generated code for taking average.
We call these runs as one `repeat` of measurement.
repeat : int, optional
Number of times to repeat the measurement.
The number of times to repeat the measurement.
In total, the generated code will be run (1 + number x repeat) times,
where the first one is warm up. The returned result contains `repeat` costs,
min_repeat_ms : float, optional
Minimum duration of a timer measurement in milliseconds.
When the run time of a measurement trial falls below this time, the
`number` parameter will be automatically increased.
Set this to improve the accuracy of perf measurement, e.g., when timers
are not precise enough to capture short-running tasks. This parameter is
also critical when devices need a certain minimum running time to "warm
up," such as GPUs that need time to reach a performance power state.
where the first "1" is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval: float, optional
The cool down interval between two measurements.
check_correctness: bool, optional
......@@ -177,7 +179,6 @@ class RPCRunner(Runner):
self.number = number
self.repeat = repeat
self.min_repeat_ms = min_repeat_ms
self.cur_number = number
self.ref_input = None
self.ref_output = None
......@@ -188,7 +189,6 @@ class RPCRunner(Runner):
def set_task(self, task):
self.task = task
self.cur_number = self.number
if check_remote(task.target, self.key, self.host, self.port):
logger.info("Get devices for measurement successfully!")
......@@ -240,8 +240,9 @@ class RPCRunner(Runner):
ret = self.executor.submit(run_through_rpc,
measure_inp,
build_res,
self.cur_number,
self.number,
self.repeat,
self.min_repeat_ms,
self.cooldown_interval,
remote_args,
self.ref_input,
......@@ -256,32 +257,6 @@ class RPCRunner(Runner):
else:
results.append(res)
# If some runs were too fast, do remeasure for them
# to meet the requirement of `min_repeat_ms`
remeasure = np.zeros((len(measure_inputs),), dtype=np.bool)
pre_number = next_number = self.cur_number
min_repeat_duration = self.min_repeat_ms / 1000.0
for i, res in enumerate(results):
if res.error_no == MeasureErrorNo.NO_ERROR:
if np.mean(res.costs) * pre_number <= min_repeat_duration:
next_number = max(next_number,
int(np.ceil(min_repeat_duration / np.mean(res.costs))))
remeasure[i] = True
if pre_number != next_number:
self.cur_number = next_number
msg = "increasing number to %d" % self.cur_number
logger.info(msg)
re_measure_inputs = [x for i, x in enumerate(measure_inputs) if remeasure[i]]
re_build_results = [x for i, x in enumerate(build_results) if remeasure[i]]
re_res = self.run(re_measure_inputs, re_build_results)
ct = 0
for i, rerun in enumerate(remeasure):
if rerun:
results[i] = re_res[ct]
ct += 1
return results
class LocalRunner(RPCRunner):
......@@ -291,21 +266,22 @@ class LocalRunner(RPCRunner):
----------
timeout: float
The timeout of a compilation
number : int, optional
Number of times to do measurement for tasking average
number: int
The number of times to run the generated code for taking average.
We call these runs as one `repeat` of measurement.
repeat : int, optional
Number of times to repeat the measurement.
The number of times to repeat the measurement.
In total, the generated code will be run (1 + number x repeat) times,
where the first one is warm up. The returned result contains `repeat` costs,
each of which is the average of `number` test run.
min_repeat_ms : float, optional
Minimum duration of a timer measurement in milliseconds.
When the run time of a measurement trial falls below this time, the
`number` parameter will be automatically increased.
Set this to improve the accuracy of perf measurement, e.g., when timers
are not precise enough to capture short-running tasks. This parameter is
also critical when devices need a certain minimum running time to "warm
up," such as GPUs that need time to reach a performance power state.
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval: float, optional
The cool down interval between two measurements.
check_correctness: bool, optional
......@@ -416,7 +392,7 @@ def android_ndk_build_func(measure_input, tmp_dir, **kwargs):
def run_through_rpc(measure_input, build_result,
number, repeat, cooldown_interval,
number, repeat, min_repeat_ms, cooldown_interval,
remote_args, ref_input=None, ref_output=None):
"""Run a generated library through rpc
......@@ -426,13 +402,22 @@ def run_through_rpc(measure_input, build_result,
The raw measure input
build_result: BuildResult
The result returned from Builder. This contains the path to the generated library.
number : int, optional
Number of times to do measurement for tasking average
number: int
The number of times to run the generated code for taking average.
We call these runs as one `repeat` of measurement.
repeat : int, optional
Number of times to repeat the measurement.
The number of times to repeat the measurement.
In total, the generated code will be run (1 + number x repeat) times,
where the first one is warm up. The returned result contains `repeat` costs,
each of which is the average of `number` test run.
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval: float
The cool down interval between two measurements
remote_args: Tuple
......@@ -454,14 +439,14 @@ def run_through_rpc(measure_input, build_result,
func = remote.load_module(os.path.split(build_result.filename)[1])
ctx = remote.context(str(measure_input.target), 0)
time_f = func.time_evaluator(
func.entry_name, ctx, number=number, repeat=repeat)
func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms)
# set input
if ref_input:
args = [nd.array(x, ctx=ctx) for x in ref_input]
else:
# create empty arrays on the remote device and copy them once.
# This can avoid some memory issues that make the measurment results unreliable.
# This can avoid some memory issues that make the measurement results unreliable.
args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
args = [nd.array(x, ctx=ctx) for x in args]
ctx.sync()
......
......@@ -127,7 +127,7 @@ class Module(ModuleBase):
kwargs.update({'options': ["-I" + path for path in find_include_path()]})
fcompile(file_name, files, **kwargs)
def time_evaluator(self, func_name, ctx, number, repeat=1):
def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
"""Get an evaluator that measures time cost of running function.
Parameters
......@@ -139,26 +139,38 @@ class Module(ModuleBase):
The context we should run this function on.
number: int
The number of steps used in measuring each time interval
The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.
repeat: int, optional
Number of times to run the timer measurement
If repeat equals 3, then we will get 3 numbers in the ProfileResult.
The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
Note
----
The function will be invoked repeat * number + 1 times,
The function will be invoked (1 + number x repeat) times,
with the first call discarded in case there is lazy initialization.
Returns
-------
ftimer : Function
The function that takes same argument as func
and return a float representing seconds per function call.
The function that takes same argument as func and returns a ProfileResult.
The ProfileResult reports `repeat` time costs in seconds.
"""
try:
feval = _RPCTimeEvaluator(
self, func_name, ctx.device_type, ctx.device_id, number, repeat)
self, func_name, ctx.device_type, ctx.device_id, number, repeat, min_repeat_ms)
def evaluator(*args):
"""Internal wrapped evaluator."""
......
......@@ -124,10 +124,11 @@ class RPCModuleNode final : public ModuleNode {
PackedFunc GetTimeEvaluator(const std::string& name,
TVMContext ctx,
int number,
int repeat) {
int repeat,
int min_repeat_ms) {
RPCFuncHandle handle = GetFuncHandle(name);
if (handle == nullptr) return PackedFunc();
handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat);
handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat, min_repeat_ms);
return WrapRemote(handle);
}
......@@ -203,10 +204,10 @@ TVM_REGISTER_GLOBAL("module._RPCTimeEvaluator")
ctx.device_id = args[3];
if (tkey == "rpc") {
*rv = static_cast<RPCModuleNode*>(m.operator->())
->GetTimeEvaluator(args[1], ctx, args[4], args[5]);
->GetTimeEvaluator(args[1], ctx, args[4], args[5], args[6]);
} else {
*rv = WrapTimeEvaluator(
m.GetFunction(args[1], false), ctx, args[4], args[5]);
m.GetFunction(args[1], false), ctx, args[4], args[5], args[6]);
}
});
......
......@@ -13,6 +13,8 @@
#include <chrono>
#include <vector>
#include <utility>
#include <cmath>
#include <algorithm>
#include "rpc_session.h"
#include "../../common/ring_buffer.h"
......@@ -1002,9 +1004,9 @@ void RPCSession::CopyFromRemote(void* from,
}
RPCFuncHandle RPCSession::GetTimeEvaluator(
RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat) {
RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat, int min_repeat_ms) {
return this->CallRemote(
RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat);
RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat, min_repeat_ms);
}
// Event handler functions
......@@ -1138,7 +1140,7 @@ void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) {
void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
PackedFunc *pf = static_cast<PackedFunc*>(args[0].operator void*());
void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3]));
void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3], args[4]));
delete pf;
*rv = fhandle;
}
......@@ -1190,21 +1192,41 @@ void RPCSession::EventHandler::HandlePackedCall() {
CHECK_EQ(state_, kRecvCode);
}
PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat) {
auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) {
PackedFunc WrapTimeEvaluator(PackedFunc pf,
TVMContext ctx,
int number,
int repeat,
int min_repeat_ms) {
auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
TVMRetValue temp;
std::ostringstream os;
// skip first time call, to activate lazy compilation components.
pf.CallPacked(args, &temp);
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
for (int i = 0; i < repeat; ++i) {
// start timing
auto tbegin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < number; ++i) {
pf.CallPacked(args, &temp);
}
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
auto tend = std::chrono::high_resolution_clock::now();
std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> tbegin, tend;
double duration_ms = 0.0;
do {
if (duration_ms > 0.0) {
number = static_cast<int>(
std::max((min_repeat_ms / (duration_ms / number) + 1),
number * 1.618)); // 1.618 is chosen by random
}
tbegin = std::chrono::high_resolution_clock::now();
// start timing
for (int i = 0; i < number; ++i) {
pf.CallPacked(args, &temp);
}
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
tend = std::chrono::high_resolution_clock::now();
duration_ms = std::chrono::duration_cast<std::chrono::duration<double> >
(tend - tbegin).count() * 1000;
} while (duration_ms < min_repeat_ms);
double speed = std::chrono::duration_cast<std::chrono::duration<double> >(
tend - tbegin).count() / number;
os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
......
......@@ -151,14 +151,26 @@ class RPCSession {
*
* \param fhandle The function handle.
* \param ctx The ctx to run measurement on.
* \param number How many steps to run in each time evaluation
* \param repeat How many times to repeat the timer
* \param number The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.
* \param repeat The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
* \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time,
the `number` parameter will be automatically increased.
* \return A remote timer function
*/
RPCFuncHandle GetTimeEvaluator(RPCFuncHandle fhandle,
TVMContext ctx,
int number,
int repeat);
int repeat,
int min_repeat_ms);
/*!
* \brief Call a remote defined system function with arguments.
* \param fcode The function code.
......@@ -221,13 +233,29 @@ class RPCSession {
};
/*!
* \brief Wrap a timer function for a given packed function.
* \brief Wrap a timer function to measure the time cost of a given packed function.
* \param f The function argument.
* \param ctx The context.
* \param number Number of steps in the inner iteration
* \param repeat How many steps to repeat the time evaluation.
* \param number The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.
* \param repeat The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
* \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time,
the `number` parameter will be automatically increased.
* \return f_timer A timer function.
*/
PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat);
PackedFunc WrapTimeEvaluator(PackedFunc f,
TVMContext ctx,
int number,
int repeat,
int min_repeat_ms);
/*!
* \brief Create a Global RPC module that refers to the session.
......
......@@ -69,29 +69,9 @@ def test_check_correctness():
callbacks=[_callback_wrong])
def test_min_repeat_ms():
task, target = get_sample_task()
measure_option = autotvm.measure_option(
builder=autotvm.LocalBuilder(),
runner=autotvm.LocalRunner(number=1, min_repeat_ms=100)
)
def _callback(tuner, measure_inputs, measure_results):
for inp, res in zip(measure_inputs, measure_results):
if res.error_no != 0:
continue
assert 1000 * np.mean(res.costs) * \
measure_option['runner'].cur_number >= 100
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(n_trial=5, measure_option=measure_option,
callbacks=[_callback])
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
test_task_tuner_without_measurement()
test_check_correctness()
test_min_repeat_ms()
import time
import ctypes
import tvm
from tvm.contrib.util import tempdir
def test_min_repeat_ms():
tmp = tempdir()
filename = tmp.relpath("log")
@tvm.register_func
def my_debug(filename):
"""one call lasts for 100 ms and writes one character to a file"""
time.sleep(0.1)
filename = ctypes.c_char_p(filename.value).value
with open(filename, "a") as fout:
fout.write("c")
X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename))
s = tvm.create_schedule(X.op)
func = tvm.build(s, [X])
x = tvm.nd.empty((), dtype="int32")
ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
number=1, repeat=1)
ftimer(x)
with open(filename, "r") as fin:
ct = len(fin.readline())
assert ct == 2
ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
number=1, repeat=1, min_repeat_ms=1000)
ftimer(x)
# make sure we get more than 10 calls
with open(filename, "r") as fin:
ct = len(fin.readline())
assert ct > 10 + 2
if __name__ == "__main__":
test_min_repeat_ms()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment