Unverified Commit bb1b7db3 by Zhao Wu Committed by GitHub

[ThreadPool] Solve ARM BIG.LITTLE heterogeneous multicores (#4747)

parent c21d1ee8
......@@ -133,34 +133,44 @@ class ThreadGroup::Impl {
#endif
}
if (exclude_worker0) { // master thread run task
#if defined(__ANDROID__)
SetFullCpuAffinity();
#else
// if we set TVM_BIND_MASTER_THREAD to be 1, we will bind master thread
// to core 0.
const char* bind_master_thread = getenv("TVM_BIND_MASTER_THREAD");
if (bind_master_thread && atoi(bind_master_thread) == 1) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
if (reverse) {
CPU_SET(sorted_order_[sorted_order_.size() - 1], &cpuset);
} else {
CPU_SET(sorted_order_[0], &cpuset);
}
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
}
pthread_atfork(nullptr, nullptr, ThreadGroup::Impl::SetFullCpuAffinity);
#endif
// Master thread will have free migration on needed cores.
// Typically, the OS will schedule the master thread to run at core 0,
// which is idle, when other workers are running.
// See the comment inside SetMasterThreadFullCpuAffinity function to get more detail.
SetMasterThreadFullCpuAffinity(reverse);
}
#endif
}
static void SetFullCpuAffinity() {
void SetMasterThreadFullCpuAffinity(bool reverse) {
#if defined(__linux__) || defined(__ANDROID__)
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
for (unsigned i = 0; i < std::thread::hardware_concurrency(); i++) {
CPU_SET(i, &cpuset);
// For example, we have 2xA72 + 4xA53 (id is 0 - 5, 4, 5 is A72 big core)
// And we use config_threadpool API to set we will only use 4xA53.
// The sorted_order will be [4, 5, 0, 1, 2, 3].
// When to call this API, we have spawn threads on little cores for other workers
// in SetAffinity function. And for tvm master thread, it should also run on little cores,
// not big cores (4, 5).
// Note: this works well on x86 too. Because x86 doesn't have BIG.LITTLE,
// our implementation will use kBig mode by default and will let master thread
// run on intended cores.
if (reverse) {
for (int i = 0; i < little_count_; ++i) {
CPU_SET(sorted_order_[sorted_order_.size() - i - 1], &cpuset);
}
} else {
int big_count = big_count_;
// Imagine our x86 has cores 0 - 7
// physical cores are 0 - 3, logical cores are 4 - 7, big_count_ is 8
// we wish we run on physical cores, not logical cores to avoid contention issue.
#if defined(_M_X64) || defined(__x86_64__)
big_count /= 2; // ignore hyper-threading
#endif
for (int i = 0; i < big_count; ++i) {
CPU_SET(sorted_order_[i], &cpuset);
}
}
#if defined(__ANDROID__)
sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment