[ThreadPool] Solve ARM BIG.LITTLE heterogeneous multicores (#4747)

bb1b7db3 · Zhao Wu · GitHub · c21d1ee8 · bb1b7db3
Unverified Commit bb1b7db3 authored Feb 04, 2020 by Zhao Wu Committed by GitHub Feb 03, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 31 additions and 21 deletions

src/runtime/threading_backend.cc
+31 -21

No files found.
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -133,34 +133,44 @@ class ThreadGroup::Impl {
 #endif
    }
    if (exclude_worker0) {  // master thread run task
-#if defined(__ANDROID__)
-      SetFullCpuAffinity();
-#else
-      // if we set TVM_BIND_MASTER_THREAD to be 1, we will bind master thread
-      // to core 0.
-      const char* bind_master_thread = getenv("TVM_BIND_MASTER_THREAD");
-      if (bind_master_thread && atoi(bind_master_thread) == 1) {
-        cpu_set_t cpuset;
-        CPU_ZERO(&cpuset);
-        if (reverse) {
-          CPU_SET(sorted_order_[sorted_order_.size() - 1], &cpuset);
-        } else {
-          CPU_SET(sorted_order_[0], &cpuset);
-        }
-        pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
-      }
-      pthread_atfork(nullptr, nullptr, ThreadGroup::Impl::SetFullCpuAffinity);
-#endif
+      // Master thread will have free migration on needed cores.
+      // Typically, the OS will schedule the master thread to run at core 0,
+      // which is idle, when other workers are running.
+      // See the comment inside SetMasterThreadFullCpuAffinity function to get more detail.
+      SetMasterThreadFullCpuAffinity(reverse);
    }
 #endif
  }

-  static void SetFullCpuAffinity() {
+  void SetMasterThreadFullCpuAffinity(bool reverse) {
 #if defined(__linux__) || defined(__ANDROID__)
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
-    for (unsigned i = 0; i < std::thread::hardware_concurrency(); i++) {
-      CPU_SET(i, &cpuset);
+    // For example, we have 2xA72 + 4xA53 (id is 0 - 5, 4, 5 is A72 big core)
+    // And we use config_threadpool API to set we will only use 4xA53.
+    // The sorted_order will be [4, 5, 0, 1, 2, 3].
+    // When to call this API, we have spawn threads on little cores for other workers
+    // in SetAffinity function. And for tvm master thread, it should also run on little cores,
+    // not big cores (4, 5).
+
+    // Note: this works well on x86 too. Because x86 doesn't have BIG.LITTLE,
+    // our implementation will use kBig mode by default and will let master thread
+    // run on intended cores.
+    if (reverse) {
+      for (int i = 0; i < little_count_; ++i) {
+        CPU_SET(sorted_order_[sorted_order_.size() - i - 1], &cpuset);
+      }
+    } else {
+      int big_count = big_count_;
+      // Imagine our x86 has cores 0 - 7
+      // physical cores are 0 - 3, logical cores are 4 - 7, big_count_ is 8
+      // we wish we run on physical cores, not logical cores to avoid contention issue.
+#if defined(_M_X64) || defined(__x86_64__)
+      big_count /= 2;  // ignore hyper-threading
+#endif
+      for (int i = 0; i < big_count; ++i) {
+        CPU_SET(sorted_order_[i], &cpuset);
+      }
    }
 #if defined(__ANDROID__)
    sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset);