Fix processor group binding under Windows.

Starting with Windows Build 20348 the behavior of the numa API has been changed: https://docs.microsoft.com/en-us/windows/win32/procthread/numa-support Old code only worked because there was probably a limit on how many cores/threads can reside within one NUMA node, and the OS creates extra NUMA nodes when necessary, however the actual mechanism of core binding is done by "Processor Groups"(https://docs.microsoft.com/en-us/windows/win32/procthread/processor-groups). With a newer OS, one NUMA node can have many such "Processor Groups" and we should just consistently use the number of groups to bind the threads instead of deriving the topology from the number of NUMA nodes. This change is required to spread threads on all cores on Windows 11 with a 3990X CPU. It has only 1 NUMA node with 2 groups of 64 threads each. closes https://github.com/official-stockfish/Stockfish/pull/3787 No functional change.
2021-11-13 06:38:52 +08:00 · 2021-11-13 06:38:52 +08:00 · 9048ac00db
parent 1a5c21dc56
commit 9048ac00db
1 changed files with 12 additions and 12 deletions
--- a/src/misc.cpp
+++ b/src/misc.cpp
@ -502,7 +502,7 @@ void bindThisThread(size_t) {}
 int best_group(size_t idx) {

  int threads = 0;
-  int nodes = 0;
+  int groups = 0;
  int cores = 0;
  DWORD returnLength = 0;
  DWORD byteOffset = 0;
@ -530,8 +530,8 @@ int best_group(size_t idx) {

  while (byteOffset < returnLength)
  {
-      if (ptr->Relationship == RelationNumaNode)
-          nodes++;
+      if (ptr->Relationship == RelationGroup)
+          groups += ptr->Group.MaximumGroupCount;

      else if (ptr->Relationship == RelationProcessorCore)
      {
@ -546,23 +546,23 @@ int best_group(size_t idx) {

  free(buffer);

-  std::vector<int> groups;
+  std::vector<int> core_groups;

-  // Run as many threads as possible on the same node until core limit is
-  // reached, then move on filling the next node.
-  for (int n = 0; n < nodes; n++)
-      for (int i = 0; i < cores / nodes; i++)
-          groups.push_back(n);
+  // Run as many threads as possible on the same group until core limit is
+  // reached, then move on filling the next group.
+  for (int n = 0; n < groups; n++)
+      for (int i = 0; i < cores / groups; i++)
+          core_groups.push_back(n);

  // In case a core has more than one logical processor (we assume 2) and we
  // have still threads to allocate, then spread them evenly across available
-  // nodes.
+  // groups.
  for (int t = 0; t < threads - cores; t++)
-      groups.push_back(t % nodes);
+      core_groups.push_back(t % groups);

  // If we still have more threads than the total number of logical processors
  // then return -1 and let the OS to decide what to do.
-  return idx < groups.size() ? groups[idx] : -1;
+  return idx < core_groups.size() ? core_groups[idx] : -1;
 }