1
0
Fork 0

Upon changing the number of threads, make sure all threads are bound

The heuristic to avoid thread binding if less than 8 threads are requested resulted in the first 7 threads not being bound.
The branch was verified to yield a roughly 13% speedup by @CoffeeOne on the appropriate hardware and OS, and an earlier version of this patch tested well on his machine:

http://tests.stockfishchess.org/tests/view/5a3693480ebc590ccbb8be5a
ELO: 9.24 +-4.6 (95%) LOS: 100.0%
Total: 5000 W: 634 L: 501 D: 3865

To make sure all threads (including mainThread) are bound as soon as the total number exceeds 7, recreate all threads on a change of thread number.
To do this, unify Threads::init, Threads::exit and Threads::set are unified in a single Threads::set function that goes through the needed steps.
The code includes several suggestions from @joergoster.

Fixes issue #1312 

No functional change
pull/1345/merge
Joost VandeVondele 2017-12-26 10:40:42 +01:00 committed by Marco Costalba
parent 2198cd0524
commit 1c50d8cbf5
5 changed files with 40 additions and 51 deletions

View File

@ -45,11 +45,11 @@ int main(int argc, char* argv[]) {
Pawns::init();
Tablebases::init(Options["SyzygyPath"]);
TT.resize(Options["Hash"]);
Threads.init(Options["Threads"]);
Threads.set(Options["Threads"]);
Search::clear(); // After threads are up
UCI::loop(argc, argv);
Threads.exit();
Threads.set(0);
return 0;
}

View File

@ -293,14 +293,6 @@ int get_group(size_t idx) {
void bindThisThread(size_t idx) {
// If OS already scheduled us on a different group than 0 then don't overwrite
// the choice, eventually we are one of many one-threaded processes running on
// some Windows NUMA hardware, for instance in fishtest. To make it simple,
// just check if running threads are below a threshold, in this case all this
// NUMA machinery is not needed.
if (Threads.size() < 8)
return;
// Use only local variables to be thread-safe
int group = get_group(idx);

View File

@ -173,13 +173,7 @@ void Search::clear() {
Time.availableNodes = 0;
TT.clear();
for (Thread* th : Threads)
th->clear();
Threads.main()->callsCnt = 0;
Threads.main()->previousScore = VALUE_INFINITE;
Threads.main()->previousTimeReduction = 1;
Threads.clear();
}

View File

@ -24,6 +24,7 @@
#include "movegen.h"
#include "search.h"
#include "thread.h"
#include "uci.h"
#include "syzygy/tbprobe.h"
ThreadPool Threads; // Global object
@ -35,7 +36,6 @@ ThreadPool Threads; // Global object
Thread::Thread(size_t n) : idx(n), stdThread(&Thread::idle_loop, this) {
wait_for_search_finished();
clear(); // Zero-init histories (based on std::array)
}
@ -92,7 +92,13 @@ void Thread::wait_for_search_finished() {
void Thread::idle_loop() {
WinProcGroup::bindThisThread(idx);
// If OS already scheduled us on a different group than 0 then don't overwrite
// the choice, eventually we are one of many one-threaded processes running on
// some Windows NUMA hardware, for instance in fishtest. To make it simple,
// just check if running threads are below a threshold, in this case all this
// NUMA machinery is not needed.
if (Options["Threads"] >= 8)
WinProcGroup::bindThisThread(idx);
while (true)
{
@ -110,41 +116,39 @@ void Thread::idle_loop() {
}
}
/// ThreadPool::init() creates and launches the threads that will go
/// immediately to sleep in idle_loop. We cannot use the constructor because
/// Threads is a static object and we need a fully initialized engine at
/// this point due to allocation of Endgames in the Thread constructor.
void ThreadPool::init(size_t requested) {
push_back(new MainThread(0));
set(requested);
}
/// ThreadPool::exit() terminates threads before the program exits. Cannot be
/// done in the destructor because threads must be terminated before deleting
/// any static object, so before main() returns.
void ThreadPool::exit() {
main()->wait_for_search_finished();
set(0);
}
/// ThreadPool::set() creates/destroys threads to match the requested number
/// ThreadPool::set() creates/destroys threads to match the requested number.
/// Created and launced threads wil go immediately to sleep in idle_loop.
/// Upon resizing, threads are recreated to allow for binding if necessary.
void ThreadPool::set(size_t requested) {
while (size() < requested)
push_back(new Thread(size()));
if (size() > 0) { // destroy any existing thread(s)
main()->wait_for_search_finished();
while (size() > requested)
delete back(), pop_back();
while (size() > 0)
delete back(), pop_back();
}
if (requested > 0) { // create new thread(s)
push_back(new MainThread(0));
while (size() < requested)
push_back(new Thread(size()));
clear();
}
}
/// ThreadPool::clear() sets threadPool data to initial values.
void ThreadPool::clear() {
for (Thread* th : *this)
th->clear();
main()->callsCnt = 0;
main()->previousScore = VALUE_INFINITE;
main()->previousTimeReduction = 1;
}
/// ThreadPool::start_thinking() wakes up main thread waiting in idle_loop() and
/// returns immediately. Main thread will wake up other threads and start the search.
@ -181,7 +185,7 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
// is shared by threads but is accessed in read-only mode.
StateInfo tmp = setupStates->back();
for (Thread* th : Threads)
for (Thread* th : *this)
{
th->nodes = th->tbHits = 0;
th->rootDepth = th->completedDepth = DEPTH_ZERO;

View File

@ -96,9 +96,8 @@ struct MainThread : public Thread {
struct ThreadPool : public std::vector<Thread*> {
void init(size_t); // No constructor and destructor, threads rely on globals that should
void exit(); // be initialized and valid during the whole thread lifetime.
void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
void clear();
void set(size_t);
MainThread* main() const { return static_cast<MainThread*>(front()); }