Add support for Windows large pages

for users that set the needed privilige "Lock Pages in Memory" large pages will be automatically enabled (see Readme.md). This expert setting might improve speed, 5% - 30%, depending on the hardware, the number of threads and hash size. More for large hashes, large number of threads and NUMA. If the operating system can not allocate large pages (easier after a reboot), default allocation is used automatically. The engine log provides details. closes https://github.com/official-stockfish/Stockfish/pull/2656 fixes https://github.com/official-stockfish/Stockfish/issues/2619 No functional change
2020-05-04 20:49:27 +03:00 · 2020-05-04 20:49:27 +03:00 · d4763424d2
parent 86ee4eb84d
commit d4763424d2
5 changed files with 120 additions and 2 deletions
--- a/Readme.md
+++ b/Readme.md
@ -42,7 +42,7 @@ Currently, Stockfish has the following UCI options:
    this equal to the number of CPU cores available.

  * #### Hash
-    The size of the hash table in MB.
+    The size of the hash table in MB. It is recommended to set Hash after setting Threads.

  * #### Clear Hash
    Clear the hash table.
@ -138,6 +138,30 @@ more compact than Nalimov tablebases, while still storing all information
 needed for optimal play and in addition being able to take into account
 the 50-move rule.

+## Large Pages
+
+Stockfish supports large pages on Linux and Windows. Large pages make
+the hash access more efficient, improving the engine speed, especially
+on large hash sizes. Typical increases are 5..10% in terms of nps, but
+speed increases up to 30% have been measured. The support is
+automatic. Stockfish attempts to use large pages when available and
+will fall back to regular memory allocation when this is not the case.
+
+### Support on Linux
+
+Large page support on Linux is obtained by the Linux kernel
+transparent huge pages functionality. Typically, transparent huge pages
+are already enabled and no configuration is needed.
+
+### Support on Windows
+
+The use of large pages requires "Lock Pages in Memory" privilege. See
+[Enable the Lock Pages in Memory Option (Windows)](https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows)
+on how to enable this privilege. Logout/login may be needed
+afterwards. Due to memory fragmentation, it may not always be
+possible to allocate large pages even when enabled. A reboot
+might alleviate this problem. To determine whether large pages
+are in use, see the engine log.

 ## Compiling Stockfish yourself from the sources

--- a/src/main.cpp
+++ b/src/main.cpp
@ -49,6 +49,7 @@ int main(int argc, char* argv[]) {

  UCI::loop(argc, argv);

+  TT.resize(0);
  Threads.set(0);
  return 0;
 }
--- a/src/misc.cpp
+++ b/src/misc.cpp
@ -309,6 +309,69 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
  return mem;
 }

+#elif defined(_WIN64)
+
+static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
+
+  HANDLE hProcessToken { };
+  LUID luid { };
+  void* mem = nullptr;
+
+  const size_t largePageSize = GetLargePageMinimum();
+  if (!largePageSize)
+      return nullptr;
+
+  // We need SeLockMemoryPrivilege, so try to enable it for the process
+  if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken))
+      return nullptr;
+
+  if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &luid))
+  {
+      TOKEN_PRIVILEGES tp { };
+      TOKEN_PRIVILEGES prevTp { };
+      DWORD prevTpLen = 0;
+
+      tp.PrivilegeCount = 1;
+      tp.Privileges[0].Luid = luid;
+      tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+
+      // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds,
+      // we still need to query GetLastError() to ensure that the privileges were actually obtained...
+      if (AdjustTokenPrivileges(
+              hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, &prevTpLen) &&
+          GetLastError() == ERROR_SUCCESS)
+      {
+          // round up size to full pages and allocate
+          allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1);
+          mem = VirtualAlloc(
+              NULL, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE);
+
+          // privilege no longer needed, restore previous state
+          AdjustTokenPrivileges(hProcessToken, FALSE, &prevTp, 0, NULL, NULL);
+      }
+  }
+
+  CloseHandle(hProcessToken);
+
+  return mem;
+}
+
+void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
+
+  // try to allocate large pages
+  mem = aligned_ttmem_alloc_large_pages(allocSize);
+  if (mem)
+      sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl;
+  else
+      sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl;
+
+  // fall back to regular, page aligned, allocation if necessary
+  if (!mem)
+      mem = VirtualAlloc(NULL, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+
+  return mem;
+}
+
 #else

 void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
@ -322,6 +385,28 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {

 #endif

+/// aligned_ttmem_free will free the previously allocated ttmem
+#if defined(_WIN64)
+
+void aligned_ttmem_free(void* mem) {
+
+  if (!VirtualFree(mem, 0, MEM_RELEASE))
+  {
+      DWORD err = GetLastError();
+      std::cerr << "Failed to free transposition table. Error code: 0x" <<
+          std::hex << err << std::dec << std::endl;
+      exit(EXIT_FAILURE);
+  }
+}
+
+#else
+
+void aligned_ttmem_free(void *mem) {
+  free(mem);
+}
+
+#endif
+

 namespace WinProcGroup {

--- a/src/misc.h
+++ b/src/misc.h
@ -34,6 +34,7 @@ const std::string compiler_info();
 void prefetch(void* addr);
 void start_logger(const std::string& fname);
 void* aligned_ttmem_alloc(size_t size, void*& mem);
+void aligned_ttmem_free(void* mem);

 void dbg_hit_on(bool b);
 void dbg_hit_on(bool c, bool b);
--- a/src/tt.cpp
+++ b/src/tt.cpp
@ -63,7 +63,14 @@ void TranspositionTable::resize(size_t mbSize) {

  Threads.main()->wait_for_search_finished();

-  free(mem);
+  if (mem)
+      aligned_ttmem_free(mem);
+
+  if (!mbSize)
+  {
+      mem = nullptr;
+      return;
+  }

  clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
  table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));