diff --git a/.github/ci/libcxx17.imp b/.github/ci/libcxx17.imp index 7bdcf5bc2de..d3a262b54e8 100644 --- a/.github/ci/libcxx17.imp +++ b/.github/ci/libcxx17.imp @@ -7,6 +7,7 @@ { include: [ "<__fwd/sstream.h>", private, "", public ] }, { include: [ "<__fwd/streambuf.h>", private, "", public ] }, { include: [ "<__fwd/string_view.h>", private, "", public ] }, + { include: [ "<__system_error/errc.h>", private, "", public ] }, # Mappings for includes between public headers { include: [ "", public, "", public ] }, diff --git a/src/Makefile b/src/Makefile index 45f38b01322..5119b615f6b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -63,7 +63,7 @@ HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \ nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \ nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \ search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \ - tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h + tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h OBJS = $(notdir $(SRCS:.cpp=.o)) diff --git a/src/engine.cpp b/src/engine.cpp index e8da24aa9e8..3fc27223a09 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -18,15 +18,15 @@ #include "engine.h" +#include #include +#include #include #include +#include #include #include #include -#include -#include -#include #include "evaluate.h" #include "misc.h" @@ -48,10 +48,14 @@ constexpr auto StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - Engine::Engine(std::string path) : binaryDirectory(CommandLine::get_binary_directory(path)), + numaContext(NumaConfig::from_system()), states(new std::deque(1)), - networks(NN::Networks( - NN::NetworkBig({EvalFileDefaultNameBig, "None", ""}, NN::EmbeddedNNUEType::BIG), - NN::NetworkSmall({EvalFileDefaultNameSmall, "None", ""}, NN::EmbeddedNNUEType::SMALL))) { + threads(), + networks( + numaContext, + NN::Networks( + NN::NetworkBig({EvalFileDefaultNameBig, "None", ""}, NN::EmbeddedNNUEType::BIG), + NN::NetworkSmall({EvalFileDefaultNameSmall, "None", ""}, NN::EmbeddedNNUEType::SMALL))) { pos.set(StartFEN, false, &states->back()); capSq = SQ_NONE; } @@ -74,7 +78,7 @@ void Engine::stop() { threads.stop = true; } void Engine::search_clear() { wait_for_search_finished(); - tt.clear(options["Threads"]); + tt.clear(threads); threads.clear(); // @TODO wont work with multiple instances @@ -124,11 +128,35 @@ void Engine::set_position(const std::string& fen, const std::vector // modifiers -void Engine::resize_threads() { threads.set({options, threads, tt, networks}, updateContext); } +void Engine::set_numa_config_from_option(const std::string& o) { + if (o == "auto" || o == "system") + { + numaContext.set_numa_config(NumaConfig::from_system()); + } + else if (o == "none") + { + numaContext.set_numa_config(NumaConfig{}); + } + else + { + numaContext.set_numa_config(NumaConfig::from_string(o)); + } + + // Force reallocation of threads in case affinities need to change. + resize_threads(); +} + +void Engine::resize_threads() { + threads.wait_for_search_finished(); + threads.set(numaContext.get_numa_config(), {options, threads, tt, networks}, updateContext); + + // Reallocate the hash with the new threadpool size + set_tt_size(options["Hash"]); +} void Engine::set_tt_size(size_t mb) { wait_for_search_finished(); - tt.resize(mb, options["Threads"]); + tt.resize(mb, threads); } void Engine::set_ponderhit(bool b) { threads.main_manager()->ponder = b; } @@ -136,28 +164,35 @@ void Engine::set_ponderhit(bool b) { threads.main_manager()->ponder = b; } // network related void Engine::verify_networks() const { - networks.big.verify(options["EvalFile"]); - networks.small.verify(options["EvalFileSmall"]); + networks->big.verify(options["EvalFile"]); + networks->small.verify(options["EvalFileSmall"]); } void Engine::load_networks() { - load_big_network(options["EvalFile"]); - load_small_network(options["EvalFileSmall"]); + networks.modify_and_replicate([this](NN::Networks& networks_) { + networks_.big.load(binaryDirectory, options["EvalFile"]); + networks_.small.load(binaryDirectory, options["EvalFileSmall"]); + }); + threads.clear(); } void Engine::load_big_network(const std::string& file) { - networks.big.load(binaryDirectory, file); + networks.modify_and_replicate( + [this, &file](NN::Networks& networks_) { networks_.big.load(binaryDirectory, file); }); threads.clear(); } void Engine::load_small_network(const std::string& file) { - networks.small.load(binaryDirectory, file); + networks.modify_and_replicate( + [this, &file](NN::Networks& networks_) { networks_.small.load(binaryDirectory, file); }); threads.clear(); } void Engine::save_network(const std::pair, std::string> files[2]) { - networks.big.save(files[0].first); - networks.small.save(files[1].first); + networks.modify_and_replicate([&files](NN::Networks& networks_) { + networks_.big.save(files[0].first); + networks_.small.save(files[1].first); + }); } // utility functions @@ -169,7 +204,7 @@ void Engine::trace_eval() const { verify_networks(); - sync_cout << "\n" << Eval::trace(p, networks) << sync_endl; + sync_cout << "\n" << Eval::trace(p, *networks) << sync_endl; } OptionsMap& Engine::get_options() { return options; } @@ -184,4 +219,21 @@ std::string Engine::visualize() const { return ss.str(); } +std::vector> Engine::get_bound_thread_count_by_numa_node() const { + auto counts = threads.get_bound_thread_count_by_numa_node(); + const NumaConfig& cfg = numaContext.get_numa_config(); + std::vector> ratios; + NumaIndex n = 0; + for (; n < counts.size(); ++n) + ratios.emplace_back(counts[n], cfg.num_cpus_in_numa_node(n)); + if (!counts.empty()) + for (; n < cfg.num_numa_nodes(); ++n) + ratios.emplace_back(0, cfg.num_cpus_in_numa_node(n)); + return ratios; +} + +std::string Engine::get_numa_config_as_string() const { + return numaContext.get_numa_config().to_string(); +} + } diff --git a/src/engine.h b/src/engine.h index 64a814cb4aa..91a8a96b0dc 100644 --- a/src/engine.h +++ b/src/engine.h @@ -35,6 +35,7 @@ #include "thread.h" #include "tt.h" #include "ucioption.h" +#include "numa.h" namespace Stockfish { @@ -47,6 +48,13 @@ class Engine { using InfoIter = Search::InfoIteration; Engine(std::string path = ""); + + // Can't be movable due to components holding backreferences to fields + Engine(const Engine&) = delete; + Engine(Engine&&) = delete; + Engine& operator=(const Engine&) = delete; + Engine& operator=(Engine&&) = delete; + ~Engine() { wait_for_search_finished(); } std::uint64_t perft(const std::string& fen, Depth depth, bool isChess960); @@ -63,6 +71,7 @@ class Engine { // modifiers + void set_numa_config_from_option(const std::string& o); void resize_threads(); void set_tt_size(size_t mb); void set_ponderhit(bool); @@ -83,23 +92,27 @@ class Engine { // utility functions - void trace_eval() const; - OptionsMap& get_options(); - std::string fen() const; - void flip(); - std::string visualize() const; + void trace_eval() const; + OptionsMap& get_options(); + std::string fen() const; + void flip(); + std::string visualize() const; + std::vector> get_bound_thread_count_by_numa_node() const; + std::string get_numa_config_as_string() const; private: const std::string binaryDirectory; + NumaReplicationContext numaContext; + Position pos; StateListPtr states; Square capSq; - OptionsMap options; - ThreadPool threads; - TranspositionTable tt; - Eval::NNUE::Networks networks; + OptionsMap options; + ThreadPool threads; + TranspositionTable tt; + NumaReplicated networks; Search::SearchManager::UpdateContext updateContext; }; diff --git a/src/misc.cpp b/src/misc.cpp index 58f804204b2..d48b75e1c28 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -48,6 +48,7 @@ using fun8_t = bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGE #endif #include +#include #include #include #include @@ -56,6 +57,7 @@ using fun8_t = bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGE #include #include #include +#include #include "types.h" @@ -592,129 +594,6 @@ void aligned_large_pages_free(void* mem) { std_aligned_free(mem); } #endif -namespace WinProcGroup { - -#ifndef _WIN32 - -void bind_this_thread(size_t) {} - -#else - -namespace { -// Retrieves logical processor information using Windows-specific -// API and returns the best node id for the thread with index idx. Original -// code from Texel by Peter Ă–sterlund. -int best_node(size_t idx) { - - int threads = 0; - int nodes = 0; - int cores = 0; - DWORD returnLength = 0; - DWORD byteOffset = 0; - - // Early exit if the needed API is not available at runtime - HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); - auto fun1 = (fun1_t) (void (*)()) GetProcAddress(k32, "GetLogicalProcessorInformationEx"); - if (!fun1) - return -1; - - // First call to GetLogicalProcessorInformationEx() to get returnLength. - // We expect the call to fail due to null buffer. - if (fun1(RelationAll, nullptr, &returnLength)) - return -1; - - // Once we know returnLength, allocate the buffer - SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *buffer, *ptr; - ptr = buffer = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*) malloc(returnLength); - - // Second call to GetLogicalProcessorInformationEx(), now we expect to succeed - if (!fun1(RelationAll, buffer, &returnLength)) - { - free(buffer); - return -1; - } - - while (byteOffset < returnLength) - { - if (ptr->Relationship == RelationNumaNode) - nodes++; - - else if (ptr->Relationship == RelationProcessorCore) - { - cores++; - threads += (ptr->Processor.Flags == LTP_PC_SMT) ? 2 : 1; - } - - assert(ptr->Size); - byteOffset += ptr->Size; - ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*) (((char*) ptr) + ptr->Size); - } - - free(buffer); - - std::vector groups; - - // Run as many threads as possible on the same node until the core limit is - // reached, then move on to filling the next node. - for (int n = 0; n < nodes; n++) - for (int i = 0; i < cores / nodes; i++) - groups.push_back(n); - - // In case a core has more than one logical processor (we assume 2) and we - // still have threads to allocate, spread them evenly across available nodes. - for (int t = 0; t < threads - cores; t++) - groups.push_back(t % nodes); - - // If we still have more threads than the total number of logical processors - // then return -1 and let the OS to decide what to do. - return idx < groups.size() ? groups[idx] : -1; -} -} - - -// Sets the group affinity of the current thread -void bind_this_thread(size_t idx) { - - // Use only local variables to be thread-safe - int node = best_node(idx); - - if (node == -1) - return; - - // Early exit if the needed API are not available at runtime - HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); - auto fun2 = fun2_t((void (*)()) GetProcAddress(k32, "GetNumaNodeProcessorMaskEx")); - auto fun3 = fun3_t((void (*)()) GetProcAddress(k32, "SetThreadGroupAffinity")); - auto fun4 = fun4_t((void (*)()) GetProcAddress(k32, "GetNumaNodeProcessorMask2")); - auto fun5 = fun5_t((void (*)()) GetProcAddress(k32, "GetMaximumProcessorGroupCount")); - - if (!fun2 || !fun3) - return; - - if (!fun4 || !fun5) - { - GROUP_AFFINITY affinity; - if (fun2(node, &affinity)) // GetNumaNodeProcessorMaskEx - fun3(GetCurrentThread(), &affinity, nullptr); // SetThreadGroupAffinity - } - else - { - // If a numa node has more than one processor group, we assume they are - // sized equal and we spread threads evenly across the groups. - USHORT elements, returnedElements; - elements = fun5(); // GetMaximumProcessorGroupCount - GROUP_AFFINITY* affinity = (GROUP_AFFINITY*) malloc(elements * sizeof(GROUP_AFFINITY)); - if (fun4(node, affinity, elements, &returnedElements)) // GetNumaNodeProcessorMask2 - fun3(GetCurrentThread(), &affinity[idx % returnedElements], - nullptr); // SetThreadGroupAffinity - free(affinity); - } -} - -#endif - -} // namespace WinProcGroup - #ifdef _WIN32 #include #define GETCWD _getcwd @@ -723,6 +602,15 @@ void bind_this_thread(size_t idx) { #define GETCWD getcwd #endif +size_t str_to_size_t(const std::string& s) { + size_t value; + auto result = std::from_chars(s.data(), s.data() + s.size(), value); + + if (result.ec != std::errc()) + std::exit(EXIT_FAILURE); + + return value; +} std::string CommandLine::get_binary_directory(std::string argv0) { std::string pathSeparator; diff --git a/src/misc.h b/src/misc.h index 3a905dfab49..99cbecfdd2c 100644 --- a/src/misc.h +++ b/src/misc.h @@ -24,10 +24,12 @@ #include #include #include +#include #include #include #include #include +#include #define stringify2(x) #x #define stringify(x) stringify2(x) @@ -50,6 +52,8 @@ void* aligned_large_pages_alloc(size_t size); // nop if mem == nullptr void aligned_large_pages_free(void* mem); +size_t str_to_size_t(const std::string& s); + // Deleter for automating release of memory area template struct AlignedDeleter { @@ -73,6 +77,31 @@ using AlignedPtr = std::unique_ptr>; template using LargePagePtr = std::unique_ptr>; +struct PipeDeleter { + void operator()(FILE* file) const { + if (file != nullptr) + { + pclose(file); + } + } +}; + +#if defined(__linux__) + +inline std::optional get_system_command_output(const std::string& command) { + std::unique_ptr pipe(popen(command.c_str(), "r")); + if (!pipe) + return std::nullopt; + + std::string result; + char buffer[1024]; + while (fgets(buffer, sizeof(buffer), pipe.get()) != nullptr) + result += buffer; + + return result; +} + +#endif void dbg_hit_on(bool cond, int slot = 0); void dbg_mean_of(int64_t value, int slot = 0); @@ -88,6 +117,24 @@ inline TimePoint now() { .count(); } +inline std::vector split(const std::string& s, const std::string& delimiter) { + size_t begin = 0; + std::vector res; + + for (;;) + { + const size_t end = s.find(delimiter, begin); + if (end == std::string::npos) + break; + + res.emplace_back(s.substr(begin, end - begin)); + begin = end + delimiter.size(); + } + + res.emplace_back(s.substr(begin)); + + return res; +} enum SyncCout { IO_LOCK, @@ -194,15 +241,6 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) { #endif } -// Under Windows it is not possible for a process to run on more than one -// logical processor group. This usually means being limited to using max 64 -// cores. To overcome this, some special platform-specific API should be -// called to set group affinity for each thread. Original code from Texel by -// Peter Ă–sterlund. -namespace WinProcGroup { -void bind_this_thread(size_t idx); -} - struct CommandLine { public: diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp index de2c7eca6d5..db864fcd384 100644 --- a/src/nnue/network.cpp +++ b/src/nnue/network.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -123,6 +124,47 @@ bool write_parameters(std::ostream& stream, const T& reference) { } // namespace Detail +template +Network::Network(const Network& other) : + evalFile(other.evalFile), + embeddedType(other.embeddedType) { + if (other.featureTransformer) + { + Detail::initialize(featureTransformer); + *featureTransformer = *other.featureTransformer; + } + for (std::size_t i = 0; i < LayerStacks; ++i) + { + if (other.network[i]) + { + Detail::initialize(network[i]); + *(network[i]) = *(other.network[i]); + } + } +} + +template +Network& +Network::operator=(const Network& other) { + evalFile = other.evalFile; + embeddedType = other.embeddedType; + + if (other.featureTransformer) + { + Detail::initialize(featureTransformer); + *featureTransformer = *other.featureTransformer; + } + for (std::size_t i = 0; i < LayerStacks; ++i) + { + if (other.network[i]) + { + Detail::initialize(network[i]); + *(network[i]) = *(other.network[i]); + } + } + + return *this; +} template void Network::load(const std::string& rootDirectory, std::string evalfilePath) { diff --git a/src/nnue/network.h b/src/nnue/network.h index 23f56663094..f0ccfafcb4c 100644 --- a/src/nnue/network.h +++ b/src/nnue/network.h @@ -50,6 +50,12 @@ class Network { evalFile(file), embeddedType(type) {} + Network(const Network& other); + Network(Network&& other) = default; + + Network& operator=(const Network& other); + Network& operator=(Network&& other) = default; + void load(const std::string& rootDirectory, std::string evalfilePath); bool save(const std::optional& filename) const; diff --git a/src/numa.h b/src/numa.h new file mode 100644 index 00000000000..c04292daf01 --- /dev/null +++ b/src/numa.h @@ -0,0 +1,904 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef NUMA_H_INCLUDED +#define NUMA_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// We support linux very well, but we explicitly do NOT support Android, partially because +// there are potential issues with `lscpu`, `popen` availability, and partially because +// there's no NUMA environments running Android and there probably won't be. +#if defined(__linux__) && !defined(__ANDROID__) + #if !defined(_GNU_SOURCE) + #define _GNU_SOURCE + #endif + #include +#elif defined(_WIN32) + +// On Windows each processor group can have up to 64 processors. +// https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups +static constexpr size_t WIN_PROCESSOR_GROUP_SIZE = 64; + + #if !defined(NOMINMAX) + #define NOMINMAX + #endif + #include + +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadselectedcpusetmasks +using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT); + +// https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-setthreadgroupaffinity +using SetThreadGroupAffinity_t = BOOL (*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY); + +#endif + +#include "misc.h" + +namespace Stockfish { + +using CpuIndex = size_t; +using NumaIndex = size_t; + +inline const CpuIndex SYSTEM_THREADS_NB = + std::max(1, std::thread::hardware_concurrency()); + +// We want to abstract the purpose of storing the numa node index somewhat. +// Whoever is using this does not need to know the specifics of the replication +// machinery to be able to access NUMA replicated memory. +class NumaReplicatedAccessToken { + public: + NumaReplicatedAccessToken() : + n(0) {} + + explicit NumaReplicatedAccessToken(NumaIndex idx) : + n(idx) {} + + NumaIndex get_numa_index() const { return n; } + + private: + NumaIndex n; +}; + +// Designed as immutable, because there is no good reason to alter an already existing config +// in a way that doesn't require recreating it completely, and it would be complex and expensive +// to maintain class invariants. +// The CPU (processor) numbers always correspond to the actual numbering used by the system. +// NOTE: the numbering is only valid within the process, as for example on Windows +// every process gets a "virtualized" set of processors that respects the current affinity +// The NUMA node numbers MAY NOT correspond to the system's numbering of the NUMA nodes. +// In particular, empty nodes may be removed, or the user may create custom nodes. +// It is guaranteed that NUMA nodes are NOT empty, i.e. every node exposed by NumaConfig +// has at least one processor assigned. +// +// Until Stockfish doesn't support exceptions all places where an exception should be thrown +// are replaced by std::exit. +class NumaConfig { + public: + NumaConfig() : + highestCpuIndex(0), + customAffinity(false) { + const auto numCpus = SYSTEM_THREADS_NB; + add_cpu_range_to_node(NumaIndex{0}, CpuIndex{0}, numCpus - 1); + } + + static std::set get_process_affinity() { + std::set cpus; + + // For unsupported systems, or in case of a soft error, we may assume all processors + // are available for use. + [[maybe_unused]] auto set_to_all_cpus = [&]() { + for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) + cpus.insert(c); + }; + +#if defined(__linux__) && !defined(__ANDROID__) + + // cpu_set_t by default holds 1024 entries. This may not be enough soon, + // but there is no easy way to determine how many threads there actually is. + // In this case we just choose a reasonable upper bound. + static constexpr CpuIndex MaxNumCpus = 1024 * 64; + + cpu_set_t* mask = CPU_ALLOC(MaxNumCpus); + if (mask == nullptr) + std::exit(EXIT_FAILURE); + + const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus); + + CPU_ZERO_S(masksize, mask); + + const int status = sched_getaffinity(0, masksize, mask); + + if (status != 0) + { + CPU_FREE(mask); + std::exit(EXIT_FAILURE); + } + + for (CpuIndex c = 0; c < MaxNumCpus; ++c) + if (CPU_ISSET_S(c, masksize, mask)) + cpus.insert(c); + + CPU_FREE(mask); + +#elif defined(_WIN32) + + // Windows is problematic and weird due to multiple ways of setting affinity, processor groups, + // and behaviour changes between versions. It's unclear if we can support this feature + // on Windows in the same way we do on Linux. + // Apparently when affinity is set via either start /affinity or msys2 taskset + // the function GetNumaProcessorNodeEx completely disregards the processors that we do not + // have affinity more. Moreover, the indices are shifted to start from 0, indicating that Windows + // is providing a whole new mapping of processors to this process. This is problematic in some cases + // but it at least allows us to [probably] support this affinity restriction feature by default. + // So overall, Windows appears to "virtualize" a set of processors and processor groups for every + // process. It's unclear if this assignment can change while the process is running. + // std::thread::hardware_concurrency() returns the number of processors that's consistent + // with GetNumaProcessorNodeEx, so we can just add all of them. + + set_to_all_cpus(); + +#else + + // For other systems we assume the process is allowed to execute on all processors. + set_to_all_cpus(); + +#endif + + return cpus; + } + + // This function queries the system for the mapping of processors to NUMA nodes. + // On Linux we utilize `lscpu` to avoid libnuma. + // On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see + // comment for Windows implementation of get_process_affinity + static NumaConfig from_system(bool respectProcessAffinity = true) { + NumaConfig cfg = empty(); + + std::set allowedCpus; + + if (respectProcessAffinity) + allowedCpus = get_process_affinity(); + else + { + for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) + allowedCpus.insert(c); + } + + auto is_cpu_allowed = [&](CpuIndex c) { return allowedCpus.count(c) == 1; }; + +#if defined(__linux__) && !defined(__ANDROID__) + + // On Linux things are straightforward, since there's no processor groups and + // any thread can be scheduled on all processors. + // This command produces output in the following form + // CPU NODE + // 0 0 + // 1 0 + // 2 1 + // 3 1 + // + // On some systems it may use '-' to signify no NUMA node, in which case we assume it's in node 0. + auto lscpuOpt = get_system_command_output("lscpu -e=cpu,node"); + if (lscpuOpt.has_value()) + { + + std::istringstream ss(*lscpuOpt); + + // skip the list header + ss.ignore(std::numeric_limits::max(), '\n'); + + while (true) + { + CpuIndex c; + NumaIndex n; + + ss >> c; + + if (!ss) + break; + + ss >> n; + + if (!ss) + { + ss.clear(); + std::string dummy; + ss >> dummy; + n = 0; + } + + if (is_cpu_allowed(c)) + cfg.add_cpu_to_node(n, c); + } + } + else + { + for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) + if (is_cpu_allowed(c)) + cfg.add_cpu_to_node(NumaIndex{0}, c); + } + +#elif defined(_WIN32) + + // Since Windows 11 and Windows Server 2022 thread affinities can span + // processor groups and can be set as such by a new WinAPI function. + static const bool CanAffinitySpanProcessorGroups = []() { + HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); + auto SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t( + (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks")); + return SetThreadSelectedCpuSetMasks_f != nullptr; + }(); + + WORD numProcGroups = GetActiveProcessorGroupCount(); + for (WORD procGroup = 0; procGroup < numProcGroups; ++procGroup) + { + for (BYTE number = 0; number < WIN_PROCESSOR_GROUP_SIZE; ++number) + { + PROCESSOR_NUMBER procnum; + procnum.Group = procGroup; + procnum.Number = number; + procnum.Reserved = 0; + USHORT nodeNumber; + + // When start /affinity or taskset was used to run this process with restricted affinity + // GetNumaProcessorNodeEx will NOT correspond to the system's processor setup, instead + // it appears to follow a completely new processor assignment, made specifically for this process, + // in which processors that this process has affinity for are remapped, and only those are remapped, + // to form a new set of processors. In other words, we can only get processors + // which we have affinity for this way. This means that the behaviour for + // `respectProcessAffinity == false` may be unexpected when affinity is set from outside, + // while the behaviour for `respectProcessAffinity == true` is given by default. + const BOOL status = GetNumaProcessorNodeEx(&procnum, &nodeNumber); + const CpuIndex c = static_cast(procGroup) * WIN_PROCESSOR_GROUP_SIZE + + static_cast(number); + if (status != 0 && nodeNumber != std::numeric_limits::max() + && is_cpu_allowed(c)) + { + cfg.add_cpu_to_node(nodeNumber, c); + } + } + } + + // Split the NUMA nodes to be contained within a group if necessary. + // This is needed between Windows 10 Build 20348 and Windows 11, because + // the new NUMA allocation behaviour was introduced while there was + // still no way to set thread affinity spanning multiple processor groups. + // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support + if (!CanAffinitySpanProcessorGroups) + { + NumaConfig splitCfg = empty(); + + NumaIndex splitNodeIndex = 0; + for (const auto& cpus : cfg.nodes) + { + if (cpus.empty()) + continue; + + size_t lastProcGroupIndex = *(cpus.begin()) / WIN_PROCESSOR_GROUP_SIZE; + for (CpuIndex c : cpus) + { + const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE; + if (procGroupIndex != lastProcGroupIndex) + { + splitNodeIndex += 1; + lastProcGroupIndex = procGroupIndex; + } + splitCfg.add_cpu_to_node(splitNodeIndex, c); + } + splitNodeIndex += 1; + } + + cfg = std::move(splitCfg); + } + +#else + + // Fallback for unsupported systems. + for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) + if (is_cpu_allowed(c)) + cfg.add_cpu_to_node(NumaIndex{0}, c); + +#endif + + // We have to ensure no empty NUMA nodes persist. + cfg.remove_empty_numa_nodes(); + + return cfg; + } + + // ':'-separated numa nodes + // ','-separated cpu indices + // supports "first-last" range syntax for cpu indices + // For example "0-15,128-143:16-31,144-159:32-47,160-175:48-63,176-191" + static NumaConfig from_string(const std::string& s) { + NumaConfig cfg = empty(); + + NumaIndex n = 0; + for (auto&& nodeStr : split(s, ":")) + { + bool addedAnyCpuInThisNode = false; + + for (const std::string& cpuStr : split(nodeStr, ",")) + { + if (cpuStr.empty()) + continue; + + auto parts = split(cpuStr, "-"); + if (parts.size() == 1) + { + const CpuIndex c = CpuIndex{str_to_size_t(parts[0])}; + if (!cfg.add_cpu_to_node(n, c)) + std::exit(EXIT_FAILURE); + } + else if (parts.size() == 2) + { + const CpuIndex cfirst = CpuIndex{str_to_size_t(parts[0])}; + const CpuIndex clast = CpuIndex{str_to_size_t(parts[1])}; + + if (!cfg.add_cpu_range_to_node(n, cfirst, clast)) + std::exit(EXIT_FAILURE); + } + else + { + std::exit(EXIT_FAILURE); + } + + addedAnyCpuInThisNode = true; + } + + if (addedAnyCpuInThisNode) + n += 1; + } + + cfg.customAffinity = true; + + return cfg; + } + + NumaConfig(const NumaConfig&) = delete; + NumaConfig(NumaConfig&&) = default; + NumaConfig& operator=(const NumaConfig&) = delete; + NumaConfig& operator=(NumaConfig&&) = default; + + bool is_cpu_assigned(CpuIndex n) const { return nodeByCpu.count(n) == 1; } + + NumaIndex num_numa_nodes() const { return nodes.size(); } + + CpuIndex num_cpus_in_numa_node(NumaIndex n) const { + assert(n < nodes.size()); + return nodes[n].size(); + } + + CpuIndex num_cpus() const { return nodeByCpu.size(); } + + bool requires_memory_replication() const { return customAffinity || nodes.size() > 1; } + + std::string to_string() const { + std::string str; + + bool isFirstNode = true; + for (auto&& cpus : nodes) + { + if (!isFirstNode) + str += ":"; + + bool isFirstSet = true; + auto rangeStart = cpus.begin(); + for (auto it = cpus.begin(); it != cpus.end(); ++it) + { + auto next = std::next(it); + if (next == cpus.end() || *next != *it + 1) + { + // cpus[i] is at the end of the range (may be of size 1) + if (!isFirstSet) + str += ","; + + const CpuIndex last = *it; + + if (it != rangeStart) + { + const CpuIndex first = *rangeStart; + + str += std::to_string(first); + str += "-"; + str += std::to_string(last); + } + else + str += std::to_string(last); + + rangeStart = next; + isFirstSet = false; + } + } + + isFirstNode = false; + } + + return str; + } + + bool suggests_binding_threads(CpuIndex numThreads) const { + // If we can reasonably determine that the threads can't be contained + // by the OS within the first NUMA node then we advise distributing + // and binding threads. When the threads are not bound we can only use + // NUMA memory replicated objects from the first node, so when the OS + // has to schedule on other nodes we lose performance. + // We also suggest binding if there's enough threads to distribute among nodes + // with minimal disparity. + // We try to ignore small nodes, in particular the empty ones. + + // If the affinity set by the user does not match the affinity given by the OS + // then binding is necessary to ensure the threads are running on correct processors. + if (customAffinity) + return true; + + // We obviously can't distribute a single thread, so a single thread should never be bound. + if (numThreads <= 1) + return false; + + size_t largestNodeSize = 0; + for (auto&& cpus : nodes) + if (cpus.size() > largestNodeSize) + largestNodeSize = cpus.size(); + + auto is_node_small = [largestNodeSize](const std::set& node) { + static constexpr double SmallNodeThreshold = 0.6; + return static_cast(node.size()) / static_cast(largestNodeSize) + <= SmallNodeThreshold; + }; + + size_t numNotSmallNodes = 0; + for (auto&& cpus : nodes) + if (!is_node_small(cpus)) + numNotSmallNodes += 1; + + return (numThreads > largestNodeSize / 2 || numThreads >= numNotSmallNodes * 4) + && nodes.size() > 1; + } + + std::vector distribute_threads_among_numa_nodes(CpuIndex numThreads) const { + std::vector ns; + + if (nodes.size() == 1) + { + // special case for when there's no NUMA nodes + // doesn't buy us much, but let's keep the default path simple + ns.resize(numThreads, NumaIndex{0}); + } + else + { + std::vector occupation(nodes.size(), 0); + for (CpuIndex c = 0; c < numThreads; ++c) + { + NumaIndex bestNode{0}; + float bestNodeFill = std::numeric_limits::max(); + for (NumaIndex n = 0; n < nodes.size(); ++n) + { + float fill = + static_cast(occupation[n] + 1) / static_cast(nodes[n].size()); + // NOTE: Do we want to perhaps fill the first available node up to 50% first before considering other nodes? + // Probably not, because it would interfere with running multiple instances. We basically shouldn't + // favor any particular node. + if (fill < bestNodeFill) + { + bestNode = n; + bestNodeFill = fill; + } + } + ns.emplace_back(bestNode); + occupation[bestNode] += 1; + } + } + + return ns; + } + + NumaReplicatedAccessToken bind_current_thread_to_numa_node(NumaIndex n) const { + if (n >= nodes.size() || nodes[n].size() == 0) + std::exit(EXIT_FAILURE); + +#if defined(__linux__) && !defined(__ANDROID__) + + cpu_set_t* mask = CPU_ALLOC(highestCpuIndex + 1); + if (mask == nullptr) + std::exit(EXIT_FAILURE); + + const size_t masksize = CPU_ALLOC_SIZE(highestCpuIndex + 1); + + CPU_ZERO_S(masksize, mask); + + for (CpuIndex c : nodes[n]) + CPU_SET_S(c, masksize, mask); + + const int status = sched_setaffinity(0, masksize, mask); + + CPU_FREE(mask); + + if (status != 0) + std::exit(EXIT_FAILURE); + + // We yield this thread just to be sure it gets rescheduled. + // This is defensive, allowed because this code is not performance critical. + sched_yield(); + +#elif defined(_WIN32) + + // Requires Windows 11. No good way to set thread affinity spanning processor groups before that. + HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); + auto SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t( + (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks")); + auto SetThreadGroupAffinity_f = + SetThreadGroupAffinity_t((void (*)()) GetProcAddress(k32, "SetThreadGroupAffinity")); + + if (SetThreadSelectedCpuSetMasks_f != nullptr) + { + // Only available on Windows 11 and Windows Server 2022 onwards. + const USHORT numProcGroups = + ((highestCpuIndex + 1) + WIN_PROCESSOR_GROUP_SIZE - 1) / WIN_PROCESSOR_GROUP_SIZE; + auto groupAffinities = std::make_unique(numProcGroups); + std::memset(groupAffinities.get(), 0, sizeof(GROUP_AFFINITY) * numProcGroups); + for (WORD i = 0; i < numProcGroups; ++i) + groupAffinities[i].Group = i; + + for (CpuIndex c : nodes[n]) + { + const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE; + const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE; + groupAffinities[procGroupIndex].Mask |= KAFFINITY(1) << idxWithinProcGroup; + } + + HANDLE hThread = GetCurrentThread(); + + const BOOL status = + SetThreadSelectedCpuSetMasks_f(hThread, groupAffinities.get(), numProcGroups); + if (status == 0) + std::exit(EXIT_FAILURE); + + // We yield this thread just to be sure it gets rescheduled. + // This is defensive, allowed because this code is not performance critical. + SwitchToThread(); + } + else if (SetThreadGroupAffinity_f != nullptr) + { + // On earlier windows version (since windows 7) we can't run a single thread + // on multiple processor groups, so we need to restrict the group. + // We assume the group of the first processor listed for this node. + // Processors from outside this group will not be assigned for this thread. + // Normally this won't be an issue because windows used to assign NUMA nodes + // such that they can't span processor groups. However, since Windows 10 Build 20348 + // the behaviour changed, so there's a small window of versions between this and Windows 11 + // that might exhibit problems with not all processors being utilized. + // We handle this in NumaConfig::from_system by manually splitting the nodes when + // we detect that there's no function to set affinity spanning processor nodes. + // This is required because otherwise our thread distribution code may produce + // suboptimal results. + // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support + GROUP_AFFINITY affinity; + std::memset(&affinity, 0, sizeof(GROUP_AFFINITY)); + affinity.Group = static_cast(n); + // We use an ordered set so we're guaranteed to get the smallest cpu number here. + const size_t forcedProcGroupIndex = *(nodes[n].begin()) / WIN_PROCESSOR_GROUP_SIZE; + for (CpuIndex c : nodes[n]) + { + const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE; + const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE; + // We skip processors that are not in the same proccessor group. + // If everything was set up correctly this will never be an issue, + // but we have to account for bad NUMA node specification. + if (procGroupIndex != forcedProcGroupIndex) + continue; + + affinity.Mask |= KAFFINITY(1) << idxWithinProcGroup; + } + + HANDLE hThread = GetCurrentThread(); + + const BOOL status = SetThreadGroupAffinity_f(hThread, &affinity, nullptr); + if (status == 0) + std::exit(EXIT_FAILURE); + + // We yield this thread just to be sure it gets rescheduled. + // This is defensive, allowed because this code is not performance critical. + SwitchToThread(); + } + +#endif + + return NumaReplicatedAccessToken(n); + } + + template + void execute_on_numa_node(NumaIndex n, FuncT&& f) const { + std::thread th([this, &f, n]() { + bind_current_thread_to_numa_node(n); + std::forward(f)(); + }); + + th.join(); + } + + private: + std::vector> nodes; + std::map nodeByCpu; + CpuIndex highestCpuIndex; + + bool customAffinity; + + static NumaConfig empty() { return NumaConfig(EmptyNodeTag{}); } + + struct EmptyNodeTag {}; + + NumaConfig(EmptyNodeTag) : + highestCpuIndex(0), + customAffinity(false) {} + + void remove_empty_numa_nodes() { + std::vector> newNodes; + for (auto&& cpus : nodes) + if (!cpus.empty()) + newNodes.emplace_back(std::move(cpus)); + nodes = std::move(newNodes); + } + + // Returns true if successful + // Returns false if failed, i.e. when the cpu is already present + // strong guarantee, the structure remains unmodified + bool add_cpu_to_node(NumaIndex n, CpuIndex c) { + if (is_cpu_assigned(c)) + return false; + + while (nodes.size() <= n) + nodes.emplace_back(); + + nodes[n].insert(c); + nodeByCpu[c] = n; + + if (c > highestCpuIndex) + highestCpuIndex = c; + + return true; + } + + // Returns true if successful + // Returns false if failed, i.e. when any of the cpus is already present + // strong guarantee, the structure remains unmodified + bool add_cpu_range_to_node(NumaIndex n, CpuIndex cfirst, CpuIndex clast) { + for (CpuIndex c = cfirst; c <= clast; ++c) + if (is_cpu_assigned(c)) + return false; + + while (nodes.size() <= n) + nodes.emplace_back(); + + for (CpuIndex c = cfirst; c <= clast; ++c) + { + nodes[n].insert(c); + nodeByCpu[c] = n; + } + + if (clast > highestCpuIndex) + highestCpuIndex = clast; + + return true; + } +}; + +class NumaReplicationContext; + +// Instances of this class are tracked by the NumaReplicationContext instance +// NumaReplicationContext informs all tracked instances whenever NUMA configuration changes. +class NumaReplicatedBase { + public: + NumaReplicatedBase(NumaReplicationContext& ctx); + + NumaReplicatedBase(const NumaReplicatedBase&) = delete; + NumaReplicatedBase(NumaReplicatedBase&& other) noexcept; + + NumaReplicatedBase& operator=(const NumaReplicatedBase&) = delete; + NumaReplicatedBase& operator=(NumaReplicatedBase&& other) noexcept; + + virtual void on_numa_config_changed() = 0; + virtual ~NumaReplicatedBase(); + + const NumaConfig& get_numa_config() const; + + private: + NumaReplicationContext* context; +}; + +// We force boxing with a unique_ptr. If this becomes an issue due to added indirection we +// may need to add an option for a custom boxing type. +// When the NUMA config changes the value stored at the index 0 is replicated to other nodes. +template +class NumaReplicated: public NumaReplicatedBase { + public: + using ReplicatorFuncType = std::function; + + NumaReplicated(NumaReplicationContext& ctx) : + NumaReplicatedBase(ctx) { + replicate_from(T{}); + } + + NumaReplicated(NumaReplicationContext& ctx, T&& source) : + NumaReplicatedBase(ctx) { + replicate_from(std::move(source)); + } + + NumaReplicated(const NumaReplicated&) = delete; + NumaReplicated(NumaReplicated&& other) noexcept : + NumaReplicatedBase(std::move(other)), + instances(std::exchange(other.instances, {})) {} + + NumaReplicated& operator=(const NumaReplicated&) = delete; + NumaReplicated& operator=(NumaReplicated&& other) noexcept { + NumaReplicatedBase::operator=(*this, std::move(other)); + instances = std::exchange(other.instances, {}); + + return *this; + } + + NumaReplicated& operator=(T&& source) { + replicate_from(std::move(source)); + + return *this; + } + + ~NumaReplicated() override = default; + + const T& operator[](NumaReplicatedAccessToken token) const { + assert(token.get_numa_index() < instances.size()); + return *(instances[token.get_numa_index()]); + } + + const T& operator*() const { return *(instances[0]); } + + const T* operator->() const { return instances[0].get(); } + + template + void modify_and_replicate(FuncT&& f) { + auto source = std::move(instances[0]); + std::forward(f)(*source); + replicate_from(std::move(*source)); + } + + void on_numa_config_changed() override { + // Use the first one as the source. It doesn't matter which one we use, because they all must + // be identical, but the first one is guaranteed to exist. + auto source = std::move(instances[0]); + replicate_from(std::move(*source)); + } + + private: + std::vector> instances; + + void replicate_from(T&& source) { + instances.clear(); + + const NumaConfig& cfg = get_numa_config(); + if (cfg.requires_memory_replication()) + { + for (NumaIndex n = 0; n < cfg.num_numa_nodes(); ++n) + { + cfg.execute_on_numa_node( + n, [this, &source]() { instances.emplace_back(std::make_unique(source)); }); + } + } + else + { + assert(cfg.num_numa_nodes() == 1); + // We take advantage of the fact that replication is not required + // and reuse the source value, avoiding one copy operation. + instances.emplace_back(std::make_unique(std::move(source))); + } + } +}; + +class NumaReplicationContext { + public: + NumaReplicationContext(NumaConfig&& cfg) : + config(std::move(cfg)) {} + + NumaReplicationContext(const NumaReplicationContext&) = delete; + NumaReplicationContext(NumaReplicationContext&&) = delete; + + NumaReplicationContext& operator=(const NumaReplicationContext&) = delete; + NumaReplicationContext& operator=(NumaReplicationContext&&) = delete; + + ~NumaReplicationContext() { + // The context must outlive replicated objects + if (!trackedReplicatedObjects.empty()) + std::exit(EXIT_FAILURE); + } + + void attach(NumaReplicatedBase* obj) { + assert(trackedReplicatedObjects.count(obj) == 0); + trackedReplicatedObjects.insert(obj); + } + + void detach(NumaReplicatedBase* obj) { + assert(trackedReplicatedObjects.count(obj) == 1); + trackedReplicatedObjects.erase(obj); + } + + // oldObj may be invalid at this point + void move_attached([[maybe_unused]] NumaReplicatedBase* oldObj, NumaReplicatedBase* newObj) { + assert(trackedReplicatedObjects.count(oldObj) == 1); + assert(trackedReplicatedObjects.count(newObj) == 0); + trackedReplicatedObjects.erase(oldObj); + trackedReplicatedObjects.insert(newObj); + } + + void set_numa_config(NumaConfig&& cfg) { + config = std::move(cfg); + for (auto&& obj : trackedReplicatedObjects) + obj->on_numa_config_changed(); + } + + const NumaConfig& get_numa_config() const { return config; } + + private: + NumaConfig config; + + // std::set uses std::less by default, which is required for pointer comparison to be defined. + std::set trackedReplicatedObjects; +}; + +inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicationContext& ctx) : + context(&ctx) { + context->attach(this); +} + +inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicatedBase&& other) noexcept : + context(std::exchange(other.context, nullptr)) { + context->move_attached(&other, this); +} + +inline NumaReplicatedBase& NumaReplicatedBase::operator=(NumaReplicatedBase&& other) noexcept { + context = std::exchange(other.context, nullptr); + + context->move_attached(&other, this); + + return *this; +} + +inline NumaReplicatedBase::~NumaReplicatedBase() { + if (context != nullptr) + context->detach(this); +} + +inline const NumaConfig& NumaReplicatedBase::get_numa_config() const { + return context->get_numa_config(); +} + +} // namespace Stockfish + + +#endif // #ifndef NUMA_H_INCLUDED diff --git a/src/search.cpp b/src/search.cpp index 0dbc6a3a5db..c074e3421ea 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -137,15 +137,17 @@ void update_all_stats(const Position& pos, Search::Worker::Worker(SharedState& sharedState, std::unique_ptr sm, - size_t thread_id) : + size_t thread_id, + NumaReplicatedAccessToken token) : // Unpack the SharedState struct into member variables thread_idx(thread_id), + numaAccessToken(token), manager(std::move(sm)), options(sharedState.options), threads(sharedState.threads), tt(sharedState.tt), networks(sharedState.networks), - refreshTable(networks) { + refreshTable(networks[token]) { clear(); } @@ -428,7 +430,7 @@ void Search::Worker::iterative_deepening() { skill.pick_best(rootMoves, multiPV); // Use part of the gained time from a previous stable move for the current move - for (Thread* th : threads) + for (auto&& th : threads) { totBestMoveChanges += th->worker->bestMoveChanges; th->worker->bestMoveChanges = 0; @@ -510,7 +512,7 @@ void Search::Worker::clear() { for (size_t i = 1; i < reductions.size(); ++i) reductions[i] = int((19.90 + std::log(size_t(options["Threads"])) / 2) * std::log(i)); - refreshTable.clear(networks); + refreshTable.clear(networks[numaAccessToken]); } @@ -576,9 +578,9 @@ Value Search::Worker::search( // Step 2. Check for aborted search and immediate draw if (threads.stop.load(std::memory_order_relaxed) || pos.is_draw(ss->ply) || ss->ply >= MAX_PLY) - return (ss->ply >= MAX_PLY && !ss->inCheck) - ? evaluate(networks, pos, refreshTable, thisThread->optimism[us]) - : value_draw(thisThread->nodes); + return (ss->ply >= MAX_PLY && !ss->inCheck) ? evaluate( + networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]) + : value_draw(thisThread->nodes); // Step 3. Mate distance pruning. Even if we mate at the next move our score // would be at best mate_in(ss->ply + 1), but if alpha is already bigger because @@ -706,7 +708,7 @@ Value Search::Worker::search( { // Providing the hint that this node's accumulator will be used often // brings significant Elo gain (~13 Elo). - Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable); + Eval::NNUE::hint_common_parent_position(pos, networks[numaAccessToken], refreshTable); unadjustedStaticEval = eval = ss->staticEval; } else if (ss->ttHit) @@ -714,9 +716,10 @@ Value Search::Worker::search( // Never assume anything about values stored in TT unadjustedStaticEval = tte->eval(); if (unadjustedStaticEval == VALUE_NONE) - unadjustedStaticEval = evaluate(networks, pos, refreshTable, thisThread->optimism[us]); + unadjustedStaticEval = + evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]); else if (PvNode) - Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable); + Eval::NNUE::hint_common_parent_position(pos, networks[numaAccessToken], refreshTable); ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); @@ -726,7 +729,8 @@ Value Search::Worker::search( } else { - unadjustedStaticEval = evaluate(networks, pos, refreshTable, thisThread->optimism[us]); + unadjustedStaticEval = + evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]); ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); // Static evaluation is saved as it was before adjustment by correction history @@ -892,7 +896,7 @@ Value Search::Worker::search( } } - Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable); + Eval::NNUE::hint_common_parent_position(pos, networks[numaAccessToken], refreshTable); } moves_loop: // When in check, search starts here @@ -1441,7 +1445,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, // Step 2. Check for an immediate draw or maximum ply reached if (pos.is_draw(ss->ply) || ss->ply >= MAX_PLY) return (ss->ply >= MAX_PLY && !ss->inCheck) - ? evaluate(networks, pos, refreshTable, thisThread->optimism[us]) + ? evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]) : VALUE_DRAW; assert(0 <= ss->ply && ss->ply < MAX_PLY); @@ -1476,7 +1480,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, unadjustedStaticEval = tte->eval(); if (unadjustedStaticEval == VALUE_NONE) unadjustedStaticEval = - evaluate(networks, pos, refreshTable, thisThread->optimism[us]); + evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]); ss->staticEval = bestValue = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); @@ -1488,10 +1492,11 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, else { // In case of null move search, use previous static eval with a different sign - unadjustedStaticEval = (ss - 1)->currentMove != Move::null() - ? evaluate(networks, pos, refreshTable, thisThread->optimism[us]) - : -(ss - 1)->staticEval; - ss->staticEval = bestValue = + unadjustedStaticEval = + (ss - 1)->currentMove != Move::null() + ? evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]) + : -(ss - 1)->staticEval; + ss->staticEval = bestValue = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); } diff --git a/src/search.h b/src/search.h index 6e5b22bda32..a61f253c005 100644 --- a/src/search.h +++ b/src/search.h @@ -32,19 +32,17 @@ #include "misc.h" #include "movepick.h" +#include "nnue/network.h" +#include "nnue/nnue_accumulator.h" +#include "numa.h" #include "position.h" #include "score.h" #include "syzygy/tbprobe.h" #include "timeman.h" #include "types.h" -#include "nnue/nnue_accumulator.h" namespace Stockfish { -namespace Eval::NNUE { -struct Networks; -} - // Different node types, used as a template parameter enum NodeType { NonPV, @@ -133,19 +131,19 @@ struct LimitsType { // The UCI stores the uci options, thread pool, and transposition table. // This struct is used to easily forward data to the Search::Worker class. struct SharedState { - SharedState(const OptionsMap& optionsMap, - ThreadPool& threadPool, - TranspositionTable& transpositionTable, - const Eval::NNUE::Networks& nets) : + SharedState(const OptionsMap& optionsMap, + ThreadPool& threadPool, + TranspositionTable& transpositionTable, + const NumaReplicated& nets) : options(optionsMap), threads(threadPool), tt(transpositionTable), networks(nets) {} - const OptionsMap& options; - ThreadPool& threads; - TranspositionTable& tt; - const Eval::NNUE::Networks& networks; + const OptionsMap& options; + ThreadPool& threads; + TranspositionTable& tt; + const NumaReplicated& networks; }; class Worker; @@ -236,7 +234,7 @@ class NullSearchManager: public ISearchManager { // of the search history, and storing data required for the search. class Worker { public: - Worker(SharedState&, std::unique_ptr, size_t); + Worker(SharedState&, std::unique_ptr, size_t, NumaReplicatedAccessToken); // Called at instantiation to initialize Reductions tables // Reset histories, usually before a new game @@ -293,7 +291,8 @@ class Worker { Depth rootDepth, completedDepth; Value rootDelta; - size_t thread_idx; + size_t thread_idx; + NumaReplicatedAccessToken numaAccessToken; // Reductions lookup table initialized at startup std::array reductions; // [depth or moveNumber] @@ -303,10 +302,10 @@ class Worker { Tablebases::Config tbConfig; - const OptionsMap& options; - ThreadPool& threads; - TranspositionTable& tt; - const Eval::NNUE::Networks& networks; + const OptionsMap& options; + ThreadPool& threads; + TranspositionTable& tt; + const NumaReplicated& networks; // Used by NNUE Eval::NNUE::AccumulatorCaches refreshTable; diff --git a/src/thread.cpp b/src/thread.cpp index 8724cb49cd1..5893f4b6d07 100644 --- a/src/thread.cpp +++ b/src/thread.cpp @@ -22,19 +22,17 @@ #include #include #include +#include #include #include -#include -#include "misc.h" #include "movegen.h" #include "search.h" #include "syzygy/tbprobe.h" #include "timeman.h" -#include "tt.h" #include "types.h" -#include "ucioption.h" #include "uci.h" +#include "ucioption.h" namespace Stockfish { @@ -42,13 +40,24 @@ namespace Stockfish { // in idle_loop(). Note that 'searching' and 'exit' should be already set. Thread::Thread(Search::SharedState& sharedState, std::unique_ptr sm, - size_t n) : - worker(std::make_unique(sharedState, std::move(sm), n)), + size_t n, + OptionalThreadToNumaNodeBinder binder) : idx(n), nthreads(sharedState.options["Threads"]), stdThread(&Thread::idle_loop, this) { wait_for_search_finished(); + + run_custom_job([this, &binder, &sharedState, &sm, n]() { + // Use the binder to [maybe] bind the threads to a NUMA node before doing + // the Worker allocation. + // Ideally we would also allocate the SearchManager here, but that's minor. + this->numaAccessToken = binder(); + this->worker = + std::make_unique(sharedState, std::move(sm), n, this->numaAccessToken); + }); + + wait_for_search_finished(); } @@ -66,12 +75,15 @@ Thread::~Thread() { // Wakes up the thread that will start the search void Thread::start_searching() { - mutex.lock(); - searching = true; - mutex.unlock(); // Unlock before notifying saves a few CPU-cycles - cv.notify_one(); // Wake up the thread in idle_loop() + assert(worker != nullptr); + run_custom_job([this]() { worker->start_searching(); }); } +// Wakes up the thread that will start the search +void Thread::clear_worker() { + assert(worker != nullptr); + run_custom_job([this]() { worker->clear(); }); +} // Blocks on the condition variable // until the thread has finished searching. @@ -81,20 +93,20 @@ void Thread::wait_for_search_finished() { cv.wait(lk, [&] { return !searching; }); } +void Thread::run_custom_job(std::function f) { + { + std::unique_lock lk(mutex); + cv.wait(lk, [&] { return !searching; }); + jobFunc = std::move(f); + searching = true; + } + cv.notify_one(); +} // Thread gets parked here, blocked on the // condition variable, when it has no work to do. void Thread::idle_loop() { - - // If OS already scheduled us on a different group than 0 then don't overwrite - // the choice, eventually we are one of many one-threaded processes running on - // some Windows NUMA hardware, for instance in fishtest. To make it simple, - // just check if running threads are below a threshold, in this case, all this - // NUMA machinery is not needed. - if (nthreads > 8) - WinProcGroup::bind_this_thread(idx); - while (true) { std::unique_lock lk(mutex); @@ -105,9 +117,13 @@ void Thread::idle_loop() { if (exit) return; + std::function job = std::move(jobFunc); + jobFunc = nullptr; + lk.unlock(); - worker->start_searching(); + if (job) + job(); } } @@ -121,49 +137,82 @@ uint64_t ThreadPool::tb_hits() const { return accumulate(&Search::Worker::tbHits // Creates/destroys threads to match the requested number. // Created and launched threads will immediately go to sleep in idle_loop. // Upon resizing, threads are recreated to allow for binding if necessary. -void ThreadPool::set(Search::SharedState sharedState, +void ThreadPool::set(const NumaConfig& numaConfig, + Search::SharedState sharedState, const Search::SearchManager::UpdateContext& updateContext) { if (threads.size() > 0) // destroy any existing thread(s) { main_thread()->wait_for_search_finished(); - while (threads.size() > 0) - delete threads.back(), threads.pop_back(); + threads.clear(); + + boundThreadToNumaNode.clear(); } const size_t requested = sharedState.options["Threads"]; if (requested > 0) // create new thread(s) { - auto manager = std::make_unique(updateContext); - threads.push_back(new Thread(sharedState, std::move(manager), 0)); + // Binding threads may be problematic when there's multiple NUMA nodes and + // multiple Stockfish instances running. In particular, if each instance + // runs a single thread then they would all be mapped to the first NUMA node. + // This is undesirable, and so the default behaviour (i.e. when the user does not + // change the NumaConfig UCI setting) is to not bind the threads to processors + // unless we know for sure that we span NUMA nodes and replication is required. + const std::string numaPolicy(sharedState.options["NumaPolicy"]); + const bool doBindThreads = [&]() { + if (numaPolicy == "none") + return false; + + if (numaPolicy == "auto") + return numaConfig.suggests_binding_threads(requested); + + // numaPolicy == "system", or explicitly set by the user + return true; + }(); + + boundThreadToNumaNode = doBindThreads + ? numaConfig.distribute_threads_among_numa_nodes(requested) + : std::vector{}; while (threads.size() < requested) { - auto null_manager = std::make_unique(); - threads.push_back(new Thread(sharedState, std::move(null_manager), threads.size())); + const size_t threadId = threads.size(); + const NumaIndex numaId = doBindThreads ? boundThreadToNumaNode[threadId] : 0; + auto manager = threadId == 0 ? std::unique_ptr( + std::make_unique(updateContext)) + : std::make_unique(); + + // When not binding threads we want to force all access to happen + // from the same NUMA node, because in case of NUMA replicated memory + // accesses we don't want to trash cache in case the threads get scheduled + // on the same NUMA node. + auto binder = doBindThreads ? OptionalThreadToNumaNodeBinder(numaConfig, numaId) + : OptionalThreadToNumaNodeBinder(numaId); + + threads.emplace_back( + std::make_unique(sharedState, std::move(manager), threadId, binder)); } clear(); main_thread()->wait_for_search_finished(); - - // Reallocate the hash with the new threadpool size - sharedState.tt.resize(sharedState.options["Hash"], requested); } } // Sets threadPool data to initial values void ThreadPool::clear() { - - for (Thread* th : threads) - th->worker->clear(); - if (threads.size() == 0) return; + for (auto&& th : threads) + th->clear_worker(); + + for (auto&& th : threads) + th->wait_for_search_finished(); + main_manager()->callsCnt = 0; main_manager()->bestPreviousScore = VALUE_INFINITE; main_manager()->bestPreviousAverageScore = VALUE_INFINITE; @@ -172,6 +221,17 @@ void ThreadPool::clear() { main_manager()->tm.clear(); } +void ThreadPool::run_on_thread(size_t threadId, std::function f) { + assert(threads.size() > threadId); + threads[threadId]->run_custom_job(std::move(f)); +} + +void ThreadPool::wait_on_thread(size_t threadId) { + assert(threads.size() > threadId); + threads[threadId]->wait_for_search_finished(); +} + +size_t ThreadPool::num_threads() const { return threads.size(); } // Wakes up main thread waiting in idle_loop() and // returns immediately. Main thread will wake up other threads and start the search. @@ -216,31 +276,36 @@ void ThreadPool::start_thinking(const OptionsMap& options, // be deduced from a fen string, so set() clears them and they are set from // setupStates->back() later. The rootState is per thread, earlier states are shared // since they are read-only. - for (Thread* th : threads) + for (auto&& th : threads) { - th->worker->limits = limits; - th->worker->nodes = th->worker->tbHits = th->worker->nmpMinPly = - th->worker->bestMoveChanges = 0; - th->worker->rootDepth = th->worker->completedDepth = 0; - th->worker->rootMoves = rootMoves; - th->worker->rootPos.set(pos.fen(), pos.is_chess960(), &th->worker->rootState); - th->worker->rootState = setupStates->back(); - th->worker->tbConfig = tbConfig; + th->run_custom_job([&]() { + th->worker->limits = limits; + th->worker->nodes = th->worker->tbHits = th->worker->nmpMinPly = + th->worker->bestMoveChanges = 0; + th->worker->rootDepth = th->worker->completedDepth = 0; + th->worker->rootMoves = rootMoves; + th->worker->rootPos.set(pos.fen(), pos.is_chess960(), &th->worker->rootState); + th->worker->rootState = setupStates->back(); + th->worker->tbConfig = tbConfig; + }); } + for (auto&& th : threads) + th->wait_for_search_finished(); + main_thread()->start_searching(); } Thread* ThreadPool::get_best_thread() const { - Thread* bestThread = threads.front(); + Thread* bestThread = threads.front().get(); Value minScore = VALUE_NONE; std::unordered_map votes( 2 * std::min(size(), bestThread->worker->rootMoves.size())); // Find the minimum score of all threads - for (Thread* th : threads) + for (auto&& th : threads) minScore = std::min(minScore, th->worker->rootMoves[0].score); // Vote according to score and depth, and select the best thread @@ -248,10 +313,10 @@ Thread* ThreadPool::get_best_thread() const { return (th->worker->rootMoves[0].score - minScore + 14) * int(th->worker->completedDepth); }; - for (Thread* th : threads) - votes[th->worker->rootMoves[0].pv[0]] += thread_voting_value(th); + for (auto&& th : threads) + votes[th->worker->rootMoves[0].pv[0]] += thread_voting_value(th.get()); - for (Thread* th : threads) + for (auto&& th : threads) { const auto bestThreadScore = bestThread->worker->rootMoves[0].score; const auto newThreadScore = th->worker->rootMoves[0].score; @@ -272,26 +337,26 @@ Thread* ThreadPool::get_best_thread() const { // Note that we make sure not to pick a thread with truncated-PV for better viewer experience. const bool betterVotingValue = - thread_voting_value(th) * int(newThreadPV.size() > 2) + thread_voting_value(th.get()) * int(newThreadPV.size() > 2) > thread_voting_value(bestThread) * int(bestThreadPV.size() > 2); if (bestThreadInProvenWin) { // Make sure we pick the shortest mate / TB conversion if (newThreadScore > bestThreadScore) - bestThread = th; + bestThread = th.get(); } else if (bestThreadInProvenLoss) { // Make sure we pick the shortest mated / TB conversion if (newThreadInProvenLoss && newThreadScore < bestThreadScore) - bestThread = th; + bestThread = th.get(); } else if (newThreadInProvenWin || newThreadInProvenLoss || (newThreadScore > VALUE_TB_LOSS_IN_MAX_PLY && (newThreadMoveVote > bestThreadMoveVote || (newThreadMoveVote == bestThreadMoveVote && betterVotingValue)))) - bestThread = th; + bestThread = th.get(); } return bestThread; @@ -302,7 +367,7 @@ Thread* ThreadPool::get_best_thread() const { // Will be invoked by main thread after it has started searching void ThreadPool::start_searching() { - for (Thread* th : threads) + for (auto&& th : threads) if (th != threads.front()) th->start_searching(); } @@ -312,9 +377,28 @@ void ThreadPool::start_searching() { void ThreadPool::wait_for_search_finished() const { - for (Thread* th : threads) + for (auto&& th : threads) if (th != threads.front()) th->wait_for_search_finished(); } +std::vector ThreadPool::get_bound_thread_count_by_numa_node() const { + std::vector counts; + + if (!boundThreadToNumaNode.empty()) + { + NumaIndex highestNumaNode = 0; + for (NumaIndex n : boundThreadToNumaNode) + if (n > highestNumaNode) + highestNumaNode = n; + + counts.resize(highestNumaNode + 1, 0); + + for (NumaIndex n : boundThreadToNumaNode) + counts[n] += 1; + } + + return counts; +} + } // namespace Stockfish diff --git a/src/thread.h b/src/thread.h index 223652aec99..102b229907b 100644 --- a/src/thread.h +++ b/src/thread.h @@ -26,10 +26,12 @@ #include #include #include +#include #include "position.h" #include "search.h" #include "thread_win32_osx.h" +#include "numa.h" namespace Stockfish { @@ -37,6 +39,32 @@ namespace Stockfish { class OptionsMap; using Value = int; +// Sometimes we don't want to actually bind the threads, but the recipent still +// needs to think it runs on *some* NUMA node, such that it can access structures +// that rely on NUMA node knowledge. This class encapsulates this optional process +// such that the recipent does not need to know whether the binding happened or not. +class OptionalThreadToNumaNodeBinder { + public: + OptionalThreadToNumaNodeBinder(NumaIndex n) : + numaConfig(nullptr), + numaId(n) {} + + OptionalThreadToNumaNodeBinder(const NumaConfig& cfg, NumaIndex n) : + numaConfig(&cfg), + numaId(n) {} + + NumaReplicatedAccessToken operator()() const { + if (numaConfig != nullptr) + return numaConfig->bind_current_thread_to_numa_node(numaId); + else + return NumaReplicatedAccessToken(numaId); + } + + private: + const NumaConfig* numaConfig; + NumaIndex numaId; +}; + // Abstraction of a thread. It contains a pointer to the worker and a native thread. // After construction, the native thread is started with idle_loop() // waiting for a signal to start searching. @@ -44,22 +72,35 @@ using Value = int; // the search is finished, it goes back to idle_loop() waiting for a new signal. class Thread { public: - Thread(Search::SharedState&, std::unique_ptr, size_t); + Thread(Search::SharedState&, + std::unique_ptr, + size_t, + OptionalThreadToNumaNodeBinder); virtual ~Thread(); - void idle_loop(); - void start_searching(); + void idle_loop(); + void start_searching(); + void clear_worker(); + void run_custom_job(std::function f); + + // Thread has been slightly altered to allow running custom jobs, so + // this name is no longer correct. However, this class (and ThreadPool) + // require further work to make them properly generic while maintaining + // appropriate specificity regarding search, from the point of view of an + // outside user, so renaming of this function in left for whenever that happens. void wait_for_search_finished(); size_t id() const { return idx; } std::unique_ptr worker; + std::function jobFunc; private: - std::mutex mutex; - std::condition_variable cv; - size_t idx, nthreads; - bool exit = false, searching = true; // Set before starting std::thread - NativeThread stdThread; + std::mutex mutex; + std::condition_variable cv; + size_t idx, nthreads; + bool exit = false, searching = true; // Set before starting std::thread + NativeThread stdThread; + NumaReplicatedAccessToken numaAccessToken; }; @@ -67,31 +108,44 @@ class Thread { // parking and, most importantly, launching a thread. All the access to threads // is done through this class. class ThreadPool { - public: + ThreadPool() {} + ~ThreadPool() { // destroy any existing thread(s) if (threads.size() > 0) { main_thread()->wait_for_search_finished(); - while (threads.size() > 0) - delete threads.back(), threads.pop_back(); + threads.clear(); } } - void start_thinking(const OptionsMap&, Position&, StateListPtr&, Search::LimitsType); - void clear(); - void set(Search::SharedState, const Search::SearchManager::UpdateContext&); + ThreadPool(const ThreadPool&) = delete; + ThreadPool(ThreadPool&&) = delete; + + ThreadPool& operator=(const ThreadPool&) = delete; + ThreadPool& operator=(ThreadPool&&) = delete; + + void start_thinking(const OptionsMap&, Position&, StateListPtr&, Search::LimitsType); + void run_on_thread(size_t threadId, std::function f); + void wait_on_thread(size_t threadId); + size_t num_threads() const; + void clear(); + void set(const NumaConfig& numaConfig, + Search::SharedState, + const Search::SearchManager::UpdateContext&); Search::SearchManager* main_manager(); - Thread* main_thread() const { return threads.front(); } + Thread* main_thread() const { return threads.front().get(); } uint64_t nodes_searched() const; uint64_t tb_hits() const; Thread* get_best_thread() const; void start_searching(); void wait_for_search_finished() const; + std::vector get_bound_thread_count_by_numa_node() const; + std::atomic_bool stop, abortedSearch, increaseDepth; auto cbegin() const noexcept { return threads.cbegin(); } @@ -102,13 +156,14 @@ class ThreadPool { auto empty() const noexcept { return threads.empty(); } private: - StateListPtr setupStates; - std::vector threads; + StateListPtr setupStates; + std::vector> threads; + std::vector boundThreadToNumaNode; uint64_t accumulate(std::atomic Search::Worker::*member) const { uint64_t sum = 0; - for (Thread* th : threads) + for (auto&& th : threads) sum += (th->worker.get()->*member).load(std::memory_order_relaxed); return sum; } diff --git a/src/tt.cpp b/src/tt.cpp index 3f5b9d4d9b0..79274f525b9 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -23,10 +23,10 @@ #include #include #include -#include -#include #include "misc.h" +#include "syzygy/tbprobe.h" +#include "thread.h" namespace Stockfish { @@ -74,7 +74,7 @@ uint8_t TTEntry::relative_age(const uint8_t generation8) const { // Sets the size of the transposition table, // measured in megabytes. Transposition table consists // of clusters and each cluster consists of ClusterSize number of TTEntry. -void TranspositionTable::resize(size_t mbSize, int threadCount) { +void TranspositionTable::resize(size_t mbSize, ThreadPool& threads) { aligned_large_pages_free(table); clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster); @@ -86,32 +86,29 @@ void TranspositionTable::resize(size_t mbSize, int threadCount) { exit(EXIT_FAILURE); } - clear(threadCount); + clear(threads); } // Initializes the entire transposition table to zero, // in a multi-threaded way. -void TranspositionTable::clear(size_t threadCount) { - std::vector threads; +void TranspositionTable::clear(ThreadPool& threads) { + const size_t threadCount = threads.num_threads(); - for (size_t idx = 0; idx < size_t(threadCount); ++idx) + for (size_t i = 0; i < threadCount; ++i) { - threads.emplace_back([this, idx, threadCount]() { - // Thread binding gives faster search on systems with a first-touch policy - if (threadCount > 8) - WinProcGroup::bind_this_thread(idx); - + threads.run_on_thread(i, [this, i, threadCount]() { // Each thread will zero its part of the hash table - const size_t stride = size_t(clusterCount / threadCount), start = size_t(stride * idx), - len = idx != size_t(threadCount) - 1 ? stride : clusterCount - start; + const size_t stride = clusterCount / threadCount; + const size_t start = stride * i; + const size_t len = i + 1 != threadCount ? stride : clusterCount - start; std::memset(&table[start], 0, len * sizeof(Cluster)); }); } - for (std::thread& th : threads) - th.join(); + for (size_t i = 0; i < threadCount; ++i) + threads.wait_on_thread(i); } diff --git a/src/tt.h b/src/tt.h index 7cc876fb9ea..3b09ec4e1d9 100644 --- a/src/tt.h +++ b/src/tt.h @@ -63,6 +63,7 @@ struct TTEntry { int16_t eval16; }; +class ThreadPool; // A TranspositionTable is an array of Cluster, of size clusterCount. Each // cluster consists of ClusterSize number of TTEntry. Each non-empty TTEntry @@ -102,8 +103,8 @@ class TranspositionTable { TTEntry* probe(const Key key, bool& found) const; int hashfull() const; - void resize(size_t mbSize, int threadCount); - void clear(size_t threadCount); + void resize(size_t mbSize, ThreadPool& threads); + void clear(ThreadPool& threads); TTEntry* first_entry(const Key key) const { return &table[mul_hi64(key, clusterCount)].entry[0]; diff --git a/src/uci.cpp b/src/uci.cpp index cb686a027db..ab0dae3946e 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -60,7 +60,16 @@ UCIEngine::UCIEngine(int argc, char** argv) : options["Debug Log File"] << Option("", [](const Option& o) { start_logger(o); }); - options["Threads"] << Option(1, 1, 1024, [this](const Option&) { engine.resize_threads(); }); + options["NumaPolicy"] << Option("auto", [this](const Option& o) { + engine.set_numa_config_from_option(o); + print_numa_config_information(); + print_thread_binding_information(); + }); + + options["Threads"] << Option(1, 1, 1024, [this](const Option&) { + engine.resize_threads(); + print_thread_binding_information(); + }); options["Hash"] << Option(16, 1, MaxHashMB, [this](const Option& o) { engine.set_tt_size(o); }); @@ -123,8 +132,15 @@ void UCIEngine::loop() { engine.set_ponderhit(false); else if (token == "uci") + { sync_cout << "id name " << engine_info(true) << "\n" - << engine.get_options() << "\nuciok" << sync_endl; + << engine.get_options() << sync_endl; + + print_numa_config_information(); + print_thread_binding_information(); + + sync_cout << "uciok" << sync_endl; + } else if (token == "setoption") setoption(is); @@ -177,6 +193,28 @@ void UCIEngine::loop() { } while (token != "quit" && cli.argc == 1); // The command-line arguments are one-shot } +void UCIEngine::print_numa_config_information() const { + auto cfgStr = engine.get_numa_config_as_string(); + sync_cout << "info string Available Processors: " << cfgStr << sync_endl; +} + +void UCIEngine::print_thread_binding_information() const { + auto boundThreadsByNode = engine.get_bound_thread_count_by_numa_node(); + if (!boundThreadsByNode.empty()) + { + sync_cout << "info string NUMA Node Thread Binding: "; + bool isFirst = true; + for (auto&& [current, total] : boundThreadsByNode) + { + if (!isFirst) + std::cout << ":"; + std::cout << current << "/" << total; + isFirst = false; + } + std::cout << sync_endl; + } +} + Search::LimitsType UCIEngine::parse_limits(std::istream& is) { Search::LimitsType limits; std::string token; diff --git a/src/uci.h b/src/uci.h index 55d580f9727..bac62bb90c0 100644 --- a/src/uci.h +++ b/src/uci.h @@ -42,6 +42,9 @@ class UCIEngine { void loop(); + void print_numa_config_information() const; + void print_thread_binding_information() const; + static int to_cp(Value v, const Position& pos); static std::string format_score(const Score& s); static std::string square(Square s); diff --git a/src/ucioption.cpp b/src/ucioption.cpp index e1ffe546525..4819a68db73 100644 --- a/src/ucioption.cpp +++ b/src/ucioption.cpp @@ -118,6 +118,8 @@ bool Option::operator==(const char* s) const { return !CaseInsensitiveLess()(currentValue, s) && !CaseInsensitiveLess()(s, currentValue); } +bool Option::operator!=(const char* s) const { return !(*this == s); } + // Inits options and assigns idx in the correct printing order diff --git a/src/ucioption.h b/src/ucioption.h index b575d1646e6..16d46696145 100644 --- a/src/ucioption.h +++ b/src/ucioption.h @@ -67,6 +67,7 @@ class Option { operator int() const; operator std::string() const; bool operator==(const char*) const; + bool operator!=(const char*) const; friend std::ostream& operator<<(std::ostream&, const OptionsMap&);