diff --git a/src/RubiChess.h b/src/RubiChess.h index 7652036a..8ccb66a5 100644 --- a/src/RubiChess.h +++ b/src/RubiChess.h @@ -779,8 +779,8 @@ unsigned char AlgebraicToIndex(string s); string IndexToAlgebraic(int i); void BitboardDraw(U64 b); U64 getTime(); -void bind_thread(int index); string numa_configuration(); +int GetNumOfNumaNodes(); string CurrentWorkingDir(); #ifdef _WIN32 void* my_large_malloc(size_t s); @@ -883,6 +883,7 @@ class NnueArchitecture virtual unsigned int GetAccumulationSize() = 0; virtual unsigned int GetPsqtAccumulationSize() = 0; virtual size_t GetNetworkFilesize() = 0; + virtual NnueArchitecture* Clone() = 0; #ifdef STATISTICS virtual void SwapInputNeurons(unsigned int i1, unsigned int i2) = 0; virtual void Statistics(bool verbose, bool sort) = 0; @@ -891,7 +892,6 @@ class NnueArchitecture extern NnueType NnueReady; -extern NnueArchitecture* NnueCurrentArch; class NnueLayer @@ -1757,6 +1757,7 @@ class chessposition // The following members (almost) don't need an init int seldepth; int sc; + NnueArchitecture* NnueArch; U64 nodespermove[0x10000]; // init in prepare only for thread #0 chessmovelist captureslist[MAXDEPTH]; chessmovelist quietslist[MAXDEPTH]; @@ -2150,6 +2151,7 @@ class engine chessposition rootposition; int Threads; int oldThreads; + int numOfNumaNodes; workingthread *sthread; ponderstate_t pondersearch; int ponderhitbonus; @@ -2393,6 +2395,7 @@ class workingthread int index; int depth; int lastCompleteDepth; + NnueArchitecture* NnueArch; #ifdef NNUELEARN PackedSfenValue* psvbuffer; PackedSfenValue* psv; @@ -2401,9 +2404,10 @@ class workingthread int chunkstate[2]; U64 rndseed; #endif - uint64_t bottompadding[6]; + uint64_t bottompadding[5]; + void bind_thread(); void idle_loop() { - bind_thread(index); + bind_thread(); while (true) { unique_lock lk(mtx); diff --git a/src/engine.cpp b/src/engine.cpp index 4c2c6d00..6dbc1bca 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -169,6 +169,7 @@ engine::engine(compilerinfo *c) // fixed 1.953.125 ~ 0.5 microseconds resolution => nps overflow at 9.444.732.965.738 nodes (~26h at 100Mnps, ~163h at 16Mnps) #endif rootposition.resetStats(); + numOfNumaNodes = GetNumOfNumaNodes(); } engine::~engine() @@ -216,15 +217,21 @@ void engine::registerOptions() ucioptions.Register(&LimitNps, "LimitNps", ucispin, "0", 0, INT_MAX, nullptr); } + +void initNumaNetworks(workingthread* thr) +{ + thr->NnueArch = (thr->index < en.numOfNumaNodes ? thr->rootpos->NnueArch->Clone() : nullptr); +} + void initThread(workingthread* thr) { void* buffer = allocalign64(sizeof(chessposition)); chessposition* pos = thr->pos = new(buffer) chessposition; pos->pwnhsh.setSize(); - pos->accumulation = NnueCurrentArch ? NnueCurrentArch->CreateAccumulationStack() : nullptr; - pos->psqtAccumulation = NnueCurrentArch ? NnueCurrentArch->CreatePsqtAccumulationStack() : nullptr; - if (NnueCurrentArch) - NnueCurrentArch->CreateAccumulationCache(pos); + pos->NnueArch = thr->NnueArch; + pos->accumulation = pos->NnueArch->CreateAccumulationStack(); + pos->psqtAccumulation = pos->NnueArch->CreatePsqtAccumulationStack(); + pos->NnueArch->CreateAccumulationCache(pos); } void cleanupThread(workingthread* thr) @@ -237,6 +244,8 @@ void cleanupThread(workingthread* thr) if (pos->accucache.psqtaccumulation) freealigned64(pos->accucache.psqtaccumulation); pos->~chessposition(); + if (thr->index < en.numOfNumaNodes) + freealigned64(thr->NnueArch); } void engine::allocThreads() @@ -268,6 +277,13 @@ void engine::allocThreads() for (int i = 0; i < Threads; i++) { sthread[i].init(i, &rootposition); + sthread[i].run_job(initNumaNetworks); + } + for (int i = 0; i < Threads; i++) + sthread[i].wait_for_work_finished(); + for (int i = 0; i < Threads; i++) + { + sthread[i].NnueArch = sthread[i % numOfNumaNodes].NnueArch; sthread[i].run_job(initThread); } resetStats(); @@ -893,9 +909,7 @@ void prepareSearch(chessposition* pos, chessposition* rootpos) int startIndex = PREROOTMOVES - framesToCopy + 1; memcpy(&pos->prerootmovestack[startIndex], &rootpos->prerootmovestack[startIndex], framesToCopy * sizeof(chessmovestack)); memcpy(&pos->prerootmovecode[startIndex], &rootpos->prerootmovecode[startIndex], framesToCopy * sizeof(uint32_t)); - - if (NnueCurrentArch) - NnueCurrentArch->ResetAccumulationCache(pos); + pos->NnueArch->ResetAccumulationCache(pos); } #ifdef NNUELEARN diff --git a/src/nnue.cpp b/src/nnue.cpp index 2d3373b2..e05c52f1 100644 --- a/src/nnue.cpp +++ b/src/nnue.cpp @@ -72,7 +72,6 @@ static constexpr int KingBucket[64] = { // Global objects // NnueType NnueReady = NnueDisabled; -NnueArchitecture* NnueCurrentArch; // The network architecture V1 class NnueArchitectureV1 : public NnueArchitecture { @@ -206,6 +205,14 @@ class NnueArchitectureV1 : public NnueArchitecture { size_t GetNetworkFilesize() { return networkfilesize; } + NnueArchitecture* Clone() { + char *buffer = (char*)allocalign64(sizeof(NnueArchitectureV1)); + NnueArchitectureV1* NewNnueArch = new(buffer) NnueArchitectureV1; + NewNnueArch->NnueFt = NnueFt; + NewNnueArch->LayerStack[0] = LayerStack[0]; + return NewNnueArch; + } + #ifdef STATISTICS void SwapInputNeurons(unsigned int i1, unsigned int i2) { // not supported for V1 @@ -376,6 +383,15 @@ class NnueArchitectureV5 : public NnueArchitecture { size_t GetNetworkFilesize() { return networkfilesize; } + NnueArchitecture* Clone() { + char* buffer = (char*)allocalign64(sizeof(NnueArchitectureV5)); + NnueArchitectureV5* NewNnueArch = new(buffer) NnueArchitectureV5; + NewNnueArch->NnueFt = NnueFt; + for (unsigned int i = 0; i < NnueLayerStacks; i++) + NewNnueArch->LayerStack[i] = LayerStack[i]; + return NewNnueArch; + } + #ifdef STATISTICS void SwapInputNeurons(unsigned int i1, unsigned int i2) { if (i1 >= NnueFtHalfdims / 2 || i2 >= NnueFtHalfdims / 2) { @@ -806,8 +822,8 @@ template GetFeatureWeight(); - int32_t* psqtweight = NnueCurrentArch->GetFeaturePsqtWeight(); + int16_t* weight = NnueArch->GetFeatureWeight(); + int32_t* psqtweight = NnueArch->GetFeaturePsqtWeight(); #ifdef USE_SIMD constexpr unsigned int numRegs = (NUM_REGS > NnueFtHalfdims * 16 / SIMD_WIDTH ? NnueFtHalfdims * 16 / SIMD_WIDTH : NUM_REGS); @@ -1007,8 +1023,8 @@ template GetFeatureWeight(); - int32_t* psqtweight = NnueCurrentArch->GetFeaturePsqtWeight(); + int16_t* weight = NnueArch->GetFeatureWeight(); + int32_t* psqtweight = NnueArch->GetFeaturePsqtWeight(); #ifdef USE_SIMD constexpr unsigned int numRegs = (NUM_REGS > NnueFtHalfdims * 16 / SIMD_WIDTH ? NnueFtHalfdims * 16 / SIMD_WIDTH : NUM_REGS); @@ -1249,13 +1265,13 @@ int chessposition::Transform(clipped_t *output, int bucket) int chessposition::NnueGetEval() { - return NnueCurrentArch->GetEval(this); + return NnueArch->GetEval(this); } void chessposition::NnueSpeculativeEval() { - NnueCurrentArch->SpeculativeEval(this); + NnueArch->SpeculativeEval(this); } @@ -2041,22 +2057,22 @@ void NnueSqrClippedRelu::Propagate(int32_t* input, clipped_t* output) // void NnueInit() { - NnueCurrentArch = nullptr; } void NnueRemove() { - if (NnueCurrentArch) { - freealigned64(NnueCurrentArch); - NnueCurrentArch = nullptr; + if (en.rootposition.NnueArch) { + freealigned64(en.rootposition.NnueArch); + en.rootposition.NnueArch = nullptr; } } bool NnueReadNet(NnueNetsource* nr) { NnueType oldnt = NnueReady; - unsigned int oldaccumulationsize = (NnueCurrentArch ? NnueCurrentArch->GetAccumulationSize() : 0); - unsigned int oldpsqtaccumulationsize = (NnueCurrentArch ? NnueCurrentArch->GetPsqtAccumulationSize() : 0); + NnueArchitecture* NnueArch = en.rootposition.NnueArch; + unsigned int oldaccumulationsize = (NnueArch ? NnueArch->GetAccumulationSize() : 0); + unsigned int oldpsqtaccumulationsize = (NnueArch ? NnueArch->GetPsqtAccumulationSize() : 0); NnueReady = NnueDisabled; @@ -2080,6 +2096,7 @@ bool NnueReadNet(NnueNetsource* nr) bool bpz; int leb128dim = 0; char* buffer; + NnueArchitecture* NnueCurrentArch; switch (version) { case NNUEFILEVERSIONROTATE: bpz = true; @@ -2178,6 +2195,7 @@ bool NnueReadNet(NnueNetsource* nr) || oldaccumulationsize != NnueCurrentArch->GetAccumulationSize() || oldpsqtaccumulationsize != NnueCurrentArch->GetPsqtAccumulationSize()) { + en.rootposition.NnueArch = NnueCurrentArch; en.allocThreads(); } @@ -2230,6 +2248,7 @@ static int xFlate(bool compress, unsigned char* in, unsigned char** out, size_t void NnueWriteNet(vector args) { + NnueArchitecture* NnueArch = en.rootposition.NnueArch; size_t ci = 0; size_t cs = args.size(); string NnueNetPath = "export.nnue"; @@ -2256,7 +2275,7 @@ void NnueWriteNet(vector args) if (sort) #ifdef STATISTICS - NnueCurrentArch->Statistics(false, true); + NnueArch->Statistics(false, true); #else cout << "Cannot sort input features. This needs STATISTICS collection enabled.\n"; #endif @@ -2272,18 +2291,18 @@ void NnueWriteNet(vector args) } if (rescale) - NnueCurrentArch->RescaleLastLayer(rescale); + NnueArch->RescaleLastLayer(rescale); - uint32_t fthash = NnueCurrentArch->GetFtHash(); - uint32_t nethash = NnueCurrentArch->GetHash(); + uint32_t fthash = NnueArch->GetFtHash(); + uint32_t nethash = NnueArch->GetHash(); uint32_t filehash = (fthash ^ nethash); - uint32_t version = NnueCurrentArch->GetFileVersion(); - string sarchitecture = NnueCurrentArch->GetArchDescription(); + uint32_t version = NnueArch->GetFileVersion(); + string sarchitecture = NnueArch->GetArchDescription(); uint32_t size = (uint32_t)sarchitecture.size(); NnueNetsource nr; - nr.readbuffersize = 3 * sizeof(uint32_t) + size + NnueCurrentArch->GetNetworkFilesize(); + nr.readbuffersize = 3 * sizeof(uint32_t) + size + NnueArch->GetNetworkFilesize(); nr.readbuffer = (unsigned char*)allocalign64(nr.readbuffersize); nr.next = nr.readbuffer; @@ -2293,8 +2312,8 @@ void NnueWriteNet(vector args) nr.write((unsigned char*)&sarchitecture[0], size); nr.write((unsigned char*)&fthash, sizeof(uint32_t)); - NnueCurrentArch->WriteFeatureWeights(&nr, leb128); - NnueCurrentArch->WriteWeights(&nr, nethash); + NnueArch->WriteFeatureWeights(&nr, leb128); + NnueArch->WriteWeights(&nr, nethash); size_t insize = nr.next - nr.readbuffer; @@ -2406,7 +2425,7 @@ bool NnueNetsource::open() if (!openOk) guiCom << "info string The network " + en.GetNnueNetPath() + " seems corrupted or format is not supported.\n"; else - guiCom << "info string Reading network " + en.GetNnueNetPath() + " successful. Using NNUE (" + NnueCurrentArch->GetArchName() + ").\n"; + guiCom << "info string Reading network " + en.GetNnueNetPath() + " successful. Using NNUE (" + en.rootposition.NnueArch->GetArchName() + ").\n"; cleanup: #ifndef NNUEINCLUDED diff --git a/src/utils.cpp b/src/utils.cpp index 325b0b9c..1c43cc8a 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -502,6 +502,14 @@ string chessposition::AlgebraicFromShort(string s) // // Credits for this code go to Kieren Pearson / Halogen // + +int GetNumOfNumaNodes() +{ + if (numa_available() == -1) + return 1; + return max(1, numa_max_node() + 1); +} + vector get_cpu_masks_per_numa_node() { vector node_cpu_masks; @@ -537,15 +545,16 @@ vector get_cpu_masks_per_numa_node() return node_cpu_masks; } -void bind_thread(int index) +void workingthread::bind_thread() { - static vector mapping = get_cpu_masks_per_numa_node(); - if (mapping.size() == 0) + int numanodes = en.numOfNumaNodes; + if (numanodes < 2) return; + static vector mapping = get_cpu_masks_per_numa_node(); // Use a random start node for better distribution of multiple instances - static int randomOffset = getTime() % mapping.size(); - size_t node = (index + randomOffset) % mapping.size(); + static int randomOffset = getTime() % numanodes; + size_t node = (index + randomOffset) % numanodes; pthread_t handle = pthread_self(); pthread_setaffinity_np(handle, sizeof(cpu_set_t), &mapping[node]); } @@ -559,7 +568,7 @@ string numa_configuration() } #else -void bind_thread(int index) +void workingthread::bind_thread() { } @@ -567,6 +576,11 @@ string numa_configuration() { return ""; } + +int GetNumOfNumaNodes() +{ + return 1; +} #endif