Matthies · Matthies · Oct 5, 2025
diff --git a/src/RubiChess.h b/src/RubiChess.h
@@ -779,8 +779,8 @@ unsigned char AlgebraicToIndex(string s);
 string IndexToAlgebraic(int i);
 void BitboardDraw(U64 b);
 U64 getTime();
-void bind_thread(int index);
 string numa_configuration();
+int GetNumOfNumaNodes();
 string CurrentWorkingDir();
 #ifdef _WIN32
 void* my_large_malloc(size_t s);
@@ -883,6 +883,7 @@ class NnueArchitecture
     virtual unsigned int GetAccumulationSize() = 0;
     virtual unsigned int GetPsqtAccumulationSize() = 0;
     virtual size_t GetNetworkFilesize() = 0;
+    virtual NnueArchitecture* Clone() = 0;
 #ifdef STATISTICS
     virtual void SwapInputNeurons(unsigned int i1, unsigned int i2) = 0;
     virtual void Statistics(bool verbose, bool sort) = 0;
@@ -891,7 +892,6 @@ class NnueArchitecture
 
 
 extern NnueType NnueReady;
-extern NnueArchitecture* NnueCurrentArch;
 
 
 class NnueLayer
@@ -1757,6 +1757,7 @@ class chessposition
     // The following members (almost) don't need an init
     int seldepth;
     int sc;
+    NnueArchitecture* NnueArch;
     U64 nodespermove[0x10000];                      // init in prepare only for thread #0
     chessmovelist captureslist[MAXDEPTH];
     chessmovelist quietslist[MAXDEPTH];
@@ -2150,6 +2151,7 @@ class engine
     chessposition rootposition;
     int Threads;
     int oldThreads;
+    int numOfNumaNodes;
     workingthread *sthread;
     ponderstate_t pondersearch;
     int ponderhitbonus;
@@ -2393,6 +2395,7 @@ class workingthread
     int index;
     int depth;
     int lastCompleteDepth;
+    NnueArchitecture* NnueArch;
 #ifdef NNUELEARN
     PackedSfenValue* psvbuffer;
     PackedSfenValue* psv;
@@ -2401,9 +2404,10 @@ class workingthread
     int chunkstate[2];
     U64 rndseed;
 #endif
-    uint64_t bottompadding[6];
+    uint64_t bottompadding[5];
+    void bind_thread();
     void idle_loop() {
-        bind_thread(index);
+        bind_thread();
         while (true)
         {
             unique_lock<mutex> lk(mtx);

diff --git a/src/engine.cpp b/src/engine.cpp
@@ -169,6 +169,7 @@ engine::engine(compilerinfo *c)
     // fixed    1.953.125 ~ 0.5 microseconds resolution =>  nps overflow at 9.444.732.965.738 nodes (~26h at 100Mnps, ~163h at 16Mnps)
 #endif
     rootposition.resetStats();
+    numOfNumaNodes = GetNumOfNumaNodes();
 }
 
 engine::~engine()
@@ -216,15 +217,21 @@ void engine::registerOptions()
     ucioptions.Register(&LimitNps, "LimitNps", ucispin, "0", 0, INT_MAX, nullptr);
 }
 
+
+void initNumaNetworks(workingthread* thr)
+{
+    thr->NnueArch = (thr->index < en.numOfNumaNodes ? thr->rootpos->NnueArch->Clone() : nullptr);
+}
+
 void initThread(workingthread* thr)
 {
     void* buffer = allocalign64(sizeof(chessposition));
     chessposition* pos = thr->pos = new(buffer) chessposition;
     pos->pwnhsh.setSize();
-    pos->accumulation = NnueCurrentArch ? NnueCurrentArch->CreateAccumulationStack() : nullptr;
-    pos->psqtAccumulation = NnueCurrentArch ? NnueCurrentArch->CreatePsqtAccumulationStack() : nullptr;
-    if (NnueCurrentArch)
-        NnueCurrentArch->CreateAccumulationCache(pos);
+    pos->NnueArch = thr->NnueArch;
+    pos->accumulation = pos->NnueArch->CreateAccumulationStack();
+    pos->psqtAccumulation = pos->NnueArch->CreatePsqtAccumulationStack();
+    pos->NnueArch->CreateAccumulationCache(pos);
 }
 
 void cleanupThread(workingthread* thr)
@@ -237,6 +244,8 @@ void cleanupThread(workingthread* thr)
     if (pos->accucache.psqtaccumulation)
         freealigned64(pos->accucache.psqtaccumulation);
     pos->~chessposition();
+    if (thr->index < en.numOfNumaNodes)
+        freealigned64(thr->NnueArch);
 }
 
 void engine::allocThreads()
@@ -268,6 +277,13 @@ void engine::allocThreads()
     for (int i = 0; i < Threads; i++)
     {
         sthread[i].init(i, &rootposition);
+        sthread[i].run_job(initNumaNetworks);
+    }
+    for (int i = 0; i < Threads; i++)
+        sthread[i].wait_for_work_finished();
+    for (int i = 0; i < Threads; i++)
+    {
+        sthread[i].NnueArch = sthread[i % numOfNumaNodes].NnueArch;
         sthread[i].run_job(initThread);
     }
     resetStats();
@@ -893,9 +909,7 @@ void prepareSearch(chessposition* pos, chessposition* rootpos)
     int startIndex = PREROOTMOVES - framesToCopy + 1;
     memcpy(&pos->prerootmovestack[startIndex], &rootpos->prerootmovestack[startIndex], framesToCopy * sizeof(chessmovestack));
     memcpy(&pos->prerootmovecode[startIndex], &rootpos->prerootmovecode[startIndex], framesToCopy * sizeof(uint32_t));
-
-    if (NnueCurrentArch)
-        NnueCurrentArch->ResetAccumulationCache(pos);
+    pos->NnueArch->ResetAccumulationCache(pos);
 }
 
 #ifdef NNUELEARN

diff --git a/src/nnue.cpp b/src/nnue.cpp
@@ -72,7 +72,6 @@ static constexpr int KingBucket[64] = {
 // Global objects
 //
 NnueType NnueReady = NnueDisabled;
-NnueArchitecture* NnueCurrentArch;
 
 // The network architecture V1
 class NnueArchitectureV1 : public NnueArchitecture {
@@ -206,6 +205,14 @@ class NnueArchitectureV1 : public NnueArchitecture {
     size_t GetNetworkFilesize() {
         return networkfilesize;
     }
+    NnueArchitecture* Clone() {
+        char *buffer = (char*)allocalign64(sizeof(NnueArchitectureV1));
+        NnueArchitectureV1* NewNnueArch = new(buffer) NnueArchitectureV1;
+        NewNnueArch->NnueFt = NnueFt;
+        NewNnueArch->LayerStack[0] = LayerStack[0];
+        return NewNnueArch;
+    }
+
 #ifdef STATISTICS
     void SwapInputNeurons(unsigned int i1, unsigned int i2) {
         // not supported for V1
@@ -376,6 +383,15 @@ class NnueArchitectureV5 : public NnueArchitecture {
     size_t GetNetworkFilesize() {
         return networkfilesize;
     }
+    NnueArchitecture* Clone() {
+        char* buffer = (char*)allocalign64(sizeof(NnueArchitectureV5<NnueFtOutputdims>));
+        NnueArchitectureV5<NnueFtOutputdims>* NewNnueArch = new(buffer) NnueArchitectureV5<NnueFtOutputdims>;
+        NewNnueArch->NnueFt = NnueFt;
+        for (unsigned int i = 0; i < NnueLayerStacks; i++)
+            NewNnueArch->LayerStack[i] = LayerStack[i];
+        return NewNnueArch;
+    }
+
 #ifdef STATISTICS
     void SwapInputNeurons(unsigned int i1, unsigned int i2) {
         if (i1 >= NnueFtHalfdims / 2 || i2 >= NnueFtHalfdims / 2) {
@@ -806,8 +822,8 @@ template <NnueType Nt, Color c, unsigned int NnueFtHalfdims, unsigned int NnuePs
         chainindex++;
     }
 
-    int16_t* weight = NnueCurrentArch->GetFeatureWeight();
-    int32_t* psqtweight = NnueCurrentArch->GetFeaturePsqtWeight();
+    int16_t* weight = NnueArch->GetFeatureWeight();
+    int32_t* psqtweight = NnueArch->GetFeaturePsqtWeight();
 
 #ifdef USE_SIMD
     constexpr unsigned int numRegs = (NUM_REGS > NnueFtHalfdims * 16 / SIMD_WIDTH ? NnueFtHalfdims * 16 / SIMD_WIDTH : NUM_REGS);
@@ -1007,8 +1023,8 @@ template <NnueType Nt, Color c, unsigned int NnueFtHalfdims, unsigned int NnuePs
 
     memcpy(cachedpiece00, piece00, sizeof(piece00));
 
-    int16_t* weight = NnueCurrentArch->GetFeatureWeight();
-    int32_t* psqtweight = NnueCurrentArch->GetFeaturePsqtWeight();
+    int16_t* weight = NnueArch->GetFeatureWeight();
+    int32_t* psqtweight = NnueArch->GetFeaturePsqtWeight();
 
 #ifdef USE_SIMD
     constexpr unsigned int numRegs = (NUM_REGS > NnueFtHalfdims * 16 / SIMD_WIDTH ? NnueFtHalfdims * 16 / SIMD_WIDTH : NUM_REGS);
@@ -1249,13 +1265,13 @@ int chessposition::Transform(clipped_t *output, int bucket)
 
 int chessposition::NnueGetEval()
 {
-    return NnueCurrentArch->GetEval(this);
+    return NnueArch->GetEval(this);
 }
 
 
 void chessposition::NnueSpeculativeEval()
 {
-    NnueCurrentArch->SpeculativeEval(this);
+    NnueArch->SpeculativeEval(this);
 }
 
 
@@ -2041,22 +2057,22 @@ void NnueSqrClippedRelu<dims>::Propagate(int32_t* input, clipped_t* output)
 //
 void NnueInit()
 {
-    NnueCurrentArch = nullptr;
 }
 
 void NnueRemove()
 {
-    if (NnueCurrentArch) {
-        freealigned64(NnueCurrentArch);
-        NnueCurrentArch = nullptr;
+    if (en.rootposition.NnueArch) {
+        freealigned64(en.rootposition.NnueArch);
+        en.rootposition.NnueArch = nullptr;
     }
 }
 
 bool NnueReadNet(NnueNetsource* nr)
 {
     NnueType oldnt = NnueReady;
-    unsigned int oldaccumulationsize = (NnueCurrentArch ? NnueCurrentArch->GetAccumulationSize() : 0);
-    unsigned int oldpsqtaccumulationsize = (NnueCurrentArch ? NnueCurrentArch->GetPsqtAccumulationSize() : 0);
+    NnueArchitecture* NnueArch = en.rootposition.NnueArch;
+    unsigned int oldaccumulationsize = (NnueArch ? NnueArch->GetAccumulationSize() : 0);
+    unsigned int oldpsqtaccumulationsize = (NnueArch ? NnueArch->GetPsqtAccumulationSize() : 0);
 
     NnueReady = NnueDisabled;
 
@@ -2080,6 +2096,7 @@ bool NnueReadNet(NnueNetsource* nr)
     bool bpz;
     int leb128dim = 0;
     char* buffer;
+    NnueArchitecture* NnueCurrentArch;
     switch (version) {
     case NNUEFILEVERSIONROTATE:
         bpz = true;
@@ -2178,6 +2195,7 @@ bool NnueReadNet(NnueNetsource* nr)
         || oldaccumulationsize != NnueCurrentArch->GetAccumulationSize()
         || oldpsqtaccumulationsize != NnueCurrentArch->GetPsqtAccumulationSize())
     {
+        en.rootposition.NnueArch = NnueCurrentArch;
         en.allocThreads();
     }
 
@@ -2230,6 +2248,7 @@ static int xFlate(bool compress, unsigned char* in, unsigned char** out, size_t
 
 void NnueWriteNet(vector<string> args)
 {
+    NnueArchitecture* NnueArch = en.rootposition.NnueArch;
     size_t ci = 0;
     size_t cs = args.size();
     string NnueNetPath = "export.nnue";
@@ -2256,7 +2275,7 @@ void NnueWriteNet(vector<string> args)
 
     if (sort)
 #ifdef STATISTICS
-        NnueCurrentArch->Statistics(false, true);
+        NnueArch->Statistics(false, true);
 #else
         cout << "Cannot sort input features. This needs STATISTICS collection enabled.\n";
 #endif
@@ -2272,18 +2291,18 @@ void NnueWriteNet(vector<string> args)
     }
 
     if (rescale)
-        NnueCurrentArch->RescaleLastLayer(rescale);
+        NnueArch->RescaleLastLayer(rescale);
 
-    uint32_t fthash = NnueCurrentArch->GetFtHash();
-    uint32_t nethash = NnueCurrentArch->GetHash();
+    uint32_t fthash = NnueArch->GetFtHash();
+    uint32_t nethash = NnueArch->GetHash();
     uint32_t filehash = (fthash ^ nethash);
 
-    uint32_t version = NnueCurrentArch->GetFileVersion();
-    string sarchitecture = NnueCurrentArch->GetArchDescription();
+    uint32_t version = NnueArch->GetFileVersion();
+    string sarchitecture = NnueArch->GetArchDescription();
     uint32_t size = (uint32_t)sarchitecture.size();
 
     NnueNetsource nr;
-    nr.readbuffersize = 3 * sizeof(uint32_t) + size + NnueCurrentArch->GetNetworkFilesize();
+    nr.readbuffersize = 3 * sizeof(uint32_t) + size + NnueArch->GetNetworkFilesize();
     nr.readbuffer = (unsigned char*)allocalign64(nr.readbuffersize);
     nr.next = nr.readbuffer;
 
@@ -2293,8 +2312,8 @@ void NnueWriteNet(vector<string> args)
     nr.write((unsigned char*)&sarchitecture[0], size);
     nr.write((unsigned char*)&fthash, sizeof(uint32_t));
 
-    NnueCurrentArch->WriteFeatureWeights(&nr, leb128);
-    NnueCurrentArch->WriteWeights(&nr, nethash);
+    NnueArch->WriteFeatureWeights(&nr, leb128);
+    NnueArch->WriteWeights(&nr, nethash);
 
     size_t insize = nr.next - nr.readbuffer;
 
@@ -2406,7 +2425,7 @@ bool NnueNetsource::open()
     if (!openOk)
         guiCom << "info string The network " + en.GetNnueNetPath() + " seems corrupted or format is not supported.\n";
     else
-        guiCom << "info string Reading network " + en.GetNnueNetPath() + " successful. Using NNUE (" + NnueCurrentArch->GetArchName() + ").\n";
+        guiCom << "info string Reading network " + en.GetNnueNetPath() + " successful. Using NNUE (" + en.rootposition.NnueArch->GetArchName() + ").\n";
 
 cleanup:
 #ifndef NNUEINCLUDED

diff --git a/src/utils.cpp b/src/utils.cpp
@@ -502,6 +502,14 @@ string chessposition::AlgebraicFromShort(string s)
 //
 // Credits for this code go to Kieren Pearson / Halogen
 //
+
+int GetNumOfNumaNodes()
+{
+    if (numa_available() == -1)
+        return 1;
+    return max(1, numa_max_node() + 1);
+}
+
 vector<cpu_set_t> get_cpu_masks_per_numa_node()
 {
     vector<cpu_set_t> node_cpu_masks;
@@ -537,15 +545,16 @@ vector<cpu_set_t> get_cpu_masks_per_numa_node()
     return node_cpu_masks;
 }
 
-void bind_thread(int index)
+void workingthread::bind_thread()
 {
-    static vector<cpu_set_t> mapping = get_cpu_masks_per_numa_node();
-    if (mapping.size() == 0)
+    int numanodes = en.numOfNumaNodes;
+    if (numanodes < 2)
         return;
 
+    static vector<cpu_set_t> mapping = get_cpu_masks_per_numa_node();
     // Use a random start node for better distribution of multiple instances
-    static int randomOffset = getTime() % mapping.size();
-    size_t node = (index + randomOffset) % mapping.size();
+    static int randomOffset = getTime() % numanodes;
+    size_t node = (index + randomOffset) % numanodes;
     pthread_t handle = pthread_self();
     pthread_setaffinity_np(handle, sizeof(cpu_set_t), &mapping[node]);
 }
@@ -559,14 +568,19 @@ string numa_configuration()
 }
 
 #else
-void bind_thread(int index)
+void workingthread::bind_thread()
 {
 }
 
 string numa_configuration()
 {
     return "";
 }
+
+int GetNumOfNumaNodes()
+{
+    return 1;
+}
 #endif