Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions src/RubiChess.h
Original file line number Diff line number Diff line change
Expand Up @@ -779,8 +779,8 @@ unsigned char AlgebraicToIndex(string s);
string IndexToAlgebraic(int i);
void BitboardDraw(U64 b);
U64 getTime();
void bind_thread(int index);
string numa_configuration();
int GetNumOfNumaNodes();
string CurrentWorkingDir();
#ifdef _WIN32
void* my_large_malloc(size_t s);
Expand Down Expand Up @@ -883,6 +883,7 @@ class NnueArchitecture
virtual unsigned int GetAccumulationSize() = 0;
virtual unsigned int GetPsqtAccumulationSize() = 0;
virtual size_t GetNetworkFilesize() = 0;
virtual NnueArchitecture* Clone() = 0;
#ifdef STATISTICS
virtual void SwapInputNeurons(unsigned int i1, unsigned int i2) = 0;
virtual void Statistics(bool verbose, bool sort) = 0;
Expand All @@ -891,7 +892,6 @@ class NnueArchitecture


extern NnueType NnueReady;
extern NnueArchitecture* NnueCurrentArch;


class NnueLayer
Expand Down Expand Up @@ -1757,6 +1757,7 @@ class chessposition
// The following members (almost) don't need an init
int seldepth;
int sc;
NnueArchitecture* NnueArch;
U64 nodespermove[0x10000]; // init in prepare only for thread #0
chessmovelist captureslist[MAXDEPTH];
chessmovelist quietslist[MAXDEPTH];
Expand Down Expand Up @@ -2150,6 +2151,7 @@ class engine
chessposition rootposition;
int Threads;
int oldThreads;
int numOfNumaNodes;
workingthread *sthread;
ponderstate_t pondersearch;
int ponderhitbonus;
Expand Down Expand Up @@ -2393,6 +2395,7 @@ class workingthread
int index;
int depth;
int lastCompleteDepth;
NnueArchitecture* NnueArch;
#ifdef NNUELEARN
PackedSfenValue* psvbuffer;
PackedSfenValue* psv;
Expand All @@ -2401,9 +2404,10 @@ class workingthread
int chunkstate[2];
U64 rndseed;
#endif
uint64_t bottompadding[6];
uint64_t bottompadding[5];
void bind_thread();
void idle_loop() {
bind_thread(index);
bind_thread();
while (true)
{
unique_lock<mutex> lk(mtx);
Expand Down
28 changes: 21 additions & 7 deletions src/engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ engine::engine(compilerinfo *c)
// fixed 1.953.125 ~ 0.5 microseconds resolution => nps overflow at 9.444.732.965.738 nodes (~26h at 100Mnps, ~163h at 16Mnps)
#endif
rootposition.resetStats();
numOfNumaNodes = GetNumOfNumaNodes();
}

engine::~engine()
Expand Down Expand Up @@ -216,15 +217,21 @@ void engine::registerOptions()
ucioptions.Register(&LimitNps, "LimitNps", ucispin, "0", 0, INT_MAX, nullptr);
}


void initNumaNetworks(workingthread* thr)
{
thr->NnueArch = (thr->index < en.numOfNumaNodes ? thr->rootpos->NnueArch->Clone() : nullptr);
}

void initThread(workingthread* thr)
{
void* buffer = allocalign64(sizeof(chessposition));
chessposition* pos = thr->pos = new(buffer) chessposition;
pos->pwnhsh.setSize();
pos->accumulation = NnueCurrentArch ? NnueCurrentArch->CreateAccumulationStack() : nullptr;
pos->psqtAccumulation = NnueCurrentArch ? NnueCurrentArch->CreatePsqtAccumulationStack() : nullptr;
if (NnueCurrentArch)
NnueCurrentArch->CreateAccumulationCache(pos);
pos->NnueArch = thr->NnueArch;
pos->accumulation = pos->NnueArch->CreateAccumulationStack();
pos->psqtAccumulation = pos->NnueArch->CreatePsqtAccumulationStack();
pos->NnueArch->CreateAccumulationCache(pos);
}

void cleanupThread(workingthread* thr)
Expand All @@ -237,6 +244,8 @@ void cleanupThread(workingthread* thr)
if (pos->accucache.psqtaccumulation)
freealigned64(pos->accucache.psqtaccumulation);
pos->~chessposition();
if (thr->index < en.numOfNumaNodes)
freealigned64(thr->NnueArch);
}

void engine::allocThreads()
Expand Down Expand Up @@ -268,6 +277,13 @@ void engine::allocThreads()
for (int i = 0; i < Threads; i++)
{
sthread[i].init(i, &rootposition);
sthread[i].run_job(initNumaNetworks);
}
for (int i = 0; i < Threads; i++)
sthread[i].wait_for_work_finished();
for (int i = 0; i < Threads; i++)
{
sthread[i].NnueArch = sthread[i % numOfNumaNodes].NnueArch;
sthread[i].run_job(initThread);
}
resetStats();
Expand Down Expand Up @@ -893,9 +909,7 @@ void prepareSearch(chessposition* pos, chessposition* rootpos)
int startIndex = PREROOTMOVES - framesToCopy + 1;
memcpy(&pos->prerootmovestack[startIndex], &rootpos->prerootmovestack[startIndex], framesToCopy * sizeof(chessmovestack));
memcpy(&pos->prerootmovecode[startIndex], &rootpos->prerootmovecode[startIndex], framesToCopy * sizeof(uint32_t));

if (NnueCurrentArch)
NnueCurrentArch->ResetAccumulationCache(pos);
pos->NnueArch->ResetAccumulationCache(pos);
}

#ifdef NNUELEARN
Expand Down
65 changes: 42 additions & 23 deletions src/nnue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ static constexpr int KingBucket[64] = {
// Global objects
//
NnueType NnueReady = NnueDisabled;
NnueArchitecture* NnueCurrentArch;

// The network architecture V1
class NnueArchitectureV1 : public NnueArchitecture {
Expand Down Expand Up @@ -206,6 +205,14 @@ class NnueArchitectureV1 : public NnueArchitecture {
size_t GetNetworkFilesize() {
return networkfilesize;
}
NnueArchitecture* Clone() {
char *buffer = (char*)allocalign64(sizeof(NnueArchitectureV1));
NnueArchitectureV1* NewNnueArch = new(buffer) NnueArchitectureV1;
NewNnueArch->NnueFt = NnueFt;
NewNnueArch->LayerStack[0] = LayerStack[0];
return NewNnueArch;
}

#ifdef STATISTICS
void SwapInputNeurons(unsigned int i1, unsigned int i2) {
// not supported for V1
Expand Down Expand Up @@ -376,6 +383,15 @@ class NnueArchitectureV5 : public NnueArchitecture {
size_t GetNetworkFilesize() {
return networkfilesize;
}
NnueArchitecture* Clone() {
char* buffer = (char*)allocalign64(sizeof(NnueArchitectureV5<NnueFtOutputdims>));
NnueArchitectureV5<NnueFtOutputdims>* NewNnueArch = new(buffer) NnueArchitectureV5<NnueFtOutputdims>;
NewNnueArch->NnueFt = NnueFt;
for (unsigned int i = 0; i < NnueLayerStacks; i++)
NewNnueArch->LayerStack[i] = LayerStack[i];
return NewNnueArch;
}

#ifdef STATISTICS
void SwapInputNeurons(unsigned int i1, unsigned int i2) {
if (i1 >= NnueFtHalfdims / 2 || i2 >= NnueFtHalfdims / 2) {
Expand Down Expand Up @@ -806,8 +822,8 @@ template <NnueType Nt, Color c, unsigned int NnueFtHalfdims, unsigned int NnuePs
chainindex++;
}

int16_t* weight = NnueCurrentArch->GetFeatureWeight();
int32_t* psqtweight = NnueCurrentArch->GetFeaturePsqtWeight();
int16_t* weight = NnueArch->GetFeatureWeight();
int32_t* psqtweight = NnueArch->GetFeaturePsqtWeight();

#ifdef USE_SIMD
constexpr unsigned int numRegs = (NUM_REGS > NnueFtHalfdims * 16 / SIMD_WIDTH ? NnueFtHalfdims * 16 / SIMD_WIDTH : NUM_REGS);
Expand Down Expand Up @@ -1007,8 +1023,8 @@ template <NnueType Nt, Color c, unsigned int NnueFtHalfdims, unsigned int NnuePs

memcpy(cachedpiece00, piece00, sizeof(piece00));

int16_t* weight = NnueCurrentArch->GetFeatureWeight();
int32_t* psqtweight = NnueCurrentArch->GetFeaturePsqtWeight();
int16_t* weight = NnueArch->GetFeatureWeight();
int32_t* psqtweight = NnueArch->GetFeaturePsqtWeight();

#ifdef USE_SIMD
constexpr unsigned int numRegs = (NUM_REGS > NnueFtHalfdims * 16 / SIMD_WIDTH ? NnueFtHalfdims * 16 / SIMD_WIDTH : NUM_REGS);
Expand Down Expand Up @@ -1249,13 +1265,13 @@ int chessposition::Transform(clipped_t *output, int bucket)

int chessposition::NnueGetEval()
{
return NnueCurrentArch->GetEval(this);
return NnueArch->GetEval(this);
}


void chessposition::NnueSpeculativeEval()
{
NnueCurrentArch->SpeculativeEval(this);
NnueArch->SpeculativeEval(this);
}


Expand Down Expand Up @@ -2041,22 +2057,22 @@ void NnueSqrClippedRelu<dims>::Propagate(int32_t* input, clipped_t* output)
//
void NnueInit()
{
NnueCurrentArch = nullptr;
}

void NnueRemove()
{
if (NnueCurrentArch) {
freealigned64(NnueCurrentArch);
NnueCurrentArch = nullptr;
if (en.rootposition.NnueArch) {
freealigned64(en.rootposition.NnueArch);
en.rootposition.NnueArch = nullptr;
}
}

bool NnueReadNet(NnueNetsource* nr)
{
NnueType oldnt = NnueReady;
unsigned int oldaccumulationsize = (NnueCurrentArch ? NnueCurrentArch->GetAccumulationSize() : 0);
unsigned int oldpsqtaccumulationsize = (NnueCurrentArch ? NnueCurrentArch->GetPsqtAccumulationSize() : 0);
NnueArchitecture* NnueArch = en.rootposition.NnueArch;
unsigned int oldaccumulationsize = (NnueArch ? NnueArch->GetAccumulationSize() : 0);
unsigned int oldpsqtaccumulationsize = (NnueArch ? NnueArch->GetPsqtAccumulationSize() : 0);

NnueReady = NnueDisabled;

Expand All @@ -2080,6 +2096,7 @@ bool NnueReadNet(NnueNetsource* nr)
bool bpz;
int leb128dim = 0;
char* buffer;
NnueArchitecture* NnueCurrentArch;
switch (version) {
case NNUEFILEVERSIONROTATE:
bpz = true;
Expand Down Expand Up @@ -2178,6 +2195,7 @@ bool NnueReadNet(NnueNetsource* nr)
|| oldaccumulationsize != NnueCurrentArch->GetAccumulationSize()
|| oldpsqtaccumulationsize != NnueCurrentArch->GetPsqtAccumulationSize())
{
en.rootposition.NnueArch = NnueCurrentArch;
en.allocThreads();
}

Expand Down Expand Up @@ -2230,6 +2248,7 @@ static int xFlate(bool compress, unsigned char* in, unsigned char** out, size_t

void NnueWriteNet(vector<string> args)
{
NnueArchitecture* NnueArch = en.rootposition.NnueArch;
size_t ci = 0;
size_t cs = args.size();
string NnueNetPath = "export.nnue";
Expand All @@ -2256,7 +2275,7 @@ void NnueWriteNet(vector<string> args)

if (sort)
#ifdef STATISTICS
NnueCurrentArch->Statistics(false, true);
NnueArch->Statistics(false, true);
#else
cout << "Cannot sort input features. This needs STATISTICS collection enabled.\n";
#endif
Expand All @@ -2272,18 +2291,18 @@ void NnueWriteNet(vector<string> args)
}

if (rescale)
NnueCurrentArch->RescaleLastLayer(rescale);
NnueArch->RescaleLastLayer(rescale);

uint32_t fthash = NnueCurrentArch->GetFtHash();
uint32_t nethash = NnueCurrentArch->GetHash();
uint32_t fthash = NnueArch->GetFtHash();
uint32_t nethash = NnueArch->GetHash();
uint32_t filehash = (fthash ^ nethash);

uint32_t version = NnueCurrentArch->GetFileVersion();
string sarchitecture = NnueCurrentArch->GetArchDescription();
uint32_t version = NnueArch->GetFileVersion();
string sarchitecture = NnueArch->GetArchDescription();
uint32_t size = (uint32_t)sarchitecture.size();

NnueNetsource nr;
nr.readbuffersize = 3 * sizeof(uint32_t) + size + NnueCurrentArch->GetNetworkFilesize();
nr.readbuffersize = 3 * sizeof(uint32_t) + size + NnueArch->GetNetworkFilesize();
nr.readbuffer = (unsigned char*)allocalign64(nr.readbuffersize);
nr.next = nr.readbuffer;

Expand All @@ -2293,8 +2312,8 @@ void NnueWriteNet(vector<string> args)
nr.write((unsigned char*)&sarchitecture[0], size);
nr.write((unsigned char*)&fthash, sizeof(uint32_t));

NnueCurrentArch->WriteFeatureWeights(&nr, leb128);
NnueCurrentArch->WriteWeights(&nr, nethash);
NnueArch->WriteFeatureWeights(&nr, leb128);
NnueArch->WriteWeights(&nr, nethash);

size_t insize = nr.next - nr.readbuffer;

Expand Down Expand Up @@ -2406,7 +2425,7 @@ bool NnueNetsource::open()
if (!openOk)
guiCom << "info string The network " + en.GetNnueNetPath() + " seems corrupted or format is not supported.\n";
else
guiCom << "info string Reading network " + en.GetNnueNetPath() + " successful. Using NNUE (" + NnueCurrentArch->GetArchName() + ").\n";
guiCom << "info string Reading network " + en.GetNnueNetPath() + " successful. Using NNUE (" + en.rootposition.NnueArch->GetArchName() + ").\n";

cleanup:
#ifndef NNUEINCLUDED
Expand Down
26 changes: 20 additions & 6 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,14 @@ string chessposition::AlgebraicFromShort(string s)
//
// Credits for this code go to Kieren Pearson / Halogen
//

int GetNumOfNumaNodes()
{
if (numa_available() == -1)
return 1;
return max(1, numa_max_node() + 1);
}

vector<cpu_set_t> get_cpu_masks_per_numa_node()
{
vector<cpu_set_t> node_cpu_masks;
Expand Down Expand Up @@ -537,15 +545,16 @@ vector<cpu_set_t> get_cpu_masks_per_numa_node()
return node_cpu_masks;
}

void bind_thread(int index)
void workingthread::bind_thread()
{
static vector<cpu_set_t> mapping = get_cpu_masks_per_numa_node();
if (mapping.size() == 0)
int numanodes = en.numOfNumaNodes;
if (numanodes < 2)
return;

static vector<cpu_set_t> mapping = get_cpu_masks_per_numa_node();
// Use a random start node for better distribution of multiple instances
static int randomOffset = getTime() % mapping.size();
size_t node = (index + randomOffset) % mapping.size();
static int randomOffset = getTime() % numanodes;
size_t node = (index + randomOffset) % numanodes;
pthread_t handle = pthread_self();
pthread_setaffinity_np(handle, sizeof(cpu_set_t), &mapping[node]);
}
Expand All @@ -559,14 +568,19 @@ string numa_configuration()
}

#else
void bind_thread(int index)
void workingthread::bind_thread()
{
}

string numa_configuration()
{
return "";
}

int GetNumOfNumaNodes()
{
return 1;
}
#endif


Expand Down