From f450d9eda5bc1d0e1b0957c3175eeefb56f70db2 Mon Sep 17 00:00:00 2001 From: Andreas Matthies Date: Wed, 13 Aug 2025 08:45:20 +0200 Subject: [PATCH 1/8] Try to implement speedup of transformation using smarter shifting and multiplication. Original patch by cj5716, idea by Andrew Grant (don't mess with credits these days). Several things missing for now and becnh is still wrong. --- src/RubiChess.h | 2 +- src/nnue.cpp | 41 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/src/RubiChess.h b/src/RubiChess.h index 839e2391..8cf5ebbb 100644 --- a/src/RubiChess.h +++ b/src/RubiChess.h @@ -21,7 +21,7 @@ #define NNUEDEFAULT nn-f05142b28f-20250520.nnue // enable this switch for faster SSE2 code using 16bit integers -#define FASTSSE2 +//#define FASTSSE2 // Enable to get statistical values about various search features //#define STATISTICS diff --git a/src/nnue.cpp b/src/nnue.cpp index 2980739c..c25a06e1 100644 --- a/src/nnue.cpp +++ b/src/nnue.cpp @@ -520,11 +520,15 @@ typedef __m128i bias_vec_t; #define vec_set_16(a) _mm512_set1_epi16(a) #define vec_max_16(a,b) _mm512_max_epi16(a,b) #define vec_min_16(a,b) _mm512_min_epi16(a,b) -#define vec_mul_16(a,b) _mm512_mullo_epi16(a,b) +#define vec_mulhi_16(a,b) _mm512_mulhi_epi16(a,b) +#define vec_slli_16(a,b) _mm512_slli_epi16(a,b) +#define vec_packus_16(a,b) _mm512_packus_epi16(a,b) +#if 0 inline ft_vec_t vec_msb_pack_16(ft_vec_t a, ft_vec_t b) { ft_vec_t compacted = _mm512_packs_epi16(_mm512_srli_epi16(a, 7), _mm512_srli_epi16(b, 7)); return _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), compacted); } +#endif #define vec_add_16(a,b) _mm512_add_epi16(a,b) #define vec_sub_16(a,b) _mm512_sub_epi16(a,b) #define vec_packs(a,b) _mm512_packs_epi16(a,b) @@ -554,11 +558,15 @@ typedef __m128i bias_vec_t; #define vec_set_16(a) _mm256_set1_epi16(a) #define vec_max_16(a,b) _mm256_max_epi16(a,b) #define vec_min_16(a,b) _mm256_min_epi16(a,b) -#define vec_mul_16(a,b) _mm256_mullo_epi16(a,b) +#define vec_mulhi_16(a,b) _mm256_mulhi_epi16(a,b) +#define vec_slli_16(a,b) _mm256_slli_epi16(a,b) +#define vec_packus_16(a,b) _mm256_packus_epi16(a,b) +#if 0 inline ft_vec_t vec_msb_pack_16(ft_vec_t a, ft_vec_t b) { ft_vec_t compacted = _mm256_packs_epi16(_mm256_srli_epi16(a, 7), _mm256_srli_epi16(b, 7)); return _mm256_permute4x64_epi64(compacted, 0xd8); } +#endif #define vec_add_16(a,b) _mm256_add_epi16(a,b) #define vec_sub_16(a,b) _mm256_sub_epi16(a,b) #define vec_packs(a,b) _mm256_packs_epi16(a,b) @@ -1168,13 +1176,14 @@ int chessposition::Transform(clipped_t *output, int bucket) { const unsigned int numChunks = NnueFtHalfdims / 2 / MAXCHUNKSIZE; ft_vec_t Zero = vec_zero(); - ft_vec_t One = vec_set_16(127); + ft_vec_t One = vec_set_16(127 * 2); const ft_vec_t* in0 = (ft_vec_t*)(acm + perspectives[p] * NnueFtHalfdims); const ft_vec_t* in1 = (ft_vec_t*)(acm + perspectives[p] * NnueFtHalfdims + NnueFtHalfdims / 2); ftout_vec_t* out = (ftout_vec_t*)&output[offset]; for (unsigned int i = 0; i < numChunks; i++) { +#if 0 const ft_vec_t sum0a = vec_max_16(vec_min_16(in0[i * 2 + 0], One), Zero); const ft_vec_t sum0b = vec_max_16(vec_min_16(in0[i * 2 + 1], One), Zero); const ft_vec_t sum1a = vec_max_16(vec_min_16(in1[i * 2 + 0], One), Zero); @@ -1190,6 +1199,23 @@ int chessposition::Transform(clipped_t *output, int bucket) out[i * 2 + 1] = shftb; #else out[i] = vec_msb_pack_16(pa, pb); +#endif +#else +#ifdef USE_SSE2 + const int shift = 7; +#else // NEON + const int shift = 6; +#endif + const ft_vec_t sum0a = vec_slli_16(vec_max_16(vec_min_16(in0[i * 2 + 0], One), Zero), shift); + const ft_vec_t sum0b = vec_slli_16(vec_max_16(vec_min_16(in0[i * 2 + 1], One), Zero), shift); + const ft_vec_t sum1a = vec_min_16(in1[i * 2 + 0], One); + const ft_vec_t sum1b = vec_min_16(in1[i * 2 + 1], One); + + const ft_vec_t pa = vec_mulhi_16(sum0a, sum1a); + const ft_vec_t pb = vec_mulhi_16(sum0b, sum1b); + + out[i] = vec_packus_16(pa, pb); + #endif } } @@ -1332,7 +1358,9 @@ bool NnueFeatureTransformer::ReadFeatureWeights( else okay = okay && nr->read((unsigned char*)src_16, ftdims * sizeof(int16_t)); - memcpy(bias, src_16, ftdims * sizeof(int16_t)); + //memcpy(bias, src_16, ftdims * sizeof(int16_t)); + for (i = 0; i < ftdims; i++) + bias[i] = src_16[i] * 2; // read weights isLeb128 = testLeb128(nr); @@ -1351,7 +1379,10 @@ bool NnueFeatureTransformer::ReadFeatureWeights( } } - memcpy(weight, src_16, inputdims * ftdims * sizeof(int16_t)); + //memcpy(weight, src_16, inputdims * ftdims * sizeof(int16_t)); + for (i = 0; i < inputdims * ftdims; i++) + weight[i] = src_16[i] * 2; + free(src_16); if (psqtbuckets) From 43b86c3dc860403618fce650809070ad196c3daf Mon Sep 17 00:00:00 2001 From: Matthies Date: Wed, 13 Aug 2025 16:24:35 +0200 Subject: [PATCH 2/8] Permute the weights when loading th network. Bench: 5085407 --- src/RubiChess.h | 19 ++++++++++++++++++- src/nnue.cpp | 10 ++++++---- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/RubiChess.h b/src/RubiChess.h index 8cf5ebbb..61b7a089 100644 --- a/src/RubiChess.h +++ b/src/RubiChess.h @@ -21,7 +21,7 @@ #define NNUEDEFAULT nn-f05142b28f-20250520.nnue // enable this switch for faster SSE2 code using 16bit integers -//#define FASTSSE2 +#define FASTSSE2 // Enable to get statistical values about various search features //#define STATISTICS @@ -932,6 +932,23 @@ class NnueFeatureTransformer : public NnueLayer uint32_t GetHash() { return NNUEINPUTSLICEHASH ^ (ftdims * 2); }; + int permutedWeightIndex(int i, bool reverse = false) + { +#if defined(USE_AVX512) + const int permuteindex[] = { 0, 2, 4, 6, 1, 3, 5, 7 }; + const int reversepermuteindex[] = { 0, 4, 1, 5, 2, 6, 3, 7 }; +#elif defined(USE_AVX2) + const int permuteindex[] = {0, 2, 1, 3, 4, 6, 5, 7}; + const int reversepermuteindex[] = { 0, 2, 1, 3, 4, 6, 5, 7 }; +#else + const int permuteindex[] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + const int reversepermuteindex[] = { 0, 1, 2, 3, 4, 5, 6, 7 }; +#endif + int block = (i / 64) * 64; + int chunk = (i % 64) / 8; + int permutedindex = (reverse ? reversepermuteindex[chunk] : permuteindex[chunk]) * 8 + (i % 8); + return block + permutedindex; + } #ifdef STATISTICS void SwapWeights(unsigned int i1, unsigned int i2) { int16_t bias_temp = bias[i1]; diff --git a/src/nnue.cpp b/src/nnue.cpp index c25a06e1..49196332 100644 --- a/src/nnue.cpp +++ b/src/nnue.cpp @@ -1358,9 +1358,9 @@ bool NnueFeatureTransformer::ReadFeatureWeights( else okay = okay && nr->read((unsigned char*)src_16, ftdims * sizeof(int16_t)); - //memcpy(bias, src_16, ftdims * sizeof(int16_t)); + // Scale and permute for (i = 0; i < ftdims; i++) - bias[i] = src_16[i] * 2; + bias[permutedWeightIndex(i)] = src_16[i] * 2; // read weights isLeb128 = testLeb128(nr); @@ -1379,9 +1379,11 @@ bool NnueFeatureTransformer::ReadFeatureWeights( } } - //memcpy(weight, src_16, inputdims * ftdims * sizeof(int16_t)); + // Scale and permute for (i = 0; i < inputdims * ftdims; i++) - weight[i] = src_16[i] * 2; + { + weight[permutedWeightIndex(i)] = src_16[i] * 2; + } free(src_16); From ceaf716629e832b989122467a4eafe5cec44ce41 Mon Sep 17 00:00:00 2001 From: Matthies Date: Wed, 13 Aug 2025 18:07:06 +0200 Subject: [PATCH 3/8] Fix other archs. Implement writing. Bench: 5085407 --- src/RubiChess.h | 4 ++-- src/nnue.cpp | 36 +++++++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/RubiChess.h b/src/RubiChess.h index 61b7a089..1a961de4 100644 --- a/src/RubiChess.h +++ b/src/RubiChess.h @@ -935,8 +935,8 @@ class NnueFeatureTransformer : public NnueLayer int permutedWeightIndex(int i, bool reverse = false) { #if defined(USE_AVX512) - const int permuteindex[] = { 0, 2, 4, 6, 1, 3, 5, 7 }; - const int reversepermuteindex[] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + const int permuteindex[] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + const int reversepermuteindex[] = { 0, 2, 4, 6, 1, 3, 5, 7 }; #elif defined(USE_AVX2) const int permuteindex[] = {0, 2, 1, 3, 4, 6, 5, 7}; const int reversepermuteindex[] = { 0, 2, 1, 3, 4, 6, 5, 7 }; diff --git a/src/nnue.cpp b/src/nnue.cpp index 49196332..2eaf13ef 100644 --- a/src/nnue.cpp +++ b/src/nnue.cpp @@ -595,11 +595,13 @@ typedef __m128i ft_vec_t, ftout_vec_t, psqt_vec_t; #define vec_set_16(a) _mm_set1_epi16(a) #define vec_max_16(a,b) _mm_max_epi16(a,b) #define vec_min_16(a,b) _mm_min_epi16(a,b) -#define vec_mul_16(a,b) _mm_mullo_epi16(a,b) +#define vec_mulhi_16(a,b) _mm_mulhi_epi16(a,b) +#define vec_slli_16(a,b) _mm_slli_epi16(a,b) +#define vec_packus_16(a,b) _mm_packus_epi16(a,b) #define vec_add_16(a,b) _mm_add_epi16(a,b) #define vec_sub_16(a,b) _mm_sub_epi16(a,b) #define vec_packs(a,b) _mm_packs_epi16(a,b) -#define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7)) +//#define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7)) #define vec_zero_psqt() _mm_setzero_si128() #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b) @@ -1248,9 +1250,9 @@ int chessposition::Transform(clipped_t *output, int bucket) for (unsigned int i = 0; i < NnueFtHalfdims / 2; i++) { int16_t sum0 = *(acm + perspectives[p] * NnueFtHalfdims + i); int16_t sum1 = *(acm + perspectives[p] * NnueFtHalfdims + NnueFtHalfdims / 2 + i); - sum0 = max((int16_t)0, min((int16_t)127, sum0)); - sum1 = max((int16_t)0, min((int16_t)127, sum1)); - output[offset + i] = sum0 * sum1 / 128; + sum0 = max((int16_t)0, min((int16_t)(127 * 2), sum0)); + sum1 = max((int16_t)0, min((int16_t)(127 * 2), sum1)); + output[offset + i] = sum0 * sum1 / 512; } } #endif @@ -1452,16 +1454,32 @@ bool writeLeb128(NnueNetsource* nr, IntType* in, size_t count) template void NnueFeatureTransformer::WriteFeatureWeights(NnueNetsource* nr, bool leb128) { + // we need some buffers for unscaled and unpermuted weights written to network file + int16_t* scaledweight = (int16_t*)calloc(inputdims * ftdims, sizeof(int16_t)); + int16_t* scaledbias = (int16_t*)calloc(ftdims, sizeof(int16_t)); + if (!scaledweight || !scaledbias) + return; + + // Scale and permute + int i; + for (i = 0; i < ftdims; i++) + scaledbias[permutedWeightIndex(i, true)] = bias[i] / 2; + for (i = 0; i < inputdims * ftdims; i++) + scaledweight[permutedWeightIndex(i, true)] = weight[i] / 2; + if (leb128) { - writeLeb128(nr, bias, ftdims); - writeLeb128(nr, weight, inputdims * ftdims); + writeLeb128(nr, scaledbias, ftdims); + writeLeb128(nr, scaledweight, inputdims * ftdims); writeLeb128(nr, psqtWeights, inputdims * psqtbuckets); } else { - nr->write((unsigned char*)bias, ftdims * sizeof(int16_t)); - nr->write((unsigned char*)weight, inputdims * ftdims * sizeof(int16_t)); + nr->write((unsigned char*)scaledbias, ftdims * sizeof(int16_t)); + nr->write((unsigned char*)scaledweight, inputdims * ftdims * sizeof(int16_t)); nr->write((unsigned char*)psqtWeights, inputdims * psqtbuckets * sizeof(int32_t)); } + + free(scaledweight); + free(scaledbias); } From be1bb633a830439a66d193384cea12bb94f4796c Mon Sep 17 00:00:00 2001 From: Matthies Date: Wed, 13 Aug 2025 18:20:03 +0200 Subject: [PATCH 4/8] Cleanup. Bench: 5085407 --- src/nnue.cpp | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/src/nnue.cpp b/src/nnue.cpp index 2eaf13ef..5c950ba7 100644 --- a/src/nnue.cpp +++ b/src/nnue.cpp @@ -523,12 +523,6 @@ typedef __m128i bias_vec_t; #define vec_mulhi_16(a,b) _mm512_mulhi_epi16(a,b) #define vec_slli_16(a,b) _mm512_slli_epi16(a,b) #define vec_packus_16(a,b) _mm512_packus_epi16(a,b) -#if 0 -inline ft_vec_t vec_msb_pack_16(ft_vec_t a, ft_vec_t b) { - ft_vec_t compacted = _mm512_packs_epi16(_mm512_srli_epi16(a, 7), _mm512_srli_epi16(b, 7)); - return _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), compacted); -} -#endif #define vec_add_16(a,b) _mm512_add_epi16(a,b) #define vec_sub_16(a,b) _mm512_sub_epi16(a,b) #define vec_packs(a,b) _mm512_packs_epi16(a,b) @@ -561,12 +555,6 @@ typedef __m128i bias_vec_t; #define vec_mulhi_16(a,b) _mm256_mulhi_epi16(a,b) #define vec_slli_16(a,b) _mm256_slli_epi16(a,b) #define vec_packus_16(a,b) _mm256_packus_epi16(a,b) -#if 0 -inline ft_vec_t vec_msb_pack_16(ft_vec_t a, ft_vec_t b) { - ft_vec_t compacted = _mm256_packs_epi16(_mm256_srli_epi16(a, 7), _mm256_srli_epi16(b, 7)); - return _mm256_permute4x64_epi64(compacted, 0xd8); -} -#endif #define vec_add_16(a,b) _mm256_add_epi16(a,b) #define vec_sub_16(a,b) _mm256_sub_epi16(a,b) #define vec_packs(a,b) _mm256_packs_epi16(a,b) @@ -601,7 +589,6 @@ typedef __m128i ft_vec_t, ftout_vec_t, psqt_vec_t; #define vec_add_16(a,b) _mm_add_epi16(a,b) #define vec_sub_16(a,b) _mm_sub_epi16(a,b) #define vec_packs(a,b) _mm_packs_epi16(a,b) -//#define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7)) #define vec_zero_psqt() _mm_setzero_si128() #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b) @@ -1185,24 +1172,6 @@ int chessposition::Transform(clipped_t *output, int bucket) ftout_vec_t* out = (ftout_vec_t*)&output[offset]; for (unsigned int i = 0; i < numChunks; i++) { -#if 0 - const ft_vec_t sum0a = vec_max_16(vec_min_16(in0[i * 2 + 0], One), Zero); - const ft_vec_t sum0b = vec_max_16(vec_min_16(in0[i * 2 + 1], One), Zero); - const ft_vec_t sum1a = vec_max_16(vec_min_16(in1[i * 2 + 0], One), Zero); - const ft_vec_t sum1b = vec_max_16(vec_min_16(in1[i * 2 + 1], One), Zero); - - const ft_vec_t pa = vec_mul_16(sum0a, sum1a); - const ft_vec_t pb = vec_mul_16(sum0b, sum1b); -#ifdef USE_FASTSSE2 - const ft_vec_t shfta = _mm_srli_epi16(pa, 7); - const ft_vec_t shftb = _mm_srli_epi16(pb, 7); - - out[i * 2] = shfta; - out[i * 2 + 1] = shftb; -#else - out[i] = vec_msb_pack_16(pa, pb); -#endif -#else #ifdef USE_SSE2 const int shift = 7; #else // NEON @@ -1217,8 +1186,6 @@ int chessposition::Transform(clipped_t *output, int bucket) const ft_vec_t pb = vec_mulhi_16(sum0b, sum1b); out[i] = vec_packus_16(pa, pb); - -#endif } } else { From 4f8b63c4d3673d19f4b1121b5a496324b1495544 Mon Sep 17 00:00:00 2001 From: Matthies Date: Wed, 13 Aug 2025 21:16:34 +0200 Subject: [PATCH 5/8] Fixes for old network arch. SSE2 still doesn't work. --- src/RubiChess.h | 22 ++++------------------ src/RubiChess.vcxproj | 4 ++-- src/nnue.cpp | 36 ++++++++++++++++++++++++++++++------ 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/src/RubiChess.h b/src/RubiChess.h index 1a961de4..25f481df 100644 --- a/src/RubiChess.h +++ b/src/RubiChess.h @@ -51,7 +51,7 @@ //#define NNUELEARN // Enable this to enable NNUE debug output -//#define NNUEDEBUG +#define NNUEDEBUG // Enable this to compile support for asserts including stack trace // MSVC only, link with DbgHelp.lib @@ -876,6 +876,9 @@ class NnueArchitecture virtual unsigned int GetAccumulationSize() = 0; virtual unsigned int GetPsqtAccumulationSize() = 0; virtual size_t GetNetworkFilesize() = 0; + virtual int GetFtWeightUpscale() = 0; + virtual int GetPermutedWeightIndex(int i, bool reverse = false) = 0; + #ifdef STATISTICS virtual void SwapInputNeurons(unsigned int i1, unsigned int i2) = 0; virtual void Statistics(bool verbose, bool sort) = 0; @@ -932,23 +935,6 @@ class NnueFeatureTransformer : public NnueLayer uint32_t GetHash() { return NNUEINPUTSLICEHASH ^ (ftdims * 2); }; - int permutedWeightIndex(int i, bool reverse = false) - { -#if defined(USE_AVX512) - const int permuteindex[] = { 0, 4, 1, 5, 2, 6, 3, 7 }; - const int reversepermuteindex[] = { 0, 2, 4, 6, 1, 3, 5, 7 }; -#elif defined(USE_AVX2) - const int permuteindex[] = {0, 2, 1, 3, 4, 6, 5, 7}; - const int reversepermuteindex[] = { 0, 2, 1, 3, 4, 6, 5, 7 }; -#else - const int permuteindex[] = { 0, 1, 2, 3, 4, 5, 6, 7 }; - const int reversepermuteindex[] = { 0, 1, 2, 3, 4, 5, 6, 7 }; -#endif - int block = (i / 64) * 64; - int chunk = (i % 64) / 8; - int permutedindex = (reverse ? reversepermuteindex[chunk] : permuteindex[chunk]) * 8 + (i % 8); - return block + permutedindex; - } #ifdef STATISTICS void SwapWeights(unsigned int i1, unsigned int i2) { int16_t bias_temp = bias[i1]; diff --git a/src/RubiChess.vcxproj b/src/RubiChess.vcxproj index 7a8f6ef9..18d2ca51 100644 --- a/src/RubiChess.vcxproj +++ b/src/RubiChess.vcxproj @@ -188,7 +188,7 @@ Level3 Disabled - _DEBUG;_CONSOLE;%(PreprocessorDefinitions);USE_SSE2;USE_SSSE3;USE_POPCNT;USE_BMI1;USE_AVX2;USE_ZLIB + _DEBUG;_CONSOLE;%(PreprocessorDefinitions);USE_SSE2;USE_SSSE3x;USE_POPCNT;USE_BMI1;USE_AVX2x;USE_ZLIB Console @@ -322,7 +322,7 @@ MaxSpeed true true - _CONSOLE;%(PreprocessorDefinitions);USE_SSE2;USE_SSSE3;USE_POPCNT;USE_BMI1;USE_AVX2;USE_ZLIB + _CONSOLE;%(PreprocessorDefinitions);USE_SSE2;USE_SSSE3x;USE_POPCNT;USE_BMI1;USE_AVX2x;USE_ZLIB Speed MultiThreaded diff --git a/src/nnue.cpp b/src/nnue.cpp index 5c950ba7..ec9babf7 100644 --- a/src/nnue.cpp +++ b/src/nnue.cpp @@ -206,6 +206,13 @@ class NnueArchitectureV1 : public NnueArchitecture { size_t GetNetworkFilesize() { return networkfilesize; } + int GetFtWeightUpscale() { + return 1; + } + int GetPermutedWeightIndex(int i, bool reverse = false) { + return i; + } + #ifdef STATISTICS void SwapInputNeurons(unsigned int i1, unsigned int i2) { // not supported for V1 @@ -376,6 +383,25 @@ class NnueArchitectureV5 : public NnueArchitecture { size_t GetNetworkFilesize() { return networkfilesize; } + int GetFtWeightUpscale() { + return 2; + } + int GetPermutedWeightIndex(int i, bool reverse = false) { +#if defined(USE_AVX512) + const int permuteindex[] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + const int reversepermuteindex[] = { 0, 2, 4, 6, 1, 3, 5, 7 }; +#elif defined(USE_AVX2) + const int permuteindex[] = { 0, 2, 1, 3, 4, 6, 5, 7 }; + const int reversepermuteindex[] = { 0, 2, 1, 3, 4, 6, 5, 7 }; +#else + const int permuteindex[] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + const int reversepermuteindex[] = { 0, 1, 2, 3, 4, 5, 6, 7 }; +#endif + int block = (i / 64) * 64; + int chunk = (i % 64) / 8; + int permutedindex = (reverse ? reversepermuteindex[chunk] : permuteindex[chunk]) * 8 + (i % 8); + return block + permutedindex; + } #ifdef STATISTICS void SwapInputNeurons(unsigned int i1, unsigned int i2) { if (i1 >= NnueFtHalfdims / 2 || i2 >= NnueFtHalfdims / 2) { @@ -1329,7 +1355,7 @@ bool NnueFeatureTransformer::ReadFeatureWeights( // Scale and permute for (i = 0; i < ftdims; i++) - bias[permutedWeightIndex(i)] = src_16[i] * 2; + bias[NnueCurrentArch->GetPermutedWeightIndex(i)] = src_16[i] * NnueCurrentArch->GetFtWeightUpscale(); // read weights isLeb128 = testLeb128(nr); @@ -1350,9 +1376,7 @@ bool NnueFeatureTransformer::ReadFeatureWeights( // Scale and permute for (i = 0; i < inputdims * ftdims; i++) - { - weight[permutedWeightIndex(i)] = src_16[i] * 2; - } + weight[NnueCurrentArch->GetPermutedWeightIndex(i)] = src_16[i] * NnueCurrentArch->GetFtWeightUpscale(); free(src_16); @@ -1430,9 +1454,9 @@ void NnueFeatureTransformer::WriteFeatureWeights // Scale and permute int i; for (i = 0; i < ftdims; i++) - scaledbias[permutedWeightIndex(i, true)] = bias[i] / 2; + scaledbias[NnueCurrentArch->GetPermutedWeightIndex(i, true)] = bias[i] / NnueCurrentArch->GetFtWeightUpscale(); for (i = 0; i < inputdims * ftdims; i++) - scaledweight[permutedWeightIndex(i, true)] = weight[i] / 2; + scaledweight[NnueCurrentArch->GetPermutedWeightIndex(i, true)] = weight[i] / NnueCurrentArch->GetFtWeightUpscale(); if (leb128) { writeLeb128(nr, scaledbias, ftdims); From 541c04f454081522f4125008997446981fe66116 Mon Sep 17 00:00:00 2001 From: Matthies Date: Thu, 14 Aug 2025 12:52:15 +0200 Subject: [PATCH 6/8] Fix for fast SSE2. Bench: 5085407 --- src/RubiChess.h | 2 +- src/nnue.cpp | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/RubiChess.h b/src/RubiChess.h index 25f481df..ec70f87c 100644 --- a/src/RubiChess.h +++ b/src/RubiChess.h @@ -51,7 +51,7 @@ //#define NNUELEARN // Enable this to enable NNUE debug output -#define NNUEDEBUG +//#define NNUEDEBUG // Enable this to compile support for asserts including stack trace // MSVC only, link with DbgHelp.lib diff --git a/src/nnue.cpp b/src/nnue.cpp index ec9babf7..5df8a725 100644 --- a/src/nnue.cpp +++ b/src/nnue.cpp @@ -1210,8 +1210,12 @@ int chessposition::Transform(clipped_t *output, int bucket) const ft_vec_t pa = vec_mulhi_16(sum0a, sum1a); const ft_vec_t pb = vec_mulhi_16(sum0b, sum1b); - +#ifdef USE_FASTSSE2 + out[i * 2] = _mm_max_epi16(pa, Zero); + out[i * 2 + 1] = _mm_max_epi16(pb, Zero); +#else out[i] = vec_packus_16(pa, pb); +#endif } } else { From 370674d4a18a80dcd3b2482c00d4d28b2b358797 Mon Sep 17 00:00:00 2001 From: Matthies Date: Thu, 14 Aug 2025 16:48:19 +0200 Subject: [PATCH 7/8] Fix for ARM. Bench: 5085407 --- src/nnue.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/nnue.cpp b/src/nnue.cpp index 5df8a725..02398ed9 100644 --- a/src/nnue.cpp +++ b/src/nnue.cpp @@ -653,13 +653,9 @@ typedef int8x16_t sprsin_vec_t; #define vec_set_16(a) vdupq_n_s16(a) #define vec_max_16(a,b) vmaxq_s16(a,b) #define vec_min_16(a,b) vminq_s16(a,b) -#define vec_mul_16(a,b) vmulq_s16(a,b) -inline ft_vec_t vec_msb_pack_16(ft_vec_t a, ft_vec_t b) { - const int8x8_t shifta = vshrn_n_s16(a, 7); - const int8x8_t shiftb = vshrn_n_s16(b, 7); - const int8x16_t compacted = vcombine_s8(shifta, shiftb); - return *(ft_vec_t*)&compacted; -} +#define vec_mulhi_16(a,b) vqdmulhq_s16(a,b) +#define vec_slli_16(a,b) vshlq_s16(a,vec_set_16(b)) +#define vec_packus_16(a,b) (ft_vec_t)(vcombine_u8(vqmovun_s16(a), vqmovun_s16(b))) #define vec_add_16(a,b) vaddq_s16(a,b) #define vec_sub_16(a,b) vsubq_s16(a,b) #define vec_packs(a,b) vcombine_s8(vqmovn_s16(a),vqmovn_s16(b)) From 94c86461e8aa19ba26aa86b972c068f2e7eaf5b7 Mon Sep 17 00:00:00 2001 From: Matthies Date: Thu, 14 Aug 2025 18:50:32 +0200 Subject: [PATCH 8/8] Fix warning. Bench: 5085407 --- src/nnue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnue.cpp b/src/nnue.cpp index 02398ed9..5c89b26c 100644 --- a/src/nnue.cpp +++ b/src/nnue.cpp @@ -210,7 +210,7 @@ class NnueArchitectureV1 : public NnueArchitecture { return 1; } int GetPermutedWeightIndex(int i, bool reverse = false) { - return i; + return (reverse ? i : i); } #ifdef STATISTICS