diff --git a/src/RubiChess.h b/src/RubiChess.h index 839e2391..ec70f87c 100644 --- a/src/RubiChess.h +++ b/src/RubiChess.h @@ -876,6 +876,9 @@ class NnueArchitecture virtual unsigned int GetAccumulationSize() = 0; virtual unsigned int GetPsqtAccumulationSize() = 0; virtual size_t GetNetworkFilesize() = 0; + virtual int GetFtWeightUpscale() = 0; + virtual int GetPermutedWeightIndex(int i, bool reverse = false) = 0; + #ifdef STATISTICS virtual void SwapInputNeurons(unsigned int i1, unsigned int i2) = 0; virtual void Statistics(bool verbose, bool sort) = 0; diff --git a/src/RubiChess.vcxproj b/src/RubiChess.vcxproj index 7a8f6ef9..18d2ca51 100644 --- a/src/RubiChess.vcxproj +++ b/src/RubiChess.vcxproj @@ -188,7 +188,7 @@ Level3 Disabled - _DEBUG;_CONSOLE;%(PreprocessorDefinitions);USE_SSE2;USE_SSSE3;USE_POPCNT;USE_BMI1;USE_AVX2;USE_ZLIB + _DEBUG;_CONSOLE;%(PreprocessorDefinitions);USE_SSE2;USE_SSSE3x;USE_POPCNT;USE_BMI1;USE_AVX2x;USE_ZLIB Console @@ -322,7 +322,7 @@ MaxSpeed true true - _CONSOLE;%(PreprocessorDefinitions);USE_SSE2;USE_SSSE3;USE_POPCNT;USE_BMI1;USE_AVX2;USE_ZLIB + _CONSOLE;%(PreprocessorDefinitions);USE_SSE2;USE_SSSE3x;USE_POPCNT;USE_BMI1;USE_AVX2x;USE_ZLIB Speed MultiThreaded diff --git a/src/nnue.cpp b/src/nnue.cpp index 2980739c..5c89b26c 100644 --- a/src/nnue.cpp +++ b/src/nnue.cpp @@ -206,6 +206,13 @@ class NnueArchitectureV1 : public NnueArchitecture { size_t GetNetworkFilesize() { return networkfilesize; } + int GetFtWeightUpscale() { + return 1; + } + int GetPermutedWeightIndex(int i, bool reverse = false) { + return (reverse ? i : i); + } + #ifdef STATISTICS void SwapInputNeurons(unsigned int i1, unsigned int i2) { // not supported for V1 @@ -376,6 +383,25 @@ class NnueArchitectureV5 : public NnueArchitecture { size_t GetNetworkFilesize() { return networkfilesize; } + int GetFtWeightUpscale() { + return 2; + } + int GetPermutedWeightIndex(int i, bool reverse = false) { +#if defined(USE_AVX512) + const int permuteindex[] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + const int reversepermuteindex[] = { 0, 2, 4, 6, 1, 3, 5, 7 }; +#elif defined(USE_AVX2) + const int permuteindex[] = { 0, 2, 1, 3, 4, 6, 5, 7 }; + const int reversepermuteindex[] = { 0, 2, 1, 3, 4, 6, 5, 7 }; +#else + const int permuteindex[] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + const int reversepermuteindex[] = { 0, 1, 2, 3, 4, 5, 6, 7 }; +#endif + int block = (i / 64) * 64; + int chunk = (i % 64) / 8; + int permutedindex = (reverse ? reversepermuteindex[chunk] : permuteindex[chunk]) * 8 + (i % 8); + return block + permutedindex; + } #ifdef STATISTICS void SwapInputNeurons(unsigned int i1, unsigned int i2) { if (i1 >= NnueFtHalfdims / 2 || i2 >= NnueFtHalfdims / 2) { @@ -520,11 +546,9 @@ typedef __m128i bias_vec_t; #define vec_set_16(a) _mm512_set1_epi16(a) #define vec_max_16(a,b) _mm512_max_epi16(a,b) #define vec_min_16(a,b) _mm512_min_epi16(a,b) -#define vec_mul_16(a,b) _mm512_mullo_epi16(a,b) -inline ft_vec_t vec_msb_pack_16(ft_vec_t a, ft_vec_t b) { - ft_vec_t compacted = _mm512_packs_epi16(_mm512_srli_epi16(a, 7), _mm512_srli_epi16(b, 7)); - return _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), compacted); -} +#define vec_mulhi_16(a,b) _mm512_mulhi_epi16(a,b) +#define vec_slli_16(a,b) _mm512_slli_epi16(a,b) +#define vec_packus_16(a,b) _mm512_packus_epi16(a,b) #define vec_add_16(a,b) _mm512_add_epi16(a,b) #define vec_sub_16(a,b) _mm512_sub_epi16(a,b) #define vec_packs(a,b) _mm512_packs_epi16(a,b) @@ -554,11 +578,9 @@ typedef __m128i bias_vec_t; #define vec_set_16(a) _mm256_set1_epi16(a) #define vec_max_16(a,b) _mm256_max_epi16(a,b) #define vec_min_16(a,b) _mm256_min_epi16(a,b) -#define vec_mul_16(a,b) _mm256_mullo_epi16(a,b) -inline ft_vec_t vec_msb_pack_16(ft_vec_t a, ft_vec_t b) { - ft_vec_t compacted = _mm256_packs_epi16(_mm256_srli_epi16(a, 7), _mm256_srli_epi16(b, 7)); - return _mm256_permute4x64_epi64(compacted, 0xd8); -} +#define vec_mulhi_16(a,b) _mm256_mulhi_epi16(a,b) +#define vec_slli_16(a,b) _mm256_slli_epi16(a,b) +#define vec_packus_16(a,b) _mm256_packus_epi16(a,b) #define vec_add_16(a,b) _mm256_add_epi16(a,b) #define vec_sub_16(a,b) _mm256_sub_epi16(a,b) #define vec_packs(a,b) _mm256_packs_epi16(a,b) @@ -587,11 +609,12 @@ typedef __m128i ft_vec_t, ftout_vec_t, psqt_vec_t; #define vec_set_16(a) _mm_set1_epi16(a) #define vec_max_16(a,b) _mm_max_epi16(a,b) #define vec_min_16(a,b) _mm_min_epi16(a,b) -#define vec_mul_16(a,b) _mm_mullo_epi16(a,b) +#define vec_mulhi_16(a,b) _mm_mulhi_epi16(a,b) +#define vec_slli_16(a,b) _mm_slli_epi16(a,b) +#define vec_packus_16(a,b) _mm_packus_epi16(a,b) #define vec_add_16(a,b) _mm_add_epi16(a,b) #define vec_sub_16(a,b) _mm_sub_epi16(a,b) #define vec_packs(a,b) _mm_packs_epi16(a,b) -#define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7)) #define vec_zero_psqt() _mm_setzero_si128() #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b) @@ -630,13 +653,9 @@ typedef int8x16_t sprsin_vec_t; #define vec_set_16(a) vdupq_n_s16(a) #define vec_max_16(a,b) vmaxq_s16(a,b) #define vec_min_16(a,b) vminq_s16(a,b) -#define vec_mul_16(a,b) vmulq_s16(a,b) -inline ft_vec_t vec_msb_pack_16(ft_vec_t a, ft_vec_t b) { - const int8x8_t shifta = vshrn_n_s16(a, 7); - const int8x8_t shiftb = vshrn_n_s16(b, 7); - const int8x16_t compacted = vcombine_s8(shifta, shiftb); - return *(ft_vec_t*)&compacted; -} +#define vec_mulhi_16(a,b) vqdmulhq_s16(a,b) +#define vec_slli_16(a,b) vshlq_s16(a,vec_set_16(b)) +#define vec_packus_16(a,b) (ft_vec_t)(vcombine_u8(vqmovun_s16(a), vqmovun_s16(b))) #define vec_add_16(a,b) vaddq_s16(a,b) #define vec_sub_16(a,b) vsubq_s16(a,b) #define vec_packs(a,b) vcombine_s8(vqmovn_s16(a),vqmovn_s16(b)) @@ -1168,28 +1187,30 @@ int chessposition::Transform(clipped_t *output, int bucket) { const unsigned int numChunks = NnueFtHalfdims / 2 / MAXCHUNKSIZE; ft_vec_t Zero = vec_zero(); - ft_vec_t One = vec_set_16(127); + ft_vec_t One = vec_set_16(127 * 2); const ft_vec_t* in0 = (ft_vec_t*)(acm + perspectives[p] * NnueFtHalfdims); const ft_vec_t* in1 = (ft_vec_t*)(acm + perspectives[p] * NnueFtHalfdims + NnueFtHalfdims / 2); ftout_vec_t* out = (ftout_vec_t*)&output[offset]; for (unsigned int i = 0; i < numChunks; i++) { - const ft_vec_t sum0a = vec_max_16(vec_min_16(in0[i * 2 + 0], One), Zero); - const ft_vec_t sum0b = vec_max_16(vec_min_16(in0[i * 2 + 1], One), Zero); - const ft_vec_t sum1a = vec_max_16(vec_min_16(in1[i * 2 + 0], One), Zero); - const ft_vec_t sum1b = vec_max_16(vec_min_16(in1[i * 2 + 1], One), Zero); +#ifdef USE_SSE2 + const int shift = 7; +#else // NEON + const int shift = 6; +#endif + const ft_vec_t sum0a = vec_slli_16(vec_max_16(vec_min_16(in0[i * 2 + 0], One), Zero), shift); + const ft_vec_t sum0b = vec_slli_16(vec_max_16(vec_min_16(in0[i * 2 + 1], One), Zero), shift); + const ft_vec_t sum1a = vec_min_16(in1[i * 2 + 0], One); + const ft_vec_t sum1b = vec_min_16(in1[i * 2 + 1], One); - const ft_vec_t pa = vec_mul_16(sum0a, sum1a); - const ft_vec_t pb = vec_mul_16(sum0b, sum1b); + const ft_vec_t pa = vec_mulhi_16(sum0a, sum1a); + const ft_vec_t pb = vec_mulhi_16(sum0b, sum1b); #ifdef USE_FASTSSE2 - const ft_vec_t shfta = _mm_srli_epi16(pa, 7); - const ft_vec_t shftb = _mm_srli_epi16(pb, 7); - - out[i * 2] = shfta; - out[i * 2 + 1] = shftb; + out[i * 2] = _mm_max_epi16(pa, Zero); + out[i * 2 + 1] = _mm_max_epi16(pb, Zero); #else - out[i] = vec_msb_pack_16(pa, pb); + out[i] = vec_packus_16(pa, pb); #endif } } @@ -1222,9 +1243,9 @@ int chessposition::Transform(clipped_t *output, int bucket) for (unsigned int i = 0; i < NnueFtHalfdims / 2; i++) { int16_t sum0 = *(acm + perspectives[p] * NnueFtHalfdims + i); int16_t sum1 = *(acm + perspectives[p] * NnueFtHalfdims + NnueFtHalfdims / 2 + i); - sum0 = max((int16_t)0, min((int16_t)127, sum0)); - sum1 = max((int16_t)0, min((int16_t)127, sum1)); - output[offset + i] = sum0 * sum1 / 128; + sum0 = max((int16_t)0, min((int16_t)(127 * 2), sum0)); + sum1 = max((int16_t)0, min((int16_t)(127 * 2), sum1)); + output[offset + i] = sum0 * sum1 / 512; } } #endif @@ -1332,7 +1353,9 @@ bool NnueFeatureTransformer::ReadFeatureWeights( else okay = okay && nr->read((unsigned char*)src_16, ftdims * sizeof(int16_t)); - memcpy(bias, src_16, ftdims * sizeof(int16_t)); + // Scale and permute + for (i = 0; i < ftdims; i++) + bias[NnueCurrentArch->GetPermutedWeightIndex(i)] = src_16[i] * NnueCurrentArch->GetFtWeightUpscale(); // read weights isLeb128 = testLeb128(nr); @@ -1351,7 +1374,10 @@ bool NnueFeatureTransformer::ReadFeatureWeights( } } - memcpy(weight, src_16, inputdims * ftdims * sizeof(int16_t)); + // Scale and permute + for (i = 0; i < inputdims * ftdims; i++) + weight[NnueCurrentArch->GetPermutedWeightIndex(i)] = src_16[i] * NnueCurrentArch->GetFtWeightUpscale(); + free(src_16); if (psqtbuckets) @@ -1419,16 +1445,32 @@ bool writeLeb128(NnueNetsource* nr, IntType* in, size_t count) template void NnueFeatureTransformer::WriteFeatureWeights(NnueNetsource* nr, bool leb128) { + // we need some buffers for unscaled and unpermuted weights written to network file + int16_t* scaledweight = (int16_t*)calloc(inputdims * ftdims, sizeof(int16_t)); + int16_t* scaledbias = (int16_t*)calloc(ftdims, sizeof(int16_t)); + if (!scaledweight || !scaledbias) + return; + + // Scale and permute + int i; + for (i = 0; i < ftdims; i++) + scaledbias[NnueCurrentArch->GetPermutedWeightIndex(i, true)] = bias[i] / NnueCurrentArch->GetFtWeightUpscale(); + for (i = 0; i < inputdims * ftdims; i++) + scaledweight[NnueCurrentArch->GetPermutedWeightIndex(i, true)] = weight[i] / NnueCurrentArch->GetFtWeightUpscale(); + if (leb128) { - writeLeb128(nr, bias, ftdims); - writeLeb128(nr, weight, inputdims * ftdims); + writeLeb128(nr, scaledbias, ftdims); + writeLeb128(nr, scaledweight, inputdims * ftdims); writeLeb128(nr, psqtWeights, inputdims * psqtbuckets); } else { - nr->write((unsigned char*)bias, ftdims * sizeof(int16_t)); - nr->write((unsigned char*)weight, inputdims * ftdims * sizeof(int16_t)); + nr->write((unsigned char*)scaledbias, ftdims * sizeof(int16_t)); + nr->write((unsigned char*)scaledweight, inputdims * ftdims * sizeof(int16_t)); nr->write((unsigned char*)psqtWeights, inputdims * psqtbuckets * sizeof(int32_t)); } + + free(scaledweight); + free(scaledbias); }