Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion stl/inc/xutility
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ _STL_DISABLE_CLANG_WARNINGS

#define _VECTORIZED_ADJACENT_FIND _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
#define _VECTORIZED_BITSET_FROM_STRING _VECTORIZED_FOR_X64_X86
#define _VECTORIZED_BITSET_TO_STRING _VECTORIZED_FOR_X64_X86
#define _VECTORIZED_BITSET_TO_STRING _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
#define _VECTORIZED_COUNT _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
#define _VECTORIZED_FIND _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
#define _VECTORIZED_FIND_END _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
Expand Down
115 changes: 97 additions & 18 deletions stl/src/vector_algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11069,15 +11069,70 @@ __declspec(noalias) bool __stdcall __std_includes_less_8u(

} // extern "C"

#ifndef _M_ARM64
namespace {
namespace _Bitset_to_string {
#ifdef _M_ARM64EC
using _Traits_1_avx = void;
using _Traits_1_sse = void;
using _Traits_2_avx = void;
using _Traits_2_sse = void;
#else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv
#if defined(_M_ARM64) || defined(_M_ARM64EC)
struct _Traits_1_neon {
using _Value_type = uint16_t;
using _Vec_t = uint8x16_t;
static void _Exit_vectorized() noexcept {}

static void _Out(void* const _Dest, const _Vec_t _Elems) noexcept {
vst1q_u8(static_cast<uint8_t*>(_Dest), _Elems);
}

static _Vec_t _Set(const uint8_t _Val) noexcept {
return vdupq_n_u8(_Val);
}

static _Vec_t _Load_constant() {
// We do not omit static here, despite DevCom-11055227, because codegen is worse - see DevCom-11056805.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually wonder if this doesn't belong in a comment now (and below), but I thought I would explain why I didn't avoid static here (and below).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm 💯 sure these things are good to comment, and it is not too verbose to comment every occurrence like you did.

static constexpr uint32_t _Idx_arr[4] = {0x01010101, 0x01010101, 0x00000000, 0x00000000};
const auto _Idx = vld1q_u8(reinterpret_cast<const uint8_t*>(_Idx_arr));
return _Idx;
}

static _Vec_t __forceinline _Step(
const uint16_t _Val, const _Vec_t _Px0, const _Vec_t _Px1, const _Vec_t _Idx) noexcept {
const auto _Vx0 = vdupq_n_u16(_Val);
const auto _Vx1 = vqtbl1q_u8(vreinterpretq_u8_u16(_Vx0), _Idx);
Copy link
Contributor

@AlexGuteniev AlexGuteniev Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But is this the most efficient approach?
if vqtbl is not very efficient, there may be other approach.
Making vector of halves which are broadcasted _Val & 0xFF and _Val >> 8 bytes may work.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TBL is performant on Neoverse cores, e.g. on Neoverse N2 it has 2c latency and throughput of 2 (it executes on all vector pipelines). The TBL approach (once we've loaded the indices) uses one DUP, and one TBL instruction. Whereas non-TBL would require two DUPs and one additional instruction to combine the two vectors.

It can be wise to avoid TBL when optimizing for AArch64 little core CPUs, but I don't think we care much about these here.

const auto _Msk = vandq_u8(_Vx1, vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080)));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is compiler capable of moving out of the loop this constant, but not the other constant? 👀

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes (for some reason) - you can see in DevCom-11056873 that v20 is not loaded inside the loop:

|$LL7@std_bitset|
        ldrh        w8,[x1],#2
        ldr         q17,[x10]
        dup         v16.8h,w8
        tbl         v16.16b,{v16.16b},v17.16b
        and         v16.16b,v16.16b,v20.16b
        cmeq        v16.16b,v16.16b,v21.16b
        bsl         v16.16b,v19.16b,v18.16b
        str         q16,[x9,#-0x10]!
        cmp         x9,x11
        bne         |$LL7@std_bitset|

It's almost as though the compiler is willing to hoist only one Neon constant per loop (and no more!)

const auto _Ex0 = vceqq_u8(_Msk, vdupq_n_u8(0));
const auto _Ex1 = vbslq_u8(_Ex0, _Px0, _Px1);
return _Ex1;
}
};

struct _Traits_2_neon {
using _Value_type = uint8_t;
using _Vec_t = uint16x8_t;
static void _Exit_vectorized() noexcept {}

static void _Out(void* const _Dest, const _Vec_t _Elems) noexcept {
vst1q_u16(static_cast<uint16_t*>(_Dest), _Elems);
}

static _Vec_t _Set(const uint16_t _Val) noexcept {
return vdupq_n_u16(_Val);
}

static _Vec_t _Load_constant() {
// We do not omit static here, despite DevCom-11055227, because codegen is worse - see DevCom-11056805.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I especially dislike that constant has different meaning in _Traits_1_neon and _Traits_2_neon.
But different argument naming should be sufficient to draw attention to this (worked for me).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it's not ideal... but hopefully the argument naming softens the blow

static constexpr uint64_t _Wx_arr[2] = {0x0010002000400080, 0x0001000200040008};
const auto _Wx = vld1q_u64(_Wx_arr);
return vreinterpretq_u16_u64(_Wx);
}

static _Vec_t __forceinline _Step(
const uint8_t _Val, const _Vec_t _Px0, const _Vec_t _Px1, const _Vec_t _Wx) noexcept {
const auto _Vx0 = vdupq_n_u16(static_cast<uint16_t>(_Val));
const auto _Msk = vandq_u16(_Vx0, _Wx);
const auto _Ex0 = vceqq_u16(_Msk, vdupq_n_u16(0));
const auto _Ex1 = vbslq_u16(_Ex0, _Px0, _Px1);
return _Ex1;
}
};
#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
struct _Traits_avx {
static void _Out(void* const _Dest, const __m256i _Elems) noexcept {
_mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Elems);
Expand All @@ -11086,6 +11141,10 @@ namespace {
static void _Exit_vectorized() noexcept {
_mm256_zeroupper();
}

static int _Load_constant() {
return 0;
}
};

struct _Traits_sse {
Expand All @@ -11094,6 +11153,10 @@ namespace {
}

static void _Exit_vectorized() noexcept {}

static int _Load_constant() {
return 0;
}
};

struct _Traits_1_avx : _Traits_avx {
Expand All @@ -11103,7 +11166,8 @@ namespace {
return _mm256_broadcastb_epi8(_mm_cvtsi32_si128(_Val));
}

static __m256i __forceinline _Step(const uint32_t _Val, const __m256i _Px0, const __m256i _Px1) noexcept {
static __m256i __forceinline _Step(
const uint32_t _Val, const __m256i _Px0, const __m256i _Px1, int) noexcept {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 =
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x01010101, 0x02020202, 0x03030303));
Expand All @@ -11123,7 +11187,8 @@ namespace {
return _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128());
}

static __m128i __forceinline _Step(const uint16_t _Val, const __m128i _Px0, const __m128i _Px1) noexcept {
static __m128i __forceinline _Step(
const uint16_t _Val, const __m128i _Px0, const __m128i _Px1, int) noexcept {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 =
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
Expand All @@ -11141,7 +11206,8 @@ namespace {
return _mm256_broadcastw_epi16(_mm_cvtsi32_si128(_Val));
}

static __m256i __forceinline _Step(const uint16_t _Val, const __m256i _Px0, const __m256i _Px1) noexcept {
static __m256i __forceinline _Step(
const uint16_t _Val, const __m256i _Px0, const __m256i _Px1, int) noexcept {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 =
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
Expand All @@ -11162,14 +11228,16 @@ namespace {
return _mm_set1_epi16(_Val);
}

static __m128i __forceinline _Step(const uint8_t _Val, const __m128i _Px0, const __m128i _Px1) noexcept {
static __m128i __forceinline _Step(
const uint8_t _Val, const __m128i _Px0, const __m128i _Px1, int) noexcept {
const __m128i _Vx = _mm_set1_epi16(_Val);
const __m128i _Msk = _mm_and_si128(_Vx, _mm_set_epi64x(0x0001000200040008, 0x0010002000400080));
const __m128i _Ex0 = _mm_cmpeq_epi16(_Msk, _mm_setzero_si128());
const __m128i _Ex1 = _mm_blendv_epi8(_Px1, _Px0, _Ex0);
return _Ex1;
}
};
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^

template <class _Traits, class _Elem>
void __stdcall _Impl(
Expand All @@ -11178,14 +11246,18 @@ namespace {

const auto _Px0 = _Traits::_Set(_Elem0);
const auto _Px1 = _Traits::_Set(_Elem1);

// TRANSITION: DevCom-11056873
const auto _Constant = _Traits::_Load_constant();

if (_Size_bits >= _Step_size_bits) {
_Elem* _Pos = _Dest + _Size_bits;
_Size_bits &= _Step_size_bits - 1;
_Elem* const _Stop_at = _Dest + _Size_bits;
do {
typename _Traits::_Value_type _Val;
memcpy(&_Val, _Src, sizeof(_Val));
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1);
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1, _Constant);
_Pos -= _Step_size_bits;
_Traits::_Out(_Pos, _Elems);
_Advance_bytes(_Src, sizeof(_Val));
Expand All @@ -11195,7 +11267,7 @@ namespace {
if (_Size_bits > 0) {
typename _Traits::_Value_type _Val;
memcpy(&_Val, _Src, sizeof(_Val));
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1);
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1, _Constant);
_Elem _Tmp[_Step_size_bits];
_Traits::_Out(_Tmp, _Elems);
const _Elem* const _Tmpd = _Tmp + (_Step_size_bits - _Size_bits);
Expand All @@ -11204,25 +11276,23 @@ namespace {

_Traits::_Exit_vectorized(); // TRANSITION, DevCom-10331414
}
#endif // ^^^ !defined(_M_ARM64EC) ^^^

#if !defined(_M_ARM64) && !defined(_M_ARM64EC)
template <class _Avx_traits, class _Sse_traits, class _Elem>
void __stdcall _Dispatch(_Elem* const _Dest, const void* const _Src, const size_t _Size_bits,
const _Elem _Elem0, const _Elem _Elem1) noexcept {
#ifndef _M_ARM64EC
if (_Use_avx2() && _Size_bits >= 256) {
_Impl<_Avx_traits>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
} else if (_Use_sse42()) {
_Impl<_Sse_traits>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
} else
#endif // ^^^ !defined(_M_ARM64EC) ^^^
{
} else {
const auto _Arr = reinterpret_cast<const uint8_t*>(_Src);
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
_Dest[_Size_bits - 1 - _Ix] = ((_Arr[_Ix >> 3] >> (_Ix & 7)) & 1) != 0 ? _Elem1 : _Elem0;
}
}
}
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
} // namespace _Bitset_to_string
} // unnamed namespace

Expand All @@ -11231,17 +11301,26 @@ extern "C" {
__declspec(noalias) void __stdcall __std_bitset_to_string_1(
char* const _Dest, const void* const _Src, const size_t _Size_bits, const char _Elem0, const char _Elem1) noexcept {
using namespace _Bitset_to_string;
#if defined(_M_ARM64) || defined(_M_ARM64EC)
_Impl<_Traits_1_neon>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
_Dispatch<_Traits_1_avx, _Traits_1_sse>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
}

__declspec(noalias) void __stdcall __std_bitset_to_string_2(wchar_t* const _Dest, const void* const _Src,
const size_t _Size_bits, const wchar_t _Elem0, const wchar_t _Elem1) noexcept {
using namespace _Bitset_to_string;
#if defined(_M_ARM64) || defined(_M_ARM64EC)
_Impl<_Traits_2_neon>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
_Dispatch<_Traits_2_avx, _Traits_2_sse>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
}

} // extern "C"

#ifndef _M_ARM64
namespace {
namespace _Bitset_from_string {
#ifdef _M_ARM64EC
Expand Down