From d1744fa3bf18de9cd22be9dbf79ec0fa75510571 Mon Sep 17 00:00:00 2001 From: Francisco Geiman Thiesen Date: Tue, 2 Dec 2025 13:23:36 -0800 Subject: [PATCH 1/5] Implement batched random integer generation for shuffle Implements the algorithm from Brackett-Rozinsky & Lemire's paper "Batched Ranged Random Integer Generation" to reduce RNG calls in std::shuffle and ranges::shuffle for 64-bit URNGs like mt19937_64. The batched approach extracts multiple bounded random integers from a single 64-bit random word, using only multiplication (no division) in the common case. This reduces the number of RNG calls by approximately half for arrays with fewer than 2^32 elements. Resolves #5736 --- stl/inc/algorithm | 267 +++++++++++++++++- .../tests/P0896R4_ranges_alg_shuffle/test.cpp | 120 ++++++++ 2 files changed, 379 insertions(+), 8 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index e95528c127d..260a2a95f02 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -6096,6 +6096,92 @@ namespace ranges { #endif // _HAS_CXX20 #endif // _HAS_CXX17 +// Batched random integer generation for shuffle optimization. +// From Nevin Brackett-Rozinsky and Daniel Lemire, "Batched Ranged Random Integer Generation", +// Software: Practice and Experience 55(1), 2025. +// +// This algorithm extracts multiple bounded random integers from a single 64-bit random word, +// using only multiplication (no division) in the common case. + +template +struct _Batched_rng_from_urng { + // Batched random generation is only beneficial for 64-bit RNGs with full range. + // It requires the RNG to produce values in [0, 2^64 - 1]. + using _Urng_result = _Invoke_result_t<_Urng&>; + + static constexpr bool _Has_full_64bit_range = sizeof(_Urng_result) >= sizeof(uint64_t) + && is_unsigned_v<_Urng_result> && (_Urng::min) () == 0 + && (_Urng::max) () == (numeric_limits::max) (); + + // Threshold bounds for batch sizes based on array size. + // These are derived from the paper to minimize expected cost per random value. + // Batch size k requires product of k consecutive bounds <= 2^64. + static constexpr uint64_t _Bound_for_batch_6 = 21; // (21)^6 = 85,766,121 < 2^64 / (very conservative) + static constexpr uint64_t _Bound_for_batch_5 = 73; // (73)^5 = 2,073,071,593 < 2^64 + static constexpr uint64_t _Bound_for_batch_4 = 302; // (302)^4 = 8,319,430,096 < 2^64 + static constexpr uint64_t _Bound_for_batch_3 = 2642; // (2642)^3 = 18,454,249,288 < 2^64 + static constexpr uint64_t _Bound_for_batch_2 = 4294967296; // 2^32, for batch of 2 + + _Urng& _Ref; + + explicit _Batched_rng_from_urng(_Urng& _Func) noexcept : _Ref(_Func) {} + + // Generate a single bounded random value in [0, _Bound) using Lemire's method. + _NODISCARD _Diff _Single_bounded(_Diff _Bound) { + _Unsigned128 _Product{_Base128::_UMul128(static_cast(_Ref()), static_cast(_Bound), + _Product._Word[1])}; + auto _Leftover = _Product._Word[0]; + + if (_Leftover < static_cast(_Bound)) { + const uint64_t _Threshold = (0 - static_cast(_Bound)) % static_cast(_Bound); + while (_Leftover < _Threshold) { + _Product = _Unsigned128{_Base128::_UMul128( + static_cast(_Ref()), static_cast(_Bound), _Product._Word[1])}; + _Leftover = _Product._Word[0]; + } + } + + return static_cast<_Diff>(_Product._Word[1]); + } + + // Generate two bounded random values from a single 64-bit random word. + // The bounds are (n+1) and n for Fisher-Yates shuffle positions _Target_index and _Target_index-1. + void _Batch_2(_Diff* _Results, _Diff _Bound1, _Diff _Bound2) { + const uint64_t _B1 = static_cast(_Bound1); + const uint64_t _B2 = static_cast(_Bound2); + const uint64_t _Product_bound = _B1 * _B2; + + uint64_t _Random_word = static_cast(_Ref()); + + _Unsigned128 _Prod1{_Base128::_UMul128(_Random_word, _B1, _Prod1._Word[1])}; + _Results[0] = static_cast<_Diff>(_Prod1._Word[1]); + uint64_t _Leftover1 = _Prod1._Word[0]; + + _Unsigned128 _Prod2{_Base128::_UMul128(_Leftover1, _B2, _Prod2._Word[1])}; + _Results[1] = static_cast<_Diff>(_Prod2._Word[1]); + uint64_t _Leftover = _Prod2._Word[0]; + + // Rejection sampling: check if leftover is below threshold. + if (_Leftover < _Product_bound) { + const uint64_t _Threshold = (0 - _Product_bound) % _Product_bound; + while (_Leftover < _Threshold) { + _Random_word = static_cast(_Ref()); + + _Prod1 = _Unsigned128{_Base128::_UMul128(_Random_word, _B1, _Prod1._Word[1])}; + _Results[0] = static_cast<_Diff>(_Prod1._Word[1]); + _Leftover1 = _Prod1._Word[0]; + + _Prod2 = _Unsigned128{_Base128::_UMul128(_Leftover1, _B2, _Prod2._Word[1])}; + _Results[1] = static_cast<_Diff>(_Prod2._Word[1]); + _Leftover = _Prod2._Word[0]; + } + } + } + + _Batched_rng_from_urng(const _Batched_rng_from_urng&) = delete; + _Batched_rng_from_urng& operator=(const _Batched_rng_from_urng&) = delete; +}; + template class _Rng_from_urng_v2 { // wrap a URNG as an RNG public: @@ -6439,11 +6525,91 @@ void _Random_shuffle1(_RanIt _First, _RanIt _Last, _RngFn& _RngFunc) { } } +// Batched shuffle implementation for 64-bit URNGs with full range. +// Uses batched random generation to reduce RNG calls. +template +void _Random_shuffle_batched(_RanIt _First, _RanIt _Last, _Urng& _Func) { + // shuffle [_First, _Last) using batched random generation + _STD _Adl_verify_range(_First, _Last); + auto _UFirst = _STD _Get_unwrapped(_First); + const auto _ULast = _STD _Get_unwrapped(_Last); + if (_UFirst == _ULast) { + return; + } + + using _Diff = _Iter_diff_t<_RanIt>; + _Batched_rng_from_urng<_Diff, _Urng> _BatchedRng(_Func); + + auto _UTarget = _UFirst; + _Diff _Target_index = 1; + + // Process pairs using batched generation when beneficial. + // Batch of 2 is beneficial when bounds fit in 32 bits (product fits in 64 bits). + while (_UTarget != _ULast) { + ++_UTarget; + if (_UTarget == _ULast) { + break; + } + + const _Diff _Bound1 = _Target_index + 1; // bound for current position + const _Diff _Bound2 = _Target_index + 2; // bound for next position + + // Check if we can batch: both bounds and their product must fit safely. + // Use batch of 2 when the larger bound is <= 2^32 (product fits in 64 bits). + if (static_cast(_Bound2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) { + auto _UTarget_next = _UTarget; + ++_UTarget_next; + + if (_UTarget_next != _ULast) { + // Generate two random indices in one batch. + _Diff _Offsets[2]; + _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2); + + _STL_ASSERT(0 <= _Offsets[0] && _Offsets[0] <= _Target_index, "random value out of range"); + _STL_ASSERT(0 <= _Offsets[1] && _Offsets[1] <= _Target_index + 1, "random value out of range"); + + // Perform first swap. + if (_Offsets[0] != _Target_index) { + swap(*_UTarget, *(_UFirst + _Offsets[0])); // intentional ADL + } + + // Advance to next position and perform second swap. + ++_UTarget; + ++_Target_index; + + if (_Offsets[1] != _Target_index) { + swap(*_UTarget, *(_UFirst + _Offsets[1])); // intentional ADL + } + + ++_UTarget; + ++_Target_index; + continue; + } + } + + // Fall back to single generation for this position. + const _Diff _Off = _BatchedRng._Single_bounded(_Bound1); + _STL_ASSERT(0 <= _Off && _Off <= _Target_index, "random value out of range"); + if (_Off != _Target_index) { + swap(*_UTarget, *(_UFirst + _Off)); // intentional ADL + } + + ++_UTarget; + ++_Target_index; + } +} + _EXPORT_STD template void shuffle(_RanIt _First, _RanIt _Last, _Urng&& _Func) { // shuffle [_First, _Last) using URNG _Func using _Urng0 = remove_reference_t<_Urng>; - _Rng_from_urng_v2<_Iter_diff_t<_RanIt>, _Urng0> _RngFunc(_Func); - _STD _Random_shuffle1(_First, _Last, _RngFunc); + + // Use batched shuffle when the URNG produces full 64-bit range values. + if constexpr (_Batched_rng_from_urng<_Iter_diff_t<_RanIt>, _Urng0>::_Has_full_64bit_range) { + _STD _Random_shuffle_batched(_First, _Last, _Func); + } else { + _Rng_from_urng_v2<_Iter_diff_t<_RanIt>, _Urng0> _RngFunc(_Func); + _STD _Random_shuffle1(_First, _Last, _RngFunc); + } } #if _HAS_CXX20 @@ -6455,20 +6621,37 @@ namespace ranges { _STATIC_CALL_OPERATOR _It operator()(_It _First, _Se _Last, _Urng&& _Func) _CONST_CALL_OPERATOR { _STD _Adl_verify_range(_First, _Last); - _Rng_from_urng_v2, remove_reference_t<_Urng>> _RngFunc(_Func); - auto _UResult = _Shuffle_unchecked( - _RANGES _Unwrap_iter<_Se>(_STD move(_First)), _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _RngFunc); + using _Urng0 = remove_reference_t<_Urng>; + using _Diff = iter_difference_t<_It>; - _STD _Seek_wrapped(_First, _STD move(_UResult)); + // Use batched shuffle when the URNG produces full 64-bit range values. + if constexpr (_Batched_rng_from_urng<_Diff, _Urng0>::_Has_full_64bit_range) { + auto _UResult = _Shuffle_unchecked_batched( + _RANGES _Unwrap_iter<_Se>(_STD move(_First)), _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _Func); + _STD _Seek_wrapped(_First, _STD move(_UResult)); + } else { + _Rng_from_urng_v2<_Diff, _Urng0> _RngFunc(_Func); + auto _UResult = _Shuffle_unchecked( + _RANGES _Unwrap_iter<_Se>(_STD move(_First)), _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _RngFunc); + _STD _Seek_wrapped(_First, _STD move(_UResult)); + } return _First; } template requires permutable> && uniform_random_bit_generator> _STATIC_CALL_OPERATOR borrowed_iterator_t<_Rng> operator()(_Rng&& _Range, _Urng&& _Func) _CONST_CALL_OPERATOR { - _Rng_from_urng_v2, remove_reference_t<_Urng>> _RngFunc(_Func); + using _Urng0 = remove_reference_t<_Urng>; + using _Diff = range_difference_t<_Rng>; - return _RANGES _Rewrap_iterator(_Range, _Shuffle_unchecked(_Ubegin(_Range), _Uend(_Range), _RngFunc)); + // Use batched shuffle when the URNG produces full 64-bit range values. + if constexpr (_Batched_rng_from_urng<_Diff, _Urng0>::_Has_full_64bit_range) { + return _RANGES _Rewrap_iterator( + _Range, _Shuffle_unchecked_batched(_Ubegin(_Range), _Uend(_Range), _Func)); + } else { + _Rng_from_urng_v2<_Diff, _Urng0> _RngFunc(_Func); + return _RANGES _Rewrap_iterator(_Range, _Shuffle_unchecked(_Ubegin(_Range), _Uend(_Range), _RngFunc)); + } } private: @@ -6496,6 +6679,74 @@ namespace ranges { } return _Target; } + + // Batched shuffle implementation for ranges. + template + _NODISCARD static _It _Shuffle_unchecked_batched(_It _First, const _Se _Last, _Urng& _Func) { + // shuffle [_First, _Last) using batched random generation + _STL_INTERNAL_STATIC_ASSERT(random_access_iterator<_It>); + _STL_INTERNAL_STATIC_ASSERT(sentinel_for<_Se, _It>); + _STL_INTERNAL_STATIC_ASSERT(permutable<_It>); + + if (_First == _Last) { + return _First; + } + + using _Diff = iter_difference_t<_It>; + _Batched_rng_from_urng<_Diff, _Urng> _BatchedRng(_Func); + + auto _Target = _First; + _Diff _Target_index = 1; + + // Process pairs using batched generation when beneficial. + while (_Target != _Last) { + ++_Target; + if (_Target == _Last) { + break; + } + + const _Diff _Bound1 = _Target_index + 1; + const _Diff _Bound2 = _Target_index + 2; + + if (static_cast(_Bound2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) { + auto _Target_next = _Target; + ++_Target_next; + + if (_Target_next != _Last) { + _Diff _Offsets[2]; + _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2); + + _STL_ASSERT(0 <= _Offsets[0] && _Offsets[0] <= _Target_index, "random value out of range"); + _STL_ASSERT(0 <= _Offsets[1] && _Offsets[1] <= _Target_index + 1, "random value out of range"); + + if (_Offsets[0] != _Target_index) { + _RANGES iter_swap(_Target, _First + _Offsets[0]); + } + + ++_Target; + ++_Target_index; + + if (_Offsets[1] != _Target_index) { + _RANGES iter_swap(_Target, _First + _Offsets[1]); + } + + ++_Target; + ++_Target_index; + continue; + } + } + + const _Diff _Off = _BatchedRng._Single_bounded(_Bound1); + _STL_ASSERT(0 <= _Off && _Off <= _Target_index, "random value out of range"); + if (_Off != _Target_index) { + _RANGES iter_swap(_Target, _First + _Off); + } + + ++_Target; + ++_Target_index; + } + return _Target; + } }; _EXPORT_STD inline constexpr _Shuffle_fn shuffle; diff --git a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp index 6c2bb00b8a3..18fd2f7fd45 100644 --- a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp +++ b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp @@ -6,15 +6,18 @@ #include #include #include +#include #include #include #include +#include #include using namespace std; const unsigned int seed = random_device{}(); mt19937 gen{seed}; +mt19937_64 gen64{seed}; // 64-bit generator for batched random path // Validate dangling story static_assert(same_as{}, gen)), ranges::dangling>); @@ -72,8 +75,125 @@ void test_urbg() { // COMPILE-ONLY ranges::shuffle(arr, RandGen{}); } +// Test that shuffle produces a valid permutation for various sizes. +// This exercises both the batched path (for 64-bit RNGs) and the fallback path. +void test_shuffle_permutation() { + // Test with 64-bit generator (batched random path) + { + vector v(100); + iota(v.begin(), v.end(), 0); + vector original = v; + + shuffle(v.begin(), v.end(), gen64); + + // Verify it's still a permutation + vector sorted_v = v; + sort(sorted_v.begin(), sorted_v.end()); + assert(sorted_v == original); + } + + // Test with ranges::shuffle and 64-bit generator + { + vector v(100); + iota(v.begin(), v.end(), 0); + vector original = v; + + ranges::shuffle(v, gen64); + + // Verify it's still a permutation + vector sorted_v = v; + sort(sorted_v.begin(), sorted_v.end()); + assert(sorted_v == original); + } + + // Test with 32-bit generator (non-batched path) + { + vector v(100); + iota(v.begin(), v.end(), 0); + vector original = v; + + shuffle(v.begin(), v.end(), gen); + + // Verify it's still a permutation + vector sorted_v = v; + sort(sorted_v.begin(), sorted_v.end()); + assert(sorted_v == original); + } + + // Test with ranges::shuffle and 32-bit generator + { + vector v(100); + iota(v.begin(), v.end(), 0); + vector original = v; + + ranges::shuffle(v, gen); + + // Verify it's still a permutation + vector sorted_v = v; + sort(sorted_v.begin(), sorted_v.end()); + assert(sorted_v == original); + } +} + +// Test edge cases for shuffle +void test_shuffle_edge_cases() { + // Empty range + { + vector v; + shuffle(v.begin(), v.end(), gen64); + assert(v.empty()); + } + + // Single element + { + vector v = {42}; + shuffle(v.begin(), v.end(), gen64); + assert(v.size() == 1); + assert(v[0] == 42); + } + + // Two elements + { + vector v = {1, 2}; + vector original = v; + shuffle(v.begin(), v.end(), gen64); + sort(v.begin(), v.end()); + assert(v == original); + } + + // Three elements (odd count, tests batching boundary) + { + vector v = {1, 2, 3}; + vector original = v; + shuffle(v.begin(), v.end(), gen64); + sort(v.begin(), v.end()); + assert(v == original); + } + + // Four elements (even count) + { + vector v = {1, 2, 3, 4}; + vector original = v; + shuffle(v.begin(), v.end(), gen64); + sort(v.begin(), v.end()); + assert(v == original); + } + + // Large array to ensure batching is effective + { + vector v(10000); + iota(v.begin(), v.end(), 0); + vector original = v; + shuffle(v.begin(), v.end(), gen64); + sort(v.begin(), v.end()); + assert(v == original); + } +} + int main() { printf("Using seed: %u\n", seed); test_random(); + test_shuffle_permutation(); + test_shuffle_edge_cases(); } From 3df88561ae112ac3f4a64d699d600e7bbfacde0d Mon Sep 17 00:00:00 2001 From: Francisco Geiman Thiesen Date: Tue, 2 Dec 2025 19:19:17 -0800 Subject: [PATCH 2/5] Address review feedback: extract URNG range check to variable template Move _Has_full_64bit_range check out of _Batched_rng_from_urng class to reduce template instantiation overhead. Use is_same_v and _Max_limit as suggested by reviewer. --- stl/inc/algorithm | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 260a2a95f02..c5b92f1d2cf 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -6103,15 +6103,14 @@ namespace ranges { // This algorithm extracts multiple bounded random integers from a single 64-bit random word, // using only multiplication (no division) in the common case. +// Check if a URNG has full 64-bit range [0, 2^64 - 1]. +// Batched random generation is only beneficial for such RNGs. +template +constexpr bool _Urng_has_full_64bit_range = + is_same_v<_Invoke_result_t<_Urng&>, uint64_t> && (_Urng::min)() == 0 && (_Urng::max)() == _Max_limit(); + template struct _Batched_rng_from_urng { - // Batched random generation is only beneficial for 64-bit RNGs with full range. - // It requires the RNG to produce values in [0, 2^64 - 1]. - using _Urng_result = _Invoke_result_t<_Urng&>; - - static constexpr bool _Has_full_64bit_range = sizeof(_Urng_result) >= sizeof(uint64_t) - && is_unsigned_v<_Urng_result> && (_Urng::min) () == 0 - && (_Urng::max) () == (numeric_limits::max) (); // Threshold bounds for batch sizes based on array size. // These are derived from the paper to minimize expected cost per random value. @@ -6604,7 +6603,7 @@ void shuffle(_RanIt _First, _RanIt _Last, _Urng&& _Func) { // shuffle [_First, _ using _Urng0 = remove_reference_t<_Urng>; // Use batched shuffle when the URNG produces full 64-bit range values. - if constexpr (_Batched_rng_from_urng<_Iter_diff_t<_RanIt>, _Urng0>::_Has_full_64bit_range) { + if constexpr (_Urng_has_full_64bit_range<_Urng0>) { _STD _Random_shuffle_batched(_First, _Last, _Func); } else { _Rng_from_urng_v2<_Iter_diff_t<_RanIt>, _Urng0> _RngFunc(_Func); @@ -6625,7 +6624,7 @@ namespace ranges { using _Diff = iter_difference_t<_It>; // Use batched shuffle when the URNG produces full 64-bit range values. - if constexpr (_Batched_rng_from_urng<_Diff, _Urng0>::_Has_full_64bit_range) { + if constexpr (_Urng_has_full_64bit_range<_Urng0>) { auto _UResult = _Shuffle_unchecked_batched( _RANGES _Unwrap_iter<_Se>(_STD move(_First)), _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _Func); _STD _Seek_wrapped(_First, _STD move(_UResult)); @@ -6645,7 +6644,7 @@ namespace ranges { using _Diff = range_difference_t<_Rng>; // Use batched shuffle when the URNG produces full 64-bit range values. - if constexpr (_Batched_rng_from_urng<_Diff, _Urng0>::_Has_full_64bit_range) { + if constexpr (_Urng_has_full_64bit_range<_Urng0>) { return _RANGES _Rewrap_iterator( _Range, _Shuffle_unchecked_batched(_Ubegin(_Range), _Uend(_Range), _Func)); } else { From 8aa184ab394fc54f3f15bdb05bcb79200de9d2dd Mon Sep 17 00:00:00 2001 From: Francisco Geiman Thiesen Date: Tue, 2 Dec 2025 19:49:01 -0800 Subject: [PATCH 3/5] Apply clang-format --- stl/inc/algorithm | 16 ++++++++-------- .../tests/P0896R4_ranges_alg_shuffle/test.cpp | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index c5b92f1d2cf..a19e4332658 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -6107,7 +6107,7 @@ namespace ranges { // Batched random generation is only beneficial for such RNGs. template constexpr bool _Urng_has_full_64bit_range = - is_same_v<_Invoke_result_t<_Urng&>, uint64_t> && (_Urng::min)() == 0 && (_Urng::max)() == _Max_limit(); + is_same_v<_Invoke_result_t<_Urng&>, uint64_t> && (_Urng::min) () == 0 && (_Urng::max) () == _Max_limit(); template struct _Batched_rng_from_urng { @@ -6127,14 +6127,14 @@ struct _Batched_rng_from_urng { // Generate a single bounded random value in [0, _Bound) using Lemire's method. _NODISCARD _Diff _Single_bounded(_Diff _Bound) { - _Unsigned128 _Product{_Base128::_UMul128(static_cast(_Ref()), static_cast(_Bound), - _Product._Word[1])}; + _Unsigned128 _Product{ + _Base128::_UMul128(static_cast(_Ref()), static_cast(_Bound), _Product._Word[1])}; auto _Leftover = _Product._Word[0]; if (_Leftover < static_cast(_Bound)) { const uint64_t _Threshold = (0 - static_cast(_Bound)) % static_cast(_Bound); while (_Leftover < _Threshold) { - _Product = _Unsigned128{_Base128::_UMul128( + _Product = _Unsigned128{_Base128::_UMul128( static_cast(_Ref()), static_cast(_Bound), _Product._Word[1])}; _Leftover = _Product._Word[0]; } @@ -6146,8 +6146,8 @@ struct _Batched_rng_from_urng { // Generate two bounded random values from a single 64-bit random word. // The bounds are (n+1) and n for Fisher-Yates shuffle positions _Target_index and _Target_index-1. void _Batch_2(_Diff* _Results, _Diff _Bound1, _Diff _Bound2) { - const uint64_t _B1 = static_cast(_Bound1); - const uint64_t _B2 = static_cast(_Bound2); + const uint64_t _B1 = static_cast(_Bound1); + const uint64_t _B2 = static_cast(_Bound2); const uint64_t _Product_bound = _B1 * _B2; uint64_t _Random_word = static_cast(_Ref()); @@ -6630,8 +6630,8 @@ namespace ranges { _STD _Seek_wrapped(_First, _STD move(_UResult)); } else { _Rng_from_urng_v2<_Diff, _Urng0> _RngFunc(_Func); - auto _UResult = _Shuffle_unchecked( - _RANGES _Unwrap_iter<_Se>(_STD move(_First)), _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _RngFunc); + auto _UResult = _Shuffle_unchecked(_RANGES _Unwrap_iter<_Se>(_STD move(_First)), + _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _RngFunc); _STD _Seek_wrapped(_First, _STD move(_UResult)); } return _First; diff --git a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp index 18fd2f7fd45..0d8082be24f 100644 --- a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp +++ b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp @@ -154,7 +154,7 @@ void test_shuffle_edge_cases() { // Two elements { - vector v = {1, 2}; + vector v = {1, 2}; vector original = v; shuffle(v.begin(), v.end(), gen64); sort(v.begin(), v.end()); @@ -163,7 +163,7 @@ void test_shuffle_edge_cases() { // Three elements (odd count, tests batching boundary) { - vector v = {1, 2, 3}; + vector v = {1, 2, 3}; vector original = v; shuffle(v.begin(), v.end(), gen64); sort(v.begin(), v.end()); @@ -172,7 +172,7 @@ void test_shuffle_edge_cases() { // Four elements (even count) { - vector v = {1, 2, 3, 4}; + vector v = {1, 2, 3, 4}; vector original = v; shuffle(v.begin(), v.end(), gen64); sort(v.begin(), v.end()); From 86c0c0a3ab014b9d86cd13077b69dea0b2c3035d Mon Sep 17 00:00:00 2001 From: Francisco Geiman Thiesen Date: Fri, 27 Feb 2026 14:35:14 -0800 Subject: [PATCH 4/5] ``: Fix batched random integer generation for `shuffle()` Fix two bugs in the batched shuffle implementation from PR #5932: 1. `_Unsigned128` brace-initialization zeroed the high word of the multiplication result (the random index), making all indices 0 and shuffle deterministic. Replace with separate `uint64_t` variables for high and low words in both `_Single_bounded` and `_Batch_2`. 2. Loop advancement in `_Random_shuffle_batched` and `_Shuffle_unchecked_batched` had extra `++_UTarget` increments causing elements to be skipped. Restructure to `for` loops matching the original `_Random_shuffle1` pattern. Add regression test for GH-6112 and shuffle quality tests adapted from Lemire's cpp_batched_random test suite (uniformity, coverage, pair distribution at start/end positions). Co-Authored-By: Claude --- stl/inc/algorithm | 73 +++---- .../tests/P0896R4_ranges_alg_shuffle/test.cpp | 182 ++++++++++++++++++ 2 files changed, 208 insertions(+), 47 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 200e23cd5fc..2951808f96a 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -6209,20 +6209,17 @@ struct _Batched_rng_from_urng { // Generate a single bounded random value in [0, _Bound) using Lemire's method. _NODISCARD _Diff _Single_bounded(_Diff _Bound) { - _Unsigned128 _Product{ - _Base128::_UMul128(static_cast(_Ref()), static_cast(_Bound), _Product._Word[1])}; - auto _Leftover = _Product._Word[0]; + uint64_t _High; + uint64_t _Leftover = _Base128::_UMul128(static_cast(_Ref()), static_cast(_Bound), _High); if (_Leftover < static_cast(_Bound)) { const uint64_t _Threshold = (0 - static_cast(_Bound)) % static_cast(_Bound); while (_Leftover < _Threshold) { - _Product = _Unsigned128{_Base128::_UMul128( - static_cast(_Ref()), static_cast(_Bound), _Product._Word[1])}; - _Leftover = _Product._Word[0]; + _Leftover = _Base128::_UMul128(static_cast(_Ref()), static_cast(_Bound), _High); } } - return static_cast<_Diff>(_Product._Word[1]); + return static_cast<_Diff>(_High); } // Generate two bounded random values from a single 64-bit random word. @@ -6234,13 +6231,13 @@ struct _Batched_rng_from_urng { uint64_t _Random_word = static_cast(_Ref()); - _Unsigned128 _Prod1{_Base128::_UMul128(_Random_word, _B1, _Prod1._Word[1])}; - _Results[0] = static_cast<_Diff>(_Prod1._Word[1]); - uint64_t _Leftover1 = _Prod1._Word[0]; + uint64_t _High1; + uint64_t _Leftover1 = _Base128::_UMul128(_Random_word, _B1, _High1); + _Results[0] = static_cast<_Diff>(_High1); - _Unsigned128 _Prod2{_Base128::_UMul128(_Leftover1, _B2, _Prod2._Word[1])}; - _Results[1] = static_cast<_Diff>(_Prod2._Word[1]); - uint64_t _Leftover = _Prod2._Word[0]; + uint64_t _High2; + uint64_t _Leftover = _Base128::_UMul128(_Leftover1, _B2, _High2); + _Results[1] = static_cast<_Diff>(_High2); // Rejection sampling: check if leftover is below threshold. if (_Leftover < _Product_bound) { @@ -6248,13 +6245,11 @@ struct _Batched_rng_from_urng { while (_Leftover < _Threshold) { _Random_word = static_cast(_Ref()); - _Prod1 = _Unsigned128{_Base128::_UMul128(_Random_word, _B1, _Prod1._Word[1])}; - _Results[0] = static_cast<_Diff>(_Prod1._Word[1]); - _Leftover1 = _Prod1._Word[0]; + _Leftover1 = _Base128::_UMul128(_Random_word, _B1, _High1); + _Results[0] = static_cast<_Diff>(_High1); - _Prod2 = _Unsigned128{_Base128::_UMul128(_Leftover1, _B2, _Prod2._Word[1])}; - _Results[1] = static_cast<_Diff>(_Prod2._Word[1]); - _Leftover = _Prod2._Word[0]; + _Leftover = _Base128::_UMul128(_Leftover1, _B2, _High2); + _Results[1] = static_cast<_Diff>(_High2); } } } @@ -6626,22 +6621,18 @@ void _Random_shuffle_batched(_RanIt _First, _RanIt _Last, _Urng& _Func) { // Process pairs using batched generation when beneficial. // Batch of 2 is beneficial when bounds fit in 32 bits (product fits in 64 bits). - while (_UTarget != _ULast) { - ++_UTarget; - if (_UTarget == _ULast) { - break; - } - + for (; ++_UTarget != _ULast; ++_Target_index) { // randomly place an element from [_First, _Target] at _Target const _Diff _Bound1 = _Target_index + 1; // bound for current position - const _Diff _Bound2 = _Target_index + 2; // bound for next position // Check if we can batch: both bounds and their product must fit safely. // Use batch of 2 when the larger bound is <= 2^32 (product fits in 64 bits). - if (static_cast(_Bound2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) { + if (static_cast(_Target_index + 2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) { auto _UTarget_next = _UTarget; ++_UTarget_next; if (_UTarget_next != _ULast) { + const _Diff _Bound2 = _Target_index + 2; // bound for next position + // Generate two random indices in one batch. _Diff _Offsets[2]; _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2); @@ -6662,9 +6653,7 @@ void _Random_shuffle_batched(_RanIt _First, _RanIt _Last, _Urng& _Func) { swap(*_UTarget, *(_UFirst + _Offsets[1])); // intentional ADL } - ++_UTarget; - ++_Target_index; - continue; + continue; // The for-loop's ++_UTarget and ++_Target_index handle the next advance. } } @@ -6674,9 +6663,6 @@ void _Random_shuffle_batched(_RanIt _First, _RanIt _Last, _Urng& _Func) { if (_Off != _Target_index) { swap(*_UTarget, *(_UFirst + _Off)); // intentional ADL } - - ++_UTarget; - ++_Target_index; } } @@ -6780,20 +6766,18 @@ namespace ranges { _Diff _Target_index = 1; // Process pairs using batched generation when beneficial. - while (_Target != _Last) { - ++_Target; - if (_Target == _Last) { - break; - } - + for (; ++_Target != _Last; ++_Target_index) { + // randomly place an element from [_First, _Target] at _Target const _Diff _Bound1 = _Target_index + 1; - const _Diff _Bound2 = _Target_index + 2; - if (static_cast(_Bound2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) { + if (static_cast(_Target_index + 2) + <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) { auto _Target_next = _Target; ++_Target_next; if (_Target_next != _Last) { + const _Diff _Bound2 = _Target_index + 2; + _Diff _Offsets[2]; _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2); @@ -6811,9 +6795,7 @@ namespace ranges { _RANGES iter_swap(_Target, _First + _Offsets[1]); } - ++_Target; - ++_Target_index; - continue; + continue; // The for-loop's ++_Target and ++_Target_index handle the next advance. } } @@ -6822,9 +6804,6 @@ namespace ranges { if (_Off != _Target_index) { _RANGES iter_swap(_Target, _First + _Off); } - - ++_Target; - ++_Target_index; } return _Target; } diff --git a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp index 0d8082be24f..f0d3a5257fc 100644 --- a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp +++ b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp @@ -190,10 +190,192 @@ void test_shuffle_edge_cases() { } } +// Regression test: consecutive shuffles with the same URNG must produce different results. +// The batched random implementation had a bug where _Unsigned128 brace-initialization zeroed the high word +// of the multiplication result, making all random indices 0 and the shuffle deterministic. +void test_gh_6112() { + // Test std::shuffle with mt19937_64 (exercises the batched random path) + { + mt19937_64 urng(6); + vector v(100); + + iota(v.begin(), v.end(), 0); + shuffle(v.begin(), v.end(), urng); + const auto first_shuffle = v; + + iota(v.begin(), v.end(), 0); + shuffle(v.begin(), v.end(), urng); + + assert(v != first_shuffle); // "should be vanishingly impossible" for these to be equal + } + + // Test ranges::shuffle with mt19937_64 + { + mt19937_64 urng(6); + vector v(100); + + iota(v.begin(), v.end(), 0); + ranges::shuffle(v, urng); + const auto first_shuffle = v; + + iota(v.begin(), v.end(), 0); + ranges::shuffle(v, urng); + + assert(v != first_shuffle); + } +} + +// Shuffle quality tests adapted from Lemire's cpp_batched_random test suite. +// These verify that the shuffle produces a proper uniform random permutation, +// not just a valid permutation. + +// Every element must be able to reach every position over many trials. +void test_everyone_can_move_everywhere() { + constexpr size_t size = 64; + constexpr size_t trials = size * size; // 4096 trials; probability of missing any (pos,val): ~e^-64 + + // Test std::shuffle with mt19937_64 (batched path) + { + mt19937_64 urng(42); + vector input(size); + vector seen(size * size, 0); // seen[position * size + value] + + for (size_t trial = 0; trial < trials; ++trial) { + iota(input.begin(), input.end(), 0); + shuffle(input.begin(), input.end(), urng); + for (size_t i = 0; i < size; ++i) { + seen[i * size + static_cast(input[i])] = 1; + } + } + + for (size_t pos = 0; pos < size; ++pos) { + for (size_t val = 0; val < size; ++val) { + assert(seen[pos * size + val]); + } + } + } + + // Test ranges::shuffle with mt19937_64 (batched path) + { + mt19937_64 urng(42); + vector input(size); + vector seen(size * size, 0); + + for (size_t trial = 0; trial < trials; ++trial) { + iota(input.begin(), input.end(), 0); + ranges::shuffle(input, urng); + for (size_t i = 0; i < size; ++i) { + seen[i * size + static_cast(input[i])] = 1; + } + } + + for (size_t pos = 0; pos < size; ++pos) { + for (size_t val = 0; val < size; ++val) { + assert(seen[pos * size + val]); + } + } + } +} + +// Check that the distribution of values across positions is roughly uniform. +// Uses the "relative gap" metric: (max_count - min_count) / mean_count < 0.6. +void test_uniformity() { + constexpr size_t size = 32; + constexpr size_t trials = size * size * 16; // 16384 trials; expected count per cell = 512 + + mt19937_64 urng(42); + vector input(size); + vector count(size * size, 0); // count[position * size + value] + + for (size_t trial = 0; trial < trials; ++trial) { + iota(input.begin(), input.end(), 0); + shuffle(input.begin(), input.end(), urng); + for (size_t i = 0; i < size; ++i) { + ++count[i * size + static_cast(input[i])]; + } + } + + size_t overall_min = SIZE_MAX; + size_t overall_max = 0; + size_t total = 0; + + for (size_t cell = 0; cell < size * size; ++cell) { + total += count[cell]; + if (count[cell] > overall_max) { + overall_max = count[cell]; + } + if (count[cell] < overall_min) { + overall_min = count[cell]; + } + } + + const double mean = static_cast(total) / static_cast(size * size); + const double relative_gap = static_cast(overall_max - overall_min) / mean; + + assert(relative_gap < 0.6); +} + +// Every distinct pair of values must be able to appear at the first two positions. +void test_any_possible_pair_at_start() { + constexpr size_t size = 32; + constexpr size_t trials = size * size * size; // 32768 trials; expected count per pair ~33 + + mt19937_64 urng(42); + vector input(size); + vector seen(size * size, 0); // seen[first * size + second] + + for (size_t trial = 0; trial < trials; ++trial) { + iota(input.begin(), input.end(), 0); + shuffle(input.begin(), input.end(), urng); + seen[static_cast(input[0]) * size + static_cast(input[1])] = 1; + } + + for (size_t i = 0; i < size; ++i) { + for (size_t j = 0; j < size; ++j) { + if (i == j) { + assert(!seen[i * size + j]); // same value can't occupy both positions + } else { + assert(seen[i * size + j]); // every distinct pair must appear + } + } + } +} + +// Every distinct pair of values must be able to appear at the last two positions. +void test_any_possible_pair_at_end() { + constexpr size_t size = 32; + constexpr size_t trials = size * size * size; + + mt19937_64 urng(42); + vector input(size); + vector seen(size * size, 0); + + for (size_t trial = 0; trial < trials; ++trial) { + iota(input.begin(), input.end(), 0); + shuffle(input.begin(), input.end(), urng); + seen[static_cast(input[size - 2]) * size + static_cast(input[size - 1])] = 1; + } + + for (size_t i = 0; i < size; ++i) { + for (size_t j = 0; j < size; ++j) { + if (i == j) { + assert(!seen[i * size + j]); + } else { + assert(seen[i * size + j]); + } + } + } +} + int main() { printf("Using seed: %u\n", seed); test_random(); test_shuffle_permutation(); test_shuffle_edge_cases(); + test_gh_6112(); + test_everyone_can_move_everywhere(); + test_uniformity(); + test_any_possible_pair_at_start(); + test_any_possible_pair_at_end(); } From 4f4a627ff183986772b5bab833414be7f3c2ec63 Mon Sep 17 00:00:00 2001 From: Francisco Geiman Thiesen Date: Mon, 2 Mar 2026 09:12:38 -0800 Subject: [PATCH 5/5] ``: Match Lemire's batched shuffle implementation Align the batched random shuffle with Daniel Lemire's reference implementation (github.com/lemire/cpp_batched_random). The previous version only used batch-of-2 and did not use the threshold constants in loop conditions. This adopts the full cascade (batch 1-6) with backward Fisher-Yates iteration, matching Lemire's design exactly. --- stl/inc/algorithm | 301 ++++++++++++++++++++++++++-------------------- 1 file changed, 168 insertions(+), 133 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 2951808f96a..d41b0095ae6 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -6193,65 +6193,39 @@ constexpr bool _Urng_has_full_64bit_range = template struct _Batched_rng_from_urng { - - // Threshold bounds for batch sizes based on array size. - // These are derived from the paper to minimize expected cost per random value. - // Batch size k requires product of k consecutive bounds <= 2^64. - static constexpr uint64_t _Bound_for_batch_6 = 21; // (21)^6 = 85,766,121 < 2^64 / (very conservative) - static constexpr uint64_t _Bound_for_batch_5 = 73; // (73)^5 = 2,073,071,593 < 2^64 - static constexpr uint64_t _Bound_for_batch_4 = 302; // (302)^4 = 8,319,430,096 < 2^64 - static constexpr uint64_t _Bound_for_batch_3 = 2642; // (2642)^3 = 18,454,249,288 < 2^64 - static constexpr uint64_t _Bound_for_batch_2 = 4294967296; // 2^32, for batch of 2 - _Urng& _Ref; explicit _Batched_rng_from_urng(_Urng& _Func) noexcept : _Ref(_Func) {} - // Generate a single bounded random value in [0, _Bound) using Lemire's method. - _NODISCARD _Diff _Single_bounded(_Diff _Bound) { + // Generate _K bounded random indices from a single 64-bit random word. + // Uses backward Fisher-Yates bounds: _N, _N-1, ..., _N-_K+1. + // _Results[j] is uniform in [0, _N-j). + // + // _Bound is a conservative upper bound on the product of the _K bounds, + // used to skip the threshold computation on the fast path. Returns the + // (possibly tightened) bound for the caller to pass to the next call. + uint64_t _Batch(_Diff* _Results, uint64_t _N, uint64_t _K, uint64_t _Bound) { + uint64_t _Rand = static_cast(_Ref()); uint64_t _High; - uint64_t _Leftover = _Base128::_UMul128(static_cast(_Ref()), static_cast(_Bound), _High); - - if (_Leftover < static_cast(_Bound)) { - const uint64_t _Threshold = (0 - static_cast(_Bound)) % static_cast(_Bound); - while (_Leftover < _Threshold) { - _Leftover = _Base128::_UMul128(static_cast(_Ref()), static_cast(_Bound), _High); - } + for (uint64_t _J = 0; _J < _K; ++_J) { + _Rand = _Base128::_UMul128(_Rand, _N - _J, _High); + _Results[_J] = static_cast<_Diff>(_High); } - - return static_cast<_Diff>(_High); - } - - // Generate two bounded random values from a single 64-bit random word. - // The bounds are (n+1) and n for Fisher-Yates shuffle positions _Target_index and _Target_index-1. - void _Batch_2(_Diff* _Results, _Diff _Bound1, _Diff _Bound2) { - const uint64_t _B1 = static_cast(_Bound1); - const uint64_t _B2 = static_cast(_Bound2); - const uint64_t _Product_bound = _B1 * _B2; - - uint64_t _Random_word = static_cast(_Ref()); - - uint64_t _High1; - uint64_t _Leftover1 = _Base128::_UMul128(_Random_word, _B1, _High1); - _Results[0] = static_cast<_Diff>(_High1); - - uint64_t _High2; - uint64_t _Leftover = _Base128::_UMul128(_Leftover1, _B2, _High2); - _Results[1] = static_cast<_Diff>(_High2); - - // Rejection sampling: check if leftover is below threshold. - if (_Leftover < _Product_bound) { - const uint64_t _Threshold = (0 - _Product_bound) % _Product_bound; - while (_Leftover < _Threshold) { - _Random_word = static_cast(_Ref()); - - _Leftover1 = _Base128::_UMul128(_Random_word, _B1, _High1); - _Results[0] = static_cast<_Diff>(_High1); - - _Leftover = _Base128::_UMul128(_Leftover1, _B2, _High2); - _Results[1] = static_cast<_Diff>(_High2); + if (_Rand < _Bound) { + _Bound = _N; + for (uint64_t _J = 1; _J < _K; ++_J) { + _Bound *= _N - _J; + } + const uint64_t _Threshold = (0 - _Bound) % _Bound; + while (_Rand < _Threshold) { + _Rand = static_cast(_Ref()); + for (uint64_t _J = 0; _J < _K; ++_J) { + _Rand = _Base128::_UMul128(_Rand, _N - _J, _High); + _Results[_J] = static_cast<_Diff>(_High); + } } } + return _Bound; } _Batched_rng_from_urng(const _Batched_rng_from_urng&) = delete; @@ -6602,66 +6576,90 @@ void _Random_shuffle1(_RanIt _First, _RanIt _Last, _RngFn& _RngFunc) { } // Batched shuffle implementation for 64-bit URNGs with full range. -// Uses batched random generation to reduce RNG calls. +// Uses backward Fisher-Yates (Durstenfeld) with batched random generation +// to reduce URNG calls. Adapted from Lemire's cpp_batched_random. template void _Random_shuffle_batched(_RanIt _First, _RanIt _Last, _Urng& _Func) { - // shuffle [_First, _Last) using batched random generation _STD _Adl_verify_range(_First, _Last); auto _UFirst = _STD _Get_unwrapped(_First); const auto _ULast = _STD _Get_unwrapped(_Last); - if (_UFirst == _ULast) { + + using _Diff = _Iter_diff_t<_RanIt>; + auto _Remaining = static_cast(_ULast - _UFirst); + if (_Remaining <= 1) { return; } - using _Diff = _Iter_diff_t<_RanIt>; _Batched_rng_from_urng<_Diff, _Urng> _BatchedRng(_Func); + _Diff _Offsets[7]; - auto _UTarget = _UFirst; - _Diff _Target_index = 1; - - // Process pairs using batched generation when beneficial. - // Batch of 2 is beneficial when bounds fit in 32 bits (product fits in 64 bits). - for (; ++_UTarget != _ULast; ++_Target_index) { // randomly place an element from [_First, _Target] at _Target - const _Diff _Bound1 = _Target_index + 1; // bound for current position - - // Check if we can batch: both bounds and their product must fit safely. - // Use batch of 2 when the larger bound is <= 2^32 (product fits in 64 bits). - if (static_cast(_Target_index + 2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) { - auto _UTarget_next = _UTarget; - ++_UTarget_next; - - if (_UTarget_next != _ULast) { - const _Diff _Bound2 = _Target_index + 2; // bound for next position - - // Generate two random indices in one batch. - _Diff _Offsets[2]; - _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2); + // Process one element at a time while bounds are too large for batching. + for (; _Remaining > (uint64_t{1} << 30); --_Remaining) { + _BatchedRng._Batch(_Offsets, _Remaining, 1, _Remaining); + swap(*(_UFirst + _Remaining - 1), *(_UFirst + _Offsets[0])); // intentional ADL + } - _STL_ASSERT(0 <= _Offsets[0] && _Offsets[0] <= _Target_index, "random value out of range"); - _STL_ASSERT(0 <= _Offsets[1] && _Offsets[1] <= _Target_index + 1, "random value out of range"); + // Batch of 2: product of 2 consecutive bounds fits in 64 bits. + { + uint64_t _Bound = uint64_t{1} << 60; + for (; _Remaining > (uint64_t{1} << 19); _Remaining -= 2) { + _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 2, _Bound); + for (uint64_t _J = 0; _J < 2; ++_J) { + swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL + } + } + } - // Perform first swap. - if (_Offsets[0] != _Target_index) { - swap(*_UTarget, *(_UFirst + _Offsets[0])); // intentional ADL - } + // Batch of 3. + { + uint64_t _Bound = uint64_t{1} << 57; + for (; _Remaining > (uint64_t{1} << 14); _Remaining -= 3) { + _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 3, _Bound); + for (uint64_t _J = 0; _J < 3; ++_J) { + swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL + } + } + } - // Advance to next position and perform second swap. - ++_UTarget; - ++_Target_index; + // Batch of 4. + { + uint64_t _Bound = uint64_t{1} << 56; + for (; _Remaining > (uint64_t{1} << 11); _Remaining -= 4) { + _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 4, _Bound); + for (uint64_t _J = 0; _J < 4; ++_J) { + swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL + } + } + } - if (_Offsets[1] != _Target_index) { - swap(*_UTarget, *(_UFirst + _Offsets[1])); // intentional ADL - } + // Batch of 5. + { + uint64_t _Bound = uint64_t{1} << 55; + for (; _Remaining > (uint64_t{1} << 9); _Remaining -= 5) { + _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 5, _Bound); + for (uint64_t _J = 0; _J < 5; ++_J) { + swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL + } + } + } - continue; // The for-loop's ++_UTarget and ++_Target_index handle the next advance. + // Batch of 6. + { + uint64_t _Bound = uint64_t{1} << 54; + for (; _Remaining > 6; _Remaining -= 6) { + _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 6, _Bound); + for (uint64_t _J = 0; _J < 6; ++_J) { + swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL } } + } - // Fall back to single generation for this position. - const _Diff _Off = _BatchedRng._Single_bounded(_Bound1); - _STL_ASSERT(0 <= _Off && _Off <= _Target_index, "random value out of range"); - if (_Off != _Target_index) { - swap(*_UTarget, *(_UFirst + _Off)); // intentional ADL + // Final: remaining <= 6 elements, handle in one batch. + if (_Remaining > 1) { + const auto _K = _Remaining - 1; + _BatchedRng._Batch(_Offsets, _Remaining, _K, 720); + for (uint64_t _J = 0; _J < _K; ++_J) { + swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL } } } @@ -6748,64 +6746,101 @@ namespace ranges { } // Batched shuffle implementation for ranges. + // Uses backward Fisher-Yates (Durstenfeld) with batched random generation. template _NODISCARD static _It _Shuffle_unchecked_batched(_It _First, const _Se _Last, _Urng& _Func) { - // shuffle [_First, _Last) using batched random generation _STL_INTERNAL_STATIC_ASSERT(random_access_iterator<_It>); _STL_INTERNAL_STATIC_ASSERT(sentinel_for<_Se, _It>); _STL_INTERNAL_STATIC_ASSERT(permutable<_It>); - if (_First == _Last) { - return _First; + using _Diff = iter_difference_t<_It>; + const auto _Count = static_cast<_Diff>(_Last - _First); + auto _Remaining = static_cast(_Count); + if (_Remaining <= 1) { + return _First + _Count; } - using _Diff = iter_difference_t<_It>; _Batched_rng_from_urng<_Diff, _Urng> _BatchedRng(_Func); + _Diff _Offsets[7]; + + // Process one element at a time while bounds are too large for batching. + for (; _Remaining > (uint64_t{1} << 30); --_Remaining) { + _BatchedRng._Batch(_Offsets, _Remaining, 1, _Remaining); + _RANGES iter_swap( + _First + static_cast<_Diff>(_Remaining - 1), _First + static_cast<_Diff>(_Offsets[0])); + } + + // Batch of 2. + { + uint64_t _Bound = uint64_t{1} << 60; + for (; _Remaining > (uint64_t{1} << 19); _Remaining -= 2) { + _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 2, _Bound); + for (uint64_t _J = 0; _J < 2; ++_J) { + _RANGES iter_swap(_First + static_cast<_Diff>(_Remaining - _J - 1), + _First + static_cast<_Diff>(_Offsets[_J])); + } + } + } - auto _Target = _First; - _Diff _Target_index = 1; - - // Process pairs using batched generation when beneficial. - for (; ++_Target != _Last; ++_Target_index) { - // randomly place an element from [_First, _Target] at _Target - const _Diff _Bound1 = _Target_index + 1; - - if (static_cast(_Target_index + 2) - <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) { - auto _Target_next = _Target; - ++_Target_next; - - if (_Target_next != _Last) { - const _Diff _Bound2 = _Target_index + 2; - - _Diff _Offsets[2]; - _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2); - - _STL_ASSERT(0 <= _Offsets[0] && _Offsets[0] <= _Target_index, "random value out of range"); - _STL_ASSERT(0 <= _Offsets[1] && _Offsets[1] <= _Target_index + 1, "random value out of range"); - - if (_Offsets[0] != _Target_index) { - _RANGES iter_swap(_Target, _First + _Offsets[0]); - } + // Batch of 3. + { + uint64_t _Bound = uint64_t{1} << 57; + for (; _Remaining > (uint64_t{1} << 14); _Remaining -= 3) { + _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 3, _Bound); + for (uint64_t _J = 0; _J < 3; ++_J) { + _RANGES iter_swap(_First + static_cast<_Diff>(_Remaining - _J - 1), + _First + static_cast<_Diff>(_Offsets[_J])); + } + } + } - ++_Target; - ++_Target_index; + // Batch of 4. + { + uint64_t _Bound = uint64_t{1} << 56; + for (; _Remaining > (uint64_t{1} << 11); _Remaining -= 4) { + _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 4, _Bound); + for (uint64_t _J = 0; _J < 4; ++_J) { + _RANGES iter_swap(_First + static_cast<_Diff>(_Remaining - _J - 1), + _First + static_cast<_Diff>(_Offsets[_J])); + } + } + } - if (_Offsets[1] != _Target_index) { - _RANGES iter_swap(_Target, _First + _Offsets[1]); - } + // Batch of 5. + { + uint64_t _Bound = uint64_t{1} << 55; + for (; _Remaining > (uint64_t{1} << 9); _Remaining -= 5) { + _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 5, _Bound); + for (uint64_t _J = 0; _J < 5; ++_J) { + _RANGES iter_swap(_First + static_cast<_Diff>(_Remaining - _J - 1), + _First + static_cast<_Diff>(_Offsets[_J])); + } + } + } - continue; // The for-loop's ++_Target and ++_Target_index handle the next advance. + // Batch of 6. + { + uint64_t _Bound = uint64_t{1} << 54; + for (; _Remaining > 6; _Remaining -= 6) { + _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 6, _Bound); + for (uint64_t _J = 0; _J < 6; ++_J) { + _RANGES iter_swap(_First + static_cast<_Diff>(_Remaining - _J - 1), + _First + static_cast<_Diff>(_Offsets[_J])); } } + } - const _Diff _Off = _BatchedRng._Single_bounded(_Bound1); - _STL_ASSERT(0 <= _Off && _Off <= _Target_index, "random value out of range"); - if (_Off != _Target_index) { - _RANGES iter_swap(_Target, _First + _Off); + // Final: remaining <= 6 elements, handle in one batch. + if (_Remaining > 1) { + const auto _K = _Remaining - 1; + _BatchedRng._Batch(_Offsets, _Remaining, _K, 720); + for (uint64_t _J = 0; _J < _K; ++_J) { + _RANGES iter_swap( + _First + static_cast<_Diff>(_Remaining - _J - 1), _First + static_cast<_Diff>(_Offsets[_J])); } } - return _Target; + + return _First + _Count; } };