From d1744fa3bf18de9cd22be9dbf79ec0fa75510571 Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen@gmail.com>
Date: Tue, 2 Dec 2025 13:23:36 -0800
Subject: [PATCH 1/5] Implement batched random integer generation for shuffle

Implements the algorithm from Brackett-Rozinsky & Lemire's paper
"Batched Ranged Random Integer Generation" to reduce RNG calls in
std::shuffle and ranges::shuffle for 64-bit URNGs like mt19937_64.

The batched approach extracts multiple bounded random integers from a
single 64-bit random word, using only multiplication (no division) in
the common case. This reduces the number of RNG calls by approximately
half for arrays with fewer than 2^32 elements.

Resolves #5736
---
 stl/inc/algorithm                             | 267 +++++++++++++++++-
 .../tests/P0896R4_ranges_alg_shuffle/test.cpp | 120 ++++++++
 2 files changed, 379 insertions(+), 8 deletions(-)
diff --git a/stl/inc/algorithm b/stl/inc/algorithm
index e95528c127d..260a2a95f02 100644
--- a/stl/inc/algorithm
+++ b/stl/inc/algorithm
@@ -6096,6 +6096,92 @@ namespace ranges {
 #endif // _HAS_CXX20
 #endif // _HAS_CXX17
 
+// Batched random integer generation for shuffle optimization.
+// From Nevin Brackett-Rozinsky and Daniel Lemire, "Batched Ranged Random Integer Generation",
+// Software: Practice and Experience 55(1), 2025.
+//
+// This algorithm extracts multiple bounded random integers from a single 64-bit random word,
+// using only multiplication (no division) in the common case.
+
+template <class _Diff, class _Urng>
+struct _Batched_rng_from_urng {
+    // Batched random generation is only beneficial for 64-bit RNGs with full range.
+    // It requires the RNG to produce values in [0, 2^64 - 1].
+    using _Urng_result = _Invoke_result_t<_Urng&>;
+
+    static constexpr bool _Has_full_64bit_range = sizeof(_Urng_result) >= sizeof(uint64_t)
+                                               && is_unsigned_v<_Urng_result> && (_Urng::min) () == 0
+                                               && (_Urng::max) () == (numeric_limits<uint64_t>::max) ();
+
+    // Threshold bounds for batch sizes based on array size.
+    // These are derived from the paper to minimize expected cost per random value.
+    // Batch size k requires product of k consecutive bounds <= 2^64.
+    static constexpr uint64_t _Bound_for_batch_6 = 21; // (21)^6 = 85,766,121 < 2^64 / (very conservative)
+    static constexpr uint64_t _Bound_for_batch_5 = 73; // (73)^5 = 2,073,071,593 < 2^64
+    static constexpr uint64_t _Bound_for_batch_4 = 302; // (302)^4 = 8,319,430,096 < 2^64
+    static constexpr uint64_t _Bound_for_batch_3 = 2642; // (2642)^3 = 18,454,249,288 < 2^64
+    static constexpr uint64_t _Bound_for_batch_2 = 4294967296; // 2^32, for batch of 2
+
+    _Urng& _Ref;
+
+    explicit _Batched_rng_from_urng(_Urng& _Func) noexcept : _Ref(_Func) {}
+
+    // Generate a single bounded random value in [0, _Bound) using Lemire's method.
+    _NODISCARD _Diff _Single_bounded(_Diff _Bound) {
+        _Unsigned128 _Product{_Base128::_UMul128(static_cast<uint64_t>(_Ref()), static_cast<uint64_t>(_Bound),
+            _Product._Word[1])};
+        auto _Leftover = _Product._Word[0];
+
+        if (_Leftover < static_cast<uint64_t>(_Bound)) {
+            const uint64_t _Threshold = (0 - static_cast<uint64_t>(_Bound)) % static_cast<uint64_t>(_Bound);
+            while (_Leftover < _Threshold) {
+                _Product = _Unsigned128{_Base128::_UMul128(
+                    static_cast<uint64_t>(_Ref()), static_cast<uint64_t>(_Bound), _Product._Word[1])};
+                _Leftover = _Product._Word[0];
+            }
+        }
+
+        return static_cast<_Diff>(_Product._Word[1]);
+    }
+
+    // Generate two bounded random values from a single 64-bit random word.
+    // The bounds are (n+1) and n for Fisher-Yates shuffle positions _Target_index and _Target_index-1.
+    void _Batch_2(_Diff* _Results, _Diff _Bound1, _Diff _Bound2) {
+        const uint64_t _B1       = static_cast<uint64_t>(_Bound1);
+        const uint64_t _B2       = static_cast<uint64_t>(_Bound2);
+        const uint64_t _Product_bound = _B1 * _B2;
+
+        uint64_t _Random_word = static_cast<uint64_t>(_Ref());
+
+        _Unsigned128 _Prod1{_Base128::_UMul128(_Random_word, _B1, _Prod1._Word[1])};
+        _Results[0]         = static_cast<_Diff>(_Prod1._Word[1]);
+        uint64_t _Leftover1 = _Prod1._Word[0];
+
+        _Unsigned128 _Prod2{_Base128::_UMul128(_Leftover1, _B2, _Prod2._Word[1])};
+        _Results[1]        = static_cast<_Diff>(_Prod2._Word[1]);
+        uint64_t _Leftover = _Prod2._Word[0];
+
+        // Rejection sampling: check if leftover is below threshold.
+        if (_Leftover < _Product_bound) {
+            const uint64_t _Threshold = (0 - _Product_bound) % _Product_bound;
+            while (_Leftover < _Threshold) {
+                _Random_word = static_cast<uint64_t>(_Ref());
+
+                _Prod1      = _Unsigned128{_Base128::_UMul128(_Random_word, _B1, _Prod1._Word[1])};
+                _Results[0] = static_cast<_Diff>(_Prod1._Word[1]);
+                _Leftover1  = _Prod1._Word[0];
+
+                _Prod2      = _Unsigned128{_Base128::_UMul128(_Leftover1, _B2, _Prod2._Word[1])};
+                _Results[1] = static_cast<_Diff>(_Prod2._Word[1]);
+                _Leftover   = _Prod2._Word[0];
+            }
+        }
+    }
+
+    _Batched_rng_from_urng(const _Batched_rng_from_urng&)            = delete;
+    _Batched_rng_from_urng& operator=(const _Batched_rng_from_urng&) = delete;
+};
+
 template <class _Diff, class _Urng>
 class _Rng_from_urng_v2 { // wrap a URNG as an RNG
 public:
@@ -6439,11 +6525,91 @@ void _Random_shuffle1(_RanIt _First, _RanIt _Last, _RngFn& _RngFunc) {
     }
 }
 
+// Batched shuffle implementation for 64-bit URNGs with full range.
+// Uses batched random generation to reduce RNG calls.
+template <class _RanIt, class _Urng>
+void _Random_shuffle_batched(_RanIt _First, _RanIt _Last, _Urng& _Func) {
+    // shuffle [_First, _Last) using batched random generation
+    _STD _Adl_verify_range(_First, _Last);
+    auto _UFirst      = _STD _Get_unwrapped(_First);
+    const auto _ULast = _STD _Get_unwrapped(_Last);
+    if (_UFirst == _ULast) {
+        return;
+    }
+
+    using _Diff = _Iter_diff_t<_RanIt>;
+    _Batched_rng_from_urng<_Diff, _Urng> _BatchedRng(_Func);
+
+    auto _UTarget       = _UFirst;
+    _Diff _Target_index = 1;
+
+    // Process pairs using batched generation when beneficial.
+    // Batch of 2 is beneficial when bounds fit in 32 bits (product fits in 64 bits).
+    while (_UTarget != _ULast) {
+        ++_UTarget;
+        if (_UTarget == _ULast) {
+            break;
+        }
+
+        const _Diff _Bound1 = _Target_index + 1; // bound for current position
+        const _Diff _Bound2 = _Target_index + 2; // bound for next position
+
+        // Check if we can batch: both bounds and their product must fit safely.
+        // Use batch of 2 when the larger bound is <= 2^32 (product fits in 64 bits).
+        if (static_cast<uint64_t>(_Bound2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) {
+            auto _UTarget_next = _UTarget;
+            ++_UTarget_next;
+
+            if (_UTarget_next != _ULast) {
+                // Generate two random indices in one batch.
+                _Diff _Offsets[2];
+                _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2);
+
+                _STL_ASSERT(0 <= _Offsets[0] && _Offsets[0] <= _Target_index, "random value out of range");
+                _STL_ASSERT(0 <= _Offsets[1] && _Offsets[1] <= _Target_index + 1, "random value out of range");
+
+                // Perform first swap.
+                if (_Offsets[0] != _Target_index) {
+                    swap(*_UTarget, *(_UFirst + _Offsets[0])); // intentional ADL
+                }
+
+                // Advance to next position and perform second swap.
+                ++_UTarget;
+                ++_Target_index;
+
+                if (_Offsets[1] != _Target_index) {
+                    swap(*_UTarget, *(_UFirst + _Offsets[1])); // intentional ADL
+                }
+
+                ++_UTarget;
+                ++_Target_index;
+                continue;
+            }
+        }
+
+        // Fall back to single generation for this position.
+        const _Diff _Off = _BatchedRng._Single_bounded(_Bound1);
+        _STL_ASSERT(0 <= _Off && _Off <= _Target_index, "random value out of range");
+        if (_Off != _Target_index) {
+            swap(*_UTarget, *(_UFirst + _Off)); // intentional ADL
+        }
+
+        ++_UTarget;
+        ++_Target_index;
+    }
+}
+
 _EXPORT_STD template <class _RanIt, class _Urng>
 void shuffle(_RanIt _First, _RanIt _Last, _Urng&& _Func) { // shuffle [_First, _Last) using URNG _Func
     using _Urng0 = remove_reference_t<_Urng>;
-    _Rng_from_urng_v2<_Iter_diff_t<_RanIt>, _Urng0> _RngFunc(_Func);
-    _STD _Random_shuffle1(_First, _Last, _RngFunc);
+
+    // Use batched shuffle when the URNG produces full 64-bit range values.
+    if constexpr (_Batched_rng_from_urng<_Iter_diff_t<_RanIt>, _Urng0>::_Has_full_64bit_range) {
+        _STD _Random_shuffle_batched(_First, _Last, _Func);
+    } else {
+        _Rng_from_urng_v2<_Iter_diff_t<_RanIt>, _Urng0> _RngFunc(_Func);
+        _STD _Random_shuffle1(_First, _Last, _RngFunc);
+    }
 }
 
 #if _HAS_CXX20
@@ -6455,20 +6621,37 @@ namespace ranges {
         _STATIC_CALL_OPERATOR _It operator()(_It _First, _Se _Last, _Urng&& _Func) _CONST_CALL_OPERATOR {
             _STD _Adl_verify_range(_First, _Last);
 
-            _Rng_from_urng_v2<iter_difference_t<_It>, remove_reference_t<_Urng>> _RngFunc(_Func);
-            auto _UResult = _Shuffle_unchecked(
-                _RANGES _Unwrap_iter<_Se>(_STD move(_First)), _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _RngFunc);
+            using _Urng0 = remove_reference_t<_Urng>;
+            using _Diff  = iter_difference_t<_It>;
 
-            _STD _Seek_wrapped(_First, _STD move(_UResult));
+            // Use batched shuffle when the URNG produces full 64-bit range values.
+            if constexpr (_Batched_rng_from_urng<_Diff, _Urng0>::_Has_full_64bit_range) {
+                auto _UResult = _Shuffle_unchecked_batched(
+                    _RANGES _Unwrap_iter<_Se>(_STD move(_First)), _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _Func);
+                _STD _Seek_wrapped(_First, _STD move(_UResult));
+            } else {
+                _Rng_from_urng_v2<_Diff, _Urng0> _RngFunc(_Func);
+                auto _UResult = _Shuffle_unchecked(
+                    _RANGES _Unwrap_iter<_Se>(_STD move(_First)), _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _RngFunc);
+                _STD _Seek_wrapped(_First, _STD move(_UResult));
+            }
             return _First;
         }
 
         template <random_access_range _Rng, class _Urng>
             requires permutable<iterator_t<_Rng>> && uniform_random_bit_generator<remove_reference_t<_Urng>>
         _STATIC_CALL_OPERATOR borrowed_iterator_t<_Rng> operator()(_Rng&& _Range, _Urng&& _Func) _CONST_CALL_OPERATOR {
-            _Rng_from_urng_v2<range_difference_t<_Rng>, remove_reference_t<_Urng>> _RngFunc(_Func);
+            using _Urng0 = remove_reference_t<_Urng>;
+            using _Diff  = range_difference_t<_Rng>;
 
-            return _RANGES _Rewrap_iterator(_Range, _Shuffle_unchecked(_Ubegin(_Range), _Uend(_Range), _RngFunc));
+            // Use batched shuffle when the URNG produces full 64-bit range values.
+            if constexpr (_Batched_rng_from_urng<_Diff, _Urng0>::_Has_full_64bit_range) {
+                return _RANGES _Rewrap_iterator(
+                    _Range, _Shuffle_unchecked_batched(_Ubegin(_Range), _Uend(_Range), _Func));
+            } else {
+                _Rng_from_urng_v2<_Diff, _Urng0> _RngFunc(_Func);
+                return _RANGES _Rewrap_iterator(_Range, _Shuffle_unchecked(_Ubegin(_Range), _Uend(_Range), _RngFunc));
+            }
         }
 
     private:
@@ -6496,6 +6679,74 @@ namespace ranges {
             }
             return _Target;
         }
+
+        // Batched shuffle implementation for ranges.
+        template <class _It, class _Se, class _Urng>
+        _NODISCARD static _It _Shuffle_unchecked_batched(_It _First, const _Se _Last, _Urng& _Func) {
+            // shuffle [_First, _Last) using batched random generation
+            _STL_INTERNAL_STATIC_ASSERT(random_access_iterator<_It>);
+            _STL_INTERNAL_STATIC_ASSERT(sentinel_for<_Se, _It>);
+            _STL_INTERNAL_STATIC_ASSERT(permutable<_It>);
+
+            if (_First == _Last) {
+                return _First;
+            }
+
+            using _Diff = iter_difference_t<_It>;
+            _Batched_rng_from_urng<_Diff, _Urng> _BatchedRng(_Func);
+
+            auto _Target        = _First;
+            _Diff _Target_index = 1;
+
+            // Process pairs using batched generation when beneficial.
+            while (_Target != _Last) {
+                ++_Target;
+                if (_Target == _Last) {
+                    break;
+                }
+
+                const _Diff _Bound1 = _Target_index + 1;
+                const _Diff _Bound2 = _Target_index + 2;
+
+                if (static_cast<uint64_t>(_Bound2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) {
+                    auto _Target_next = _Target;
+                    ++_Target_next;
+
+                    if (_Target_next != _Last) {
+                        _Diff _Offsets[2];
+                        _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2);
+
+                        _STL_ASSERT(0 <= _Offsets[0] && _Offsets[0] <= _Target_index, "random value out of range");
+                        _STL_ASSERT(0 <= _Offsets[1] && _Offsets[1] <= _Target_index + 1, "random value out of range");
+
+                        if (_Offsets[0] != _Target_index) {
+                            _RANGES iter_swap(_Target, _First + _Offsets[0]);
+                        }
+
+                        ++_Target;
+                        ++_Target_index;
+
+                        if (_Offsets[1] != _Target_index) {
+                            _RANGES iter_swap(_Target, _First + _Offsets[1]);
+                        }
+
+                        ++_Target;
+                        ++_Target_index;
+                        continue;
+                    }
+                }
+
+                const _Diff _Off = _BatchedRng._Single_bounded(_Bound1);
+                _STL_ASSERT(0 <= _Off && _Off <= _Target_index, "random value out of range");
+                if (_Off != _Target_index) {
+                    _RANGES iter_swap(_Target, _First + _Off);
+                }
+
+                ++_Target;
+                ++_Target_index;
+            }
+            return _Target;
+        }
     };
 
     _EXPORT_STD inline constexpr _Shuffle_fn shuffle;
diff --git a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp
index 6c2bb00b8a3..18fd2f7fd45 100644
--- a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp
+++ b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp
@@ -6,15 +6,18 @@
 #include <concepts>
 #include <cstdint>
 #include <cstdio>
+#include <numeric>
 #include <random>
 #include <ranges>
 #include <utility>
+#include <vector>
 
 #include <range_algorithm_support.hpp>
 using namespace std;
 
 const unsigned int seed = random_device{}();
 mt19937 gen{seed};
+mt19937_64 gen64{seed}; // 64-bit generator for batched random path
 
 // Validate dangling story
 static_assert(same_as<decltype(ranges::shuffle(borrowed<false>{}, gen)), ranges::dangling>);
@@ -72,8 +75,125 @@ void test_urbg() { // COMPILE-ONLY
     ranges::shuffle(arr, RandGen{});
 }
 
+// Test that shuffle produces a valid permutation for various sizes.
+// This exercises both the batched path (for 64-bit RNGs) and the fallback path.
+void test_shuffle_permutation() {
+    // Test with 64-bit generator (batched random path)
+    {
+        vector<int> v(100);
+        iota(v.begin(), v.end(), 0);
+        vector<int> original = v;
+
+        shuffle(v.begin(), v.end(), gen64);
+
+        // Verify it's still a permutation
+        vector<int> sorted_v = v;
+        sort(sorted_v.begin(), sorted_v.end());
+        assert(sorted_v == original);
+    }
+
+    // Test with ranges::shuffle and 64-bit generator
+    {
+        vector<int> v(100);
+        iota(v.begin(), v.end(), 0);
+        vector<int> original = v;
+
+        ranges::shuffle(v, gen64);
+
+        // Verify it's still a permutation
+        vector<int> sorted_v = v;
+        sort(sorted_v.begin(), sorted_v.end());
+        assert(sorted_v == original);
+    }
+
+    // Test with 32-bit generator (non-batched path)
+    {
+        vector<int> v(100);
+        iota(v.begin(), v.end(), 0);
+        vector<int> original = v;
+
+        shuffle(v.begin(), v.end(), gen);
+
+        // Verify it's still a permutation
+        vector<int> sorted_v = v;
+        sort(sorted_v.begin(), sorted_v.end());
+        assert(sorted_v == original);
+    }
+
+    // Test with ranges::shuffle and 32-bit generator
+    {
+        vector<int> v(100);
+        iota(v.begin(), v.end(), 0);
+        vector<int> original = v;
+
+        ranges::shuffle(v, gen);
+
+        // Verify it's still a permutation
+        vector<int> sorted_v = v;
+        sort(sorted_v.begin(), sorted_v.end());
+        assert(sorted_v == original);
+    }
+}
+
+// Test edge cases for shuffle
+void test_shuffle_edge_cases() {
+    // Empty range
+    {
+        vector<int> v;
+        shuffle(v.begin(), v.end(), gen64);
+        assert(v.empty());
+    }
+
+    // Single element
+    {
+        vector<int> v = {42};
+        shuffle(v.begin(), v.end(), gen64);
+        assert(v.size() == 1);
+        assert(v[0] == 42);
+    }
+
+    // Two elements
+    {
+        vector<int> v = {1, 2};
+        vector<int> original = v;
+        shuffle(v.begin(), v.end(), gen64);
+        sort(v.begin(), v.end());
+        assert(v == original);
+    }
+
+    // Three elements (odd count, tests batching boundary)
+    {
+        vector<int> v = {1, 2, 3};
+        vector<int> original = v;
+        shuffle(v.begin(), v.end(), gen64);
+        sort(v.begin(), v.end());
+        assert(v == original);
+    }
+
+    // Four elements (even count)
+    {
+        vector<int> v = {1, 2, 3, 4};
+        vector<int> original = v;
+        shuffle(v.begin(), v.end(), gen64);
+        sort(v.begin(), v.end());
+        assert(v == original);
+    }
+
+    // Large array to ensure batching is effective
+    {
+        vector<int> v(10000);
+        iota(v.begin(), v.end(), 0);
+        vector<int> original = v;
+        shuffle(v.begin(), v.end(), gen64);
+        sort(v.begin(), v.end());
+        assert(v == original);
+    }
+}
+
 int main() {
     printf("Using seed: %u\n", seed);
 
     test_random<instantiator, int>();
+    test_shuffle_permutation();
+    test_shuffle_edge_cases();
 }

From 3df88561ae112ac3f4a64d699d600e7bbfacde0d Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen@gmail.com>
Date: Tue, 2 Dec 2025 19:19:17 -0800
Subject: [PATCH 2/5] Address review feedback: extract URNG range check to
 variable template

Move _Has_full_64bit_range check out of _Batched_rng_from_urng class
to reduce template instantiation overhead. Use is_same_v and _Max_limit
as suggested by reviewer.
---
 stl/inc/algorithm | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/stl/inc/algorithm b/stl/inc/algorithm
index 260a2a95f02..c5b92f1d2cf 100644
--- a/stl/inc/algorithm
+++ b/stl/inc/algorithm
@@ -6103,15 +6103,14 @@ namespace ranges {
 // This algorithm extracts multiple bounded random integers from a single 64-bit random word,
 // using only multiplication (no division) in the common case.
 
+// Check if a URNG has full 64-bit range [0, 2^64 - 1].
+// Batched random generation is only beneficial for such RNGs.
+template <class _Urng>
+constexpr bool _Urng_has_full_64bit_range =
+    is_same_v<_Invoke_result_t<_Urng&>, uint64_t> && (_Urng::min)() == 0 && (_Urng::max)() == _Max_limit<uint64_t>();
+
 template <class _Diff, class _Urng>
 struct _Batched_rng_from_urng {
-    // Batched random generation is only beneficial for 64-bit RNGs with full range.
-    // It requires the RNG to produce values in [0, 2^64 - 1].
-    using _Urng_result = _Invoke_result_t<_Urng&>;
-
-    static constexpr bool _Has_full_64bit_range = sizeof(_Urng_result) >= sizeof(uint64_t)
-                                               && is_unsigned_v<_Urng_result> && (_Urng::min) () == 0
-                                               && (_Urng::max) () == (numeric_limits<uint64_t>::max) ();
 
     // Threshold bounds for batch sizes based on array size.
     // These are derived from the paper to minimize expected cost per random value.
@@ -6604,7 +6603,7 @@ void shuffle(_RanIt _First, _RanIt _Last, _Urng&& _Func) { // shuffle [_First, _
     using _Urng0 = remove_reference_t<_Urng>;
 
     // Use batched shuffle when the URNG produces full 64-bit range values.
-    if constexpr (_Batched_rng_from_urng<_Iter_diff_t<_RanIt>, _Urng0>::_Has_full_64bit_range) {
+    if constexpr (_Urng_has_full_64bit_range<_Urng0>) {
         _STD _Random_shuffle_batched(_First, _Last, _Func);
     } else {
         _Rng_from_urng_v2<_Iter_diff_t<_RanIt>, _Urng0> _RngFunc(_Func);
@@ -6625,7 +6624,7 @@ namespace ranges {
             using _Diff  = iter_difference_t<_It>;
 
             // Use batched shuffle when the URNG produces full 64-bit range values.
-            if constexpr (_Batched_rng_from_urng<_Diff, _Urng0>::_Has_full_64bit_range) {
+            if constexpr (_Urng_has_full_64bit_range<_Urng0>) {
                 auto _UResult = _Shuffle_unchecked_batched(
                     _RANGES _Unwrap_iter<_Se>(_STD move(_First)), _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _Func);
                 _STD _Seek_wrapped(_First, _STD move(_UResult));
@@ -6645,7 +6644,7 @@ namespace ranges {
             using _Diff  = range_difference_t<_Rng>;
 
             // Use batched shuffle when the URNG produces full 64-bit range values.
-            if constexpr (_Batched_rng_from_urng<_Diff, _Urng0>::_Has_full_64bit_range) {
+            if constexpr (_Urng_has_full_64bit_range<_Urng0>) {
                 return _RANGES _Rewrap_iterator(
                     _Range, _Shuffle_unchecked_batched(_Ubegin(_Range), _Uend(_Range), _Func));
             } else {

From 8aa184ab394fc54f3f15bdb05bcb79200de9d2dd Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen@gmail.com>
Date: Tue, 2 Dec 2025 19:49:01 -0800
Subject: [PATCH 3/5] Apply clang-format

---
 stl/inc/algorithm                                | 16 ++++++++--------
 .../tests/P0896R4_ranges_alg_shuffle/test.cpp    |  6 +++---
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/stl/inc/algorithm b/stl/inc/algorithm
index c5b92f1d2cf..a19e4332658 100644
--- a/stl/inc/algorithm
+++ b/stl/inc/algorithm
@@ -6107,7 +6107,7 @@ namespace ranges {
 // Batched random generation is only beneficial for such RNGs.
 template <class _Urng>
 constexpr bool _Urng_has_full_64bit_range =
-    is_same_v<_Invoke_result_t<_Urng&>, uint64_t> && (_Urng::min)() == 0 && (_Urng::max)() == _Max_limit<uint64_t>();
+    is_same_v<_Invoke_result_t<_Urng&>, uint64_t> && (_Urng::min) () == 0 && (_Urng::max) () == _Max_limit<uint64_t>();
 
 template <class _Diff, class _Urng>
 struct _Batched_rng_from_urng {
@@ -6127,14 +6127,14 @@ struct _Batched_rng_from_urng {
 
     // Generate a single bounded random value in [0, _Bound) using Lemire's method.
     _NODISCARD _Diff _Single_bounded(_Diff _Bound) {
-        _Unsigned128 _Product{_Base128::_UMul128(static_cast<uint64_t>(_Ref()), static_cast<uint64_t>(_Bound),
-            _Product._Word[1])};
+        _Unsigned128 _Product{
+            _Base128::_UMul128(static_cast<uint64_t>(_Ref()), static_cast<uint64_t>(_Bound), _Product._Word[1])};
         auto _Leftover = _Product._Word[0];
 
         if (_Leftover < static_cast<uint64_t>(_Bound)) {
             const uint64_t _Threshold = (0 - static_cast<uint64_t>(_Bound)) % static_cast<uint64_t>(_Bound);
             while (_Leftover < _Threshold) {
-                _Product = _Unsigned128{_Base128::_UMul128(
+                _Product  = _Unsigned128{_Base128::_UMul128(
                     static_cast<uint64_t>(_Ref()), static_cast<uint64_t>(_Bound), _Product._Word[1])};
                 _Leftover = _Product._Word[0];
             }
@@ -6146,8 +6146,8 @@ struct _Batched_rng_from_urng {
     // Generate two bounded random values from a single 64-bit random word.
     // The bounds are (n+1) and n for Fisher-Yates shuffle positions _Target_index and _Target_index-1.
     void _Batch_2(_Diff* _Results, _Diff _Bound1, _Diff _Bound2) {
-        const uint64_t _B1       = static_cast<uint64_t>(_Bound1);
-        const uint64_t _B2       = static_cast<uint64_t>(_Bound2);
+        const uint64_t _B1            = static_cast<uint64_t>(_Bound1);
+        const uint64_t _B2            = static_cast<uint64_t>(_Bound2);
         const uint64_t _Product_bound = _B1 * _B2;
 
         uint64_t _Random_word = static_cast<uint64_t>(_Ref());
@@ -6630,8 +6630,8 @@ namespace ranges {
                 _STD _Seek_wrapped(_First, _STD move(_UResult));
             } else {
                 _Rng_from_urng_v2<_Diff, _Urng0> _RngFunc(_Func);
-                auto _UResult = _Shuffle_unchecked(
-                    _RANGES _Unwrap_iter<_Se>(_STD move(_First)), _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _RngFunc);
+                auto _UResult = _Shuffle_unchecked(_RANGES _Unwrap_iter<_Se>(_STD move(_First)),
+                    _RANGES _Unwrap_sent<_It>(_STD move(_Last)), _RngFunc);
                 _STD _Seek_wrapped(_First, _STD move(_UResult));
             }
             return _First;
diff --git a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp
index 18fd2f7fd45..0d8082be24f 100644
--- a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp
+++ b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp
@@ -154,7 +154,7 @@ void test_shuffle_edge_cases() {
 
     // Two elements
     {
-        vector<int> v = {1, 2};
+        vector<int> v        = {1, 2};
         vector<int> original = v;
         shuffle(v.begin(), v.end(), gen64);
         sort(v.begin(), v.end());
@@ -163,7 +163,7 @@ void test_shuffle_edge_cases() {
 
     // Three elements (odd count, tests batching boundary)
     {
-        vector<int> v = {1, 2, 3};
+        vector<int> v        = {1, 2, 3};
         vector<int> original = v;
         shuffle(v.begin(), v.end(), gen64);
         sort(v.begin(), v.end());
@@ -172,7 +172,7 @@ void test_shuffle_edge_cases() {
 
     // Four elements (even count)
     {
-        vector<int> v = {1, 2, 3, 4};
+        vector<int> v        = {1, 2, 3, 4};
         vector<int> original = v;
         shuffle(v.begin(), v.end(), gen64);
         sort(v.begin(), v.end());

From 86c0c0a3ab014b9d86cd13077b69dea0b2c3035d Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen@gmail.com>
Date: Fri, 27 Feb 2026 14:35:14 -0800
Subject: [PATCH 4/5] `<algorithm>`: Fix batched random integer generation for
 `shuffle()`

Fix two bugs in the batched shuffle implementation from PR #5932:

1. `_Unsigned128` brace-initialization zeroed the high word of the
   multiplication result (the random index), making all indices 0 and
   shuffle deterministic. Replace with separate `uint64_t` variables
   for high and low words in both `_Single_bounded` and `_Batch_2`.

2. Loop advancement in `_Random_shuffle_batched` and
   `_Shuffle_unchecked_batched` had extra `++_UTarget` increments
   causing elements to be skipped. Restructure to `for` loops matching
   the original `_Random_shuffle1` pattern.

Add regression test for GH-6112 and shuffle quality tests adapted from
Lemire's cpp_batched_random test suite (uniformity, coverage, pair
distribution at start/end positions).

Co-Authored-By: Claude <noreply@anthropic.com>
---
 stl/inc/algorithm                             |  73 +++----
 .../tests/P0896R4_ranges_alg_shuffle/test.cpp | 182 ++++++++++++++++++
 2 files changed, 208 insertions(+), 47 deletions(-)

diff --git a/stl/inc/algorithm b/stl/inc/algorithm
index 200e23cd5fc..2951808f96a 100644
--- a/stl/inc/algorithm
+++ b/stl/inc/algorithm
@@ -6209,20 +6209,17 @@ struct _Batched_rng_from_urng {
 
     // Generate a single bounded random value in [0, _Bound) using Lemire's method.
     _NODISCARD _Diff _Single_bounded(_Diff _Bound) {
-        _Unsigned128 _Product{
-            _Base128::_UMul128(static_cast<uint64_t>(_Ref()), static_cast<uint64_t>(_Bound), _Product._Word[1])};
-        auto _Leftover = _Product._Word[0];
+        uint64_t _High;
+        uint64_t _Leftover = _Base128::_UMul128(static_cast<uint64_t>(_Ref()), static_cast<uint64_t>(_Bound), _High);
 
         if (_Leftover < static_cast<uint64_t>(_Bound)) {
             const uint64_t _Threshold = (0 - static_cast<uint64_t>(_Bound)) % static_cast<uint64_t>(_Bound);
             while (_Leftover < _Threshold) {
-                _Product  = _Unsigned128{_Base128::_UMul128(
-                    static_cast<uint64_t>(_Ref()), static_cast<uint64_t>(_Bound), _Product._Word[1])};
-                _Leftover = _Product._Word[0];
+                _Leftover = _Base128::_UMul128(static_cast<uint64_t>(_Ref()), static_cast<uint64_t>(_Bound), _High);
             }
         }
 
-        return static_cast<_Diff>(_Product._Word[1]);
+        return static_cast<_Diff>(_High);
     }
 
     // Generate two bounded random values from a single 64-bit random word.
@@ -6234,13 +6231,13 @@ struct _Batched_rng_from_urng {
 
         uint64_t _Random_word = static_cast<uint64_t>(_Ref());
 
-        _Unsigned128 _Prod1{_Base128::_UMul128(_Random_word, _B1, _Prod1._Word[1])};
-        _Results[0]         = static_cast<_Diff>(_Prod1._Word[1]);
-        uint64_t _Leftover1 = _Prod1._Word[0];
+        uint64_t _High1;
+        uint64_t _Leftover1 = _Base128::_UMul128(_Random_word, _B1, _High1);
+        _Results[0]         = static_cast<_Diff>(_High1);
 
-        _Unsigned128 _Prod2{_Base128::_UMul128(_Leftover1, _B2, _Prod2._Word[1])};
-        _Results[1]        = static_cast<_Diff>(_Prod2._Word[1]);
-        uint64_t _Leftover = _Prod2._Word[0];
+        uint64_t _High2;
+        uint64_t _Leftover = _Base128::_UMul128(_Leftover1, _B2, _High2);
+        _Results[1]        = static_cast<_Diff>(_High2);
 
         // Rejection sampling: check if leftover is below threshold.
         if (_Leftover < _Product_bound) {
@@ -6248,13 +6245,11 @@ struct _Batched_rng_from_urng {
             while (_Leftover < _Threshold) {
                 _Random_word = static_cast<uint64_t>(_Ref());
 
-                _Prod1      = _Unsigned128{_Base128::_UMul128(_Random_word, _B1, _Prod1._Word[1])};
-                _Results[0] = static_cast<_Diff>(_Prod1._Word[1]);
-                _Leftover1  = _Prod1._Word[0];
+                _Leftover1  = _Base128::_UMul128(_Random_word, _B1, _High1);
+                _Results[0] = static_cast<_Diff>(_High1);
 
-                _Prod2      = _Unsigned128{_Base128::_UMul128(_Leftover1, _B2, _Prod2._Word[1])};
-                _Results[1] = static_cast<_Diff>(_Prod2._Word[1]);
-                _Leftover   = _Prod2._Word[0];
+                _Leftover   = _Base128::_UMul128(_Leftover1, _B2, _High2);
+                _Results[1] = static_cast<_Diff>(_High2);
             }
         }
     }
@@ -6626,22 +6621,18 @@ void _Random_shuffle_batched(_RanIt _First, _RanIt _Last, _Urng& _Func) {
 
     // Process pairs using batched generation when beneficial.
     // Batch of 2 is beneficial when bounds fit in 32 bits (product fits in 64 bits).
-    while (_UTarget != _ULast) {
-        ++_UTarget;
-        if (_UTarget == _ULast) {
-            break;
-        }
-
+    for (; ++_UTarget != _ULast; ++_Target_index) { // randomly place an element from [_First, _Target] at _Target
         const _Diff _Bound1 = _Target_index + 1; // bound for current position
-        const _Diff _Bound2 = _Target_index + 2; // bound for next position
 
         // Check if we can batch: both bounds and their product must fit safely.
         // Use batch of 2 when the larger bound is <= 2^32 (product fits in 64 bits).
-        if (static_cast<uint64_t>(_Bound2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) {
+        if (static_cast<uint64_t>(_Target_index + 2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) {
             auto _UTarget_next = _UTarget;
             ++_UTarget_next;
 
             if (_UTarget_next != _ULast) {
+                const _Diff _Bound2 = _Target_index + 2; // bound for next position
+
                 // Generate two random indices in one batch.
                 _Diff _Offsets[2];
                 _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2);
@@ -6662,9 +6653,7 @@ void _Random_shuffle_batched(_RanIt _First, _RanIt _Last, _Urng& _Func) {
                     swap(*_UTarget, *(_UFirst + _Offsets[1])); // intentional ADL
                 }
 
-                ++_UTarget;
-                ++_Target_index;
-                continue;
+                continue; // The for-loop's ++_UTarget and ++_Target_index handle the next advance.
             }
         }
 
@@ -6674,9 +6663,6 @@ void _Random_shuffle_batched(_RanIt _First, _RanIt _Last, _Urng& _Func) {
         if (_Off != _Target_index) {
             swap(*_UTarget, *(_UFirst + _Off)); // intentional ADL
         }
-
-        ++_UTarget;
-        ++_Target_index;
     }
 }
 
@@ -6780,20 +6766,18 @@ namespace ranges {
             _Diff _Target_index = 1;
 
             // Process pairs using batched generation when beneficial.
-            while (_Target != _Last) {
-                ++_Target;
-                if (_Target == _Last) {
-                    break;
-                }
-
+            for (; ++_Target != _Last; ++_Target_index) {
+                // randomly place an element from [_First, _Target] at _Target
                 const _Diff _Bound1 = _Target_index + 1;
-                const _Diff _Bound2 = _Target_index + 2;
 
-                if (static_cast<uint64_t>(_Bound2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) {
+                if (static_cast<uint64_t>(_Target_index + 2)
+                    <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) {
                     auto _Target_next = _Target;
                     ++_Target_next;
 
                     if (_Target_next != _Last) {
+                        const _Diff _Bound2 = _Target_index + 2;
+
                         _Diff _Offsets[2];
                         _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2);
 
@@ -6811,9 +6795,7 @@ namespace ranges {
                             _RANGES iter_swap(_Target, _First + _Offsets[1]);
                         }
 
-                        ++_Target;
-                        ++_Target_index;
-                        continue;
+                        continue; // The for-loop's ++_Target and ++_Target_index handle the next advance.
                     }
                 }
 
@@ -6822,9 +6804,6 @@ namespace ranges {
                 if (_Off != _Target_index) {
                     _RANGES iter_swap(_Target, _First + _Off);
                 }
-
-                ++_Target;
-                ++_Target_index;
             }
             return _Target;
         }
diff --git a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp
index 0d8082be24f..f0d3a5257fc 100644
--- a/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp
+++ b/tests/std/tests/P0896R4_ranges_alg_shuffle/test.cpp
@@ -190,10 +190,192 @@ void test_shuffle_edge_cases() {
     }
 }
 
+// Regression test: consecutive shuffles with the same URNG must produce different results.
+// The batched random implementation had a bug where _Unsigned128 brace-initialization zeroed the high word
+// of the multiplication result, making all random indices 0 and the shuffle deterministic.
+void test_gh_6112() {
+    // Test std::shuffle with mt19937_64 (exercises the batched random path)
+    {
+        mt19937_64 urng(6);
+        vector<int> v(100);
+
+        iota(v.begin(), v.end(), 0);
+        shuffle(v.begin(), v.end(), urng);
+        const auto first_shuffle = v;
+
+        iota(v.begin(), v.end(), 0);
+        shuffle(v.begin(), v.end(), urng);
+
+        assert(v != first_shuffle); // "should be vanishingly impossible" for these to be equal
+    }
+
+    // Test ranges::shuffle with mt19937_64
+    {
+        mt19937_64 urng(6);
+        vector<int> v(100);
+
+        iota(v.begin(), v.end(), 0);
+        ranges::shuffle(v, urng);
+        const auto first_shuffle = v;
+
+        iota(v.begin(), v.end(), 0);
+        ranges::shuffle(v, urng);
+
+        assert(v != first_shuffle);
+    }
+}
+
+// Shuffle quality tests adapted from Lemire's cpp_batched_random test suite.
+// These verify that the shuffle produces a proper uniform random permutation,
+// not just a valid permutation.
+
+// Every element must be able to reach every position over many trials.
+void test_everyone_can_move_everywhere() {
+    constexpr size_t size   = 64;
+    constexpr size_t trials = size * size; // 4096 trials; probability of missing any (pos,val): ~e^-64
+
+    // Test std::shuffle with mt19937_64 (batched path)
+    {
+        mt19937_64 urng(42);
+        vector<int> input(size);
+        vector<char> seen(size * size, 0); // seen[position * size + value]
+
+        for (size_t trial = 0; trial < trials; ++trial) {
+            iota(input.begin(), input.end(), 0);
+            shuffle(input.begin(), input.end(), urng);
+            for (size_t i = 0; i < size; ++i) {
+                seen[i * size + static_cast<size_t>(input[i])] = 1;
+            }
+        }
+
+        for (size_t pos = 0; pos < size; ++pos) {
+            for (size_t val = 0; val < size; ++val) {
+                assert(seen[pos * size + val]);
+            }
+        }
+    }
+
+    // Test ranges::shuffle with mt19937_64 (batched path)
+    {
+        mt19937_64 urng(42);
+        vector<int> input(size);
+        vector<char> seen(size * size, 0);
+
+        for (size_t trial = 0; trial < trials; ++trial) {
+            iota(input.begin(), input.end(), 0);
+            ranges::shuffle(input, urng);
+            for (size_t i = 0; i < size; ++i) {
+                seen[i * size + static_cast<size_t>(input[i])] = 1;
+            }
+        }
+
+        for (size_t pos = 0; pos < size; ++pos) {
+            for (size_t val = 0; val < size; ++val) {
+                assert(seen[pos * size + val]);
+            }
+        }
+    }
+}
+
+// Check that the distribution of values across positions is roughly uniform.
+// Uses the "relative gap" metric: (max_count - min_count) / mean_count < 0.6.
+void test_uniformity() {
+    constexpr size_t size   = 32;
+    constexpr size_t trials = size * size * 16; // 16384 trials; expected count per cell = 512
+
+    mt19937_64 urng(42);
+    vector<int> input(size);
+    vector<size_t> count(size * size, 0); // count[position * size + value]
+
+    for (size_t trial = 0; trial < trials; ++trial) {
+        iota(input.begin(), input.end(), 0);
+        shuffle(input.begin(), input.end(), urng);
+        for (size_t i = 0; i < size; ++i) {
+            ++count[i * size + static_cast<size_t>(input[i])];
+        }
+    }
+
+    size_t overall_min = SIZE_MAX;
+    size_t overall_max = 0;
+    size_t total       = 0;
+
+    for (size_t cell = 0; cell < size * size; ++cell) {
+        total += count[cell];
+        if (count[cell] > overall_max) {
+            overall_max = count[cell];
+        }
+        if (count[cell] < overall_min) {
+            overall_min = count[cell];
+        }
+    }
+
+    const double mean         = static_cast<double>(total) / static_cast<double>(size * size);
+    const double relative_gap = static_cast<double>(overall_max - overall_min) / mean;
+
+    assert(relative_gap < 0.6);
+}
+
+// Every distinct pair of values must be able to appear at the first two positions.
+void test_any_possible_pair_at_start() {
+    constexpr size_t size   = 32;
+    constexpr size_t trials = size * size * size; // 32768 trials; expected count per pair ~33
+
+    mt19937_64 urng(42);
+    vector<int> input(size);
+    vector<char> seen(size * size, 0); // seen[first * size + second]
+
+    for (size_t trial = 0; trial < trials; ++trial) {
+        iota(input.begin(), input.end(), 0);
+        shuffle(input.begin(), input.end(), urng);
+        seen[static_cast<size_t>(input[0]) * size + static_cast<size_t>(input[1])] = 1;
+    }
+
+    for (size_t i = 0; i < size; ++i) {
+        for (size_t j = 0; j < size; ++j) {
+            if (i == j) {
+                assert(!seen[i * size + j]); // same value can't occupy both positions
+            } else {
+                assert(seen[i * size + j]); // every distinct pair must appear
+            }
+        }
+    }
+}
+
+// Every distinct pair of values must be able to appear at the last two positions.
+void test_any_possible_pair_at_end() {
+    constexpr size_t size   = 32;
+    constexpr size_t trials = size * size * size;
+
+    mt19937_64 urng(42);
+    vector<int> input(size);
+    vector<char> seen(size * size, 0);
+
+    for (size_t trial = 0; trial < trials; ++trial) {
+        iota(input.begin(), input.end(), 0);
+        shuffle(input.begin(), input.end(), urng);
+        seen[static_cast<size_t>(input[size - 2]) * size + static_cast<size_t>(input[size - 1])] = 1;
+    }
+
+    for (size_t i = 0; i < size; ++i) {
+        for (size_t j = 0; j < size; ++j) {
+            if (i == j) {
+                assert(!seen[i * size + j]);
+            } else {
+                assert(seen[i * size + j]);
+            }
+        }
+    }
+}
+
 int main() {
     printf("Using seed: %u\n", seed);
 
     test_random<instantiator, int>();
     test_shuffle_permutation();
     test_shuffle_edge_cases();
+    test_gh_6112();
+    test_everyone_can_move_everywhere();
+    test_uniformity();
+    test_any_possible_pair_at_start();
+    test_any_possible_pair_at_end();
 }

From 4f4a627ff183986772b5bab833414be7f3c2ec63 Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen@gmail.com>
Date: Mon, 2 Mar 2026 09:12:38 -0800
Subject: [PATCH 5/5] `<algorithm>`: Match Lemire's batched shuffle
 implementation

Align the batched random shuffle with Daniel Lemire's reference
implementation (github.com/lemire/cpp_batched_random). The previous
version only used batch-of-2 and did not use the threshold constants
in loop conditions. This adopts the full cascade (batch 1-6) with
backward Fisher-Yates iteration, matching Lemire's design exactly.
---
 stl/inc/algorithm | 301 ++++++++++++++++++++++++++--------------------
 1 file changed, 168 insertions(+), 133 deletions(-)

diff --git a/stl/inc/algorithm b/stl/inc/algorithm
index 2951808f96a..d41b0095ae6 100644
--- a/stl/inc/algorithm
+++ b/stl/inc/algorithm
@@ -6193,65 +6193,39 @@ constexpr bool _Urng_has_full_64bit_range =
 
 template <class _Diff, class _Urng>
 struct _Batched_rng_from_urng {
-
-    // Threshold bounds for batch sizes based on array size.
-    // These are derived from the paper to minimize expected cost per random value.
-    // Batch size k requires product of k consecutive bounds <= 2^64.
-    static constexpr uint64_t _Bound_for_batch_6 = 21; // (21)^6 = 85,766,121 < 2^64 / (very conservative)
-    static constexpr uint64_t _Bound_for_batch_5 = 73; // (73)^5 = 2,073,071,593 < 2^64
-    static constexpr uint64_t _Bound_for_batch_4 = 302; // (302)^4 = 8,319,430,096 < 2^64
-    static constexpr uint64_t _Bound_for_batch_3 = 2642; // (2642)^3 = 18,454,249,288 < 2^64
-    static constexpr uint64_t _Bound_for_batch_2 = 4294967296; // 2^32, for batch of 2
-
     _Urng& _Ref;
 
     explicit _Batched_rng_from_urng(_Urng& _Func) noexcept : _Ref(_Func) {}
 
-    // Generate a single bounded random value in [0, _Bound) using Lemire's method.
-    _NODISCARD _Diff _Single_bounded(_Diff _Bound) {
+    // Generate _K bounded random indices from a single 64-bit random word.
+    // Uses backward Fisher-Yates bounds: _N, _N-1, ..., _N-_K+1.
+    // _Results[j] is uniform in [0, _N-j).
+    //
+    // _Bound is a conservative upper bound on the product of the _K bounds,
+    // used to skip the threshold computation on the fast path. Returns the
+    // (possibly tightened) bound for the caller to pass to the next call.
+    uint64_t _Batch(_Diff* _Results, uint64_t _N, uint64_t _K, uint64_t _Bound) {
+        uint64_t _Rand = static_cast<uint64_t>(_Ref());
         uint64_t _High;
-        uint64_t _Leftover = _Base128::_UMul128(static_cast<uint64_t>(_Ref()), static_cast<uint64_t>(_Bound), _High);
-
-        if (_Leftover < static_cast<uint64_t>(_Bound)) {
-            const uint64_t _Threshold = (0 - static_cast<uint64_t>(_Bound)) % static_cast<uint64_t>(_Bound);
-            while (_Leftover < _Threshold) {
-                _Leftover = _Base128::_UMul128(static_cast<uint64_t>(_Ref()), static_cast<uint64_t>(_Bound), _High);
-            }
+        for (uint64_t _J = 0; _J < _K; ++_J) {
+            _Rand        = _Base128::_UMul128(_Rand, _N - _J, _High);
+            _Results[_J] = static_cast<_Diff>(_High);
         }
-
-        return static_cast<_Diff>(_High);
-    }
-
-    // Generate two bounded random values from a single 64-bit random word.
-    // The bounds are (n+1) and n for Fisher-Yates shuffle positions _Target_index and _Target_index-1.
-    void _Batch_2(_Diff* _Results, _Diff _Bound1, _Diff _Bound2) {
-        const uint64_t _B1            = static_cast<uint64_t>(_Bound1);
-        const uint64_t _B2            = static_cast<uint64_t>(_Bound2);
-        const uint64_t _Product_bound = _B1 * _B2;
-
-        uint64_t _Random_word = static_cast<uint64_t>(_Ref());
-
-        uint64_t _High1;
-        uint64_t _Leftover1 = _Base128::_UMul128(_Random_word, _B1, _High1);
-        _Results[0]         = static_cast<_Diff>(_High1);
-
-        uint64_t _High2;
-        uint64_t _Leftover = _Base128::_UMul128(_Leftover1, _B2, _High2);
-        _Results[1]        = static_cast<_Diff>(_High2);
-
-        // Rejection sampling: check if leftover is below threshold.
-        if (_Leftover < _Product_bound) {
-            const uint64_t _Threshold = (0 - _Product_bound) % _Product_bound;
-            while (_Leftover < _Threshold) {
-                _Random_word = static_cast<uint64_t>(_Ref());
-
-                _Leftover1  = _Base128::_UMul128(_Random_word, _B1, _High1);
-                _Results[0] = static_cast<_Diff>(_High1);
-
-                _Leftover   = _Base128::_UMul128(_Leftover1, _B2, _High2);
-                _Results[1] = static_cast<_Diff>(_High2);
+        if (_Rand < _Bound) {
+            _Bound = _N;
+            for (uint64_t _J = 1; _J < _K; ++_J) {
+                _Bound *= _N - _J;
+            }
+            const uint64_t _Threshold = (0 - _Bound) % _Bound;
+            while (_Rand < _Threshold) {
+                _Rand = static_cast<uint64_t>(_Ref());
+                for (uint64_t _J = 0; _J < _K; ++_J) {
+                    _Rand        = _Base128::_UMul128(_Rand, _N - _J, _High);
+                    _Results[_J] = static_cast<_Diff>(_High);
+                }
             }
         }
+        return _Bound;
     }
 
     _Batched_rng_from_urng(const _Batched_rng_from_urng&)            = delete;
@@ -6602,66 +6576,90 @@ void _Random_shuffle1(_RanIt _First, _RanIt _Last, _RngFn& _RngFunc) {
 }
 
 // Batched shuffle implementation for 64-bit URNGs with full range.
-// Uses batched random generation to reduce RNG calls.
+// Uses backward Fisher-Yates (Durstenfeld) with batched random generation
+// to reduce URNG calls. Adapted from Lemire's cpp_batched_random.
 template <class _RanIt, class _Urng>
 void _Random_shuffle_batched(_RanIt _First, _RanIt _Last, _Urng& _Func) {
-    // shuffle [_First, _Last) using batched random generation
     _STD _Adl_verify_range(_First, _Last);
     auto _UFirst      = _STD _Get_unwrapped(_First);
     const auto _ULast = _STD _Get_unwrapped(_Last);
-    if (_UFirst == _ULast) {
+
+    using _Diff     = _Iter_diff_t<_RanIt>;
+    auto _Remaining = static_cast<uint64_t>(_ULast - _UFirst);
+    if (_Remaining <= 1) {
         return;
     }
 
-    using _Diff = _Iter_diff_t<_RanIt>;
     _Batched_rng_from_urng<_Diff, _Urng> _BatchedRng(_Func);
+    _Diff _Offsets[7];
 
-    auto _UTarget       = _UFirst;
-    _Diff _Target_index = 1;
-
-    // Process pairs using batched generation when beneficial.
-    // Batch of 2 is beneficial when bounds fit in 32 bits (product fits in 64 bits).
-    for (; ++_UTarget != _ULast; ++_Target_index) { // randomly place an element from [_First, _Target] at _Target
-        const _Diff _Bound1 = _Target_index + 1; // bound for current position
-
-        // Check if we can batch: both bounds and their product must fit safely.
-        // Use batch of 2 when the larger bound is <= 2^32 (product fits in 64 bits).
-        if (static_cast<uint64_t>(_Target_index + 2) <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) {
-            auto _UTarget_next = _UTarget;
-            ++_UTarget_next;
-
-            if (_UTarget_next != _ULast) {
-                const _Diff _Bound2 = _Target_index + 2; // bound for next position
-
-                // Generate two random indices in one batch.
-                _Diff _Offsets[2];
-                _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2);
+    // Process one element at a time while bounds are too large for batching.
+    for (; _Remaining > (uint64_t{1} << 30); --_Remaining) {
+        _BatchedRng._Batch(_Offsets, _Remaining, 1, _Remaining);
+        swap(*(_UFirst + _Remaining - 1), *(_UFirst + _Offsets[0])); // intentional ADL
+    }
 
-                _STL_ASSERT(0 <= _Offsets[0] && _Offsets[0] <= _Target_index, "random value out of range");
-                _STL_ASSERT(0 <= _Offsets[1] && _Offsets[1] <= _Target_index + 1, "random value out of range");
+    // Batch of 2: product of 2 consecutive bounds fits in 64 bits.
+    {
+        uint64_t _Bound = uint64_t{1} << 60;
+        for (; _Remaining > (uint64_t{1} << 19); _Remaining -= 2) {
+            _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 2, _Bound);
+            for (uint64_t _J = 0; _J < 2; ++_J) {
+                swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL
+            }
+        }
+    }
 
-                // Perform first swap.
-                if (_Offsets[0] != _Target_index) {
-                    swap(*_UTarget, *(_UFirst + _Offsets[0])); // intentional ADL
-                }
+    // Batch of 3.
+    {
+        uint64_t _Bound = uint64_t{1} << 57;
+        for (; _Remaining > (uint64_t{1} << 14); _Remaining -= 3) {
+            _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 3, _Bound);
+            for (uint64_t _J = 0; _J < 3; ++_J) {
+                swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL
+            }
+        }
+    }
 
-                // Advance to next position and perform second swap.
-                ++_UTarget;
-                ++_Target_index;
+    // Batch of 4.
+    {
+        uint64_t _Bound = uint64_t{1} << 56;
+        for (; _Remaining > (uint64_t{1} << 11); _Remaining -= 4) {
+            _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 4, _Bound);
+            for (uint64_t _J = 0; _J < 4; ++_J) {
+                swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL
+            }
+        }
+    }
 
-                if (_Offsets[1] != _Target_index) {
-                    swap(*_UTarget, *(_UFirst + _Offsets[1])); // intentional ADL
-                }
+    // Batch of 5.
+    {
+        uint64_t _Bound = uint64_t{1} << 55;
+        for (; _Remaining > (uint64_t{1} << 9); _Remaining -= 5) {
+            _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 5, _Bound);
+            for (uint64_t _J = 0; _J < 5; ++_J) {
+                swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL
+            }
+        }
+    }
 
-                continue; // The for-loop's ++_UTarget and ++_Target_index handle the next advance.
+    // Batch of 6.
+    {
+        uint64_t _Bound = uint64_t{1} << 54;
+        for (; _Remaining > 6; _Remaining -= 6) {
+            _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 6, _Bound);
+            for (uint64_t _J = 0; _J < 6; ++_J) {
+                swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL
             }
         }
+    }
 
-        // Fall back to single generation for this position.
-        const _Diff _Off = _BatchedRng._Single_bounded(_Bound1);
-        _STL_ASSERT(0 <= _Off && _Off <= _Target_index, "random value out of range");
-        if (_Off != _Target_index) {
-            swap(*_UTarget, *(_UFirst + _Off)); // intentional ADL
+    // Final: remaining <= 6 elements, handle in one batch.
+    if (_Remaining > 1) {
+        const auto _K = _Remaining - 1;
+        _BatchedRng._Batch(_Offsets, _Remaining, _K, 720);
+        for (uint64_t _J = 0; _J < _K; ++_J) {
+            swap(*(_UFirst + _Remaining - _J - 1), *(_UFirst + _Offsets[_J])); // intentional ADL
         }
     }
 }
@@ -6748,64 +6746,101 @@ namespace ranges {
         }
 
         // Batched shuffle implementation for ranges.
+        // Uses backward Fisher-Yates (Durstenfeld) with batched random generation.
         template <class _It, class _Se, class _Urng>
         _NODISCARD static _It _Shuffle_unchecked_batched(_It _First, const _Se _Last, _Urng& _Func) {
-            // shuffle [_First, _Last) using batched random generation
             _STL_INTERNAL_STATIC_ASSERT(random_access_iterator<_It>);
             _STL_INTERNAL_STATIC_ASSERT(sentinel_for<_Se, _It>);
             _STL_INTERNAL_STATIC_ASSERT(permutable<_It>);
 
-            if (_First == _Last) {
-                return _First;
+            using _Diff       = iter_difference_t<_It>;
+            const auto _Count = static_cast<_Diff>(_Last - _First);
+            auto _Remaining   = static_cast<uint64_t>(_Count);
+            if (_Remaining <= 1) {
+                return _First + _Count;
             }
 
-            using _Diff = iter_difference_t<_It>;
             _Batched_rng_from_urng<_Diff, _Urng> _BatchedRng(_Func);
+            _Diff _Offsets[7];
+
+            // Process one element at a time while bounds are too large for batching.
+            for (; _Remaining > (uint64_t{1} << 30); --_Remaining) {
+                _BatchedRng._Batch(_Offsets, _Remaining, 1, _Remaining);
+                _RANGES iter_swap(
+                    _First + static_cast<_Diff>(_Remaining - 1), _First + static_cast<_Diff>(_Offsets[0]));
+            }
+
+            // Batch of 2.
+            {
+                uint64_t _Bound = uint64_t{1} << 60;
+                for (; _Remaining > (uint64_t{1} << 19); _Remaining -= 2) {
+                    _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 2, _Bound);
+                    for (uint64_t _J = 0; _J < 2; ++_J) {
+                        _RANGES iter_swap(_First + static_cast<_Diff>(_Remaining - _J - 1),
+                            _First + static_cast<_Diff>(_Offsets[_J]));
+                    }
+                }
+            }
 
-            auto _Target        = _First;
-            _Diff _Target_index = 1;
-
-            // Process pairs using batched generation when beneficial.
-            for (; ++_Target != _Last; ++_Target_index) {
-                // randomly place an element from [_First, _Target] at _Target
-                const _Diff _Bound1 = _Target_index + 1;
-
-                if (static_cast<uint64_t>(_Target_index + 2)
-                    <= _Batched_rng_from_urng<_Diff, _Urng>::_Bound_for_batch_2) {
-                    auto _Target_next = _Target;
-                    ++_Target_next;
-
-                    if (_Target_next != _Last) {
-                        const _Diff _Bound2 = _Target_index + 2;
-
-                        _Diff _Offsets[2];
-                        _BatchedRng._Batch_2(_Offsets, _Bound1, _Bound2);
-
-                        _STL_ASSERT(0 <= _Offsets[0] && _Offsets[0] <= _Target_index, "random value out of range");
-                        _STL_ASSERT(0 <= _Offsets[1] && _Offsets[1] <= _Target_index + 1, "random value out of range");
-
-                        if (_Offsets[0] != _Target_index) {
-                            _RANGES iter_swap(_Target, _First + _Offsets[0]);
-                        }
+            // Batch of 3.
+            {
+                uint64_t _Bound = uint64_t{1} << 57;
+                for (; _Remaining > (uint64_t{1} << 14); _Remaining -= 3) {
+                    _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 3, _Bound);
+                    for (uint64_t _J = 0; _J < 3; ++_J) {
+                        _RANGES iter_swap(_First + static_cast<_Diff>(_Remaining - _J - 1),
+                            _First + static_cast<_Diff>(_Offsets[_J]));
+                    }
+                }
+            }
 
-                        ++_Target;
-                        ++_Target_index;
+            // Batch of 4.
+            {
+                uint64_t _Bound = uint64_t{1} << 56;
+                for (; _Remaining > (uint64_t{1} << 11); _Remaining -= 4) {
+                    _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 4, _Bound);
+                    for (uint64_t _J = 0; _J < 4; ++_J) {
+                        _RANGES iter_swap(_First + static_cast<_Diff>(_Remaining - _J - 1),
+                            _First + static_cast<_Diff>(_Offsets[_J]));
+                    }
+                }
+            }
 
-                        if (_Offsets[1] != _Target_index) {
-                            _RANGES iter_swap(_Target, _First + _Offsets[1]);
-                        }
+            // Batch of 5.
+            {
+                uint64_t _Bound = uint64_t{1} << 55;
+                for (; _Remaining > (uint64_t{1} << 9); _Remaining -= 5) {
+                    _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 5, _Bound);
+                    for (uint64_t _J = 0; _J < 5; ++_J) {
+                        _RANGES iter_swap(_First + static_cast<_Diff>(_Remaining - _J - 1),
+                            _First + static_cast<_Diff>(_Offsets[_J]));
+                    }
+                }
+            }
 
-                        continue; // The for-loop's ++_Target and ++_Target_index handle the next advance.
+            // Batch of 6.
+            {
+                uint64_t _Bound = uint64_t{1} << 54;
+                for (; _Remaining > 6; _Remaining -= 6) {
+                    _Bound = _BatchedRng._Batch(_Offsets, _Remaining, 6, _Bound);
+                    for (uint64_t _J = 0; _J < 6; ++_J) {
+                        _RANGES iter_swap(_First + static_cast<_Diff>(_Remaining - _J - 1),
+                            _First + static_cast<_Diff>(_Offsets[_J]));
                     }
                 }
+            }
 
-                const _Diff _Off = _BatchedRng._Single_bounded(_Bound1);
-                _STL_ASSERT(0 <= _Off && _Off <= _Target_index, "random value out of range");
-                if (_Off != _Target_index) {
-                    _RANGES iter_swap(_Target, _First + _Off);
+            // Final: remaining <= 6 elements, handle in one batch.
+            if (_Remaining > 1) {
+                const auto _K = _Remaining - 1;
+                _BatchedRng._Batch(_Offsets, _Remaining, _K, 720);
+                for (uint64_t _J = 0; _J < _K; ++_J) {
+                    _RANGES iter_swap(
+                        _First + static_cast<_Diff>(_Remaining - _J - 1), _First + static_cast<_Diff>(_Offsets[_J]));
                 }
             }
-            return _Target;
+
+            return _First + _Count;
         }
     };