From c4518165c0c079796a94a3dd795062596d0b600a Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Sat, 24 Jan 2026 22:01:19 -0800
Subject: [PATCH 01/13] fix: remove changelog option from TagBot configuration

---
 .github/workflows/TagBot.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
index 639dc50..baa39d2 100644
--- a/.github/workflows/TagBot.yml
+++ b/.github/workflows/TagBot.yml
@@ -19,4 +19,3 @@ jobs:
           token: ${{ secrets.GITHUB_TOKEN }}
           ssh: ${{ secrets.DOCUMENTER_KEY }}
           dispatch: true
-          changelog: false

From 46d1a4e66a2081e5fa690f000db2574f65355f5e Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Fri, 30 Jan 2026 14:30:55 -0800
Subject: [PATCH 02/13] feat: implement unsafe_acquire! for Bit type with SIMD
 performance

Add unsafe_acquire!(pool, Bit, n) that returns a real BitVector with
shared chunks, providing ~140x performance improvement for native
BitVector operations like count(), sum(), and bitwise ops.

Changes:
- Add get_bitvector_wrapper! with N-way cache for wrapper reuse
- Replace _throw_bit_unsafe_error with actual implementation
- Support N-D via reshape(BitVector, dims) returning BitArray{N}
- Add pool_stats and show methods for BitTypedPool
- Fix pool display when BitTypedPool has content

The wrapper BitVector shares the pooled BitVector's chunks field,
preserving SIMD optimizations while reusing pool memory.
---
 src/acquire.jl        | 139 ++++++++++++++++++++++++++++++++++++++----
 src/types.jl          |  41 ++++++++-----
 src/utils.jl          |  70 ++++++++++++++++++++-
 test/test_bitarray.jl |  64 ++++++++++++++-----
 4 files changed, 271 insertions(+), 43 deletions(-)

diff --git a/src/acquire.jl b/src/acquire.jl
index b8ddcf6..286a632 100644
--- a/src/acquire.jl
+++ b/src/acquire.jl
@@ -19,14 +19,113 @@
     unsafe_wrap(Array{T,N}, pointer(flat_view), dims)
 end
 
-# BitTypedPool cannot use unsafe_wrap - throw clear error
-# Called from _unsafe_acquire_impl! dispatches for Bit type
-@noinline function _throw_bit_unsafe_error()
-    throw(ArgumentError(
-        "unsafe_acquire!(pool, Bit, ...) is not supported. " *
-        "BitArray stores data in immutable chunks::Vector{UInt64} that cannot be wrapped with unsafe_wrap. " *
-        "Use acquire!(pool, Bit, ...) instead, which returns a view."
-    ))
+# ==============================================================================
+# BitVector Wrapper (chunks sharing for SIMD performance)
+# ==============================================================================
+
+"""
+    get_bitvector_wrapper!(tp::BitTypedPool, n::Int) -> BitVector
+
+Get a BitVector that shares `chunks` with the pooled BitVector.
+
+Unlike `get_view!` which returns a `SubArray` (loses SIMD optimizations),
+this returns a real `BitVector` with shared chunks, preserving native
+BitVector performance (~140x faster for `count()`, `sum()`, etc.).
+
+## Implementation
+Creates a new BitVector shell and replaces its `chunks` field with the
+pooled BitVector's chunks. Uses N-way cache for wrapper reuse.
+
+## Safety
+The returned BitVector is only valid within the `@with_pool` scope.
+Do NOT use after the scope ends (use-after-free risk).
+"""
+function get_bitvector_wrapper!(tp::BitTypedPool, n::Int)
+    tp.n_active += 1
+    idx = tp.n_active
+
+    # 1. Pool expansion needed (new slot)
+    if idx > length(tp.vectors)
+        pool_bv = BitVector(undef, n)
+        push!(tp.vectors, pool_bv)
+        push!(tp.views, view(pool_bv, 1:n))
+        push!(tp.view_lengths, n)
+
+        # Create wrapper sharing chunks
+        wrapper = BitVector(undef, n)
+        wrapper.chunks = pool_bv.chunks
+
+        # Expand N-way cache (CACHE_WAYS entries per slot)
+        for _ in 1:CACHE_WAYS
+            push!(tp.nd_arrays, nothing)
+            push!(tp.nd_dims, nothing)
+            push!(tp.nd_ptrs, UInt(0))
+        end
+        push!(tp.nd_next_way, 0)
+
+        # Cache in first way
+        base = (idx - 1) * CACHE_WAYS + 1
+        @inbounds tp.nd_arrays[base] = wrapper
+        @inbounds tp.nd_dims[base] = n
+        @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks))
+
+        # Warn at powers of 2 (possible missing rewind!)
+        if idx >= 512 && (idx & (idx - 1)) == 0
+            total_bits = sum(length, tp.vectors)
+            @warn "BitTypedPool growing large ($idx arrays, ~$(total_bits ÷ 8) bytes). Missing rewind!()?"
+        end
+
+        return wrapper
+    end
+
+    # 2. Check N-way cache for hit
+    @inbounds pool_bv = tp.vectors[idx]
+    current_ptr = UInt(pointer(pool_bv.chunks))
+
+    # Ensure cache slots exist for this index
+    n_slots_cached = length(tp.nd_next_way)
+    while idx > n_slots_cached
+        for _ in 1:CACHE_WAYS
+            push!(tp.nd_arrays, nothing)
+            push!(tp.nd_dims, nothing)
+            push!(tp.nd_ptrs, UInt(0))
+        end
+        push!(tp.nd_next_way, 0)
+        n_slots_cached += 1
+    end
+
+    base = (idx - 1) * CACHE_WAYS
+
+    # Linear search across all ways
+    for k in 1:CACHE_WAYS
+        cache_idx = base + k
+        @inbounds cached_n = tp.nd_dims[cache_idx]
+        @inbounds cached_ptr = tp.nd_ptrs[cache_idx]
+
+        if cached_n == n && cached_ptr == current_ptr
+            return @inbounds tp.nd_arrays[cache_idx]::BitVector
+        end
+    end
+
+    # 3. Cache miss - resize pool_bv if needed and create new wrapper
+    if length(pool_bv) < n
+        resize!(pool_bv, n)
+        @inbounds tp.views[idx] = view(pool_bv, 1:n)
+        @inbounds tp.view_lengths[idx] = n
+    end
+
+    wrapper = BitVector(undef, n)
+    wrapper.chunks = pool_bv.chunks
+
+    # Round-robin replacement
+    @inbounds way_offset = tp.nd_next_way[idx]
+    target_idx = base + way_offset + 1
+    @inbounds tp.nd_arrays[target_idx] = wrapper
+    @inbounds tp.nd_dims[target_idx] = n
+    @inbounds tp.nd_ptrs[target_idx] = UInt(pointer(pool_bv.chunks))
+    @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS
+
+    return wrapper
 end
 
 # ==============================================================================
@@ -245,10 +344,21 @@ end
 # Similar-style
 @inline _unsafe_acquire_impl!(pool::AbstractArrayPool, x::AbstractArray) = _unsafe_acquire_impl!(pool, eltype(x), size(x))
 
-# Bit type: unsafe_acquire! not supported (throw clear error early)
-@inline _unsafe_acquire_impl!(::AbstractArrayPool, ::Type{Bit}, ::Int) = _throw_bit_unsafe_error()
-@inline _unsafe_acquire_impl!(::AbstractArrayPool, ::Type{Bit}, ::Vararg{Int,N}) where {N} = _throw_bit_unsafe_error()
-@inline _unsafe_acquire_impl!(::AbstractArrayPool, ::Type{Bit}, ::NTuple{N,Int}) where {N} = _throw_bit_unsafe_error()
+# Bit type: returns BitVector with shared chunks (SIMD optimized)
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int)
+    tp = get_typed_pool!(pool, Bit)::BitTypedPool
+    return get_bitvector_wrapper!(tp, n)
+end
+
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N}
+    total = prod(dims)
+    bv = _unsafe_acquire_impl!(pool, Bit, total)
+    return reshape(bv, dims)  # ReshapedArray{Bool,N,BitVector,...}
+end
+
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N}
+    _unsafe_acquire_impl!(pool, Bit, dims...)
+end
 
 # ==============================================================================
 # Acquisition API (User-facing with untracked marking)
@@ -455,6 +565,11 @@ const _acquire_array_impl! = _unsafe_acquire_impl!
 @inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims)
 @inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims)
 
+# --- unsafe_acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) ---
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n)
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims)
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims)
+
 # --- Generic DisabledPool fallbacks (unknown backend → error) ---
 @inline acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B)
 @inline unsafe_acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B)
diff --git a/src/types.jl b/src/types.jl
index 2b1a070..89f2027 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -241,10 +241,10 @@ end
 - **1D**: `SubArray{Bool,1,BitVector,...}`
 - **N-D**: `ReshapedArray{Bool,N,...}` (reshaped view of 1D BitVector)
 
-## Limitation
-`unsafe_acquire!(pool, Bit, ...)` is **not supported** because Julia's
-`BitArray` stores data in immutable `chunks::Vector{UInt64}` that cannot
-be wrapped with `unsafe_wrap`.
+## Performance Note
+`unsafe_acquire!(pool, Bit, n)` returns a real `BitVector` with shared chunks,
+preserving SIMD-optimized operations like `count()` (~140x faster than SubArray).
+Use this when you need native BitVector performance.
 
 See also: [`acquire!`](@ref), [`BitTypedPool`](@ref)
 """
@@ -262,30 +262,41 @@ Specialized pool for `BitVector` arrays with memory reuse.
 Unlike `TypedPool{Bool}` which stores `Vector{Bool}` (1 byte per element),
 this pool stores `BitVector` (1 bit per element, ~8x memory efficiency).
 
-## Important Limitation
-**`unsafe_acquire!` is NOT supported for BitArray** because Julia's `BitArray`
-stores data in a `chunks::Vector{UInt64}` field that cannot be wrapped with
-`unsafe_wrap`. Only view-based acquisition via `acquire!(pool, Bit, ...)` is available.
+## Acquisition Methods
+- `acquire!(pool, Bit, n)` → `SubArray{Bool,1,BitVector,...}` (view-based)
+- `unsafe_acquire!(pool, Bit, n)` → `BitVector` (chunks-sharing, SIMD optimized)
+
+Use `unsafe_acquire!` when you need native BitVector operations like `count()`,
+`sum()`, or bitwise operations - these are ~140x faster than SubArray equivalents.
 
 ## Fields
 - `vectors`: Backing `BitVector` storage
-- `views`: Cached `SubArray` views for zero-allocation 1D access
+- `views`: Cached `SubArray` views for `acquire!`
 - `view_lengths`: Cached lengths for fast comparison
-- `nd_*`: Empty N-D cache fields (for `empty!` compatibility, unused)
+- `nd_arrays`: Cached wrapper BitVectors for `unsafe_acquire!` (chunks sharing)
+- `nd_dims`: Cached lengths for wrapper cache validation
+- `nd_ptrs`: Cached chunk pointers for invalidation detection
+- `nd_next_way`: Round-robin counter for N-way cache
 - `n_active`: Count of currently active arrays
 - `_checkpoint_*`: State management stacks (1-based sentinel pattern)
 
 ## Usage
 ```julia
 @with_pool pool begin
-    bv = acquire!(pool, Bit, 100)         # SubArray{Bool,1,BitVector,...}
-    ba = acquire!(pool, Bit, 10, 10)      # ReshapedArray{Bool,2,...}
-    t = trues!(pool, 50)                  # Filled with true
-    f = falses!(pool, 50)                 # Filled with false
+    # View-based (standard)
+    bv = acquire!(pool, Bit, 100)              # SubArray{Bool,1,BitVector,...}
+
+    # SIMD-optimized (for performance-critical code)
+    bv_fast = unsafe_acquire!(pool, Bit, 100)  # BitVector (real)
+    count(bv_fast)                             # ~140x faster than count(bv)
+
+    # Convenience functions
+    t = trues!(pool, 50)                       # Filled with true
+    f = falses!(pool, 50)                      # Filled with false
 end
 ```
 
-See also: [`trues!`](@ref), [`falses!`](@ref)
+See also: [`trues!`](@ref), [`falses!`](@ref), [`Bit`](@ref)
 """
 mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector}
     # --- Storage ---
diff --git a/src/utils.jl b/src/utils.jl
index 5950744..cf12ea0 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -116,6 +116,43 @@ function pool_stats(tp::TypedPool{T}; io::IO=stdout, indent::Int=0, name::String
     return nothing
 end
 
+"""
+    pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String="")
+
+Print statistics for a BitTypedPool.
+"""
+function pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String="")
+    prefix = " "^indent
+    type_name = isempty(name) ? "Bit" : name
+
+    n_arrays = length(tp.vectors)
+    if n_arrays == 0
+        printstyled(io, prefix, type_name, color=:cyan)
+        printstyled(io, " (empty)\n", color=:dark_gray)
+        return
+    end
+
+    total_bits = sum(length(v) for v in tp.vectors)
+    total_bytes = sum(sizeof(v.chunks) for v in tp.vectors)
+    bytes_str = Base.format_bytes(total_bytes)
+
+    # Header
+    printstyled(io, prefix, type_name, color=:cyan)
+    println(io)
+
+    # Stats
+    printstyled(io, prefix, "  slots: ", color=:dark_gray)
+    printstyled(io, n_arrays, color=:blue)
+    printstyled(io, " (active: ", color=:dark_gray)
+    printstyled(io, tp.n_active, color=:blue)
+    printstyled(io, ")\n", color=:dark_gray)
+
+    printstyled(io, prefix, "  bits: ", color=:dark_gray)
+    printstyled(io, total_bits, color=:blue)
+    printstyled(io, " ($bytes_str)\n", color=:dark_gray)
+    return nothing
+end
+
 """
     pool_stats(pool::AdaptiveArrayPool; io::IO=stdout)
 
@@ -141,8 +178,13 @@ function pool_stats(pool::AdaptiveArrayPool; io::IO=stdout)
     foreach_fixed_slot(pool) do tp
         if !isempty(tp.vectors)
             has_content = true
-            T = typeof(tp).parameters[1]  # Extract T from TypedPool{T}
-            pool_stats(tp; io, indent=2, name="$T (fixed)")
+            name = if tp isa BitTypedPool
+                "Bit (fixed)"
+            else
+                T = typeof(tp).parameters[1]  # Extract T from TypedPool{T}
+                "$T (fixed)"
+            end
+            pool_stats(tp; io, indent=2, name)
         end
     end
 
@@ -228,6 +270,30 @@ function Base.show(io::IO, ::MIME"text/plain", tp::TypedPool{T}) where {T}
     pool_stats(tp; io, name="TypedPool{$T}")
 end
 
+# Compact one-line show for BitTypedPool
+function Base.show(io::IO, tp::BitTypedPool)
+    n_vectors = length(tp.vectors)
+    if n_vectors == 0
+        print(io, "BitTypedPool(empty)")
+    else
+        total_bits = sum(length(v) for v in tp.vectors)
+        print(io, "BitTypedPool(slots=$n_vectors, active=$(tp.n_active), bits=$total_bits)")
+    end
+end
+
+# Multi-line show for BitTypedPool
+function Base.show(io::IO, ::MIME"text/plain", tp::BitTypedPool)
+    n_vectors = length(tp.vectors)
+    println(io, "BitTypedPool:")
+    println(io, "  slots:  $n_vectors")
+    println(io, "  active: $(tp.n_active)")
+    if n_vectors > 0
+        total_bits = sum(length(v) for v in tp.vectors)
+        total_bytes = sum(sizeof(v.chunks) for v in tp.vectors)
+        println(io, "  bits:   $total_bits ($(Base.format_bytes(total_bytes)))")
+    end
+end
+
 # Compact one-line show for AdaptiveArrayPool
 function Base.show(io::IO, pool::AdaptiveArrayPool)
     n_types = Ref(0)
diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl
index a5dbeca..e5394de 100644
--- a/test/test_bitarray.jl
+++ b/test/test_bitarray.jl
@@ -445,24 +445,60 @@
         @test outer_result == (100, 0)
     end
 
-    @testset "unsafe_acquire! not supported" begin
+    @testset "unsafe_acquire! returns BitVector with shared chunks" begin
         pool = AdaptiveArrayPool()
 
-        # unsafe_acquire! with Bit should throw a clear error
-        @test_throws ArgumentError unsafe_acquire!(pool, Bit, 100)
-        @test_throws ArgumentError unsafe_acquire!(pool, Bit, 10, 10)
+        # unsafe_acquire! with Bit returns a real BitVector (not SubArray)
+        bv = unsafe_acquire!(pool, Bit, 100)
+        @test bv isa BitVector
+        @test length(bv) == 100
 
-        # Tuple form (covers acquire.jl:251)
-        @test_throws ArgumentError unsafe_acquire!(pool, Bit, (10, 10))
+        # N-D returns BitArray (reshape of BitVector becomes BitArray in Julia)
+        ba = unsafe_acquire!(pool, Bit, 10, 10)
+        @test ba isa BitMatrix  # reshape(BitVector, dims) → BitArray
+        @test size(ba) == (10, 10)
 
-        # Verify the error message is helpful
-        try
-            unsafe_acquire!(pool, Bit, 100)
-        catch e
-            @test e isa ArgumentError
-            @test occursin("unsafe_acquire!", e.msg)
-            @test occursin("Bit", e.msg)
-            @test occursin("acquire!", e.msg)  # Suggests alternative
+        # Tuple form
+        ba_tuple = unsafe_acquire!(pool, Bit, (10, 10))
+        @test ba_tuple isa BitMatrix
+        @test size(ba_tuple) == (10, 10)
+
+        # Verify chunks sharing (key feature!)
+        @with_pool pool2 begin
+            bv2 = unsafe_acquire!(pool2, Bit, 100)
+            pool_bv = pool2.bits.vectors[1]
+            @test bv2.chunks === pool_bv.chunks  # Same chunks object!
+
+            # Verify data is shared
+            bv2[1] = true
+            @test pool_bv[1] == true
+            bv2[1] = false
+            @test pool_bv[1] == false
+        end
+    end
+
+    @testset "unsafe_acquire! SIMD performance" begin
+        # Verify that unsafe_acquire! preserves SIMD-optimized operations
+        pool = AdaptiveArrayPool()
+
+        @with_pool pool begin
+            n = 10000
+
+            # Setup: fill with known pattern
+            bv_unsafe = unsafe_acquire!(pool, Bit, n)
+            fill!(bv_unsafe, true)
+
+            # count() should work correctly
+            @test count(bv_unsafe) == n
+
+            # Verify it's using the fast path (type check)
+            @test bv_unsafe isa BitVector
+
+            # Compare with acquire! (SubArray)
+            bv_view = acquire!(pool, Bit, n)
+            fill!(bv_view, true)
+            @test count(bv_view) == n
+            @test bv_view isa SubArray
         end
     end
 

From eeeb815ef127e2e8095031cae93a1ab4595b90cc Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Fri, 30 Jan 2026 14:47:50 -0800
Subject: [PATCH 03/13] refactor: extract BitArray acquisition logic to
 dedicated file

Separate BitArray-specific code from acquire.jl into bitarray.jl
for improved maintainability and code organization.

Moved to src/bitarray.jl:
- allocate_vector(::BitTypedPool, n) dispatch
- Base.zero/one(::Type{Bit}) overloads
- get_bitvector_wrapper! (SIMD-optimized chunks sharing)
- _unsafe_acquire_impl! for Bit type
- DisabledPool fallbacks for Bit type

No functional changes - all tests pass with same coverage.
---
 src/AdaptiveArrayPools.jl |   3 +
 src/acquire.jl            | 142 -------------------------------
 src/bitarray.jl           | 174 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 177 insertions(+), 142 deletions(-)
 create mode 100644 src/bitarray.jl

diff --git a/src/AdaptiveArrayPools.jl b/src/AdaptiveArrayPools.jl
index 61f691a..7092822 100644
--- a/src/AdaptiveArrayPools.jl
+++ b/src/AdaptiveArrayPools.jl
@@ -28,6 +28,9 @@ include("utils.jl")
 # Acquisition operations: get_view!, acquire!, unsafe_acquire!, aliases
 include("acquire.jl")
 
+# BitArray-specific acquisition (SIMD-optimized BitVector operations)
+include("bitarray.jl")
+
 # Convenience functions: zeros!, ones!, similar!
 include("convenience.jl")
 
diff --git a/src/acquire.jl b/src/acquire.jl
index 286a632..428738b 100644
--- a/src/acquire.jl
+++ b/src/acquire.jl
@@ -6,128 +6,12 @@
 @inline allocate_vector(::AbstractTypedPool{T,Vector{T}}, n::Int) where {T} =
     Vector{T}(undef, n)
 
-# BitTypedPool allocates BitVector (used when acquiring with Bit type)
-@inline allocate_vector(::BitTypedPool, n::Int) = BitVector(undef, n)
-
-# Bit type returns Bool element type for fill operations (zero/one)
-@inline Base.zero(::Type{Bit}) = false
-@inline Base.one(::Type{Bit}) = true
-
 # Wrap flat view into N-D array (dispatch point for extensions)
 @inline function wrap_array(::AbstractTypedPool{T,Vector{T}},
                             flat_view, dims::NTuple{N,Int}) where {T,N}
     unsafe_wrap(Array{T,N}, pointer(flat_view), dims)
 end
 
-# ==============================================================================
-# BitVector Wrapper (chunks sharing for SIMD performance)
-# ==============================================================================
-
-"""
-    get_bitvector_wrapper!(tp::BitTypedPool, n::Int) -> BitVector
-
-Get a BitVector that shares `chunks` with the pooled BitVector.
-
-Unlike `get_view!` which returns a `SubArray` (loses SIMD optimizations),
-this returns a real `BitVector` with shared chunks, preserving native
-BitVector performance (~140x faster for `count()`, `sum()`, etc.).
-
-## Implementation
-Creates a new BitVector shell and replaces its `chunks` field with the
-pooled BitVector's chunks. Uses N-way cache for wrapper reuse.
-
-## Safety
-The returned BitVector is only valid within the `@with_pool` scope.
-Do NOT use after the scope ends (use-after-free risk).
-"""
-function get_bitvector_wrapper!(tp::BitTypedPool, n::Int)
-    tp.n_active += 1
-    idx = tp.n_active
-
-    # 1. Pool expansion needed (new slot)
-    if idx > length(tp.vectors)
-        pool_bv = BitVector(undef, n)
-        push!(tp.vectors, pool_bv)
-        push!(tp.views, view(pool_bv, 1:n))
-        push!(tp.view_lengths, n)
-
-        # Create wrapper sharing chunks
-        wrapper = BitVector(undef, n)
-        wrapper.chunks = pool_bv.chunks
-
-        # Expand N-way cache (CACHE_WAYS entries per slot)
-        for _ in 1:CACHE_WAYS
-            push!(tp.nd_arrays, nothing)
-            push!(tp.nd_dims, nothing)
-            push!(tp.nd_ptrs, UInt(0))
-        end
-        push!(tp.nd_next_way, 0)
-
-        # Cache in first way
-        base = (idx - 1) * CACHE_WAYS + 1
-        @inbounds tp.nd_arrays[base] = wrapper
-        @inbounds tp.nd_dims[base] = n
-        @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks))
-
-        # Warn at powers of 2 (possible missing rewind!)
-        if idx >= 512 && (idx & (idx - 1)) == 0
-            total_bits = sum(length, tp.vectors)
-            @warn "BitTypedPool growing large ($idx arrays, ~$(total_bits ÷ 8) bytes). Missing rewind!()?"
-        end
-
-        return wrapper
-    end
-
-    # 2. Check N-way cache for hit
-    @inbounds pool_bv = tp.vectors[idx]
-    current_ptr = UInt(pointer(pool_bv.chunks))
-
-    # Ensure cache slots exist for this index
-    n_slots_cached = length(tp.nd_next_way)
-    while idx > n_slots_cached
-        for _ in 1:CACHE_WAYS
-            push!(tp.nd_arrays, nothing)
-            push!(tp.nd_dims, nothing)
-            push!(tp.nd_ptrs, UInt(0))
-        end
-        push!(tp.nd_next_way, 0)
-        n_slots_cached += 1
-    end
-
-    base = (idx - 1) * CACHE_WAYS
-
-    # Linear search across all ways
-    for k in 1:CACHE_WAYS
-        cache_idx = base + k
-        @inbounds cached_n = tp.nd_dims[cache_idx]
-        @inbounds cached_ptr = tp.nd_ptrs[cache_idx]
-
-        if cached_n == n && cached_ptr == current_ptr
-            return @inbounds tp.nd_arrays[cache_idx]::BitVector
-        end
-    end
-
-    # 3. Cache miss - resize pool_bv if needed and create new wrapper
-    if length(pool_bv) < n
-        resize!(pool_bv, n)
-        @inbounds tp.views[idx] = view(pool_bv, 1:n)
-        @inbounds tp.view_lengths[idx] = n
-    end
-
-    wrapper = BitVector(undef, n)
-    wrapper.chunks = pool_bv.chunks
-
-    # Round-robin replacement
-    @inbounds way_offset = tp.nd_next_way[idx]
-    target_idx = base + way_offset + 1
-    @inbounds tp.nd_arrays[target_idx] = wrapper
-    @inbounds tp.nd_dims[target_idx] = n
-    @inbounds tp.nd_ptrs[target_idx] = UInt(pointer(pool_bv.chunks))
-    @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS
-
-    return wrapper
-end
-
 # ==============================================================================
 # Helper: Overflow-Safe Product
 # ==============================================================================
@@ -344,22 +228,6 @@ end
 # Similar-style
 @inline _unsafe_acquire_impl!(pool::AbstractArrayPool, x::AbstractArray) = _unsafe_acquire_impl!(pool, eltype(x), size(x))
 
-# Bit type: returns BitVector with shared chunks (SIMD optimized)
-@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int)
-    tp = get_typed_pool!(pool, Bit)::BitTypedPool
-    return get_bitvector_wrapper!(tp, n)
-end
-
-@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N}
-    total = prod(dims)
-    bv = _unsafe_acquire_impl!(pool, Bit, total)
-    return reshape(bv, dims)  # ReshapedArray{Bool,N,BitVector,...}
-end
-
-@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N}
-    _unsafe_acquire_impl!(pool, Bit, dims...)
-end
-
 # ==============================================================================
 # Acquisition API (User-facing with untracked marking)
 # ==============================================================================
@@ -560,16 +428,6 @@ const _acquire_array_impl! = _unsafe_acquire_impl!
 @inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = Array{T,N}(undef, dims)
 @inline unsafe_acquire!(::DisabledPool{:cpu}, x::AbstractArray) = similar(x)
 
-# --- acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) ---
-@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n)
-@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims)
-@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims)
-
-# --- unsafe_acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) ---
-@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n)
-@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims)
-@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims)
-
 # --- Generic DisabledPool fallbacks (unknown backend → error) ---
 @inline acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B)
 @inline unsafe_acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B)
diff --git a/src/bitarray.jl b/src/bitarray.jl
new file mode 100644
index 0000000..144389d
--- /dev/null
+++ b/src/bitarray.jl
@@ -0,0 +1,174 @@
+# ==============================================================================
+# BitArray Acquisition (SIMD-Optimized BitVector Operations)
+# ==============================================================================
+#
+# This file contains BitArray-specific pool operations, separated from the
+# generic Array acquisition code in acquire.jl for maintainability.
+#
+# Key components:
+# - allocate_vector(::BitTypedPool, n) - BitVector allocation dispatch
+# - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type
+# - get_bitvector_wrapper! - SIMD-optimized BitVector with shared chunks
+# - _unsafe_acquire_impl! for Bit - Raw BitVector/BitArray acquisition
+# - DisabledPool fallbacks for Bit type
+#
+# Design rationale:
+# - BitVector cannot use unsafe_wrap like Array, so it needs a different
+#   strategy for returning native BitVector instances.
+# - The "chunks sharing" approach creates a new BitVector shell and replaces
+#   its internal chunks field, preserving ~140x faster SIMD operations.
+# ==============================================================================
+
+# ==============================================================================
+# Allocation Dispatch Points (BitArray-specific)
+# ==============================================================================
+
+# BitTypedPool allocates BitVector (used when acquiring with Bit type)
+@inline allocate_vector(::BitTypedPool, n::Int) = BitVector(undef, n)
+
+# Bit type returns Bool element type for fill operations (zero/one)
+@inline Base.zero(::Type{Bit}) = false
+@inline Base.one(::Type{Bit}) = true
+
+# ==============================================================================
+# BitVector Wrapper (chunks sharing for SIMD performance)
+# ==============================================================================
+
+"""
+    get_bitvector_wrapper!(tp::BitTypedPool, n::Int) -> BitVector
+
+Get a BitVector that shares `chunks` with the pooled BitVector.
+
+Unlike `get_view!` which returns a `SubArray` (loses SIMD optimizations),
+this returns a real `BitVector` with shared chunks, preserving native
+BitVector performance (~140x faster for `count()`, `sum()`, etc.).
+
+## Implementation
+Creates a new BitVector shell and replaces its `chunks` field with the
+pooled BitVector's chunks. Uses N-way cache for wrapper reuse.
+
+## Safety
+The returned BitVector is only valid within the `@with_pool` scope.
+Do NOT use after the scope ends (use-after-free risk).
+"""
+function get_bitvector_wrapper!(tp::BitTypedPool, n::Int)
+    tp.n_active += 1
+    idx = tp.n_active
+
+    # 1. Pool expansion needed (new slot)
+    if idx > length(tp.vectors)
+        pool_bv = BitVector(undef, n)
+        push!(tp.vectors, pool_bv)
+        push!(tp.views, view(pool_bv, 1:n))
+        push!(tp.view_lengths, n)
+
+        # Create wrapper sharing chunks
+        wrapper = BitVector(undef, n)
+        wrapper.chunks = pool_bv.chunks
+
+        # Expand N-way cache (CACHE_WAYS entries per slot)
+        for _ in 1:CACHE_WAYS
+            push!(tp.nd_arrays, nothing)
+            push!(tp.nd_dims, nothing)
+            push!(tp.nd_ptrs, UInt(0))
+        end
+        push!(tp.nd_next_way, 0)
+
+        # Cache in first way
+        base = (idx - 1) * CACHE_WAYS + 1
+        @inbounds tp.nd_arrays[base] = wrapper
+        @inbounds tp.nd_dims[base] = n
+        @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks))
+
+        # Warn at powers of 2 (possible missing rewind!)
+        if idx >= 512 && (idx & (idx - 1)) == 0
+            total_bits = sum(length, tp.vectors)
+            @warn "BitTypedPool growing large ($idx arrays, ~$(total_bits ÷ 8) bytes). Missing rewind!()?"
+        end
+
+        return wrapper
+    end
+
+    # 2. Check N-way cache for hit
+    @inbounds pool_bv = tp.vectors[idx]
+    current_ptr = UInt(pointer(pool_bv.chunks))
+
+    # Ensure cache slots exist for this index
+    n_slots_cached = length(tp.nd_next_way)
+    while idx > n_slots_cached
+        for _ in 1:CACHE_WAYS
+            push!(tp.nd_arrays, nothing)
+            push!(tp.nd_dims, nothing)
+            push!(tp.nd_ptrs, UInt(0))
+        end
+        push!(tp.nd_next_way, 0)
+        n_slots_cached += 1
+    end
+
+    base = (idx - 1) * CACHE_WAYS
+
+    # Linear search across all ways
+    for k in 1:CACHE_WAYS
+        cache_idx = base + k
+        @inbounds cached_n = tp.nd_dims[cache_idx]
+        @inbounds cached_ptr = tp.nd_ptrs[cache_idx]
+
+        if cached_n == n && cached_ptr == current_ptr
+            return @inbounds tp.nd_arrays[cache_idx]::BitVector
+        end
+    end
+
+    # 3. Cache miss - resize pool_bv if needed and create new wrapper
+    if length(pool_bv) < n
+        resize!(pool_bv, n)
+        @inbounds tp.views[idx] = view(pool_bv, 1:n)
+        @inbounds tp.view_lengths[idx] = n
+    end
+
+    wrapper = BitVector(undef, n)
+    wrapper.chunks = pool_bv.chunks
+
+    # Round-robin replacement
+    @inbounds way_offset = tp.nd_next_way[idx]
+    target_idx = base + way_offset + 1
+    @inbounds tp.nd_arrays[target_idx] = wrapper
+    @inbounds tp.nd_dims[target_idx] = n
+    @inbounds tp.nd_ptrs[target_idx] = UInt(pointer(pool_bv.chunks))
+    @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS
+
+    return wrapper
+end
+
+# ==============================================================================
+# Unsafe Acquire Implementation (Bit type)
+# ==============================================================================
+
+# Bit type: returns BitVector with shared chunks (SIMD optimized)
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int)
+    tp = get_typed_pool!(pool, Bit)::BitTypedPool
+    return get_bitvector_wrapper!(tp, n)
+end
+
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N}
+    total = prod(dims)
+    bv = _unsafe_acquire_impl!(pool, Bit, total)
+    return reshape(bv, dims)  # ReshapedArray{Bool,N,BitVector,...}
+end
+
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N}
+    _unsafe_acquire_impl!(pool, Bit, dims...)
+end
+
+# ==============================================================================
+# DisabledPool Fallbacks (Bit type)
+# ==============================================================================
+
+# --- acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) ---
+@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n)
+@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims)
+@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims)
+
+# --- unsafe_acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) ---
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n)
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims)
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims)

From 322ebe4a3eefe16eb8479480c061d74ab447a824 Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Fri, 30 Jan 2026 16:09:24 -0800
Subject: [PATCH 04/13] feat: unify Bit type API to always return BitVector for
 SIMD performance

Both acquire! and unsafe_acquire! now return BitVector for Bit type,
eliminating the need for users to choose between APIs to get optimal
performance. The _acquire_impl! for Bit now delegates to _unsafe_acquire_impl!,
ensuring ~140x faster SIMD operations (count, sum, bitwise) are always used.

Also fixes BitVector wrapper sizing to use exact length (!=) instead of
minimum length (<), ensuring fill!/count! iterate only over relevant chunks.
---
 src/bitarray.jl       |  66 ++++++++++++++++++++++----
 src/types.jl          |  74 +++++++++++++++++++----------
 test/test_bitarray.jl | 108 +++++++++++++++++++++++++++++-------------
 3 files changed, 180 insertions(+), 68 deletions(-)

diff --git a/src/bitarray.jl b/src/bitarray.jl
index 144389d..fd64bff 100644
--- a/src/bitarray.jl
+++ b/src/bitarray.jl
@@ -1,5 +1,5 @@
 # ==============================================================================
-# BitArray Acquisition (SIMD-Optimized BitVector Operations)
+# BitArray Acquisition (Unified BitVector API)
 # ==============================================================================
 #
 # This file contains BitArray-specific pool operations, separated from the
@@ -9,14 +9,34 @@
 # - allocate_vector(::BitTypedPool, n) - BitVector allocation dispatch
 # - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type
 # - get_bitvector_wrapper! - SIMD-optimized BitVector with shared chunks
+# - _acquire_impl! for Bit - Delegates to _unsafe_acquire_impl! for performance
 # - _unsafe_acquire_impl! for Bit - Raw BitVector/BitArray acquisition
 # - DisabledPool fallbacks for Bit type
 #
-# Design rationale:
-# - BitVector cannot use unsafe_wrap like Array, so it needs a different
-#   strategy for returning native BitVector instances.
-# - The "chunks sharing" approach creates a new BitVector shell and replaces
-#   its internal chunks field, preserving ~140x faster SIMD operations.
+# Design Decision: Unified BitVector Return Type
+# =============================================
+# Unlike regular types where acquire! returns SubArray and unsafe_acquire!
+# returns Array, for Bit type BOTH return BitVector. This design choice is
+# intentional for several reasons:
+#
+# 1. **SIMD Performance**: BitVector operations like `count()`, `sum()`, and
+#    bitwise operations are ~140x faster than their SubArray equivalents
+#    because they use SIMD-optimized chunked algorithms.
+#
+# 2. **API Simplicity**: Users always get BitVector regardless of which API
+#    they call. No need to remember "use unsafe_acquire! for performance".
+#
+# 3. **Semantic Clarity**: The "unsafe" in unsafe_acquire! refers to memory
+#    safety concerns (use-after-free risk). BitVector already handles memory
+#    efficiently (1 bit per element), so the naming would be misleading.
+#
+# 4. **Backwards Compatibility**: Code using trues!/falses! just works with
+#    optimal performance - these convenience functions now return BitVector.
+#
+# Implementation:
+# - _acquire_impl!(pool, Bit, ...) delegates to _unsafe_acquire_impl!
+# - get_bitvector_wrapper! creates BitVector shells sharing pool's chunks
+# - N-D requests return reshaped BitArrays (reshape preserves chunk sharing)
 # ==============================================================================
 
 # ==============================================================================
@@ -118,8 +138,11 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int)
         end
     end
 
-    # 3. Cache miss - resize pool_bv if needed and create new wrapper
-    if length(pool_bv) < n
+    # 3. Cache miss - resize pool_bv to EXACTLY n elements and create new wrapper
+    # Unlike regular arrays where we only grow, BitVector wrappers MUST have exactly
+    # the right number of chunks. Otherwise fill!()/count() iterate over all chunks,
+    # not just the bits within wrapper.len, causing incorrect behavior.
+    if length(pool_bv) != n
         resize!(pool_bv, n)
         @inbounds tp.views[idx] = view(pool_bv, 1:n)
         @inbounds tp.view_lengths[idx] = n
@@ -139,6 +162,31 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int)
     return wrapper
 end
 
+# ==============================================================================
+# Acquire Implementation (Bit type → delegates to unsafe_acquire for performance)
+# ==============================================================================
+#
+# Unlike other types where acquire! returns SubArray (view-based) and
+# unsafe_acquire! returns Array (raw), Bit type always returns BitVector.
+# This is because BitVector's SIMD-optimized operations (count, sum, etc.)
+# are ~140x faster than SubArray equivalents.
+#
+# The delegation is transparent: users calling acquire!(pool, Bit, n) get
+# BitVector without needing to know about unsafe_acquire!.
+
+# Bit type: delegates to _unsafe_acquire_impl! for SIMD performance
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int)
+    return _unsafe_acquire_impl!(pool, Bit, n)
+end
+
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N}
+    return _unsafe_acquire_impl!(pool, Bit, dims...)
+end
+
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N}
+    return _unsafe_acquire_impl!(pool, Bit, dims...)
+end
+
 # ==============================================================================
 # Unsafe Acquire Implementation (Bit type)
 # ==============================================================================
@@ -152,7 +200,7 @@ end
 @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N}
     total = prod(dims)
     bv = _unsafe_acquire_impl!(pool, Bit, total)
-    return reshape(bv, dims)  # ReshapedArray{Bool,N,BitVector,...}
+    return reshape(bv, dims)  # BitArray{N} (Julia's reshape on BitVector returns BitArray)
 end
 
 @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N}
diff --git a/src/types.jl b/src/types.jl
index 89f2027..51bab0f 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -225,28 +225,45 @@ bit-packed arrays (1 bit per element vs 1 byte for `Vector{Bool}`).
 ## Usage
 ```julia
 @with_pool pool begin
-    # BitVector view (1 bit per element, ~8x memory savings)
+    # BitVector (1 bit per element, ~8x memory savings)
     bv = acquire!(pool, Bit, 1000)
 
     # vs Vector{Bool} (1 byte per element)
     vb = acquire!(pool, Bool, 1000)
 
     # Convenience functions work too
-    mask = zeros!(pool, Bit, 100)   # BitVector filled with false
-    flags = ones!(pool, Bit, 100)   # BitVector filled with true
+    mask = falses!(pool, 100)       # BitVector filled with false
+    flags = trues!(pool, 100)       # BitVector filled with true
 end
 ```
 
-## Return Types
-- **1D**: `SubArray{Bool,1,BitVector,...}`
-- **N-D**: `ReshapedArray{Bool,N,...}` (reshaped view of 1D BitVector)
+## Return Types (Unified for Performance)
+Unlike other types, `Bit` always returns native `BitVector`/`BitArray`:
+- **1D**: `BitVector` (both `acquire!` and `unsafe_acquire!`)
+- **N-D**: `BitArray{N}` (reshaped, preserves SIMD optimization)
 
-## Performance Note
-`unsafe_acquire!(pool, Bit, n)` returns a real `BitVector` with shared chunks,
-preserving SIMD-optimized operations like `count()` (~140x faster than SubArray).
-Use this when you need native BitVector performance.
+This design ensures users always get SIMD-optimized performance without
+needing to remember which API to use.
 
-See also: [`acquire!`](@ref), [`BitTypedPool`](@ref)
+## Performance
+`BitVector` operations like `count()`, `sum()`, and bitwise operations are
+~140x faster than equivalent operations on `SubArray{Bool}` because they
+use SIMD-optimized algorithms on packed 64-bit chunks.
+
+```julia
+@with_pool pool begin
+    bv = acquire!(pool, Bit, 10000)
+    fill!(bv, true)
+    count(bv)  # Uses fast SIMD path automatically
+end
+```
+
+## Memory Safety
+The returned `BitVector` shares its internal `chunks` array with the pool.
+It is only valid within the `@with_pool` scope - using it after the scope
+ends leads to undefined behavior (use-after-free risk).
+
+See also: [`trues!`](@ref), [`falses!`](@ref), [`BitTypedPool`](@ref)
 """
 struct Bit end
 
@@ -262,18 +279,21 @@ Specialized pool for `BitVector` arrays with memory reuse.
 Unlike `TypedPool{Bool}` which stores `Vector{Bool}` (1 byte per element),
 this pool stores `BitVector` (1 bit per element, ~8x memory efficiency).
 
-## Acquisition Methods
-- `acquire!(pool, Bit, n)` → `SubArray{Bool,1,BitVector,...}` (view-based)
-- `unsafe_acquire!(pool, Bit, n)` → `BitVector` (chunks-sharing, SIMD optimized)
+## Unified API (Always Returns BitVector)
+Unlike other types, both `acquire!` and `unsafe_acquire!` return `BitVector`
+for the `Bit` type. This design ensures users always get SIMD-optimized
+performance without needing to choose between APIs.
 
-Use `unsafe_acquire!` when you need native BitVector operations like `count()`,
-`sum()`, or bitwise operations - these are ~140x faster than SubArray equivalents.
+- `acquire!(pool, Bit, n)` → `BitVector` (SIMD optimized)
+- `unsafe_acquire!(pool, Bit, n)` → `BitVector` (same behavior)
+- `trues!(pool, n)` → `BitVector` filled with `true`
+- `falses!(pool, n)` → `BitVector` filled with `false`
 
 ## Fields
 - `vectors`: Backing `BitVector` storage
-- `views`: Cached `SubArray` views for `acquire!`
+- `views`: Cached `SubArray` views (legacy, maintained for compatibility)
 - `view_lengths`: Cached lengths for fast comparison
-- `nd_arrays`: Cached wrapper BitVectors for `unsafe_acquire!` (chunks sharing)
+- `nd_arrays`: Cached wrapper BitVectors (chunks sharing)
 - `nd_dims`: Cached lengths for wrapper cache validation
 - `nd_ptrs`: Cached chunk pointers for invalidation detection
 - `nd_next_way`: Round-robin counter for N-way cache
@@ -283,19 +303,21 @@ Use `unsafe_acquire!` when you need native BitVector operations like `count()`,
 ## Usage
 ```julia
 @with_pool pool begin
-    # View-based (standard)
-    bv = acquire!(pool, Bit, 100)              # SubArray{Bool,1,BitVector,...}
-
-    # SIMD-optimized (for performance-critical code)
-    bv_fast = unsafe_acquire!(pool, Bit, 100)  # BitVector (real)
-    count(bv_fast)                             # ~140x faster than count(bv)
+    # All return BitVector with SIMD performance
+    bv = acquire!(pool, Bit, 100)              # BitVector
+    count(bv)                                  # Fast SIMD path
 
     # Convenience functions
-    t = trues!(pool, 50)                       # Filled with true
-    f = falses!(pool, 50)                      # Filled with false
+    t = trues!(pool, 50)                       # BitVector filled with true
+    f = falses!(pool, 50)                      # BitVector filled with false
 end
 ```
 
+## Performance
+Operations like `count()`, `sum()`, and bitwise operations are ~140x faster
+than equivalent operations on `SubArray{Bool}` because `BitVector` uses
+SIMD-optimized algorithms on packed 64-bit chunks.
+
 See also: [`trues!`](@ref), [`falses!`](@ref), [`Bit`](@ref)
 """
 mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector}
diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl
index e5394de..3e0a267 100644
--- a/test/test_bitarray.jl
+++ b/test/test_bitarray.jl
@@ -26,13 +26,14 @@
         @test isempty(pool.bits.vectors)
     end
 
-    @testset "acquire!(pool, Bit, n) - 1D" begin
+    @testset "acquire!(pool, Bit, n) - 1D (returns BitVector for SIMD performance)" begin
         pool = AdaptiveArrayPool()
 
         bv = acquire!(pool, Bit, 100)
         @test length(bv) == 100
         @test eltype(bv) == Bool
-        @test bv isa SubArray{Bool, 1, BitVector}
+        # Returns BitVector (not SubArray) for SIMD-optimized operations
+        @test bv isa BitVector
         @test pool.bits.n_active == 1
 
         # Write and read back
@@ -45,6 +46,7 @@
         # Second acquire
         bv2 = acquire!(pool, Bit, 50)
         @test length(bv2) == 50
+        @test bv2 isa BitVector
         @test pool.bits.n_active == 2
 
         # Independent values
@@ -53,14 +55,15 @@
         @test count(bv) == 99  # bv unchanged
     end
 
-    @testset "acquire!(pool, Bit, dims...) - N-D" begin
+    @testset "acquire!(pool, Bit, dims...) - N-D (returns BitArray for SIMD performance)" begin
         pool = AdaptiveArrayPool()
 
-        # 2D
+        # 2D - returns BitMatrix (Julia's reshape(BitVector, dims) returns BitArray)
         ba2 = acquire!(pool, Bit, 10, 10)
         @test size(ba2) == (10, 10)
         @test eltype(ba2) == Bool
-        @test ba2 isa Base.ReshapedArray
+        # Note: reshape(BitVector, dims) returns BitArray{N}, not ReshapedArray
+        @test ba2 isa BitMatrix
         @test pool.bits.n_active == 1
 
         # Test indexing
@@ -75,108 +78,126 @@
         # 3D
         ba3 = acquire!(pool, Bit, 4, 5, 3)
         @test size(ba3) == (4, 5, 3)
+        @test ba3 isa BitArray{3}
         @test pool.bits.n_active == 2
 
         # Tuple form
         ba_tuple = acquire!(pool, Bit, (3, 4, 2))
         @test size(ba_tuple) == (3, 4, 2)
+        @test ba_tuple isa BitArray{3}
         @test pool.bits.n_active == 3
     end
 
-    @testset "ones!(pool, Bit, dims...) - filled with true" begin
+    @testset "ones!(pool, Bit, dims...) - BitVector filled with true" begin
         pool = AdaptiveArrayPool()
 
-        # 1D
+        # 1D - returns BitVector
         t1 = ones!(pool, Bit, 100)
         @test length(t1) == 100
         @test all(t1)
+        @test t1 isa BitVector
         @test pool.bits.n_active == 1
 
-        # 2D
+        # 2D - returns BitMatrix (reshape of BitVector)
         t2 = ones!(pool, Bit, 10, 10)
         @test size(t2) == (10, 10)
         @test all(t2)
         @test count(t2) == 100
+        @test t2 isa BitMatrix
 
         # Tuple form
         t3 = ones!(pool, Bit, (5, 5, 4))
         @test size(t3) == (5, 5, 4)
         @test all(t3)
+        @test t3 isa BitArray{3}
     end
 
-    @testset "zeros!(pool, Bit, dims...) - filled with false" begin
+    @testset "zeros!(pool, Bit, dims...) - BitVector filled with false" begin
         pool = AdaptiveArrayPool()
 
-        # 1D
+        # 1D - returns BitVector
         f1 = zeros!(pool, Bit, 100)
         @test length(f1) == 100
         @test !any(f1)
+        @test f1 isa BitVector
         @test pool.bits.n_active == 1
 
-        # 2D
+        # 2D - returns BitMatrix (reshape of BitVector)
         f2 = zeros!(pool, Bit, 10, 10)
         @test size(f2) == (10, 10)
         @test !any(f2)
         @test count(f2) == 0
+        @test f2 isa BitMatrix
 
         # Tuple form
         f3 = zeros!(pool, Bit, (5, 5, 4))
         @test size(f3) == (5, 5, 4)
         @test !any(f3)
+        @test f3 isa BitArray{3}
     end
 
-    @testset "trues!(pool, dims...) - convenience for BitArray filled with true" begin
+    @testset "trues!(pool, dims...) - BitVector filled with true (SIMD optimized)" begin
         pool = AdaptiveArrayPool()
 
-        # 1D
+        # 1D - returns BitVector
         t1 = trues!(pool, 100)
         @test length(t1) == 100
         @test all(t1)
         @test eltype(t1) == Bool
+        @test t1 isa BitVector
         @test pool.bits.n_active == 1
 
-        # 2D
+        # 2D - returns BitMatrix (reshape of BitVector)
         t2 = trues!(pool, 10, 10)
         @test size(t2) == (10, 10)
         @test all(t2)
         @test count(t2) == 100
+        @test t2 isa BitMatrix
 
         # Tuple form
         t3 = trues!(pool, (5, 5, 4))
         @test size(t3) == (5, 5, 4)
         @test all(t3)
+        @test t3 isa BitArray{3}
 
         # Equivalent to ones!(pool, Bit, ...)
         t4 = trues!(pool, 50)
         t5 = ones!(pool, Bit, 50)
         @test all(t4 .== t5)
+        @test t4 isa BitVector
+        @test t5 isa BitVector
     end
 
-    @testset "falses!(pool, dims...) - convenience for BitArray filled with false" begin
+    @testset "falses!(pool, dims...) - BitVector filled with false (SIMD optimized)" begin
         pool = AdaptiveArrayPool()
 
-        # 1D
+        # 1D - returns BitVector
         f1 = falses!(pool, 100)
         @test length(f1) == 100
         @test !any(f1)
         @test eltype(f1) == Bool
+        @test f1 isa BitVector
         @test pool.bits.n_active == 1
 
-        # 2D
+        # 2D - returns BitMatrix (reshape of BitVector)
         f2 = falses!(pool, 10, 10)
         @test size(f2) == (10, 10)
         @test !any(f2)
         @test count(f2) == 0
+        @test f2 isa BitMatrix
 
         # Tuple form
         f3 = falses!(pool, (5, 5, 4))
         @test size(f3) == (5, 5, 4)
         @test !any(f3)
+        @test f3 isa BitArray{3}
 
         # Equivalent to zeros!(pool, Bit, ...)
         f4 = falses!(pool, 50)
         f5 = zeros!(pool, Bit, 50)
         @test all(f4 .== f5)
+        @test f4 isa BitVector
+        @test f5 isa BitVector
     end
 
     @testset "State management" begin
@@ -405,14 +426,14 @@
     @testset "Mixed Bool types" begin
         pool = AdaptiveArrayPool()
 
-        # Vector{Bool} via acquire! with Bool
+        # Vector{Bool} via acquire! with Bool - returns SubArray (view)
         vb = acquire!(pool, Bool, 100)
         @test vb isa SubArray{Bool, 1, Vector{Bool}}
         @test pool.bool.n_active == 1
 
-        # BitVector via acquire! with Bit
+        # BitVector via acquire! with Bit - returns BitVector (for SIMD)
         bv = acquire!(pool, Bit, 100)
-        @test bv isa SubArray{Bool, 1, BitVector}
+        @test bv isa BitVector  # Note: Bit returns BitVector, not SubArray
         @test pool.bits.n_active == 1
 
         # Both should work independently
@@ -477,28 +498,28 @@
         end
     end
 
-    @testset "unsafe_acquire! SIMD performance" begin
-        # Verify that unsafe_acquire! preserves SIMD-optimized operations
+    @testset "Unified BitVector API - both acquire! and unsafe_acquire! return BitVector" begin
+        # Both acquire! and unsafe_acquire! return BitVector for Bit type
+        # This is a deliberate design choice for SIMD performance
         pool = AdaptiveArrayPool()
 
         @with_pool pool begin
             n = 10000
 
-            # Setup: fill with known pattern
+            # unsafe_acquire! returns BitVector
             bv_unsafe = unsafe_acquire!(pool, Bit, n)
             fill!(bv_unsafe, true)
-
-            # count() should work correctly
             @test count(bv_unsafe) == n
-
-            # Verify it's using the fast path (type check)
             @test bv_unsafe isa BitVector
 
-            # Compare with acquire! (SubArray)
-            bv_view = acquire!(pool, Bit, n)
-            fill!(bv_view, true)
-            @test count(bv_view) == n
-            @test bv_view isa SubArray
+            # acquire! ALSO returns BitVector (not SubArray)
+            bv_acquire = acquire!(pool, Bit, n)
+            fill!(bv_acquire, true)
+            @test count(bv_acquire) == n
+            @test bv_acquire isa BitVector  # Same type as unsafe_acquire!
+
+            # Both benefit from SIMD-optimized count()
+            # (No performance difference since both return BitVector)
         end
     end
 
@@ -517,6 +538,12 @@
         @test eltype(v_bool) == Bool
         @test eltype(v_bit) == Bool
 
+        # Note: acquire! returns SubArray for most types, but BitVector for Bit
+        @test v_f64 isa SubArray
+        @test v_i32 isa SubArray
+        @test v_bool isa SubArray
+        @test v_bit isa BitVector  # Special case for SIMD performance
+
         # zeros!/ones! work consistently
         z_f64 = zeros!(pool, Float64, 10)
         z_bit = zeros!(pool, Bit, 10)
@@ -527,29 +554,37 @@
         @test !any(z_bit)
         @test all(o_f64 .== 1.0)
         @test all(o_bit)
+
+        # Type consistency for convenience functions
+        @test z_bit isa BitVector
+        @test o_bit isa BitVector
     end
 
-    @testset "NTuple form coverage" begin
+    @testset "NTuple form coverage (all return BitArray types)" begin
         pool = AdaptiveArrayPool()
 
         # Test NTuple forms for trues!/falses! (covers _trues_impl! and _falses_impl! NTuple overloads)
         t_tuple = trues!(pool, (5, 5))
         @test size(t_tuple) == (5, 5)
         @test all(t_tuple)
+        @test t_tuple isa BitMatrix
 
         f_tuple = falses!(pool, (5, 5))
         @test size(f_tuple) == (5, 5)
         @test !any(f_tuple)
+        @test f_tuple isa BitMatrix
 
         # Test NTuple forms for zeros!/ones! with Bit type
         # (covers _zeros_impl! and _ones_impl! with Bit NTuple overloads)
         z_bit_tuple = zeros!(pool, Bit, (4, 4))
         @test size(z_bit_tuple) == (4, 4)
         @test !any(z_bit_tuple)
+        @test z_bit_tuple isa BitMatrix
 
         o_bit_tuple = ones!(pool, Bit, (4, 4))
         @test size(o_bit_tuple) == (4, 4)
         @test all(o_bit_tuple)
+        @test o_bit_tuple isa BitMatrix
     end
 
     @testset "Generic DisabledPool fallback for unknown backend" begin
@@ -600,10 +635,17 @@
         z = AdaptiveArrayPools._zeros_impl!(pool, Bit, (3, 3))
         @test size(z) == (3, 3)
         @test !any(z)
+        @test z isa BitMatrix
 
         o = AdaptiveArrayPools._ones_impl!(pool, Bit, (3, 3))
         @test size(o) == (3, 3)
         @test all(o)
+        @test o isa BitMatrix
+
+        # Test _acquire_impl! returns BitVector (not SubArray)
+        bv = AdaptiveArrayPools._acquire_impl!(pool, Bit, 100)
+        @test bv isa BitVector
+        @test length(bv) == 100
     end
 
 end # BitArray Support

From 2434629008daa8b6f815daea10e55b0b3984cebc Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Fri, 30 Jan 2026 16:21:36 -0800
Subject: [PATCH 05/13] fix: replace prod with safe_prod in
 _unsafe_acquire_impl! for improved safety

---
 src/bitarray.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bitarray.jl b/src/bitarray.jl
index fd64bff..353c9e9 100644
--- a/src/bitarray.jl
+++ b/src/bitarray.jl
@@ -198,7 +198,7 @@ end
 end
 
 @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N}
-    total = prod(dims)
+    total = safe_prod(dims)
     bv = _unsafe_acquire_impl!(pool, Bit, total)
     return reshape(bv, dims)  # BitArray{N} (Julia's reshape on BitVector returns BitArray)
 end

From c63a84363185a9199238d35013e9c1412d208f45 Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Fri, 30 Jan 2026 16:37:16 -0800
Subject: [PATCH 06/13] fix: update performance metrics for BitVector
 operations in documentation

---
 src/bitarray.jl | 6 +++---
 src/types.jl    | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/bitarray.jl b/src/bitarray.jl
index 353c9e9..2a061a7 100644
--- a/src/bitarray.jl
+++ b/src/bitarray.jl
@@ -20,7 +20,7 @@
 # intentional for several reasons:
 #
 # 1. **SIMD Performance**: BitVector operations like `count()`, `sum()`, and
-#    bitwise operations are ~140x faster than their SubArray equivalents
+#    bitwise operations are ~(10x ~ 100x) faster than their SubArray equivalents
 #    because they use SIMD-optimized chunked algorithms.
 #
 # 2. **API Simplicity**: Users always get BitVector regardless of which API
@@ -61,7 +61,7 @@ Get a BitVector that shares `chunks` with the pooled BitVector.
 
 Unlike `get_view!` which returns a `SubArray` (loses SIMD optimizations),
 this returns a real `BitVector` with shared chunks, preserving native
-BitVector performance (~140x faster for `count()`, `sum()`, etc.).
+BitVector performance (~(10x ~ 100x) faster for `count()`, `sum()`, etc.).
 
 ## Implementation
 Creates a new BitVector shell and replaces its `chunks` field with the
@@ -169,7 +169,7 @@ end
 # Unlike other types where acquire! returns SubArray (view-based) and
 # unsafe_acquire! returns Array (raw), Bit type always returns BitVector.
 # This is because BitVector's SIMD-optimized operations (count, sum, etc.)
-# are ~140x faster than SubArray equivalents.
+# are ~(10x ~ 100x) faster than SubArray equivalents.
 #
 # The delegation is transparent: users calling acquire!(pool, Bit, n) get
 # BitVector without needing to know about unsafe_acquire!.
diff --git a/src/types.jl b/src/types.jl
index 51bab0f..0f2e8b5 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -247,7 +247,7 @@ needing to remember which API to use.
 
 ## Performance
 `BitVector` operations like `count()`, `sum()`, and bitwise operations are
-~140x faster than equivalent operations on `SubArray{Bool}` because they
+~(10x ~ 100x) faster than equivalent operations on `SubArray{Bool}` because they
 use SIMD-optimized algorithms on packed 64-bit chunks.
 
 ```julia
@@ -314,7 +314,7 @@ end
 ```
 
 ## Performance
-Operations like `count()`, `sum()`, and bitwise operations are ~140x faster
+Operations like `count()`, `sum()`, and bitwise operations are ~(10x ~ 100x) faster
 than equivalent operations on `SubArray{Bool}` because `BitVector` uses
 SIMD-optimized algorithms on packed 64-bit chunks.
 

From 8ef74e5796893e1d140155d5c9b14a3715249603 Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Fri, 30 Jan 2026 16:43:58 -0800
Subject: [PATCH 07/13] docs: clarify BitVector and BitArray usage in
 documentation for performance optimization

---
 docs/src/features/bit-arrays.md | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/docs/src/features/bit-arrays.md b/docs/src/features/bit-arrays.md
index ec21d39..25b1823 100644
--- a/docs/src/features/bit-arrays.md
+++ b/docs/src/features/bit-arrays.md
@@ -14,7 +14,7 @@ To distinguish between standard boolean arrays (`Vector{Bool}`, 1 byte/element)
 ## Usage
 
 ### 1D Arrays (BitVector)
-For 1D arrays, `acquire!` returns a view into a pooled `BitVector`.
+For 1D arrays, `acquire!` returns a native `BitVector`. This design choice enables full SIMD optimization, making operations significantly faster (10x~100x) than using views.
 
 ```julia
 @with_pool pool begin
@@ -25,17 +25,17 @@ For 1D arrays, `acquire!` returns a view into a pooled `BitVector`.
     bv .= true
     bv[1] = false
     
-    # Supports standard operations
+    # Supports standard operations with full SIMD acceleration
     count(bv)
 end
 ```
 
-### N-D Arrays (BitArray / Reshaped)
-For multi-dimensional arrays, `acquire!` returns a `ReshapedArray` wrapper around the linear `BitVector`. This maintains zero-allocation efficiency while providing N-D indexing.
+### N-D Arrays (BitArray)
+For multi-dimensional arrays, `acquire!` returns a `BitArray{N}` (specifically `BitMatrix` for 2D). This preserves the packed memory layout and SIMD benefits while providing N-D indexing.
 
 ```julia
 @with_pool pool begin
-    # 100x100 bit matrix
+    # 100x100 bit matrix (returns BitMatrix)
     mask = zeros!(pool, Bit, 100, 100)
     
     mask[5, 5] = true
@@ -68,11 +68,17 @@ end
 Note: `zeros!(pool, Bit, ...)` and `ones!(pool, Bit, ...)` are also supported (aliased to `falses!` and `trues!`).
 ```
 
-## How It Works
+## Performance & Safety
 
-The pool maintains a separate `BitTypedPool` specifically for `BitVector` storage.
-- **Sentinel**: `acquire!(..., Bit, ...)` dispatches to this special pool.
-- **Views**: 1D returns `SubArray{Bool, 1, BitVector, ...}`.
-- **Reshaping**: N-D returns `ReshapedArray{Bool, N, SubArray{...}}`.
+### Why Native BitVector?
+The pool returns native `BitVector`/`BitArray` types instead of `SubArray` views for **performance**.
+Operations like `count()`, `sum()`, and bitwise broadcasting are **10x~100x faster** on native bit arrays because they utilize SIMD instructions on packed 64-bit chunks.
 
-This ensures that even for complex shapes, the underlying storage is always a compact `BitVector` reused from the pool.
+### ⚠️ Important: Do Not Resize
+
+While the returned arrays are standard `BitVector` types, they share their underlying memory chunks with the pool.
+
+!!! warning "Do Not Resize"
+    **NEVER** resize (`push!`, `pop!`, `resize!`) a pooled `BitVector` or `BitArray`.
+    
+    The underlying memory is owned and managed by the pool. Resizing it will detach it from the pool or potentially corrupt the shared state. Treat these arrays as **fixed-size** scratch buffers only.

From ade8f99683428d86660bc3d9b9b6c657e3afd899 Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Fri, 30 Jan 2026 21:45:47 -0800
Subject: [PATCH 08/13] feat: enhance BitArray and BitTypedPool functionality
 with improved statistics and testing

---
 src/bitarray.jl       |  23 ++-------
 src/utils.jl          | 113 +++++++++++-------------------------------
 test/test_bitarray.jl |  17 ++++++-
 test/test_utils.jl    |  48 ++++++++++++++++++
 4 files changed, 97 insertions(+), 104 deletions(-)

diff --git a/src/bitarray.jl b/src/bitarray.jl
index 2a061a7..5a8488f 100644
--- a/src/bitarray.jl
+++ b/src/bitarray.jl
@@ -6,7 +6,6 @@
 # generic Array acquisition code in acquire.jl for maintainability.
 #
 # Key components:
-# - allocate_vector(::BitTypedPool, n) - BitVector allocation dispatch
 # - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type
 # - get_bitvector_wrapper! - SIMD-optimized BitVector with shared chunks
 # - _acquire_impl! for Bit - Delegates to _unsafe_acquire_impl! for performance
@@ -40,12 +39,9 @@
 # ==============================================================================
 
 # ==============================================================================
-# Allocation Dispatch Points (BitArray-specific)
+# Fill Value Dispatch (BitArray-specific)
 # ==============================================================================
 
-# BitTypedPool allocates BitVector (used when acquiring with Bit type)
-@inline allocate_vector(::BitTypedPool, n::Int) = BitVector(undef, n)
-
 # Bit type returns Bool element type for fill operations (zero/one)
 @inline Base.zero(::Type{Bit}) = false
 @inline Base.one(::Type{Bit}) = true
@@ -109,22 +105,9 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int)
         return wrapper
     end
 
-    # 2. Check N-way cache for hit
+    # 2. Check N-way cache for hit (cache slots always exist - created with vector slot above)
     @inbounds pool_bv = tp.vectors[idx]
     current_ptr = UInt(pointer(pool_bv.chunks))
-
-    # Ensure cache slots exist for this index
-    n_slots_cached = length(tp.nd_next_way)
-    while idx > n_slots_cached
-        for _ in 1:CACHE_WAYS
-            push!(tp.nd_arrays, nothing)
-            push!(tp.nd_dims, nothing)
-            push!(tp.nd_ptrs, UInt(0))
-        end
-        push!(tp.nd_next_way, 0)
-        n_slots_cached += 1
-    end
-
     base = (idx - 1) * CACHE_WAYS
 
     # Linear search across all ways
@@ -204,7 +187,7 @@ end
 end
 
 @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N}
-    _unsafe_acquire_impl!(pool, Bit, dims...)
+    return _unsafe_acquire_impl!(pool, Bit, dims...)
 end
 
 # ==============================================================================
diff --git a/src/utils.jl b/src/utils.jl
index cf12ea0..f252aaa 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -79,51 +79,24 @@ _validate_pool_return(val, ::DisabledPool) = nothing
 # Statistics & Pretty Printing
 # ==============================================================================
 
-"""
-    pool_stats(tp::TypedPool{T}; io::IO=stdout, indent::Int=0, name::String="")
-
-Print statistics for a single TypedPool.
-"""
-function pool_stats(tp::TypedPool{T}; io::IO=stdout, indent::Int=0, name::String="") where {T}
-    prefix = " "^indent
-    type_name = isempty(name) ? string(T) : name
-
-    n_arrays = length(tp.vectors)
-    if n_arrays == 0
-        printstyled(io, prefix, type_name, color=:cyan)
-        printstyled(io, " (empty)\n", color=:dark_gray)
-        return
-    end
+# --- Helper functions for pool_stats (type-specific behavior) ---
+_default_type_name(::TypedPool{T}) where {T} = string(T)
+_default_type_name(::BitTypedPool) = "Bit"
 
-    total_elements = sum(length(v) for v in tp.vectors)
-    total_bytes = sum(Base.summarysize(v) for v in tp.vectors)
-    bytes_str = Base.format_bytes(total_bytes)
+_vector_bytes(v::Vector) = Base.summarysize(v)
+_vector_bytes(v::BitVector) = sizeof(v.chunks)
 
-    # Header
-    printstyled(io, prefix, type_name, color=:cyan)
-    println(io)
-
-    # Stats
-    printstyled(io, prefix, "  slots: ", color=:dark_gray)
-    printstyled(io, n_arrays, color=:blue)
-    printstyled(io, " (active: ", color=:dark_gray)
-    printstyled(io, tp.n_active, color=:blue)
-    printstyled(io, ")\n", color=:dark_gray)
-
-    printstyled(io, prefix, "  elements: ", color=:dark_gray)
-    printstyled(io, total_elements, color=:blue)
-    printstyled(io, " ($bytes_str)\n", color=:dark_gray)
-    return nothing
-end
+_count_label(::TypedPool) = "elements"
+_count_label(::BitTypedPool) = "bits"
 
 """
-    pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String="")
+    pool_stats(tp::AbstractTypedPool; io::IO=stdout, indent::Int=0, name::String="")
 
-Print statistics for a BitTypedPool.
+Print statistics for a TypedPool or BitTypedPool.
 """
-function pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String="")
+function pool_stats(tp::AbstractTypedPool; io::IO=stdout, indent::Int=0, name::String="")
     prefix = " "^indent
-    type_name = isempty(name) ? "Bit" : name
+    type_name = isempty(name) ? _default_type_name(tp) : name
 
     n_arrays = length(tp.vectors)
     if n_arrays == 0
@@ -132,8 +105,8 @@ function pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String
         return
     end
 
-    total_bits = sum(length(v) for v in tp.vectors)
-    total_bytes = sum(sizeof(v.chunks) for v in tp.vectors)
+    total_count = sum(length(v) for v in tp.vectors)
+    total_bytes = sum(_vector_bytes(v) for v in tp.vectors)
     bytes_str = Base.format_bytes(total_bytes)
 
     # Header
@@ -147,8 +120,8 @@ function pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String
     printstyled(io, tp.n_active, color=:blue)
     printstyled(io, ")\n", color=:dark_gray)
 
-    printstyled(io, prefix, "  bits: ", color=:dark_gray)
-    printstyled(io, total_bits, color=:blue)
+    printstyled(io, prefix, "  ", _count_label(tp), ": ", color=:dark_gray)
+    printstyled(io, total_count, color=:blue)
     printstyled(io, " ($bytes_str)\n", color=:dark_gray)
     return nothing
 end
@@ -178,12 +151,7 @@ function pool_stats(pool::AdaptiveArrayPool; io::IO=stdout)
     foreach_fixed_slot(pool) do tp
         if !isempty(tp.vectors)
             has_content = true
-            name = if tp isa BitTypedPool
-                "Bit (fixed)"
-            else
-                T = typeof(tp).parameters[1]  # Extract T from TypedPool{T}
-                "$T (fixed)"
-            end
+            name = _default_type_name(tp) * " (fixed)"
             pool_stats(tp; io, indent=2, name)
         end
     end
@@ -217,10 +185,7 @@ function pool_stats(; io::IO=stdout)
     pool_stats(:cpu; io)
     # Show CUDA pools if extension is loaded and pools exist
     try
-        pools = get_task_local_cuda_pools()
-        for pool in values(pools)
-            pool_stats(pool; io)
-        end
+        pool_stats(Val(:cuda); io)
     catch e
         e isa MethodError || rethrow()
         # CUDA extension not loaded - silently skip
@@ -254,44 +219,26 @@ end
 # Base.show (delegates to pool_stats)
 # ==============================================================================
 
-# Compact one-line show for TypedPool
-function Base.show(io::IO, tp::TypedPool{T}) where {T}
-    n_vectors = length(tp.vectors)
-    if n_vectors == 0
-        print(io, "TypedPool{$T}(empty)")
-    else
-        total = sum(length(v) for v in tp.vectors)
-        print(io, "TypedPool{$T}(slots=$n_vectors, active=$(tp.n_active), elements=$total)")
-    end
-end
+# --- Helper for Base.show (full type name for display) ---
+_show_type_name(::TypedPool{T}) where {T} = "TypedPool{$T}"
+_show_type_name(::BitTypedPool) = "BitTypedPool"
 
-# Multi-line show for TypedPool
-function Base.show(io::IO, ::MIME"text/plain", tp::TypedPool{T}) where {T}
-    pool_stats(tp; io, name="TypedPool{$T}")
-end
-
-# Compact one-line show for BitTypedPool
-function Base.show(io::IO, tp::BitTypedPool)
+# Compact one-line show for all AbstractTypedPool
+function Base.show(io::IO, tp::AbstractTypedPool)
+    name = _show_type_name(tp)
     n_vectors = length(tp.vectors)
     if n_vectors == 0
-        print(io, "BitTypedPool(empty)")
+        print(io, "$name(empty)")
     else
-        total_bits = sum(length(v) for v in tp.vectors)
-        print(io, "BitTypedPool(slots=$n_vectors, active=$(tp.n_active), bits=$total_bits)")
+        total = sum(length(v) for v in tp.vectors)
+        label = _count_label(tp)
+        print(io, "$name(slots=$n_vectors, active=$(tp.n_active), $label=$total)")
     end
 end
 
-# Multi-line show for BitTypedPool
-function Base.show(io::IO, ::MIME"text/plain", tp::BitTypedPool)
-    n_vectors = length(tp.vectors)
-    println(io, "BitTypedPool:")
-    println(io, "  slots:  $n_vectors")
-    println(io, "  active: $(tp.n_active)")
-    if n_vectors > 0
-        total_bits = sum(length(v) for v in tp.vectors)
-        total_bytes = sum(sizeof(v.chunks) for v in tp.vectors)
-        println(io, "  bits:   $total_bits ($(Base.format_bytes(total_bytes)))")
-    end
+# Multi-line show for all AbstractTypedPool
+function Base.show(io::IO, ::MIME"text/plain", tp::AbstractTypedPool)
+    pool_stats(tp; io, name=_show_type_name(tp))
 end
 
 # Compact one-line show for AdaptiveArrayPool
diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl
index 3e0a267..d6cb3d3 100644
--- a/test/test_bitarray.jl
+++ b/test/test_bitarray.jl
@@ -258,7 +258,7 @@
     end
 
     @testset "DisabledPool fallback" begin
-        # acquire! with Bit
+        # --- acquire! with Bit ---
         bv = acquire!(DISABLED_CPU, Bit, 100)
         @test bv isa BitVector
         @test length(bv) == 100
@@ -273,6 +273,21 @@
         @test ba_tuple isa BitArray{2}
         @test size(ba_tuple) == (5, 5)
 
+        # --- unsafe_acquire! with Bit (covers bitarray.jl:206-208) ---
+        ubv = unsafe_acquire!(DISABLED_CPU, Bit, 100)
+        @test ubv isa BitVector
+        @test length(ubv) == 100
+
+        # N-D
+        uba = unsafe_acquire!(DISABLED_CPU, Bit, 10, 10)
+        @test uba isa BitArray{2}
+        @test size(uba) == (10, 10)
+
+        # Tuple form
+        uba_tuple = unsafe_acquire!(DISABLED_CPU, Bit, (5, 5))
+        @test uba_tuple isa BitArray{2}
+        @test size(uba_tuple) == (5, 5)
+
         # ones! with Bit (like trues)
         t = ones!(DISABLED_CPU, Bit, 50)
         @test t isa BitVector
diff --git a/test/test_utils.jl b/test/test_utils.jl
index ddbde0c..d668761 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -266,6 +266,54 @@ end
         @test occursin("empty", output)
     end
 
+    @testset "pool_stats for BitTypedPool" begin
+        import AdaptiveArrayPools: BitTypedPool
+
+        # Empty BitTypedPool
+        btp = BitTypedPool()
+        output = @capture_out pool_stats(btp)
+        @test occursin("Bit", output)
+        @test occursin("empty", output)
+
+        # BitTypedPool with content (via AdaptiveArrayPool)
+        pool = AdaptiveArrayPool()
+        checkpoint!(pool)
+
+        # Acquire some BitVectors
+        bv1 = acquire!(pool, Bit, 100)
+        bv2 = acquire!(pool, Bit, 200)
+
+        output = @capture_out pool_stats(pool)
+        @test occursin("Bit (fixed)", output)
+        @test occursin("slots: 2", output)
+        @test occursin("active: 2", output)
+        @test occursin("bits:", output)  # BitTypedPool uses "bits" label, not "elements"
+        @test occursin("300", output)     # Total bits: 100 + 200
+
+        rewind!(pool)
+
+        # Test direct BitTypedPool stats
+        btp2 = BitTypedPool()
+        # Manually add vectors for testing
+        push!(btp2.vectors, BitVector(undef, 64))
+        btp2.n_active = 1
+
+        output = @capture_out pool_stats(btp2)
+        @test occursin("Bit", output)
+        @test occursin("slots: 1", output)
+        @test occursin("bits: 64", output)
+    end
+
+    @testset "direct call of internal helpers" begin
+        import AdaptiveArrayPools: _default_type_name, _vector_bytes, _count_label, TypedPool, BitTypedPool
+        @test _default_type_name(TypedPool{Float64}()) == "Float64"
+        @test _default_type_name(BitTypedPool()) == "Bit"
+        @test _vector_bytes([1, 2, 3]) == Base.summarysize([1, 2, 3])
+        @test _vector_bytes(BitVector(undef, 100)) == sizeof(BitVector(undef, 100).chunks)
+        @test _count_label(TypedPool{Int}()) == "elements"
+        @test _count_label(BitTypedPool()) == "bits"
+    end
+
     @testset "_validate_pool_return with N-D arrays" begin
         pool = AdaptiveArrayPool()
         checkpoint!(pool)

From 08491acf8d003095be2dee01ad231b181503109e Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Fri, 30 Jan 2026 22:11:40 -0800
Subject: [PATCH 09/13] feat: add warning for BitTypedPool growth and enhance
 tests for Bit and BitMatrix acquisition

---
 src/bitarray.jl       |  4 ++--
 test/test_bitarray.jl | 25 +++++++++++++++++++++++++
 test/test_utils.jl    | 16 ++++++++++++++--
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/src/bitarray.jl b/src/bitarray.jl
index 5a8488f..7f0ccc6 100644
--- a/src/bitarray.jl
+++ b/src/bitarray.jl
@@ -98,8 +98,8 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int)
 
         # Warn at powers of 2 (possible missing rewind!)
         if idx >= 512 && (idx & (idx - 1)) == 0
-            total_bits = sum(length, tp.vectors)
-            @warn "BitTypedPool growing large ($idx arrays, ~$(total_bits ÷ 8) bytes). Missing rewind!()?"
+            total_bytes = sum(_vector_bytes, tp.vectors)
+            @warn "BitTypedPool growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?"
         end
 
         return wrapper
diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl
index d6cb3d3..12a2e5b 100644
--- a/test/test_bitarray.jl
+++ b/test/test_bitarray.jl
@@ -661,6 +661,31 @@
         bv = AdaptiveArrayPools._acquire_impl!(pool, Bit, 100)
         @test bv isa BitVector
         @test length(bv) == 100
+
+        bv = AdaptiveArrayPools._acquire_impl!(pool, Bit, (10, 10))
+        @test bv isa BitMatrix
+        @test size(bv) == (10, 10)
+    end
+    @testset "BitTypedPool growth warning at 512 arrays" begin
+        # Use a fresh pool to ensure we start from 0
+        pool = AdaptiveArrayPool()
+
+        @test pooling_enabled(pool) == true
+
+        # Acquire 511 arrays without rewind - no warning yet
+        for i in 1:511
+            acquire!(pool, Bit, 10)
+        end
+        @test pool.bits.n_active == 511
+
+        # The 512th acquire should trigger a warning
+        @test_logs (:warn, r"BitTypedPool growing large \(512 arrays") begin
+            acquire!(pool, Bit, 10)
+        end
+        @test pool.bits.n_active == 512
+
+        # Clean up
+        empty!(pool)
     end
 
 end # BitArray Support
diff --git a/test/test_utils.jl b/test/test_utils.jl
index d668761..4efd0d2 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -196,8 +196,8 @@ end
         rewind!(pool)
     end
 
-    @testset "Base.show for TypedPool" begin
-        import AdaptiveArrayPools: TypedPool
+    @testset "Base.show for TypedPool & BitTypedPool" begin
+        import AdaptiveArrayPools: TypedPool, BitTypedPool 
 
         # Empty TypedPool - compact show
         tp_empty = TypedPool{Float64}()
@@ -210,6 +210,8 @@ end
         acquire!(pool, Float64, 100)
         acquire!(pool, Float64, 50)
 
+        acquire!(pool, Bit, 10)
+
         output = sprint(show, pool.float64)
         @test occursin("TypedPool{Float64}", output)
         @test occursin("slots=2", output)
@@ -222,6 +224,16 @@ end
         @test occursin("slots:", output)
         @test occursin("active:", output)
 
+        # BitTypedPool - compact show
+        output = sprint(show, pool.bits)
+        @test output == "BitTypedPool(slots=1, active=1, bits=10)"
+        # Multi-line show (MIME"text/plain")
+        output = sprint(show, MIME("text/plain"), pool.bits)
+        @test occursin("BitTypedPool", output)
+        @test occursin("slots:", output)
+        @test occursin("active:", output)
+        @test occursin("bits:", output)
+
         rewind!(pool)
     end
 

From 4ab1809b3d037a9af99b6ab2535da324c4e90dcc Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Fri, 30 Jan 2026 23:28:06 -0800
Subject: [PATCH 10/13] feat: update documentation for BitArray support,
 enhancing clarity on usage and performance optimizations

---
 docs/src/features/bit-arrays.md | 60 ++++++++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 13 deletions(-)

diff --git a/docs/src/features/bit-arrays.md b/docs/src/features/bit-arrays.md
index 25b1823..61519b9 100644
--- a/docs/src/features/bit-arrays.md
+++ b/docs/src/features/bit-arrays.md
@@ -1,6 +1,6 @@
-# BitVector Support
+# BitArray Support
 
-AdaptiveArrayPools.jl includes specialized support for `BitArray` (specifically `BitVector`), enabling **~8x memory savings** for boolean arrays compared to standard `Vector{Bool}`.
+AdaptiveArrayPools.jl includes specialized support for `BitArray` (including `BitVector` and N-dimensional `BitArray{N}`), enabling **~8x memory savings** for boolean arrays compared to standard `Vector{Bool}`.
 
 ## The `Bit` Sentinel Type
 
@@ -20,11 +20,11 @@ For 1D arrays, `acquire!` returns a native `BitVector`. This design choice enabl
 @with_pool pool begin
     # Acquire a BitVector of length 1000
     bv = acquire!(pool, Bit, 1000)
-    
+
     # Use like normal
     bv .= true
     bv[1] = false
-    
+
     # Supports standard operations with full SIMD acceleration
     count(bv)
 end
@@ -37,8 +37,11 @@ For multi-dimensional arrays, `acquire!` returns a `BitArray{N}` (specifically `
 @with_pool pool begin
     # 100x100 bit matrix (returns BitMatrix)
     mask = zeros!(pool, Bit, 100, 100)
-    
+
     mask[5, 5] = true
+
+    # 3D BitArray
+    volume = acquire!(pool, Bit, 10, 10, 10)
 end
 ```
 
@@ -50,35 +53,66 @@ For specific `BitVector` operations, prefer `trues!` and `falses!` which mirror
 @with_pool pool begin
     # Filled with false (equivalent to `falses(256)`)
     mask = falses!(pool, 256)
-    
+
     # Filled with true (equivalent to `trues(256)`)
     flags = trues!(pool, 256)
-    
+
     # Multidimensional
     grid = trues!(pool, 100, 100)
-    
+
     # Similar to existing BitArray
     A = BitVector(undef, 50)
     B = similar!(pool, A)  # Reuses eltype(A) -> Bool
-    
+
     # To explicit get Bit-packed from pool irrespective of source
-    C = similar!(pool, A, Bit) 
+    C = similar!(pool, A, Bit)
 end
+```
 
 Note: `zeros!(pool, Bit, ...)` and `ones!(pool, Bit, ...)` are also supported (aliased to `falses!` and `trues!`).
-```
 
 ## Performance & Safety
 
-### Why Native BitVector?
+### Why Native BitArray?
 The pool returns native `BitVector`/`BitArray` types instead of `SubArray` views for **performance**.
 Operations like `count()`, `sum()`, and bitwise broadcasting are **10x~100x faster** on native bit arrays because they utilize SIMD instructions on packed 64-bit chunks.
 
+### N-D Caching & Zero Allocation
+
+The pool uses an N-way associative cache to efficiently reuse `BitArray{N}` instances:
+
+| Scenario | Allocation |
+|----------|------------|
+| First call with new dims | ~944 bytes (new `BitArray{N}` created) |
+| Subsequent call with same dims | **0 bytes** (cached instance reused) |
+| Same ndims, different dims | **0 bytes** (dims/len fields modified in-place) |
+| Different ndims | ~944 bytes (new `BitArray{N}` created and cached) |
+
+Unlike regular `Array` where dimensions are immutable, `BitArray` allows in-place modification of its `dims` and `len` fields. The pool exploits this to achieve **zero allocation** on repeated calls with matching dimensionality.
+
+```julia
+@with_pool pool begin
+    # First call: allocates BitMatrix wrapper (~944 bytes)
+    m1 = acquire!(pool, Bit, 100, 100)
+
+    # Rewind to reuse the same slot
+    rewind!(pool)
+
+    # Same dims: 0 allocation (exact cache hit)
+    m2 = acquire!(pool, Bit, 100, 100)
+
+    rewind!(pool)
+
+    # Different dims but same ndims: 0 allocation (dims modified in-place)
+    m3 = acquire!(pool, Bit, 50, 200)
+end
+```
+
 ### ⚠️ Important: Do Not Resize
 
 While the returned arrays are standard `BitVector` types, they share their underlying memory chunks with the pool.
 
 !!! warning "Do Not Resize"
     **NEVER** resize (`push!`, `pop!`, `resize!`) a pooled `BitVector` or `BitArray`.
-    
+
     The underlying memory is owned and managed by the pool. Resizing it will detach it from the pool or potentially corrupt the shared state. Treat these arrays as **fixed-size** scratch buffers only.

From fa1513ad93da9d646fbc291c6d0b6bf52cc3504f Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Fri, 30 Jan 2026 23:29:09 -0800
Subject: [PATCH 11/13] refactor(bitarray): implement N-D caching for
 zero-allocation BitArray reuse

- Replace get_bitvector_wrapper! with get_bitarray! supporting proper N-D caching
- Exploit BitArray.dims mutability for same-ndims reuse (0 allocation)
- Add isa type check before equality to prevent Vector{Any} boxing
- Remove unused views/view_lengths fields from BitTypedPool
- Split empty! into separate BitTypedPool and TypedPool methods
- Add test verifying zero-allocation on cached BitArray retrieval
---
 src/bitarray.jl       | 135 +++++++++++++++++++++++-------------------
 src/state.jl          |  28 ++++++++-
 src/types.jl          |  26 +++-----
 test/test_bitarray.jl |  25 ++++++++
 4 files changed, 134 insertions(+), 80 deletions(-)

diff --git a/src/bitarray.jl b/src/bitarray.jl
index 7f0ccc6..f2263ac 100644
--- a/src/bitarray.jl
+++ b/src/bitarray.jl
@@ -1,5 +1,5 @@
 # ==============================================================================
-# BitArray Acquisition (Unified BitVector API)
+# BitArray Acquisition (N-D Cached BitArray API)
 # ==============================================================================
 #
 # This file contains BitArray-specific pool operations, separated from the
@@ -7,35 +7,35 @@
 #
 # Key components:
 # - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type
-# - get_bitvector_wrapper! - SIMD-optimized BitVector with shared chunks
+# - get_bitarray! - N-D BitArray with shared chunks and N-way caching
 # - _acquire_impl! for Bit - Delegates to _unsafe_acquire_impl! for performance
-# - _unsafe_acquire_impl! for Bit - Raw BitVector/BitArray acquisition
+# - _unsafe_acquire_impl! for Bit - Raw BitArray acquisition with caching
 # - DisabledPool fallbacks for Bit type
 #
-# Design Decision: Unified BitVector Return Type
+# Design Decision: Unified BitArray Return Type
 # =============================================
 # Unlike regular types where acquire! returns SubArray and unsafe_acquire!
-# returns Array, for Bit type BOTH return BitVector. This design choice is
+# returns Array, for Bit type BOTH return BitArray{N}. This design choice is
 # intentional for several reasons:
 #
-# 1. **SIMD Performance**: BitVector operations like `count()`, `sum()`, and
+# 1. **SIMD Performance**: BitArray operations like `count()`, `sum()`, and
 #    bitwise operations are ~(10x ~ 100x) faster than their SubArray equivalents
 #    because they use SIMD-optimized chunked algorithms.
 #
-# 2. **API Simplicity**: Users always get BitVector regardless of which API
+# 2. **API Simplicity**: Users always get BitArray regardless of which API
 #    they call. No need to remember "use unsafe_acquire! for performance".
 #
-# 3. **Semantic Clarity**: The "unsafe" in unsafe_acquire! refers to memory
-#    safety concerns (use-after-free risk). BitVector already handles memory
-#    efficiently (1 bit per element), so the naming would be misleading.
+# 3. **N-D Caching**: BitArray{N} can be reused by modifying dims/len fields
+#    when ndims matches, achieving 0 allocation on repeated calls. This is
+#    unique to BitArray - regular Array cannot modify dims in place.
 #
 # 4. **Backwards Compatibility**: Code using trues!/falses! just works with
-#    optimal performance - these convenience functions now return BitVector.
+#    optimal performance - these convenience functions return BitVector.
 #
 # Implementation:
 # - _acquire_impl!(pool, Bit, ...) delegates to _unsafe_acquire_impl!
-# - get_bitvector_wrapper! creates BitVector shells sharing pool's chunks
-# - N-D requests return reshaped BitArrays (reshape preserves chunk sharing)
+# - get_bitarray! creates BitArray shells sharing pool's chunks
+# - N-way cache stores BitArray{N} entries, reused via dims modification
 # ==============================================================================
 
 # ==============================================================================
@@ -47,40 +47,45 @@
 @inline Base.one(::Type{Bit}) = true
 
 # ==============================================================================
-# BitVector Wrapper (chunks sharing for SIMD performance)
+# BitArray Acquisition (N-D caching with chunks sharing)
 # ==============================================================================
 
 """
-    get_bitvector_wrapper!(tp::BitTypedPool, n::Int) -> BitVector
+    get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) -> BitArray{N}
 
-Get a BitVector that shares `chunks` with the pooled BitVector.
+Get a BitArray{N} that shares `chunks` with the pooled BitVector.
 
-Unlike `get_view!` which returns a `SubArray` (loses SIMD optimizations),
-this returns a real `BitVector` with shared chunks, preserving native
-BitVector performance (~(10x ~ 100x) faster for `count()`, `sum()`, etc.).
+Uses N-way cache for BitArray reuse. Unlike Array which requires unsafe_wrap
+for each shape, BitArray can reuse cached entries by modifying `dims`/`len`
+fields when ndims matches (0 bytes allocation).
 
-## Implementation
-Creates a new BitVector shell and replaces its `chunks` field with the
-pooled BitVector's chunks. Uses N-way cache for wrapper reuse.
+## Cache Strategy
+- **Exact match**: Return cached BitArray directly (0 bytes)
+- **Same ndims**: Modify dims/len/chunks of cached entry (0 bytes)
+- **Different ndims**: Create new BitArray{N} and cache it (~944 bytes)
+
+## Implementation Notes
+- BitVector (N=1): `size()` uses `len` field, `dims` is ignored
+- BitArray{N>1}: `size()` uses `dims` field
+- All BitArrays share `chunks` with the pool's backing BitVector
 
 ## Safety
-The returned BitVector is only valid within the `@with_pool` scope.
+The returned BitArray is only valid within the `@with_pool` scope.
 Do NOT use after the scope ends (use-after-free risk).
 """
-function get_bitvector_wrapper!(tp::BitTypedPool, n::Int)
+function get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) where {N}
+    total_len = safe_prod(dims)
     tp.n_active += 1
     idx = tp.n_active
 
     # 1. Pool expansion needed (new slot)
     if idx > length(tp.vectors)
-        pool_bv = BitVector(undef, n)
+        pool_bv = BitVector(undef, total_len)
         push!(tp.vectors, pool_bv)
-        push!(tp.views, view(pool_bv, 1:n))
-        push!(tp.view_lengths, n)
 
-        # Create wrapper sharing chunks
-        wrapper = BitVector(undef, n)
-        wrapper.chunks = pool_bv.chunks
+        # Create BitArray sharing chunks
+        ba = BitArray{N}(undef, dims)
+        ba.chunks = pool_bv.chunks
 
         # Expand N-way cache (CACHE_WAYS entries per slot)
         for _ in 1:CACHE_WAYS
@@ -92,8 +97,8 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int)
 
         # Cache in first way
         base = (idx - 1) * CACHE_WAYS + 1
-        @inbounds tp.nd_arrays[base] = wrapper
-        @inbounds tp.nd_dims[base] = n
+        @inbounds tp.nd_arrays[base] = ba
+        @inbounds tp.nd_dims[base] = dims
         @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks))
 
         # Warn at powers of 2 (possible missing rewind!)
@@ -102,49 +107,59 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int)
             @warn "BitTypedPool growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?"
         end
 
-        return wrapper
+        return ba
     end
 
-    # 2. Check N-way cache for hit (cache slots always exist - created with vector slot above)
+    # 2. Ensure pool_bv has correct size
     @inbounds pool_bv = tp.vectors[idx]
+    if length(pool_bv) != total_len
+        resize!(pool_bv, total_len)
+    end
     current_ptr = UInt(pointer(pool_bv.chunks))
     base = (idx - 1) * CACHE_WAYS
 
-    # Linear search across all ways
+    # 3. Check N-way cache for hit
     for k in 1:CACHE_WAYS
         cache_idx = base + k
-        @inbounds cached_n = tp.nd_dims[cache_idx]
+        @inbounds cached_dims = tp.nd_dims[cache_idx]
         @inbounds cached_ptr = tp.nd_ptrs[cache_idx]
 
-        if cached_n == n && cached_ptr == current_ptr
-            return @inbounds tp.nd_arrays[cache_idx]::BitVector
+        # Must check isa FIRST for type stability (avoids boxing in == comparison)
+        if cached_dims isa NTuple{N,Int} && cached_ptr == current_ptr
+            if cached_dims == dims
+                # Exact match - return cached BitArray directly (0 alloc)
+                return @inbounds tp.nd_arrays[cache_idx]::BitArray{N}
+            else
+                # Same ndims but different dims - reuse by modifying fields (0 alloc!)
+                ba = @inbounds tp.nd_arrays[cache_idx]::BitArray{N}
+                ba.len = total_len
+                ba.dims = dims
+                ba.chunks = pool_bv.chunks
+                # Update cache metadata
+                @inbounds tp.nd_dims[cache_idx] = dims
+                return ba
+            end
         end
     end
 
-    # 3. Cache miss - resize pool_bv to EXACTLY n elements and create new wrapper
-    # Unlike regular arrays where we only grow, BitVector wrappers MUST have exactly
-    # the right number of chunks. Otherwise fill!()/count() iterate over all chunks,
-    # not just the bits within wrapper.len, causing incorrect behavior.
-    if length(pool_bv) != n
-        resize!(pool_bv, n)
-        @inbounds tp.views[idx] = view(pool_bv, 1:n)
-        @inbounds tp.view_lengths[idx] = n
-    end
-
-    wrapper = BitVector(undef, n)
-    wrapper.chunks = pool_bv.chunks
+    # 4. Cache miss - create new BitArray{N}
+    ba = BitArray{N}(undef, dims)
+    ba.chunks = pool_bv.chunks
 
     # Round-robin replacement
     @inbounds way_offset = tp.nd_next_way[idx]
     target_idx = base + way_offset + 1
-    @inbounds tp.nd_arrays[target_idx] = wrapper
-    @inbounds tp.nd_dims[target_idx] = n
-    @inbounds tp.nd_ptrs[target_idx] = UInt(pointer(pool_bv.chunks))
+    @inbounds tp.nd_arrays[target_idx] = ba
+    @inbounds tp.nd_dims[target_idx] = dims
+    @inbounds tp.nd_ptrs[target_idx] = current_ptr
     @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS
 
-    return wrapper
+    return ba
 end
 
+# Convenience: 1D case wraps to tuple
+@inline get_bitarray!(tp::BitTypedPool, n::Int) = get_bitarray!(tp, (n,))
+
 # ==============================================================================
 # Acquire Implementation (Bit type → delegates to unsafe_acquire for performance)
 # ==============================================================================
@@ -174,20 +189,20 @@ end
 # Unsafe Acquire Implementation (Bit type)
 # ==============================================================================
 
-# Bit type: returns BitVector with shared chunks (SIMD optimized)
+# Bit type: returns BitArray{N} with shared chunks (SIMD optimized, N-D cached)
 @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int)
     tp = get_typed_pool!(pool, Bit)::BitTypedPool
-    return get_bitvector_wrapper!(tp, n)
+    return get_bitarray!(tp, n)
 end
 
 @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N}
-    total = safe_prod(dims)
-    bv = _unsafe_acquire_impl!(pool, Bit, total)
-    return reshape(bv, dims)  # BitArray{N} (Julia's reshape on BitVector returns BitArray)
+    tp = get_typed_pool!(pool, Bit)::BitTypedPool
+    return get_bitarray!(tp, dims)
 end
 
 @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N}
-    return _unsafe_acquire_impl!(pool, Bit, dims...)
+    tp = get_typed_pool!(pool, Bit)::BitTypedPool
+    return get_bitarray!(tp, dims)
 end
 
 # ==============================================================================
diff --git a/src/state.jl b/src/state.jl
index 9cb09ab..c9b2a66 100644
--- a/src/state.jl
+++ b/src/state.jl
@@ -206,12 +206,34 @@ end
 # ==============================================================================
 
 """
-    empty!(tp::AbstractTypedPool)
+    empty!(tp::BitTypedPool)
 
-Clear all internal storage, releasing all memory.
+Clear all internal storage for BitTypedPool, releasing all memory.
 Restores sentinel values for 1-based sentinel pattern.
 """
-function Base.empty!(tp::AbstractTypedPool)
+function Base.empty!(tp::BitTypedPool)
+    empty!(tp.vectors)
+    # Clear N-way wrapper cache
+    empty!(tp.nd_arrays)
+    empty!(tp.nd_dims)
+    empty!(tp.nd_ptrs)
+    empty!(tp.nd_next_way)
+    tp.n_active = 0
+    # Restore sentinel values (1-based sentinel pattern)
+    empty!(tp._checkpoint_n_active)
+    push!(tp._checkpoint_n_active, 0)   # Sentinel: n_active=0 at depth=0
+    empty!(tp._checkpoint_depths)
+    push!(tp._checkpoint_depths, 0)     # Sentinel: depth=0 = no checkpoint
+    return tp
+end
+
+"""
+    empty!(tp::TypedPool)
+
+Clear all internal storage for TypedPool, releasing all memory.
+Restores sentinel values for 1-based sentinel pattern.
+"""
+function Base.empty!(tp::TypedPool)
     empty!(tp.vectors)
     empty!(tp.views)
     empty!(tp.view_lengths)
diff --git a/src/types.jl b/src/types.jl
index 0f2e8b5..957eb92 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -291,8 +291,6 @@ performance without needing to choose between APIs.
 
 ## Fields
 - `vectors`: Backing `BitVector` storage
-- `views`: Cached `SubArray` views (legacy, maintained for compatibility)
-- `view_lengths`: Cached lengths for fast comparison
 - `nd_arrays`: Cached wrapper BitVectors (chunks sharing)
 - `nd_dims`: Cached lengths for wrapper cache validation
 - `nd_ptrs`: Cached chunk pointers for invalidation detection
@@ -324,17 +322,14 @@ mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector}
     # --- Storage ---
     vectors::Vector{BitVector}
 
-    # --- 1D Cache (1:1 mapping) ---
-    views::Vector{SubArray{Bool, 1, BitVector, Tuple{UnitRange{Int64}}, true}}
-    view_lengths::Vector{Int}
-
-    # --- N-D Array Cache (empty, for empty! compatibility) ---
-    # BitArray cannot use unsafe_wrap, so no N-D caching is possible.
-    # These fields exist only for compatibility with empty!(::AbstractTypedPool).
-    nd_arrays::Vector{Any}
-    nd_dims::Vector{Any}
-    nd_ptrs::Vector{UInt}
-    nd_next_way::Vector{Int}
+    # --- 1D BitVector Wrapper Cache (N-way set associative) ---
+    # Unlike TypedPool which uses views for 1D and nd_* for N-D,
+    # BitTypedPool uses nd_* for 1D wrapper caching (BitVector with shared chunks).
+    # No views needed since we always return BitVector, not SubArray.
+    nd_arrays::Vector{Any}      # BitVector wrappers
+    nd_dims::Vector{Any}        # requested lengths (Int, not tuple)
+    nd_ptrs::Vector{UInt}       # pointer validation
+    nd_next_way::Vector{Int}    # round-robin counter per slot
 
     # --- State Management (1-based sentinel pattern) ---
     n_active::Int
@@ -345,10 +340,7 @@ end
 BitTypedPool() = BitTypedPool(
     # Storage
     BitVector[],
-    # 1D Cache
-    SubArray{Bool, 1, BitVector, Tuple{UnitRange{Int64}}, true}[],
-    Int[],
-    # N-D Array Cache (empty, for compatibility)
+    # 1D BitVector Wrapper Cache (N-way)
     Any[],
     Any[],
     UInt[],
diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl
index 12a2e5b..d848dcb 100644
--- a/test/test_bitarray.jl
+++ b/test/test_bitarray.jl
@@ -688,4 +688,29 @@
         empty!(pool)
     end
 
+    @testset "N-D BitArray caching - zero allocation on reuse" begin
+        # Test that N-D caching works: first call may allocate, subsequent calls should not
+        # This verifies the optimization where BitArray{N}.dims can be modified in-place
+
+        pool = get_task_local_pool()
+        empty!(pool) # Start fresh
+        @with_pool pool function foo()
+            # Warmup to populate cache
+            bv = acquire!(pool, Bit, 100)
+            ba2 = acquire!(pool, Bit, 10, 10)
+            ba3 = acquire!(pool, Bit, 5, 5, 4)
+
+            tt1 = trues!(pool, 256)
+            tt2 = ones!(pool, 10, 20)
+            ff1 = falses!(pool, 100, 5)
+            ff2 = zeros!(pool, 100)
+
+            C = similar!(pool, tt1)
+        end
+
+        @test (@allocated foo()) > 0 # First call allocates
+        @test (@allocated foo()) == 0 # Subsequent calls reuse cached arrays
+        @test (@allocated foo()) == 0 # Further calls also zero allocation
+    end
+
 end # BitArray Support

From a9f9f5c3c93539a6d120a0edd65cb2be0bdfd745 Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Sat, 31 Jan 2026 09:16:26 -0800
Subject: [PATCH 12/13] feat(bitarray): enhance N-D BitArray caching and add
 allocation tests

---
 src/bitarray.jl         |  8 ++---
 src/types.jl            | 11 +++---
 test/runtests.jl        |  1 +
 test/test_allocation.jl | 30 ++++++++++++++++
 test/test_bitarray.jl   | 76 ++++++++++++++++++++++++++++++++---------
 5 files changed, 100 insertions(+), 26 deletions(-)
 create mode 100644 test/test_allocation.jl

diff --git a/src/bitarray.jl b/src/bitarray.jl
index f2263ac..eb9addf 100644
--- a/src/bitarray.jl
+++ b/src/bitarray.jl
@@ -165,12 +165,12 @@ end
 # ==============================================================================
 #
 # Unlike other types where acquire! returns SubArray (view-based) and
-# unsafe_acquire! returns Array (raw), Bit type always returns BitVector.
-# This is because BitVector's SIMD-optimized operations (count, sum, etc.)
+# unsafe_acquire! returns Array (raw), Bit type always returns BitArray{N}.
+# This is because BitArray's SIMD-optimized operations (count, sum, etc.)
 # are ~(10x ~ 100x) faster than SubArray equivalents.
 #
-# The delegation is transparent: users calling acquire!(pool, Bit, n) get
-# BitVector without needing to know about unsafe_acquire!.
+# The delegation is transparent: users calling acquire!(pool, Bit, dims...) get
+# BitArray{N} without needing to know about unsafe_acquire!.
 
 # Bit type: delegates to _unsafe_acquire_impl! for SIMD performance
 @inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int)
diff --git a/src/types.jl b/src/types.jl
index 957eb92..0b6f62f 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -322,12 +322,13 @@ mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector}
     # --- Storage ---
     vectors::Vector{BitVector}
 
-    # --- 1D BitVector Wrapper Cache (N-way set associative) ---
+    # --- N-D BitArray Cache (N-way set associative) ---
     # Unlike TypedPool which uses views for 1D and nd_* for N-D,
-    # BitTypedPool uses nd_* for 1D wrapper caching (BitVector with shared chunks).
-    # No views needed since we always return BitVector, not SubArray.
-    nd_arrays::Vector{Any}      # BitVector wrappers
-    nd_dims::Vector{Any}        # requested lengths (Int, not tuple)
+    # BitTypedPool uses nd_* for ALL dimensions (1D, 2D, 3D, etc.).
+    # No views needed since we always return BitArray{N}, not SubArray.
+    # BitArray.dims is mutable, enabling 0-alloc reuse for same-ndims requests.
+    nd_arrays::Vector{Any}      # Cached BitArray{N} instances
+    nd_dims::Vector{Any}        # Cached dims (NTuple{N,Int})
     nd_ptrs::Vector{UInt}       # pointer validation
     nd_next_way::Vector{Int}    # round-robin counter per slot
 
diff --git a/test/runtests.jl b/test/runtests.jl
index c4417de..2525187 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -27,6 +27,7 @@ else
     include("test_convenience.jl")
     include("test_bitarray.jl")
     include("test_coverage.jl")
+    include("test_allocation.jl")
 
     # CUDA extension tests (auto-detect, skip with TEST_CUDA=false)
     if get(ENV, "TEST_CUDA", "true") != "false"
diff --git a/test/test_allocation.jl b/test/test_allocation.jl
new file mode 100644
index 0000000..270fb8f
--- /dev/null
+++ b/test/test_allocation.jl
@@ -0,0 +1,30 @@
+@with_pool pool function foo()
+	float64_vec = acquire!(pool, Float64, 10)
+	float32_vec = acquire!(pool, Float32, 10)
+
+	float64_mat = acquire!(pool, Float64, 10, 10)
+	float32_mat = acquire!(pool, Float32, 10, 10)
+
+	bv = acquire!(pool, Bit, 100)
+	ba2 = acquire!(pool, Bit, 10, 10)
+	ba3 = acquire!(pool, Bit, 5, 5, 4)
+
+	tt1 = trues!(pool, 256)
+	tt2 = ones!(pool, Bit, 10, 20)
+	ff1 = falses!(pool, 100, 5)
+	ff2 = zeros!(pool, Bit, 100)
+
+	C = similar!(pool, tt1)
+end
+
+
+@testset "zero allocation on reuse" begin
+
+    alloc1 = @allocated foo()
+    alloc2 = @allocated foo()
+    alloc3 = @allocated foo()
+
+    @test alloc1 > 0 # First call allocates
+    @test alloc2 == 0 # Subsequent calls reuse cached arrays
+    @test alloc3 == 0 # Further calls also zero allocation
+end
\ No newline at end of file
diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl
index d848dcb..08e9fca 100644
--- a/test/test_bitarray.jl
+++ b/test/test_bitarray.jl
@@ -688,29 +688,71 @@
         empty!(pool)
     end
 
-    @testset "N-D BitArray caching - zero allocation on reuse" begin
-        # Test that N-D caching works: first call may allocate, subsequent calls should not
-        # This verifies the optimization where BitArray{N}.dims can be modified in-place
+    @testset "N-D BitArray caching - same ndims different dims" begin
+        # Test the optimization where BitArray{N}.dims can be modified in-place
+        # when ndims matches but dims differ (e.g., (10,10) → (5,20))
 
         pool = get_task_local_pool()
-        empty!(pool) # Start fresh
-        @with_pool pool function foo()
-            # Warmup to populate cache
-            bv = acquire!(pool, Bit, 100)
-            ba2 = acquire!(pool, Bit, 10, 10)
-            ba3 = acquire!(pool, Bit, 5, 5, 4)
+        empty!(pool)
+
+        # Test correctness: verify dims are updated correctly
+        @with_pool pool begin
+            m1 = acquire!(pool, Bit, 10, 10)
+            @test size(m1) == (10, 10)
+            rewind!(pool)
+
+            m2 = acquire!(pool, Bit, 5, 20)  # Same ndims, different dims
+            @test size(m2) == (5, 20)
+            rewind!(pool)
+
+            m3 = acquire!(pool, Bit, 25, 4)
+            @test size(m3) == (25, 4)
+        end
+
+        # Test zero-allocation: separate function without assertions/returns
+        @with_pool pool function bar_alloc()
+            acquire!(pool, Bit, 10, 10)
+            rewind!(pool)
+            acquire!(pool, Bit, 5, 20)
+            rewind!(pool)
+            acquire!(pool, Bit, 25, 4)
+            nothing
+        end
 
-            tt1 = trues!(pool, 256)
-            tt2 = ones!(pool, 10, 20)
-            ff1 = falses!(pool, 100, 5)
-            ff2 = zeros!(pool, 100)
+        bar_alloc()  # Warmup
+        @test (@allocated bar_alloc()) == 0
+    end
+
+    @testset "N-D BitArray caching - cache eviction (round-robin)" begin
+        # Test that round-robin replacement works correctly when cache is full
+        # CACHE_WAYS determines how many different ndims can be cached per slot
+
+        pool = get_task_local_pool()
+        empty!(pool)
 
-            C = similar!(pool, tt1)
+        # Test that different ndims each allocate initially, but reuse on repeat
+        @with_pool pool function test_ndims_caching()
+            # These all use slot 1, each with different ndims
+            acquire!(pool, Bit, 100)          # 1D
+            acquire!(pool, Bit, 10, 10)       # 2D
+            acquire!(pool, Bit, 5, 5, 4)      # 3D
+            acquire!(pool, Bit, 5, 2, 2, 5)   # 4D
+            nothing
         end
 
-        @test (@allocated foo()) > 0 # First call allocates
-        @test (@allocated foo()) == 0 # Subsequent calls reuse cached arrays
-        @test (@allocated foo()) == 0 # Further calls also zero allocation
+        test_ndims_caching()  # Warmup
+        @test (@allocated test_ndims_caching()) == 0
+
+        # Test that cache eviction doesn't break functionality
+        @with_pool pool begin
+            # Exceed CACHE_WAYS with different ndims to force eviction
+            for n in 1:6
+                dims = ntuple(_ -> 2, n)
+                ba = acquire!(pool, Bit, dims...)
+                @test ndims(ba) == n
+                @test size(ba) == dims
+            end
+        end
     end
 
 end # BitArray Support

From 8e0ff199a93fa4e9724c517dd6f347c2e53b5e7e Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Sat, 31 Jan 2026 09:17:58 -0800
Subject: [PATCH 13/13] feat(bitarray): remove N-D BitArray caching tests for
 optimization verification

---
 test/test_bitarray.jl | 67 -------------------------------------------
 1 file changed, 67 deletions(-)

diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl
index 08e9fca..12a2e5b 100644
--- a/test/test_bitarray.jl
+++ b/test/test_bitarray.jl
@@ -688,71 +688,4 @@
         empty!(pool)
     end
 
-    @testset "N-D BitArray caching - same ndims different dims" begin
-        # Test the optimization where BitArray{N}.dims can be modified in-place
-        # when ndims matches but dims differ (e.g., (10,10) → (5,20))
-
-        pool = get_task_local_pool()
-        empty!(pool)
-
-        # Test correctness: verify dims are updated correctly
-        @with_pool pool begin
-            m1 = acquire!(pool, Bit, 10, 10)
-            @test size(m1) == (10, 10)
-            rewind!(pool)
-
-            m2 = acquire!(pool, Bit, 5, 20)  # Same ndims, different dims
-            @test size(m2) == (5, 20)
-            rewind!(pool)
-
-            m3 = acquire!(pool, Bit, 25, 4)
-            @test size(m3) == (25, 4)
-        end
-
-        # Test zero-allocation: separate function without assertions/returns
-        @with_pool pool function bar_alloc()
-            acquire!(pool, Bit, 10, 10)
-            rewind!(pool)
-            acquire!(pool, Bit, 5, 20)
-            rewind!(pool)
-            acquire!(pool, Bit, 25, 4)
-            nothing
-        end
-
-        bar_alloc()  # Warmup
-        @test (@allocated bar_alloc()) == 0
-    end
-
-    @testset "N-D BitArray caching - cache eviction (round-robin)" begin
-        # Test that round-robin replacement works correctly when cache is full
-        # CACHE_WAYS determines how many different ndims can be cached per slot
-
-        pool = get_task_local_pool()
-        empty!(pool)
-
-        # Test that different ndims each allocate initially, but reuse on repeat
-        @with_pool pool function test_ndims_caching()
-            # These all use slot 1, each with different ndims
-            acquire!(pool, Bit, 100)          # 1D
-            acquire!(pool, Bit, 10, 10)       # 2D
-            acquire!(pool, Bit, 5, 5, 4)      # 3D
-            acquire!(pool, Bit, 5, 2, 2, 5)   # 4D
-            nothing
-        end
-
-        test_ndims_caching()  # Warmup
-        @test (@allocated test_ndims_caching()) == 0
-
-        # Test that cache eviction doesn't break functionality
-        @with_pool pool begin
-            # Exceed CACHE_WAYS with different ndims to force eviction
-            for n in 1:6
-                dims = ntuple(_ -> 2, n)
-                ba = acquire!(pool, Bit, dims...)
-                @test ndims(ba) == n
-                @test size(ba) == dims
-            end
-        end
-    end
-
 end # BitArray Support