diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
index 639dc50..baa39d2 100644
--- a/.github/workflows/TagBot.yml
+++ b/.github/workflows/TagBot.yml
@@ -19,4 +19,3 @@ jobs:
           token: ${{ secrets.GITHUB_TOKEN }}
           ssh: ${{ secrets.DOCUMENTER_KEY }}
           dispatch: true
-          changelog: false
diff --git a/docs/src/features/bit-arrays.md b/docs/src/features/bit-arrays.md
index ec21d39..61519b9 100644
--- a/docs/src/features/bit-arrays.md
+++ b/docs/src/features/bit-arrays.md
@@ -1,6 +1,6 @@
-# BitVector Support
+# BitArray Support
 
-AdaptiveArrayPools.jl includes specialized support for `BitArray` (specifically `BitVector`), enabling **~8x memory savings** for boolean arrays compared to standard `Vector{Bool}`.
+AdaptiveArrayPools.jl includes specialized support for `BitArray` (including `BitVector` and N-dimensional `BitArray{N}`), enabling **~8x memory savings** for boolean arrays compared to standard `Vector{Bool}`.
 
 ## The `Bit` Sentinel Type
 
@@ -14,31 +14,34 @@ To distinguish between standard boolean arrays (`Vector{Bool}`, 1 byte/element)
 ## Usage
 
 ### 1D Arrays (BitVector)
-For 1D arrays, `acquire!` returns a view into a pooled `BitVector`.
+For 1D arrays, `acquire!` returns a native `BitVector`. This design choice enables full SIMD optimization, making operations significantly faster (10x~100x) than using views.
 
 ```julia
 @with_pool pool begin
     # Acquire a BitVector of length 1000
     bv = acquire!(pool, Bit, 1000)
-    
+
     # Use like normal
     bv .= true
     bv[1] = false
-    
-    # Supports standard operations
+
+    # Supports standard operations with full SIMD acceleration
     count(bv)
 end
 ```
 
-### N-D Arrays (BitArray / Reshaped)
-For multi-dimensional arrays, `acquire!` returns a `ReshapedArray` wrapper around the linear `BitVector`. This maintains zero-allocation efficiency while providing N-D indexing.
+### N-D Arrays (BitArray)
+For multi-dimensional arrays, `acquire!` returns a `BitArray{N}` (specifically `BitMatrix` for 2D). This preserves the packed memory layout and SIMD benefits while providing N-D indexing.
 
 ```julia
 @with_pool pool begin
-    # 100x100 bit matrix
+    # 100x100 bit matrix (returns BitMatrix)
     mask = zeros!(pool, Bit, 100, 100)
-    
+
     mask[5, 5] = true
+
+    # 3D BitArray
+    volume = acquire!(pool, Bit, 10, 10, 10)
 end
 ```
 
@@ -50,29 +53,66 @@ For specific `BitVector` operations, prefer `trues!` and `falses!` which mirror
 @with_pool pool begin
     # Filled with false (equivalent to `falses(256)`)
     mask = falses!(pool, 256)
-    
+
     # Filled with true (equivalent to `trues(256)`)
     flags = trues!(pool, 256)
-    
+
     # Multidimensional
     grid = trues!(pool, 100, 100)
-    
+
     # Similar to existing BitArray
     A = BitVector(undef, 50)
     B = similar!(pool, A)  # Reuses eltype(A) -> Bool
-    
+
     # To explicit get Bit-packed from pool irrespective of source
-    C = similar!(pool, A, Bit) 
+    C = similar!(pool, A, Bit)
 end
+```
 
 Note: `zeros!(pool, Bit, ...)` and `ones!(pool, Bit, ...)` are also supported (aliased to `falses!` and `trues!`).
+
+## Performance & Safety
+
+### Why Native BitArray?
+The pool returns native `BitVector`/`BitArray` types instead of `SubArray` views for **performance**.
+Operations like `count()`, `sum()`, and bitwise broadcasting are **10x~100x faster** on native bit arrays because they utilize SIMD instructions on packed 64-bit chunks.
+
+### N-D Caching & Zero Allocation
+
+The pool uses an N-way associative cache to efficiently reuse `BitArray{N}` instances:
+
+| Scenario | Allocation |
+|----------|------------|
+| First call with new dims | ~944 bytes (new `BitArray{N}` created) |
+| Subsequent call with same dims | **0 bytes** (cached instance reused) |
+| Same ndims, different dims | **0 bytes** (dims/len fields modified in-place) |
+| Different ndims | ~944 bytes (new `BitArray{N}` created and cached) |
+
+Unlike regular `Array` where dimensions are immutable, `BitArray` allows in-place modification of its `dims` and `len` fields. The pool exploits this to achieve **zero allocation** on repeated calls with matching dimensionality.
+
+```julia
+@with_pool pool begin
+    # First call: allocates BitMatrix wrapper (~944 bytes)
+    m1 = acquire!(pool, Bit, 100, 100)
+
+    # Rewind to reuse the same slot
+    rewind!(pool)
+
+    # Same dims: 0 allocation (exact cache hit)
+    m2 = acquire!(pool, Bit, 100, 100)
+
+    rewind!(pool)
+
+    # Different dims but same ndims: 0 allocation (dims modified in-place)
+    m3 = acquire!(pool, Bit, 50, 200)
+end
 ```
 
-## How It Works
+### ⚠️ Important: Do Not Resize
+
+While the returned arrays are standard `BitVector` types, they share their underlying memory chunks with the pool.
 
-The pool maintains a separate `BitTypedPool` specifically for `BitVector` storage.
-- **Sentinel**: `acquire!(..., Bit, ...)` dispatches to this special pool.
-- **Views**: 1D returns `SubArray{Bool, 1, BitVector, ...}`.
-- **Reshaping**: N-D returns `ReshapedArray{Bool, N, SubArray{...}}`.
+!!! warning "Do Not Resize"
+    **NEVER** resize (`push!`, `pop!`, `resize!`) a pooled `BitVector` or `BitArray`.
 
-This ensures that even for complex shapes, the underlying storage is always a compact `BitVector` reused from the pool.
+    The underlying memory is owned and managed by the pool. Resizing it will detach it from the pool or potentially corrupt the shared state. Treat these arrays as **fixed-size** scratch buffers only.
diff --git a/src/AdaptiveArrayPools.jl b/src/AdaptiveArrayPools.jl
index 61f691a..7092822 100644
--- a/src/AdaptiveArrayPools.jl
+++ b/src/AdaptiveArrayPools.jl
@@ -28,6 +28,9 @@ include("utils.jl")
 # Acquisition operations: get_view!, acquire!, unsafe_acquire!, aliases
 include("acquire.jl")
 
+# BitArray-specific acquisition (SIMD-optimized BitVector operations)
+include("bitarray.jl")
+
 # Convenience functions: zeros!, ones!, similar!
 include("convenience.jl")
 
diff --git a/src/acquire.jl b/src/acquire.jl
index b8ddcf6..428738b 100644
--- a/src/acquire.jl
+++ b/src/acquire.jl
@@ -6,29 +6,12 @@
 @inline allocate_vector(::AbstractTypedPool{T,Vector{T}}, n::Int) where {T} =
     Vector{T}(undef, n)
 
-# BitTypedPool allocates BitVector (used when acquiring with Bit type)
-@inline allocate_vector(::BitTypedPool, n::Int) = BitVector(undef, n)
-
-# Bit type returns Bool element type for fill operations (zero/one)
-@inline Base.zero(::Type{Bit}) = false
-@inline Base.one(::Type{Bit}) = true
-
 # Wrap flat view into N-D array (dispatch point for extensions)
 @inline function wrap_array(::AbstractTypedPool{T,Vector{T}},
                             flat_view, dims::NTuple{N,Int}) where {T,N}
     unsafe_wrap(Array{T,N}, pointer(flat_view), dims)
 end
 
-# BitTypedPool cannot use unsafe_wrap - throw clear error
-# Called from _unsafe_acquire_impl! dispatches for Bit type
-@noinline function _throw_bit_unsafe_error()
-    throw(ArgumentError(
-        "unsafe_acquire!(pool, Bit, ...) is not supported. " *
-        "BitArray stores data in immutable chunks::Vector{UInt64} that cannot be wrapped with unsafe_wrap. " *
-        "Use acquire!(pool, Bit, ...) instead, which returns a view."
-    ))
-end
-
 # ==============================================================================
 # Helper: Overflow-Safe Product
 # ==============================================================================
@@ -245,11 +228,6 @@ end
 # Similar-style
 @inline _unsafe_acquire_impl!(pool::AbstractArrayPool, x::AbstractArray) = _unsafe_acquire_impl!(pool, eltype(x), size(x))
 
-# Bit type: unsafe_acquire! not supported (throw clear error early)
-@inline _unsafe_acquire_impl!(::AbstractArrayPool, ::Type{Bit}, ::Int) = _throw_bit_unsafe_error()
-@inline _unsafe_acquire_impl!(::AbstractArrayPool, ::Type{Bit}, ::Vararg{Int,N}) where {N} = _throw_bit_unsafe_error()
-@inline _unsafe_acquire_impl!(::AbstractArrayPool, ::Type{Bit}, ::NTuple{N,Int}) where {N} = _throw_bit_unsafe_error()
-
 # ==============================================================================
 # Acquisition API (User-facing with untracked marking)
 # ==============================================================================
@@ -450,11 +428,6 @@ const _acquire_array_impl! = _unsafe_acquire_impl!
 @inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = Array{T,N}(undef, dims)
 @inline unsafe_acquire!(::DisabledPool{:cpu}, x::AbstractArray) = similar(x)
 
-# --- acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) ---
-@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n)
-@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims)
-@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims)
-
 # --- Generic DisabledPool fallbacks (unknown backend → error) ---
 @inline acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B)
 @inline unsafe_acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B)
diff --git a/src/bitarray.jl b/src/bitarray.jl
new file mode 100644
index 0000000..eb9addf
--- /dev/null
+++ b/src/bitarray.jl
@@ -0,0 +1,220 @@
+# ==============================================================================
+# BitArray Acquisition (N-D Cached BitArray API)
+# ==============================================================================
+#
+# This file contains BitArray-specific pool operations, separated from the
+# generic Array acquisition code in acquire.jl for maintainability.
+#
+# Key components:
+# - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type
+# - get_bitarray! - N-D BitArray with shared chunks and N-way caching
+# - _acquire_impl! for Bit - Delegates to _unsafe_acquire_impl! for performance
+# - _unsafe_acquire_impl! for Bit - Raw BitArray acquisition with caching
+# - DisabledPool fallbacks for Bit type
+#
+# Design Decision: Unified BitArray Return Type
+# =============================================
+# Unlike regular types where acquire! returns SubArray and unsafe_acquire!
+# returns Array, for Bit type BOTH return BitArray{N}. This design choice is
+# intentional for several reasons:
+#
+# 1. **SIMD Performance**: BitArray operations like `count()`, `sum()`, and
+#    bitwise operations are ~(10x ~ 100x) faster than their SubArray equivalents
+#    because they use SIMD-optimized chunked algorithms.
+#
+# 2. **API Simplicity**: Users always get BitArray regardless of which API
+#    they call. No need to remember "use unsafe_acquire! for performance".
+#
+# 3. **N-D Caching**: BitArray{N} can be reused by modifying dims/len fields
+#    when ndims matches, achieving 0 allocation on repeated calls. This is
+#    unique to BitArray - regular Array cannot modify dims in place.
+#
+# 4. **Backwards Compatibility**: Code using trues!/falses! just works with
+#    optimal performance - these convenience functions return BitVector.
+#
+# Implementation:
+# - _acquire_impl!(pool, Bit, ...) delegates to _unsafe_acquire_impl!
+# - get_bitarray! creates BitArray shells sharing pool's chunks
+# - N-way cache stores BitArray{N} entries, reused via dims modification
+# ==============================================================================
+
+# ==============================================================================
+# Fill Value Dispatch (BitArray-specific)
+# ==============================================================================
+
+# Bit type returns Bool element type for fill operations (zero/one)
+@inline Base.zero(::Type{Bit}) = false
+@inline Base.one(::Type{Bit}) = true
+
+# ==============================================================================
+# BitArray Acquisition (N-D caching with chunks sharing)
+# ==============================================================================
+
+"""
+    get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) -> BitArray{N}
+
+Get a BitArray{N} that shares `chunks` with the pooled BitVector.
+
+Uses N-way cache for BitArray reuse. Unlike Array which requires unsafe_wrap
+for each shape, BitArray can reuse cached entries by modifying `dims`/`len`
+fields when ndims matches (0 bytes allocation).
+
+## Cache Strategy
+- **Exact match**: Return cached BitArray directly (0 bytes)
+- **Same ndims**: Modify dims/len/chunks of cached entry (0 bytes)
+- **Different ndims**: Create new BitArray{N} and cache it (~944 bytes)
+
+## Implementation Notes
+- BitVector (N=1): `size()` uses `len` field, `dims` is ignored
+- BitArray{N>1}: `size()` uses `dims` field
+- All BitArrays share `chunks` with the pool's backing BitVector
+
+## Safety
+The returned BitArray is only valid within the `@with_pool` scope.
+Do NOT use after the scope ends (use-after-free risk).
+"""
+function get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) where {N}
+    total_len = safe_prod(dims)
+    tp.n_active += 1
+    idx = tp.n_active
+
+    # 1. Pool expansion needed (new slot)
+    if idx > length(tp.vectors)
+        pool_bv = BitVector(undef, total_len)
+        push!(tp.vectors, pool_bv)
+
+        # Create BitArray sharing chunks
+        ba = BitArray{N}(undef, dims)
+        ba.chunks = pool_bv.chunks
+
+        # Expand N-way cache (CACHE_WAYS entries per slot)
+        for _ in 1:CACHE_WAYS
+            push!(tp.nd_arrays, nothing)
+            push!(tp.nd_dims, nothing)
+            push!(tp.nd_ptrs, UInt(0))
+        end
+        push!(tp.nd_next_way, 0)
+
+        # Cache in first way
+        base = (idx - 1) * CACHE_WAYS + 1
+        @inbounds tp.nd_arrays[base] = ba
+        @inbounds tp.nd_dims[base] = dims
+        @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks))
+
+        # Warn at powers of 2 (possible missing rewind!)
+        if idx >= 512 && (idx & (idx - 1)) == 0
+            total_bytes = sum(_vector_bytes, tp.vectors)
+            @warn "BitTypedPool growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?"
+        end
+
+        return ba
+    end
+
+    # 2. Ensure pool_bv has correct size
+    @inbounds pool_bv = tp.vectors[idx]
+    if length(pool_bv) != total_len
+        resize!(pool_bv, total_len)
+    end
+    current_ptr = UInt(pointer(pool_bv.chunks))
+    base = (idx - 1) * CACHE_WAYS
+
+    # 3. Check N-way cache for hit
+    for k in 1:CACHE_WAYS
+        cache_idx = base + k
+        @inbounds cached_dims = tp.nd_dims[cache_idx]
+        @inbounds cached_ptr = tp.nd_ptrs[cache_idx]
+
+        # Must check isa FIRST for type stability (avoids boxing in == comparison)
+        if cached_dims isa NTuple{N,Int} && cached_ptr == current_ptr
+            if cached_dims == dims
+                # Exact match - return cached BitArray directly (0 alloc)
+                return @inbounds tp.nd_arrays[cache_idx]::BitArray{N}
+            else
+                # Same ndims but different dims - reuse by modifying fields (0 alloc!)
+                ba = @inbounds tp.nd_arrays[cache_idx]::BitArray{N}
+                ba.len = total_len
+                ba.dims = dims
+                ba.chunks = pool_bv.chunks
+                # Update cache metadata
+                @inbounds tp.nd_dims[cache_idx] = dims
+                return ba
+            end
+        end
+    end
+
+    # 4. Cache miss - create new BitArray{N}
+    ba = BitArray{N}(undef, dims)
+    ba.chunks = pool_bv.chunks
+
+    # Round-robin replacement
+    @inbounds way_offset = tp.nd_next_way[idx]
+    target_idx = base + way_offset + 1
+    @inbounds tp.nd_arrays[target_idx] = ba
+    @inbounds tp.nd_dims[target_idx] = dims
+    @inbounds tp.nd_ptrs[target_idx] = current_ptr
+    @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS
+
+    return ba
+end
+
+# Convenience: 1D case wraps to tuple
+@inline get_bitarray!(tp::BitTypedPool, n::Int) = get_bitarray!(tp, (n,))
+
+# ==============================================================================
+# Acquire Implementation (Bit type → delegates to unsafe_acquire for performance)
+# ==============================================================================
+#
+# Unlike other types where acquire! returns SubArray (view-based) and
+# unsafe_acquire! returns Array (raw), Bit type always returns BitArray{N}.
+# This is because BitArray's SIMD-optimized operations (count, sum, etc.)
+# are ~(10x ~ 100x) faster than SubArray equivalents.
+#
+# The delegation is transparent: users calling acquire!(pool, Bit, dims...) get
+# BitArray{N} without needing to know about unsafe_acquire!.
+
+# Bit type: delegates to _unsafe_acquire_impl! for SIMD performance
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int)
+    return _unsafe_acquire_impl!(pool, Bit, n)
+end
+
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N}
+    return _unsafe_acquire_impl!(pool, Bit, dims...)
+end
+
+@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N}
+    return _unsafe_acquire_impl!(pool, Bit, dims...)
+end
+
+# ==============================================================================
+# Unsafe Acquire Implementation (Bit type)
+# ==============================================================================
+
+# Bit type: returns BitArray{N} with shared chunks (SIMD optimized, N-D cached)
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int)
+    tp = get_typed_pool!(pool, Bit)::BitTypedPool
+    return get_bitarray!(tp, n)
+end
+
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N}
+    tp = get_typed_pool!(pool, Bit)::BitTypedPool
+    return get_bitarray!(tp, dims)
+end
+
+@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N}
+    tp = get_typed_pool!(pool, Bit)::BitTypedPool
+    return get_bitarray!(tp, dims)
+end
+
+# ==============================================================================
+# DisabledPool Fallbacks (Bit type)
+# ==============================================================================
+
+# --- acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) ---
+@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n)
+@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims)
+@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims)
+
+# --- unsafe_acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) ---
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n)
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims)
+@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims)
diff --git a/src/state.jl b/src/state.jl
index 9cb09ab..c9b2a66 100644
--- a/src/state.jl
+++ b/src/state.jl
@@ -206,12 +206,34 @@ end
 # ==============================================================================
 
 """
-    empty!(tp::AbstractTypedPool)
+    empty!(tp::BitTypedPool)
 
-Clear all internal storage, releasing all memory.
+Clear all internal storage for BitTypedPool, releasing all memory.
 Restores sentinel values for 1-based sentinel pattern.
 """
-function Base.empty!(tp::AbstractTypedPool)
+function Base.empty!(tp::BitTypedPool)
+    empty!(tp.vectors)
+    # Clear N-way wrapper cache
+    empty!(tp.nd_arrays)
+    empty!(tp.nd_dims)
+    empty!(tp.nd_ptrs)
+    empty!(tp.nd_next_way)
+    tp.n_active = 0
+    # Restore sentinel values (1-based sentinel pattern)
+    empty!(tp._checkpoint_n_active)
+    push!(tp._checkpoint_n_active, 0)   # Sentinel: n_active=0 at depth=0
+    empty!(tp._checkpoint_depths)
+    push!(tp._checkpoint_depths, 0)     # Sentinel: depth=0 = no checkpoint
+    return tp
+end
+
+"""
+    empty!(tp::TypedPool)
+
+Clear all internal storage for TypedPool, releasing all memory.
+Restores sentinel values for 1-based sentinel pattern.
+"""
+function Base.empty!(tp::TypedPool)
     empty!(tp.vectors)
     empty!(tp.views)
     empty!(tp.view_lengths)
diff --git a/src/types.jl b/src/types.jl
index 2b1a070..0b6f62f 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -225,28 +225,45 @@ bit-packed arrays (1 bit per element vs 1 byte for `Vector{Bool}`).
 ## Usage
 ```julia
 @with_pool pool begin
-    # BitVector view (1 bit per element, ~8x memory savings)
+    # BitVector (1 bit per element, ~8x memory savings)
     bv = acquire!(pool, Bit, 1000)
 
     # vs Vector{Bool} (1 byte per element)
     vb = acquire!(pool, Bool, 1000)
 
     # Convenience functions work too
-    mask = zeros!(pool, Bit, 100)   # BitVector filled with false
-    flags = ones!(pool, Bit, 100)   # BitVector filled with true
+    mask = falses!(pool, 100)       # BitVector filled with false
+    flags = trues!(pool, 100)       # BitVector filled with true
 end
 ```
 
-## Return Types
-- **1D**: `SubArray{Bool,1,BitVector,...}`
-- **N-D**: `ReshapedArray{Bool,N,...}` (reshaped view of 1D BitVector)
+## Return Types (Unified for Performance)
+Unlike other types, `Bit` always returns native `BitVector`/`BitArray`:
+- **1D**: `BitVector` (both `acquire!` and `unsafe_acquire!`)
+- **N-D**: `BitArray{N}` (reshaped, preserves SIMD optimization)
 
-## Limitation
-`unsafe_acquire!(pool, Bit, ...)` is **not supported** because Julia's
-`BitArray` stores data in immutable `chunks::Vector{UInt64}` that cannot
-be wrapped with `unsafe_wrap`.
+This design ensures users always get SIMD-optimized performance without
+needing to remember which API to use.
 
-See also: [`acquire!`](@ref), [`BitTypedPool`](@ref)
+## Performance
+`BitVector` operations like `count()`, `sum()`, and bitwise operations are
+~(10x ~ 100x) faster than equivalent operations on `SubArray{Bool}` because they
+use SIMD-optimized algorithms on packed 64-bit chunks.
+
+```julia
+@with_pool pool begin
+    bv = acquire!(pool, Bit, 10000)
+    fill!(bv, true)
+    count(bv)  # Uses fast SIMD path automatically
+end
+```
+
+## Memory Safety
+The returned `BitVector` shares its internal `chunks` array with the pool.
+It is only valid within the `@with_pool` scope - using it after the scope
+ends leads to undefined behavior (use-after-free risk).
+
+See also: [`trues!`](@ref), [`falses!`](@ref), [`BitTypedPool`](@ref)
 """
 struct Bit end
 
@@ -262,46 +279,58 @@ Specialized pool for `BitVector` arrays with memory reuse.
 Unlike `TypedPool{Bool}` which stores `Vector{Bool}` (1 byte per element),
 this pool stores `BitVector` (1 bit per element, ~8x memory efficiency).
 
-## Important Limitation
-**`unsafe_acquire!` is NOT supported for BitArray** because Julia's `BitArray`
-stores data in a `chunks::Vector{UInt64}` field that cannot be wrapped with
-`unsafe_wrap`. Only view-based acquisition via `acquire!(pool, Bit, ...)` is available.
+## Unified API (Always Returns BitVector)
+Unlike other types, both `acquire!` and `unsafe_acquire!` return `BitVector`
+for the `Bit` type. This design ensures users always get SIMD-optimized
+performance without needing to choose between APIs.
+
+- `acquire!(pool, Bit, n)` → `BitVector` (SIMD optimized)
+- `unsafe_acquire!(pool, Bit, n)` → `BitVector` (same behavior)
+- `trues!(pool, n)` → `BitVector` filled with `true`
+- `falses!(pool, n)` → `BitVector` filled with `false`
 
 ## Fields
 - `vectors`: Backing `BitVector` storage
-- `views`: Cached `SubArray` views for zero-allocation 1D access
-- `view_lengths`: Cached lengths for fast comparison
-- `nd_*`: Empty N-D cache fields (for `empty!` compatibility, unused)
+- `nd_arrays`: Cached wrapper BitVectors (chunks sharing)
+- `nd_dims`: Cached lengths for wrapper cache validation
+- `nd_ptrs`: Cached chunk pointers for invalidation detection
+- `nd_next_way`: Round-robin counter for N-way cache
 - `n_active`: Count of currently active arrays
 - `_checkpoint_*`: State management stacks (1-based sentinel pattern)
 
 ## Usage
 ```julia
 @with_pool pool begin
-    bv = acquire!(pool, Bit, 100)         # SubArray{Bool,1,BitVector,...}
-    ba = acquire!(pool, Bit, 10, 10)      # ReshapedArray{Bool,2,...}
-    t = trues!(pool, 50)                  # Filled with true
-    f = falses!(pool, 50)                 # Filled with false
+    # All return BitVector with SIMD performance
+    bv = acquire!(pool, Bit, 100)              # BitVector
+    count(bv)                                  # Fast SIMD path
+
+    # Convenience functions
+    t = trues!(pool, 50)                       # BitVector filled with true
+    f = falses!(pool, 50)                      # BitVector filled with false
 end
 ```
 
-See also: [`trues!`](@ref), [`falses!`](@ref)
+## Performance
+Operations like `count()`, `sum()`, and bitwise operations are ~(10x ~ 100x) faster
+than equivalent operations on `SubArray{Bool}` because `BitVector` uses
+SIMD-optimized algorithms on packed 64-bit chunks.
+
+See also: [`trues!`](@ref), [`falses!`](@ref), [`Bit`](@ref)
 """
 mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector}
     # --- Storage ---
     vectors::Vector{BitVector}
 
-    # --- 1D Cache (1:1 mapping) ---
-    views::Vector{SubArray{Bool, 1, BitVector, Tuple{UnitRange{Int64}}, true}}
-    view_lengths::Vector{Int}
-
-    # --- N-D Array Cache (empty, for empty! compatibility) ---
-    # BitArray cannot use unsafe_wrap, so no N-D caching is possible.
-    # These fields exist only for compatibility with empty!(::AbstractTypedPool).
-    nd_arrays::Vector{Any}
-    nd_dims::Vector{Any}
-    nd_ptrs::Vector{UInt}
-    nd_next_way::Vector{Int}
+    # --- N-D BitArray Cache (N-way set associative) ---
+    # Unlike TypedPool which uses views for 1D and nd_* for N-D,
+    # BitTypedPool uses nd_* for ALL dimensions (1D, 2D, 3D, etc.).
+    # No views needed since we always return BitArray{N}, not SubArray.
+    # BitArray.dims is mutable, enabling 0-alloc reuse for same-ndims requests.
+    nd_arrays::Vector{Any}      # Cached BitArray{N} instances
+    nd_dims::Vector{Any}        # Cached dims (NTuple{N,Int})
+    nd_ptrs::Vector{UInt}       # pointer validation
+    nd_next_way::Vector{Int}    # round-robin counter per slot
 
     # --- State Management (1-based sentinel pattern) ---
     n_active::Int
@@ -312,10 +341,7 @@ end
 BitTypedPool() = BitTypedPool(
     # Storage
     BitVector[],
-    # 1D Cache
-    SubArray{Bool, 1, BitVector, Tuple{UnitRange{Int64}}, true}[],
-    Int[],
-    # N-D Array Cache (empty, for compatibility)
+    # 1D BitVector Wrapper Cache (N-way)
     Any[],
     Any[],
     UInt[],
diff --git a/src/utils.jl b/src/utils.jl
index 5950744..f252aaa 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -79,14 +79,24 @@ _validate_pool_return(val, ::DisabledPool) = nothing
 # Statistics & Pretty Printing
 # ==============================================================================
 
+# --- Helper functions for pool_stats (type-specific behavior) ---
+_default_type_name(::TypedPool{T}) where {T} = string(T)
+_default_type_name(::BitTypedPool) = "Bit"
+
+_vector_bytes(v::Vector) = Base.summarysize(v)
+_vector_bytes(v::BitVector) = sizeof(v.chunks)
+
+_count_label(::TypedPool) = "elements"
+_count_label(::BitTypedPool) = "bits"
+
 """
-    pool_stats(tp::TypedPool{T}; io::IO=stdout, indent::Int=0, name::String="")
+    pool_stats(tp::AbstractTypedPool; io::IO=stdout, indent::Int=0, name::String="")
 
-Print statistics for a single TypedPool.
+Print statistics for a TypedPool or BitTypedPool.
 """
-function pool_stats(tp::TypedPool{T}; io::IO=stdout, indent::Int=0, name::String="") where {T}
+function pool_stats(tp::AbstractTypedPool; io::IO=stdout, indent::Int=0, name::String="")
     prefix = " "^indent
-    type_name = isempty(name) ? string(T) : name
+    type_name = isempty(name) ? _default_type_name(tp) : name
 
     n_arrays = length(tp.vectors)
     if n_arrays == 0
@@ -95,8 +105,8 @@ function pool_stats(tp::TypedPool{T}; io::IO=stdout, indent::Int=0, name::String
         return
     end
 
-    total_elements = sum(length(v) for v in tp.vectors)
-    total_bytes = sum(Base.summarysize(v) for v in tp.vectors)
+    total_count = sum(length(v) for v in tp.vectors)
+    total_bytes = sum(_vector_bytes(v) for v in tp.vectors)
     bytes_str = Base.format_bytes(total_bytes)
 
     # Header
@@ -110,8 +120,8 @@ function pool_stats(tp::TypedPool{T}; io::IO=stdout, indent::Int=0, name::String
     printstyled(io, tp.n_active, color=:blue)
     printstyled(io, ")\n", color=:dark_gray)
 
-    printstyled(io, prefix, "  elements: ", color=:dark_gray)
-    printstyled(io, total_elements, color=:blue)
+    printstyled(io, prefix, "  ", _count_label(tp), ": ", color=:dark_gray)
+    printstyled(io, total_count, color=:blue)
     printstyled(io, " ($bytes_str)\n", color=:dark_gray)
     return nothing
 end
@@ -141,8 +151,8 @@ function pool_stats(pool::AdaptiveArrayPool; io::IO=stdout)
     foreach_fixed_slot(pool) do tp
         if !isempty(tp.vectors)
             has_content = true
-            T = typeof(tp).parameters[1]  # Extract T from TypedPool{T}
-            pool_stats(tp; io, indent=2, name="$T (fixed)")
+            name = _default_type_name(tp) * " (fixed)"
+            pool_stats(tp; io, indent=2, name)
         end
     end
 
@@ -175,10 +185,7 @@ function pool_stats(; io::IO=stdout)
     pool_stats(:cpu; io)
     # Show CUDA pools if extension is loaded and pools exist
     try
-        pools = get_task_local_cuda_pools()
-        for pool in values(pools)
-            pool_stats(pool; io)
-        end
+        pool_stats(Val(:cuda); io)
     catch e
         e isa MethodError || rethrow()
         # CUDA extension not loaded - silently skip
@@ -212,20 +219,26 @@ end
 # Base.show (delegates to pool_stats)
 # ==============================================================================
 
-# Compact one-line show for TypedPool
-function Base.show(io::IO, tp::TypedPool{T}) where {T}
+# --- Helper for Base.show (full type name for display) ---
+_show_type_name(::TypedPool{T}) where {T} = "TypedPool{$T}"
+_show_type_name(::BitTypedPool) = "BitTypedPool"
+
+# Compact one-line show for all AbstractTypedPool
+function Base.show(io::IO, tp::AbstractTypedPool)
+    name = _show_type_name(tp)
     n_vectors = length(tp.vectors)
     if n_vectors == 0
-        print(io, "TypedPool{$T}(empty)")
+        print(io, "$name(empty)")
     else
         total = sum(length(v) for v in tp.vectors)
-        print(io, "TypedPool{$T}(slots=$n_vectors, active=$(tp.n_active), elements=$total)")
+        label = _count_label(tp)
+        print(io, "$name(slots=$n_vectors, active=$(tp.n_active), $label=$total)")
     end
 end
 
-# Multi-line show for TypedPool
-function Base.show(io::IO, ::MIME"text/plain", tp::TypedPool{T}) where {T}
-    pool_stats(tp; io, name="TypedPool{$T}")
+# Multi-line show for all AbstractTypedPool
+function Base.show(io::IO, ::MIME"text/plain", tp::AbstractTypedPool)
+    pool_stats(tp; io, name=_show_type_name(tp))
 end
 
 # Compact one-line show for AdaptiveArrayPool
diff --git a/test/runtests.jl b/test/runtests.jl
index c4417de..2525187 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -27,6 +27,7 @@ else
     include("test_convenience.jl")
     include("test_bitarray.jl")
     include("test_coverage.jl")
+    include("test_allocation.jl")
 
     # CUDA extension tests (auto-detect, skip with TEST_CUDA=false)
     if get(ENV, "TEST_CUDA", "true") != "false"
diff --git a/test/test_allocation.jl b/test/test_allocation.jl
new file mode 100644
index 0000000..270fb8f
--- /dev/null
+++ b/test/test_allocation.jl
@@ -0,0 +1,30 @@
+@with_pool pool function foo()
+	float64_vec = acquire!(pool, Float64, 10)
+	float32_vec = acquire!(pool, Float32, 10)
+
+	float64_mat = acquire!(pool, Float64, 10, 10)
+	float32_mat = acquire!(pool, Float32, 10, 10)
+
+	bv = acquire!(pool, Bit, 100)
+	ba2 = acquire!(pool, Bit, 10, 10)
+	ba3 = acquire!(pool, Bit, 5, 5, 4)
+
+	tt1 = trues!(pool, 256)
+	tt2 = ones!(pool, Bit, 10, 20)
+	ff1 = falses!(pool, 100, 5)
+	ff2 = zeros!(pool, Bit, 100)
+
+	C = similar!(pool, tt1)
+end
+
+
+@testset "zero allocation on reuse" begin
+
+    alloc1 = @allocated foo()
+    alloc2 = @allocated foo()
+    alloc3 = @allocated foo()
+
+    @test alloc1 > 0 # First call allocates
+    @test alloc2 == 0 # Subsequent calls reuse cached arrays
+    @test alloc3 == 0 # Further calls also zero allocation
+end
\ No newline at end of file
diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl
index a5dbeca..12a2e5b 100644
--- a/test/test_bitarray.jl
+++ b/test/test_bitarray.jl
@@ -26,13 +26,14 @@
         @test isempty(pool.bits.vectors)
     end
 
-    @testset "acquire!(pool, Bit, n) - 1D" begin
+    @testset "acquire!(pool, Bit, n) - 1D (returns BitVector for SIMD performance)" begin
         pool = AdaptiveArrayPool()
 
         bv = acquire!(pool, Bit, 100)
         @test length(bv) == 100
         @test eltype(bv) == Bool
-        @test bv isa SubArray{Bool, 1, BitVector}
+        # Returns BitVector (not SubArray) for SIMD-optimized operations
+        @test bv isa BitVector
         @test pool.bits.n_active == 1
 
         # Write and read back
@@ -45,6 +46,7 @@
         # Second acquire
         bv2 = acquire!(pool, Bit, 50)
         @test length(bv2) == 50
+        @test bv2 isa BitVector
         @test pool.bits.n_active == 2
 
         # Independent values
@@ -53,14 +55,15 @@
         @test count(bv) == 99  # bv unchanged
     end
 
-    @testset "acquire!(pool, Bit, dims...) - N-D" begin
+    @testset "acquire!(pool, Bit, dims...) - N-D (returns BitArray for SIMD performance)" begin
         pool = AdaptiveArrayPool()
 
-        # 2D
+        # 2D - returns BitMatrix (Julia's reshape(BitVector, dims) returns BitArray)
         ba2 = acquire!(pool, Bit, 10, 10)
         @test size(ba2) == (10, 10)
         @test eltype(ba2) == Bool
-        @test ba2 isa Base.ReshapedArray
+        # Note: reshape(BitVector, dims) returns BitArray{N}, not ReshapedArray
+        @test ba2 isa BitMatrix
         @test pool.bits.n_active == 1
 
         # Test indexing
@@ -75,108 +78,126 @@
         # 3D
         ba3 = acquire!(pool, Bit, 4, 5, 3)
         @test size(ba3) == (4, 5, 3)
+        @test ba3 isa BitArray{3}
         @test pool.bits.n_active == 2
 
         # Tuple form
         ba_tuple = acquire!(pool, Bit, (3, 4, 2))
         @test size(ba_tuple) == (3, 4, 2)
+        @test ba_tuple isa BitArray{3}
         @test pool.bits.n_active == 3
     end
 
-    @testset "ones!(pool, Bit, dims...) - filled with true" begin
+    @testset "ones!(pool, Bit, dims...) - BitVector filled with true" begin
         pool = AdaptiveArrayPool()
 
-        # 1D
+        # 1D - returns BitVector
         t1 = ones!(pool, Bit, 100)
         @test length(t1) == 100
         @test all(t1)
+        @test t1 isa BitVector
         @test pool.bits.n_active == 1
 
-        # 2D
+        # 2D - returns BitMatrix (reshape of BitVector)
         t2 = ones!(pool, Bit, 10, 10)
         @test size(t2) == (10, 10)
         @test all(t2)
         @test count(t2) == 100
+        @test t2 isa BitMatrix
 
         # Tuple form
         t3 = ones!(pool, Bit, (5, 5, 4))
         @test size(t3) == (5, 5, 4)
         @test all(t3)
+        @test t3 isa BitArray{3}
     end
 
-    @testset "zeros!(pool, Bit, dims...) - filled with false" begin
+    @testset "zeros!(pool, Bit, dims...) - BitVector filled with false" begin
         pool = AdaptiveArrayPool()
 
-        # 1D
+        # 1D - returns BitVector
         f1 = zeros!(pool, Bit, 100)
         @test length(f1) == 100
         @test !any(f1)
+        @test f1 isa BitVector
         @test pool.bits.n_active == 1
 
-        # 2D
+        # 2D - returns BitMatrix (reshape of BitVector)
         f2 = zeros!(pool, Bit, 10, 10)
         @test size(f2) == (10, 10)
         @test !any(f2)
         @test count(f2) == 0
+        @test f2 isa BitMatrix
 
         # Tuple form
         f3 = zeros!(pool, Bit, (5, 5, 4))
         @test size(f3) == (5, 5, 4)
         @test !any(f3)
+        @test f3 isa BitArray{3}
     end
 
-    @testset "trues!(pool, dims...) - convenience for BitArray filled with true" begin
+    @testset "trues!(pool, dims...) - BitVector filled with true (SIMD optimized)" begin
         pool = AdaptiveArrayPool()
 
-        # 1D
+        # 1D - returns BitVector
         t1 = trues!(pool, 100)
         @test length(t1) == 100
         @test all(t1)
         @test eltype(t1) == Bool
+        @test t1 isa BitVector
         @test pool.bits.n_active == 1
 
-        # 2D
+        # 2D - returns BitMatrix (reshape of BitVector)
         t2 = trues!(pool, 10, 10)
         @test size(t2) == (10, 10)
         @test all(t2)
         @test count(t2) == 100
+        @test t2 isa BitMatrix
 
         # Tuple form
         t3 = trues!(pool, (5, 5, 4))
         @test size(t3) == (5, 5, 4)
         @test all(t3)
+        @test t3 isa BitArray{3}
 
         # Equivalent to ones!(pool, Bit, ...)
         t4 = trues!(pool, 50)
         t5 = ones!(pool, Bit, 50)
         @test all(t4 .== t5)
+        @test t4 isa BitVector
+        @test t5 isa BitVector
     end
 
-    @testset "falses!(pool, dims...) - convenience for BitArray filled with false" begin
+    @testset "falses!(pool, dims...) - BitVector filled with false (SIMD optimized)" begin
         pool = AdaptiveArrayPool()
 
-        # 1D
+        # 1D - returns BitVector
         f1 = falses!(pool, 100)
         @test length(f1) == 100
         @test !any(f1)
         @test eltype(f1) == Bool
+        @test f1 isa BitVector
         @test pool.bits.n_active == 1
 
-        # 2D
+        # 2D - returns BitMatrix (reshape of BitVector)
         f2 = falses!(pool, 10, 10)
         @test size(f2) == (10, 10)
         @test !any(f2)
         @test count(f2) == 0
+        @test f2 isa BitMatrix
 
         # Tuple form
         f3 = falses!(pool, (5, 5, 4))
         @test size(f3) == (5, 5, 4)
         @test !any(f3)
+        @test f3 isa BitArray{3}
 
         # Equivalent to zeros!(pool, Bit, ...)
         f4 = falses!(pool, 50)
         f5 = zeros!(pool, Bit, 50)
         @test all(f4 .== f5)
+        @test f4 isa BitVector
+        @test f5 isa BitVector
     end
 
     @testset "State management" begin
@@ -237,7 +258,7 @@
     end
 
     @testset "DisabledPool fallback" begin
-        # acquire! with Bit
+        # --- acquire! with Bit ---
         bv = acquire!(DISABLED_CPU, Bit, 100)
         @test bv isa BitVector
         @test length(bv) == 100
@@ -252,6 +273,21 @@
         @test ba_tuple isa BitArray{2}
         @test size(ba_tuple) == (5, 5)
 
+        # --- unsafe_acquire! with Bit (covers bitarray.jl:206-208) ---
+        ubv = unsafe_acquire!(DISABLED_CPU, Bit, 100)
+        @test ubv isa BitVector
+        @test length(ubv) == 100
+
+        # N-D
+        uba = unsafe_acquire!(DISABLED_CPU, Bit, 10, 10)
+        @test uba isa BitArray{2}
+        @test size(uba) == (10, 10)
+
+        # Tuple form
+        uba_tuple = unsafe_acquire!(DISABLED_CPU, Bit, (5, 5))
+        @test uba_tuple isa BitArray{2}
+        @test size(uba_tuple) == (5, 5)
+
         # ones! with Bit (like trues)
         t = ones!(DISABLED_CPU, Bit, 50)
         @test t isa BitVector
@@ -405,14 +441,14 @@
     @testset "Mixed Bool types" begin
         pool = AdaptiveArrayPool()
 
-        # Vector{Bool} via acquire! with Bool
+        # Vector{Bool} via acquire! with Bool - returns SubArray (view)
         vb = acquire!(pool, Bool, 100)
         @test vb isa SubArray{Bool, 1, Vector{Bool}}
         @test pool.bool.n_active == 1
 
-        # BitVector via acquire! with Bit
+        # BitVector via acquire! with Bit - returns BitVector (for SIMD)
         bv = acquire!(pool, Bit, 100)
-        @test bv isa SubArray{Bool, 1, BitVector}
+        @test bv isa BitVector  # Note: Bit returns BitVector, not SubArray
         @test pool.bits.n_active == 1
 
         # Both should work independently
@@ -445,24 +481,60 @@
         @test outer_result == (100, 0)
     end
 
-    @testset "unsafe_acquire! not supported" begin
+    @testset "unsafe_acquire! returns BitVector with shared chunks" begin
         pool = AdaptiveArrayPool()
 
-        # unsafe_acquire! with Bit should throw a clear error
-        @test_throws ArgumentError unsafe_acquire!(pool, Bit, 100)
-        @test_throws ArgumentError unsafe_acquire!(pool, Bit, 10, 10)
+        # unsafe_acquire! with Bit returns a real BitVector (not SubArray)
+        bv = unsafe_acquire!(pool, Bit, 100)
+        @test bv isa BitVector
+        @test length(bv) == 100
 
-        # Tuple form (covers acquire.jl:251)
-        @test_throws ArgumentError unsafe_acquire!(pool, Bit, (10, 10))
+        # N-D returns BitArray (reshape of BitVector becomes BitArray in Julia)
+        ba = unsafe_acquire!(pool, Bit, 10, 10)
+        @test ba isa BitMatrix  # reshape(BitVector, dims) → BitArray
+        @test size(ba) == (10, 10)
 
-        # Verify the error message is helpful
-        try
-            unsafe_acquire!(pool, Bit, 100)
-        catch e
-            @test e isa ArgumentError
-            @test occursin("unsafe_acquire!", e.msg)
-            @test occursin("Bit", e.msg)
-            @test occursin("acquire!", e.msg)  # Suggests alternative
+        # Tuple form
+        ba_tuple = unsafe_acquire!(pool, Bit, (10, 10))
+        @test ba_tuple isa BitMatrix
+        @test size(ba_tuple) == (10, 10)
+
+        # Verify chunks sharing (key feature!)
+        @with_pool pool2 begin
+            bv2 = unsafe_acquire!(pool2, Bit, 100)
+            pool_bv = pool2.bits.vectors[1]
+            @test bv2.chunks === pool_bv.chunks  # Same chunks object!
+
+            # Verify data is shared
+            bv2[1] = true
+            @test pool_bv[1] == true
+            bv2[1] = false
+            @test pool_bv[1] == false
+        end
+    end
+
+    @testset "Unified BitVector API - both acquire! and unsafe_acquire! return BitVector" begin
+        # Both acquire! and unsafe_acquire! return BitVector for Bit type
+        # This is a deliberate design choice for SIMD performance
+        pool = AdaptiveArrayPool()
+
+        @with_pool pool begin
+            n = 10000
+
+            # unsafe_acquire! returns BitVector
+            bv_unsafe = unsafe_acquire!(pool, Bit, n)
+            fill!(bv_unsafe, true)
+            @test count(bv_unsafe) == n
+            @test bv_unsafe isa BitVector
+
+            # acquire! ALSO returns BitVector (not SubArray)
+            bv_acquire = acquire!(pool, Bit, n)
+            fill!(bv_acquire, true)
+            @test count(bv_acquire) == n
+            @test bv_acquire isa BitVector  # Same type as unsafe_acquire!
+
+            # Both benefit from SIMD-optimized count()
+            # (No performance difference since both return BitVector)
         end
     end
 
@@ -481,6 +553,12 @@
         @test eltype(v_bool) == Bool
         @test eltype(v_bit) == Bool
 
+        # Note: acquire! returns SubArray for most types, but BitVector for Bit
+        @test v_f64 isa SubArray
+        @test v_i32 isa SubArray
+        @test v_bool isa SubArray
+        @test v_bit isa BitVector  # Special case for SIMD performance
+
         # zeros!/ones! work consistently
         z_f64 = zeros!(pool, Float64, 10)
         z_bit = zeros!(pool, Bit, 10)
@@ -491,29 +569,37 @@
         @test !any(z_bit)
         @test all(o_f64 .== 1.0)
         @test all(o_bit)
+
+        # Type consistency for convenience functions
+        @test z_bit isa BitVector
+        @test o_bit isa BitVector
     end
 
-    @testset "NTuple form coverage" begin
+    @testset "NTuple form coverage (all return BitArray types)" begin
         pool = AdaptiveArrayPool()
 
         # Test NTuple forms for trues!/falses! (covers _trues_impl! and _falses_impl! NTuple overloads)
         t_tuple = trues!(pool, (5, 5))
         @test size(t_tuple) == (5, 5)
         @test all(t_tuple)
+        @test t_tuple isa BitMatrix
 
         f_tuple = falses!(pool, (5, 5))
         @test size(f_tuple) == (5, 5)
         @test !any(f_tuple)
+        @test f_tuple isa BitMatrix
 
         # Test NTuple forms for zeros!/ones! with Bit type
         # (covers _zeros_impl! and _ones_impl! with Bit NTuple overloads)
         z_bit_tuple = zeros!(pool, Bit, (4, 4))
         @test size(z_bit_tuple) == (4, 4)
         @test !any(z_bit_tuple)
+        @test z_bit_tuple isa BitMatrix
 
         o_bit_tuple = ones!(pool, Bit, (4, 4))
         @test size(o_bit_tuple) == (4, 4)
         @test all(o_bit_tuple)
+        @test o_bit_tuple isa BitMatrix
     end
 
     @testset "Generic DisabledPool fallback for unknown backend" begin
@@ -564,10 +650,42 @@
         z = AdaptiveArrayPools._zeros_impl!(pool, Bit, (3, 3))
         @test size(z) == (3, 3)
         @test !any(z)
+        @test z isa BitMatrix
 
         o = AdaptiveArrayPools._ones_impl!(pool, Bit, (3, 3))
         @test size(o) == (3, 3)
         @test all(o)
+        @test o isa BitMatrix
+
+        # Test _acquire_impl! returns BitVector (not SubArray)
+        bv = AdaptiveArrayPools._acquire_impl!(pool, Bit, 100)
+        @test bv isa BitVector
+        @test length(bv) == 100
+
+        bv = AdaptiveArrayPools._acquire_impl!(pool, Bit, (10, 10))
+        @test bv isa BitMatrix
+        @test size(bv) == (10, 10)
+    end
+    @testset "BitTypedPool growth warning at 512 arrays" begin
+        # Use a fresh pool to ensure we start from 0
+        pool = AdaptiveArrayPool()
+
+        @test pooling_enabled(pool) == true
+
+        # Acquire 511 arrays without rewind - no warning yet
+        for i in 1:511
+            acquire!(pool, Bit, 10)
+        end
+        @test pool.bits.n_active == 511
+
+        # The 512th acquire should trigger a warning
+        @test_logs (:warn, r"BitTypedPool growing large \(512 arrays") begin
+            acquire!(pool, Bit, 10)
+        end
+        @test pool.bits.n_active == 512
+
+        # Clean up
+        empty!(pool)
     end
 
 end # BitArray Support
diff --git a/test/test_utils.jl b/test/test_utils.jl
index ddbde0c..4efd0d2 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -196,8 +196,8 @@ end
         rewind!(pool)
     end
 
-    @testset "Base.show for TypedPool" begin
-        import AdaptiveArrayPools: TypedPool
+    @testset "Base.show for TypedPool & BitTypedPool" begin
+        import AdaptiveArrayPools: TypedPool, BitTypedPool 
 
         # Empty TypedPool - compact show
         tp_empty = TypedPool{Float64}()
@@ -210,6 +210,8 @@ end
         acquire!(pool, Float64, 100)
         acquire!(pool, Float64, 50)
 
+        acquire!(pool, Bit, 10)
+
         output = sprint(show, pool.float64)
         @test occursin("TypedPool{Float64}", output)
         @test occursin("slots=2", output)
@@ -222,6 +224,16 @@ end
         @test occursin("slots:", output)
         @test occursin("active:", output)
 
+        # BitTypedPool - compact show
+        output = sprint(show, pool.bits)
+        @test output == "BitTypedPool(slots=1, active=1, bits=10)"
+        # Multi-line show (MIME"text/plain")
+        output = sprint(show, MIME("text/plain"), pool.bits)
+        @test occursin("BitTypedPool", output)
+        @test occursin("slots:", output)
+        @test occursin("active:", output)
+        @test occursin("bits:", output)
+
         rewind!(pool)
     end
 
@@ -266,6 +278,54 @@ end
         @test occursin("empty", output)
     end
 
+    @testset "pool_stats for BitTypedPool" begin
+        import AdaptiveArrayPools: BitTypedPool
+
+        # Empty BitTypedPool
+        btp = BitTypedPool()
+        output = @capture_out pool_stats(btp)
+        @test occursin("Bit", output)
+        @test occursin("empty", output)
+
+        # BitTypedPool with content (via AdaptiveArrayPool)
+        pool = AdaptiveArrayPool()
+        checkpoint!(pool)
+
+        # Acquire some BitVectors
+        bv1 = acquire!(pool, Bit, 100)
+        bv2 = acquire!(pool, Bit, 200)
+
+        output = @capture_out pool_stats(pool)
+        @test occursin("Bit (fixed)", output)
+        @test occursin("slots: 2", output)
+        @test occursin("active: 2", output)
+        @test occursin("bits:", output)  # BitTypedPool uses "bits" label, not "elements"
+        @test occursin("300", output)     # Total bits: 100 + 200
+
+        rewind!(pool)
+
+        # Test direct BitTypedPool stats
+        btp2 = BitTypedPool()
+        # Manually add vectors for testing
+        push!(btp2.vectors, BitVector(undef, 64))
+        btp2.n_active = 1
+
+        output = @capture_out pool_stats(btp2)
+        @test occursin("Bit", output)
+        @test occursin("slots: 1", output)
+        @test occursin("bits: 64", output)
+    end
+
+    @testset "direct call of internal helpers" begin
+        import AdaptiveArrayPools: _default_type_name, _vector_bytes, _count_label, TypedPool, BitTypedPool
+        @test _default_type_name(TypedPool{Float64}()) == "Float64"
+        @test _default_type_name(BitTypedPool()) == "Bit"
+        @test _vector_bytes([1, 2, 3]) == Base.summarysize([1, 2, 3])
+        @test _vector_bytes(BitVector(undef, 100)) == sizeof(BitVector(undef, 100).chunks)
+        @test _count_label(TypedPool{Int}()) == "elements"
+        @test _count_label(BitTypedPool()) == "bits"
+    end
+
     @testset "_validate_pool_return with N-D arrays" begin
         pool = AdaptiveArrayPool()
         checkpoint!(pool)