From c4518165c0c079796a94a3dd795062596d0b600a Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Sat, 24 Jan 2026 22:01:19 -0800 Subject: [PATCH 01/13] fix: remove changelog option from TagBot configuration --- .github/workflows/TagBot.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index 639dc50..baa39d2 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -19,4 +19,3 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} ssh: ${{ secrets.DOCUMENTER_KEY }} dispatch: true - changelog: false From 46d1a4e66a2081e5fa690f000db2574f65355f5e Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Fri, 30 Jan 2026 14:30:55 -0800 Subject: [PATCH 02/13] feat: implement unsafe_acquire! for Bit type with SIMD performance Add unsafe_acquire!(pool, Bit, n) that returns a real BitVector with shared chunks, providing ~140x performance improvement for native BitVector operations like count(), sum(), and bitwise ops. Changes: - Add get_bitvector_wrapper! with N-way cache for wrapper reuse - Replace _throw_bit_unsafe_error with actual implementation - Support N-D via reshape(BitVector, dims) returning BitArray{N} - Add pool_stats and show methods for BitTypedPool - Fix pool display when BitTypedPool has content The wrapper BitVector shares the pooled BitVector's chunks field, preserving SIMD optimizations while reusing pool memory. --- src/acquire.jl | 139 ++++++++++++++++++++++++++++++++++++++---- src/types.jl | 41 ++++++++----- src/utils.jl | 70 ++++++++++++++++++++- test/test_bitarray.jl | 64 ++++++++++++++----- 4 files changed, 271 insertions(+), 43 deletions(-) diff --git a/src/acquire.jl b/src/acquire.jl index b8ddcf6..286a632 100644 --- a/src/acquire.jl +++ b/src/acquire.jl @@ -19,14 +19,113 @@ unsafe_wrap(Array{T,N}, pointer(flat_view), dims) end -# BitTypedPool cannot use unsafe_wrap - throw clear error -# Called from _unsafe_acquire_impl! dispatches for Bit type -@noinline function _throw_bit_unsafe_error() - throw(ArgumentError( - "unsafe_acquire!(pool, Bit, ...) is not supported. " * - "BitArray stores data in immutable chunks::Vector{UInt64} that cannot be wrapped with unsafe_wrap. " * - "Use acquire!(pool, Bit, ...) instead, which returns a view." - )) +# ============================================================================== +# BitVector Wrapper (chunks sharing for SIMD performance) +# ============================================================================== + +""" + get_bitvector_wrapper!(tp::BitTypedPool, n::Int) -> BitVector + +Get a BitVector that shares `chunks` with the pooled BitVector. + +Unlike `get_view!` which returns a `SubArray` (loses SIMD optimizations), +this returns a real `BitVector` with shared chunks, preserving native +BitVector performance (~140x faster for `count()`, `sum()`, etc.). + +## Implementation +Creates a new BitVector shell and replaces its `chunks` field with the +pooled BitVector's chunks. Uses N-way cache for wrapper reuse. + +## Safety +The returned BitVector is only valid within the `@with_pool` scope. +Do NOT use after the scope ends (use-after-free risk). +""" +function get_bitvector_wrapper!(tp::BitTypedPool, n::Int) + tp.n_active += 1 + idx = tp.n_active + + # 1. Pool expansion needed (new slot) + if idx > length(tp.vectors) + pool_bv = BitVector(undef, n) + push!(tp.vectors, pool_bv) + push!(tp.views, view(pool_bv, 1:n)) + push!(tp.view_lengths, n) + + # Create wrapper sharing chunks + wrapper = BitVector(undef, n) + wrapper.chunks = pool_bv.chunks + + # Expand N-way cache (CACHE_WAYS entries per slot) + for _ in 1:CACHE_WAYS + push!(tp.nd_arrays, nothing) + push!(tp.nd_dims, nothing) + push!(tp.nd_ptrs, UInt(0)) + end + push!(tp.nd_next_way, 0) + + # Cache in first way + base = (idx - 1) * CACHE_WAYS + 1 + @inbounds tp.nd_arrays[base] = wrapper + @inbounds tp.nd_dims[base] = n + @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks)) + + # Warn at powers of 2 (possible missing rewind!) + if idx >= 512 && (idx & (idx - 1)) == 0 + total_bits = sum(length, tp.vectors) + @warn "BitTypedPool growing large ($idx arrays, ~$(total_bits ÷ 8) bytes). Missing rewind!()?" + end + + return wrapper + end + + # 2. Check N-way cache for hit + @inbounds pool_bv = tp.vectors[idx] + current_ptr = UInt(pointer(pool_bv.chunks)) + + # Ensure cache slots exist for this index + n_slots_cached = length(tp.nd_next_way) + while idx > n_slots_cached + for _ in 1:CACHE_WAYS + push!(tp.nd_arrays, nothing) + push!(tp.nd_dims, nothing) + push!(tp.nd_ptrs, UInt(0)) + end + push!(tp.nd_next_way, 0) + n_slots_cached += 1 + end + + base = (idx - 1) * CACHE_WAYS + + # Linear search across all ways + for k in 1:CACHE_WAYS + cache_idx = base + k + @inbounds cached_n = tp.nd_dims[cache_idx] + @inbounds cached_ptr = tp.nd_ptrs[cache_idx] + + if cached_n == n && cached_ptr == current_ptr + return @inbounds tp.nd_arrays[cache_idx]::BitVector + end + end + + # 3. Cache miss - resize pool_bv if needed and create new wrapper + if length(pool_bv) < n + resize!(pool_bv, n) + @inbounds tp.views[idx] = view(pool_bv, 1:n) + @inbounds tp.view_lengths[idx] = n + end + + wrapper = BitVector(undef, n) + wrapper.chunks = pool_bv.chunks + + # Round-robin replacement + @inbounds way_offset = tp.nd_next_way[idx] + target_idx = base + way_offset + 1 + @inbounds tp.nd_arrays[target_idx] = wrapper + @inbounds tp.nd_dims[target_idx] = n + @inbounds tp.nd_ptrs[target_idx] = UInt(pointer(pool_bv.chunks)) + @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS + + return wrapper end # ============================================================================== @@ -245,10 +344,21 @@ end # Similar-style @inline _unsafe_acquire_impl!(pool::AbstractArrayPool, x::AbstractArray) = _unsafe_acquire_impl!(pool, eltype(x), size(x)) -# Bit type: unsafe_acquire! not supported (throw clear error early) -@inline _unsafe_acquire_impl!(::AbstractArrayPool, ::Type{Bit}, ::Int) = _throw_bit_unsafe_error() -@inline _unsafe_acquire_impl!(::AbstractArrayPool, ::Type{Bit}, ::Vararg{Int,N}) where {N} = _throw_bit_unsafe_error() -@inline _unsafe_acquire_impl!(::AbstractArrayPool, ::Type{Bit}, ::NTuple{N,Int}) where {N} = _throw_bit_unsafe_error() +# Bit type: returns BitVector with shared chunks (SIMD optimized) +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int) + tp = get_typed_pool!(pool, Bit)::BitTypedPool + return get_bitvector_wrapper!(tp, n) +end + +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N} + total = prod(dims) + bv = _unsafe_acquire_impl!(pool, Bit, total) + return reshape(bv, dims) # ReshapedArray{Bool,N,BitVector,...} +end + +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N} + _unsafe_acquire_impl!(pool, Bit, dims...) +end # ============================================================================== # Acquisition API (User-facing with untracked marking) @@ -455,6 +565,11 @@ const _acquire_array_impl! = _unsafe_acquire_impl! @inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims) @inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims) +# --- unsafe_acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) --- +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n) +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims) +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims) + # --- Generic DisabledPool fallbacks (unknown backend → error) --- @inline acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B) @inline unsafe_acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B) diff --git a/src/types.jl b/src/types.jl index 2b1a070..89f2027 100644 --- a/src/types.jl +++ b/src/types.jl @@ -241,10 +241,10 @@ end - **1D**: `SubArray{Bool,1,BitVector,...}` - **N-D**: `ReshapedArray{Bool,N,...}` (reshaped view of 1D BitVector) -## Limitation -`unsafe_acquire!(pool, Bit, ...)` is **not supported** because Julia's -`BitArray` stores data in immutable `chunks::Vector{UInt64}` that cannot -be wrapped with `unsafe_wrap`. +## Performance Note +`unsafe_acquire!(pool, Bit, n)` returns a real `BitVector` with shared chunks, +preserving SIMD-optimized operations like `count()` (~140x faster than SubArray). +Use this when you need native BitVector performance. See also: [`acquire!`](@ref), [`BitTypedPool`](@ref) """ @@ -262,30 +262,41 @@ Specialized pool for `BitVector` arrays with memory reuse. Unlike `TypedPool{Bool}` which stores `Vector{Bool}` (1 byte per element), this pool stores `BitVector` (1 bit per element, ~8x memory efficiency). -## Important Limitation -**`unsafe_acquire!` is NOT supported for BitArray** because Julia's `BitArray` -stores data in a `chunks::Vector{UInt64}` field that cannot be wrapped with -`unsafe_wrap`. Only view-based acquisition via `acquire!(pool, Bit, ...)` is available. +## Acquisition Methods +- `acquire!(pool, Bit, n)` → `SubArray{Bool,1,BitVector,...}` (view-based) +- `unsafe_acquire!(pool, Bit, n)` → `BitVector` (chunks-sharing, SIMD optimized) + +Use `unsafe_acquire!` when you need native BitVector operations like `count()`, +`sum()`, or bitwise operations - these are ~140x faster than SubArray equivalents. ## Fields - `vectors`: Backing `BitVector` storage -- `views`: Cached `SubArray` views for zero-allocation 1D access +- `views`: Cached `SubArray` views for `acquire!` - `view_lengths`: Cached lengths for fast comparison -- `nd_*`: Empty N-D cache fields (for `empty!` compatibility, unused) +- `nd_arrays`: Cached wrapper BitVectors for `unsafe_acquire!` (chunks sharing) +- `nd_dims`: Cached lengths for wrapper cache validation +- `nd_ptrs`: Cached chunk pointers for invalidation detection +- `nd_next_way`: Round-robin counter for N-way cache - `n_active`: Count of currently active arrays - `_checkpoint_*`: State management stacks (1-based sentinel pattern) ## Usage ```julia @with_pool pool begin - bv = acquire!(pool, Bit, 100) # SubArray{Bool,1,BitVector,...} - ba = acquire!(pool, Bit, 10, 10) # ReshapedArray{Bool,2,...} - t = trues!(pool, 50) # Filled with true - f = falses!(pool, 50) # Filled with false + # View-based (standard) + bv = acquire!(pool, Bit, 100) # SubArray{Bool,1,BitVector,...} + + # SIMD-optimized (for performance-critical code) + bv_fast = unsafe_acquire!(pool, Bit, 100) # BitVector (real) + count(bv_fast) # ~140x faster than count(bv) + + # Convenience functions + t = trues!(pool, 50) # Filled with true + f = falses!(pool, 50) # Filled with false end ``` -See also: [`trues!`](@ref), [`falses!`](@ref) +See also: [`trues!`](@ref), [`falses!`](@ref), [`Bit`](@ref) """ mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector} # --- Storage --- diff --git a/src/utils.jl b/src/utils.jl index 5950744..cf12ea0 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -116,6 +116,43 @@ function pool_stats(tp::TypedPool{T}; io::IO=stdout, indent::Int=0, name::String return nothing end +""" + pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String="") + +Print statistics for a BitTypedPool. +""" +function pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String="") + prefix = " "^indent + type_name = isempty(name) ? "Bit" : name + + n_arrays = length(tp.vectors) + if n_arrays == 0 + printstyled(io, prefix, type_name, color=:cyan) + printstyled(io, " (empty)\n", color=:dark_gray) + return + end + + total_bits = sum(length(v) for v in tp.vectors) + total_bytes = sum(sizeof(v.chunks) for v in tp.vectors) + bytes_str = Base.format_bytes(total_bytes) + + # Header + printstyled(io, prefix, type_name, color=:cyan) + println(io) + + # Stats + printstyled(io, prefix, " slots: ", color=:dark_gray) + printstyled(io, n_arrays, color=:blue) + printstyled(io, " (active: ", color=:dark_gray) + printstyled(io, tp.n_active, color=:blue) + printstyled(io, ")\n", color=:dark_gray) + + printstyled(io, prefix, " bits: ", color=:dark_gray) + printstyled(io, total_bits, color=:blue) + printstyled(io, " ($bytes_str)\n", color=:dark_gray) + return nothing +end + """ pool_stats(pool::AdaptiveArrayPool; io::IO=stdout) @@ -141,8 +178,13 @@ function pool_stats(pool::AdaptiveArrayPool; io::IO=stdout) foreach_fixed_slot(pool) do tp if !isempty(tp.vectors) has_content = true - T = typeof(tp).parameters[1] # Extract T from TypedPool{T} - pool_stats(tp; io, indent=2, name="$T (fixed)") + name = if tp isa BitTypedPool + "Bit (fixed)" + else + T = typeof(tp).parameters[1] # Extract T from TypedPool{T} + "$T (fixed)" + end + pool_stats(tp; io, indent=2, name) end end @@ -228,6 +270,30 @@ function Base.show(io::IO, ::MIME"text/plain", tp::TypedPool{T}) where {T} pool_stats(tp; io, name="TypedPool{$T}") end +# Compact one-line show for BitTypedPool +function Base.show(io::IO, tp::BitTypedPool) + n_vectors = length(tp.vectors) + if n_vectors == 0 + print(io, "BitTypedPool(empty)") + else + total_bits = sum(length(v) for v in tp.vectors) + print(io, "BitTypedPool(slots=$n_vectors, active=$(tp.n_active), bits=$total_bits)") + end +end + +# Multi-line show for BitTypedPool +function Base.show(io::IO, ::MIME"text/plain", tp::BitTypedPool) + n_vectors = length(tp.vectors) + println(io, "BitTypedPool:") + println(io, " slots: $n_vectors") + println(io, " active: $(tp.n_active)") + if n_vectors > 0 + total_bits = sum(length(v) for v in tp.vectors) + total_bytes = sum(sizeof(v.chunks) for v in tp.vectors) + println(io, " bits: $total_bits ($(Base.format_bytes(total_bytes)))") + end +end + # Compact one-line show for AdaptiveArrayPool function Base.show(io::IO, pool::AdaptiveArrayPool) n_types = Ref(0) diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl index a5dbeca..e5394de 100644 --- a/test/test_bitarray.jl +++ b/test/test_bitarray.jl @@ -445,24 +445,60 @@ @test outer_result == (100, 0) end - @testset "unsafe_acquire! not supported" begin + @testset "unsafe_acquire! returns BitVector with shared chunks" begin pool = AdaptiveArrayPool() - # unsafe_acquire! with Bit should throw a clear error - @test_throws ArgumentError unsafe_acquire!(pool, Bit, 100) - @test_throws ArgumentError unsafe_acquire!(pool, Bit, 10, 10) + # unsafe_acquire! with Bit returns a real BitVector (not SubArray) + bv = unsafe_acquire!(pool, Bit, 100) + @test bv isa BitVector + @test length(bv) == 100 - # Tuple form (covers acquire.jl:251) - @test_throws ArgumentError unsafe_acquire!(pool, Bit, (10, 10)) + # N-D returns BitArray (reshape of BitVector becomes BitArray in Julia) + ba = unsafe_acquire!(pool, Bit, 10, 10) + @test ba isa BitMatrix # reshape(BitVector, dims) → BitArray + @test size(ba) == (10, 10) - # Verify the error message is helpful - try - unsafe_acquire!(pool, Bit, 100) - catch e - @test e isa ArgumentError - @test occursin("unsafe_acquire!", e.msg) - @test occursin("Bit", e.msg) - @test occursin("acquire!", e.msg) # Suggests alternative + # Tuple form + ba_tuple = unsafe_acquire!(pool, Bit, (10, 10)) + @test ba_tuple isa BitMatrix + @test size(ba_tuple) == (10, 10) + + # Verify chunks sharing (key feature!) + @with_pool pool2 begin + bv2 = unsafe_acquire!(pool2, Bit, 100) + pool_bv = pool2.bits.vectors[1] + @test bv2.chunks === pool_bv.chunks # Same chunks object! + + # Verify data is shared + bv2[1] = true + @test pool_bv[1] == true + bv2[1] = false + @test pool_bv[1] == false + end + end + + @testset "unsafe_acquire! SIMD performance" begin + # Verify that unsafe_acquire! preserves SIMD-optimized operations + pool = AdaptiveArrayPool() + + @with_pool pool begin + n = 10000 + + # Setup: fill with known pattern + bv_unsafe = unsafe_acquire!(pool, Bit, n) + fill!(bv_unsafe, true) + + # count() should work correctly + @test count(bv_unsafe) == n + + # Verify it's using the fast path (type check) + @test bv_unsafe isa BitVector + + # Compare with acquire! (SubArray) + bv_view = acquire!(pool, Bit, n) + fill!(bv_view, true) + @test count(bv_view) == n + @test bv_view isa SubArray end end From eeeb815ef127e2e8095031cae93a1ab4595b90cc Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Fri, 30 Jan 2026 14:47:50 -0800 Subject: [PATCH 03/13] refactor: extract BitArray acquisition logic to dedicated file Separate BitArray-specific code from acquire.jl into bitarray.jl for improved maintainability and code organization. Moved to src/bitarray.jl: - allocate_vector(::BitTypedPool, n) dispatch - Base.zero/one(::Type{Bit}) overloads - get_bitvector_wrapper! (SIMD-optimized chunks sharing) - _unsafe_acquire_impl! for Bit type - DisabledPool fallbacks for Bit type No functional changes - all tests pass with same coverage. --- src/AdaptiveArrayPools.jl | 3 + src/acquire.jl | 142 ------------------------------- src/bitarray.jl | 174 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 177 insertions(+), 142 deletions(-) create mode 100644 src/bitarray.jl diff --git a/src/AdaptiveArrayPools.jl b/src/AdaptiveArrayPools.jl index 61f691a..7092822 100644 --- a/src/AdaptiveArrayPools.jl +++ b/src/AdaptiveArrayPools.jl @@ -28,6 +28,9 @@ include("utils.jl") # Acquisition operations: get_view!, acquire!, unsafe_acquire!, aliases include("acquire.jl") +# BitArray-specific acquisition (SIMD-optimized BitVector operations) +include("bitarray.jl") + # Convenience functions: zeros!, ones!, similar! include("convenience.jl") diff --git a/src/acquire.jl b/src/acquire.jl index 286a632..428738b 100644 --- a/src/acquire.jl +++ b/src/acquire.jl @@ -6,128 +6,12 @@ @inline allocate_vector(::AbstractTypedPool{T,Vector{T}}, n::Int) where {T} = Vector{T}(undef, n) -# BitTypedPool allocates BitVector (used when acquiring with Bit type) -@inline allocate_vector(::BitTypedPool, n::Int) = BitVector(undef, n) - -# Bit type returns Bool element type for fill operations (zero/one) -@inline Base.zero(::Type{Bit}) = false -@inline Base.one(::Type{Bit}) = true - # Wrap flat view into N-D array (dispatch point for extensions) @inline function wrap_array(::AbstractTypedPool{T,Vector{T}}, flat_view, dims::NTuple{N,Int}) where {T,N} unsafe_wrap(Array{T,N}, pointer(flat_view), dims) end -# ============================================================================== -# BitVector Wrapper (chunks sharing for SIMD performance) -# ============================================================================== - -""" - get_bitvector_wrapper!(tp::BitTypedPool, n::Int) -> BitVector - -Get a BitVector that shares `chunks` with the pooled BitVector. - -Unlike `get_view!` which returns a `SubArray` (loses SIMD optimizations), -this returns a real `BitVector` with shared chunks, preserving native -BitVector performance (~140x faster for `count()`, `sum()`, etc.). - -## Implementation -Creates a new BitVector shell and replaces its `chunks` field with the -pooled BitVector's chunks. Uses N-way cache for wrapper reuse. - -## Safety -The returned BitVector is only valid within the `@with_pool` scope. -Do NOT use after the scope ends (use-after-free risk). -""" -function get_bitvector_wrapper!(tp::BitTypedPool, n::Int) - tp.n_active += 1 - idx = tp.n_active - - # 1. Pool expansion needed (new slot) - if idx > length(tp.vectors) - pool_bv = BitVector(undef, n) - push!(tp.vectors, pool_bv) - push!(tp.views, view(pool_bv, 1:n)) - push!(tp.view_lengths, n) - - # Create wrapper sharing chunks - wrapper = BitVector(undef, n) - wrapper.chunks = pool_bv.chunks - - # Expand N-way cache (CACHE_WAYS entries per slot) - for _ in 1:CACHE_WAYS - push!(tp.nd_arrays, nothing) - push!(tp.nd_dims, nothing) - push!(tp.nd_ptrs, UInt(0)) - end - push!(tp.nd_next_way, 0) - - # Cache in first way - base = (idx - 1) * CACHE_WAYS + 1 - @inbounds tp.nd_arrays[base] = wrapper - @inbounds tp.nd_dims[base] = n - @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks)) - - # Warn at powers of 2 (possible missing rewind!) - if idx >= 512 && (idx & (idx - 1)) == 0 - total_bits = sum(length, tp.vectors) - @warn "BitTypedPool growing large ($idx arrays, ~$(total_bits ÷ 8) bytes). Missing rewind!()?" - end - - return wrapper - end - - # 2. Check N-way cache for hit - @inbounds pool_bv = tp.vectors[idx] - current_ptr = UInt(pointer(pool_bv.chunks)) - - # Ensure cache slots exist for this index - n_slots_cached = length(tp.nd_next_way) - while idx > n_slots_cached - for _ in 1:CACHE_WAYS - push!(tp.nd_arrays, nothing) - push!(tp.nd_dims, nothing) - push!(tp.nd_ptrs, UInt(0)) - end - push!(tp.nd_next_way, 0) - n_slots_cached += 1 - end - - base = (idx - 1) * CACHE_WAYS - - # Linear search across all ways - for k in 1:CACHE_WAYS - cache_idx = base + k - @inbounds cached_n = tp.nd_dims[cache_idx] - @inbounds cached_ptr = tp.nd_ptrs[cache_idx] - - if cached_n == n && cached_ptr == current_ptr - return @inbounds tp.nd_arrays[cache_idx]::BitVector - end - end - - # 3. Cache miss - resize pool_bv if needed and create new wrapper - if length(pool_bv) < n - resize!(pool_bv, n) - @inbounds tp.views[idx] = view(pool_bv, 1:n) - @inbounds tp.view_lengths[idx] = n - end - - wrapper = BitVector(undef, n) - wrapper.chunks = pool_bv.chunks - - # Round-robin replacement - @inbounds way_offset = tp.nd_next_way[idx] - target_idx = base + way_offset + 1 - @inbounds tp.nd_arrays[target_idx] = wrapper - @inbounds tp.nd_dims[target_idx] = n - @inbounds tp.nd_ptrs[target_idx] = UInt(pointer(pool_bv.chunks)) - @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS - - return wrapper -end - # ============================================================================== # Helper: Overflow-Safe Product # ============================================================================== @@ -344,22 +228,6 @@ end # Similar-style @inline _unsafe_acquire_impl!(pool::AbstractArrayPool, x::AbstractArray) = _unsafe_acquire_impl!(pool, eltype(x), size(x)) -# Bit type: returns BitVector with shared chunks (SIMD optimized) -@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int) - tp = get_typed_pool!(pool, Bit)::BitTypedPool - return get_bitvector_wrapper!(tp, n) -end - -@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N} - total = prod(dims) - bv = _unsafe_acquire_impl!(pool, Bit, total) - return reshape(bv, dims) # ReshapedArray{Bool,N,BitVector,...} -end - -@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N} - _unsafe_acquire_impl!(pool, Bit, dims...) -end - # ============================================================================== # Acquisition API (User-facing with untracked marking) # ============================================================================== @@ -560,16 +428,6 @@ const _acquire_array_impl! = _unsafe_acquire_impl! @inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = Array{T,N}(undef, dims) @inline unsafe_acquire!(::DisabledPool{:cpu}, x::AbstractArray) = similar(x) -# --- acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) --- -@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n) -@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims) -@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims) - -# --- unsafe_acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) --- -@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n) -@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims) -@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims) - # --- Generic DisabledPool fallbacks (unknown backend → error) --- @inline acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B) @inline unsafe_acquire!(::DisabledPool{B}, _args...) where {B} = _throw_backend_not_loaded(B) diff --git a/src/bitarray.jl b/src/bitarray.jl new file mode 100644 index 0000000..144389d --- /dev/null +++ b/src/bitarray.jl @@ -0,0 +1,174 @@ +# ============================================================================== +# BitArray Acquisition (SIMD-Optimized BitVector Operations) +# ============================================================================== +# +# This file contains BitArray-specific pool operations, separated from the +# generic Array acquisition code in acquire.jl for maintainability. +# +# Key components: +# - allocate_vector(::BitTypedPool, n) - BitVector allocation dispatch +# - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type +# - get_bitvector_wrapper! - SIMD-optimized BitVector with shared chunks +# - _unsafe_acquire_impl! for Bit - Raw BitVector/BitArray acquisition +# - DisabledPool fallbacks for Bit type +# +# Design rationale: +# - BitVector cannot use unsafe_wrap like Array, so it needs a different +# strategy for returning native BitVector instances. +# - The "chunks sharing" approach creates a new BitVector shell and replaces +# its internal chunks field, preserving ~140x faster SIMD operations. +# ============================================================================== + +# ============================================================================== +# Allocation Dispatch Points (BitArray-specific) +# ============================================================================== + +# BitTypedPool allocates BitVector (used when acquiring with Bit type) +@inline allocate_vector(::BitTypedPool, n::Int) = BitVector(undef, n) + +# Bit type returns Bool element type for fill operations (zero/one) +@inline Base.zero(::Type{Bit}) = false +@inline Base.one(::Type{Bit}) = true + +# ============================================================================== +# BitVector Wrapper (chunks sharing for SIMD performance) +# ============================================================================== + +""" + get_bitvector_wrapper!(tp::BitTypedPool, n::Int) -> BitVector + +Get a BitVector that shares `chunks` with the pooled BitVector. + +Unlike `get_view!` which returns a `SubArray` (loses SIMD optimizations), +this returns a real `BitVector` with shared chunks, preserving native +BitVector performance (~140x faster for `count()`, `sum()`, etc.). + +## Implementation +Creates a new BitVector shell and replaces its `chunks` field with the +pooled BitVector's chunks. Uses N-way cache for wrapper reuse. + +## Safety +The returned BitVector is only valid within the `@with_pool` scope. +Do NOT use after the scope ends (use-after-free risk). +""" +function get_bitvector_wrapper!(tp::BitTypedPool, n::Int) + tp.n_active += 1 + idx = tp.n_active + + # 1. Pool expansion needed (new slot) + if idx > length(tp.vectors) + pool_bv = BitVector(undef, n) + push!(tp.vectors, pool_bv) + push!(tp.views, view(pool_bv, 1:n)) + push!(tp.view_lengths, n) + + # Create wrapper sharing chunks + wrapper = BitVector(undef, n) + wrapper.chunks = pool_bv.chunks + + # Expand N-way cache (CACHE_WAYS entries per slot) + for _ in 1:CACHE_WAYS + push!(tp.nd_arrays, nothing) + push!(tp.nd_dims, nothing) + push!(tp.nd_ptrs, UInt(0)) + end + push!(tp.nd_next_way, 0) + + # Cache in first way + base = (idx - 1) * CACHE_WAYS + 1 + @inbounds tp.nd_arrays[base] = wrapper + @inbounds tp.nd_dims[base] = n + @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks)) + + # Warn at powers of 2 (possible missing rewind!) + if idx >= 512 && (idx & (idx - 1)) == 0 + total_bits = sum(length, tp.vectors) + @warn "BitTypedPool growing large ($idx arrays, ~$(total_bits ÷ 8) bytes). Missing rewind!()?" + end + + return wrapper + end + + # 2. Check N-way cache for hit + @inbounds pool_bv = tp.vectors[idx] + current_ptr = UInt(pointer(pool_bv.chunks)) + + # Ensure cache slots exist for this index + n_slots_cached = length(tp.nd_next_way) + while idx > n_slots_cached + for _ in 1:CACHE_WAYS + push!(tp.nd_arrays, nothing) + push!(tp.nd_dims, nothing) + push!(tp.nd_ptrs, UInt(0)) + end + push!(tp.nd_next_way, 0) + n_slots_cached += 1 + end + + base = (idx - 1) * CACHE_WAYS + + # Linear search across all ways + for k in 1:CACHE_WAYS + cache_idx = base + k + @inbounds cached_n = tp.nd_dims[cache_idx] + @inbounds cached_ptr = tp.nd_ptrs[cache_idx] + + if cached_n == n && cached_ptr == current_ptr + return @inbounds tp.nd_arrays[cache_idx]::BitVector + end + end + + # 3. Cache miss - resize pool_bv if needed and create new wrapper + if length(pool_bv) < n + resize!(pool_bv, n) + @inbounds tp.views[idx] = view(pool_bv, 1:n) + @inbounds tp.view_lengths[idx] = n + end + + wrapper = BitVector(undef, n) + wrapper.chunks = pool_bv.chunks + + # Round-robin replacement + @inbounds way_offset = tp.nd_next_way[idx] + target_idx = base + way_offset + 1 + @inbounds tp.nd_arrays[target_idx] = wrapper + @inbounds tp.nd_dims[target_idx] = n + @inbounds tp.nd_ptrs[target_idx] = UInt(pointer(pool_bv.chunks)) + @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS + + return wrapper +end + +# ============================================================================== +# Unsafe Acquire Implementation (Bit type) +# ============================================================================== + +# Bit type: returns BitVector with shared chunks (SIMD optimized) +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int) + tp = get_typed_pool!(pool, Bit)::BitTypedPool + return get_bitvector_wrapper!(tp, n) +end + +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N} + total = prod(dims) + bv = _unsafe_acquire_impl!(pool, Bit, total) + return reshape(bv, dims) # ReshapedArray{Bool,N,BitVector,...} +end + +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N} + _unsafe_acquire_impl!(pool, Bit, dims...) +end + +# ============================================================================== +# DisabledPool Fallbacks (Bit type) +# ============================================================================== + +# --- acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) --- +@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n) +@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims) +@inline acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims) + +# --- unsafe_acquire! for DisabledPool{:cpu} with Bit type (returns BitArray) --- +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, n::Int) = BitVector(undef, n) +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::Vararg{Int,N}) where {N} = BitArray{N}(undef, dims) +@inline unsafe_acquire!(::DisabledPool{:cpu}, ::Type{Bit}, dims::NTuple{N,Int}) where {N} = BitArray{N}(undef, dims) From 322ebe4a3eefe16eb8479480c061d74ab447a824 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Fri, 30 Jan 2026 16:09:24 -0800 Subject: [PATCH 04/13] feat: unify Bit type API to always return BitVector for SIMD performance Both acquire! and unsafe_acquire! now return BitVector for Bit type, eliminating the need for users to choose between APIs to get optimal performance. The _acquire_impl! for Bit now delegates to _unsafe_acquire_impl!, ensuring ~140x faster SIMD operations (count, sum, bitwise) are always used. Also fixes BitVector wrapper sizing to use exact length (!=) instead of minimum length (<), ensuring fill!/count! iterate only over relevant chunks. --- src/bitarray.jl | 66 ++++++++++++++++++++++---- src/types.jl | 74 +++++++++++++++++++---------- test/test_bitarray.jl | 108 +++++++++++++++++++++++++++++------------- 3 files changed, 180 insertions(+), 68 deletions(-) diff --git a/src/bitarray.jl b/src/bitarray.jl index 144389d..fd64bff 100644 --- a/src/bitarray.jl +++ b/src/bitarray.jl @@ -1,5 +1,5 @@ # ============================================================================== -# BitArray Acquisition (SIMD-Optimized BitVector Operations) +# BitArray Acquisition (Unified BitVector API) # ============================================================================== # # This file contains BitArray-specific pool operations, separated from the @@ -9,14 +9,34 @@ # - allocate_vector(::BitTypedPool, n) - BitVector allocation dispatch # - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type # - get_bitvector_wrapper! - SIMD-optimized BitVector with shared chunks +# - _acquire_impl! for Bit - Delegates to _unsafe_acquire_impl! for performance # - _unsafe_acquire_impl! for Bit - Raw BitVector/BitArray acquisition # - DisabledPool fallbacks for Bit type # -# Design rationale: -# - BitVector cannot use unsafe_wrap like Array, so it needs a different -# strategy for returning native BitVector instances. -# - The "chunks sharing" approach creates a new BitVector shell and replaces -# its internal chunks field, preserving ~140x faster SIMD operations. +# Design Decision: Unified BitVector Return Type +# ============================================= +# Unlike regular types where acquire! returns SubArray and unsafe_acquire! +# returns Array, for Bit type BOTH return BitVector. This design choice is +# intentional for several reasons: +# +# 1. **SIMD Performance**: BitVector operations like `count()`, `sum()`, and +# bitwise operations are ~140x faster than their SubArray equivalents +# because they use SIMD-optimized chunked algorithms. +# +# 2. **API Simplicity**: Users always get BitVector regardless of which API +# they call. No need to remember "use unsafe_acquire! for performance". +# +# 3. **Semantic Clarity**: The "unsafe" in unsafe_acquire! refers to memory +# safety concerns (use-after-free risk). BitVector already handles memory +# efficiently (1 bit per element), so the naming would be misleading. +# +# 4. **Backwards Compatibility**: Code using trues!/falses! just works with +# optimal performance - these convenience functions now return BitVector. +# +# Implementation: +# - _acquire_impl!(pool, Bit, ...) delegates to _unsafe_acquire_impl! +# - get_bitvector_wrapper! creates BitVector shells sharing pool's chunks +# - N-D requests return reshaped BitArrays (reshape preserves chunk sharing) # ============================================================================== # ============================================================================== @@ -118,8 +138,11 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int) end end - # 3. Cache miss - resize pool_bv if needed and create new wrapper - if length(pool_bv) < n + # 3. Cache miss - resize pool_bv to EXACTLY n elements and create new wrapper + # Unlike regular arrays where we only grow, BitVector wrappers MUST have exactly + # the right number of chunks. Otherwise fill!()/count() iterate over all chunks, + # not just the bits within wrapper.len, causing incorrect behavior. + if length(pool_bv) != n resize!(pool_bv, n) @inbounds tp.views[idx] = view(pool_bv, 1:n) @inbounds tp.view_lengths[idx] = n @@ -139,6 +162,31 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int) return wrapper end +# ============================================================================== +# Acquire Implementation (Bit type → delegates to unsafe_acquire for performance) +# ============================================================================== +# +# Unlike other types where acquire! returns SubArray (view-based) and +# unsafe_acquire! returns Array (raw), Bit type always returns BitVector. +# This is because BitVector's SIMD-optimized operations (count, sum, etc.) +# are ~140x faster than SubArray equivalents. +# +# The delegation is transparent: users calling acquire!(pool, Bit, n) get +# BitVector without needing to know about unsafe_acquire!. + +# Bit type: delegates to _unsafe_acquire_impl! for SIMD performance +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int) + return _unsafe_acquire_impl!(pool, Bit, n) +end + +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N} + return _unsafe_acquire_impl!(pool, Bit, dims...) +end + +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N} + return _unsafe_acquire_impl!(pool, Bit, dims...) +end + # ============================================================================== # Unsafe Acquire Implementation (Bit type) # ============================================================================== @@ -152,7 +200,7 @@ end @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N} total = prod(dims) bv = _unsafe_acquire_impl!(pool, Bit, total) - return reshape(bv, dims) # ReshapedArray{Bool,N,BitVector,...} + return reshape(bv, dims) # BitArray{N} (Julia's reshape on BitVector returns BitArray) end @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N} diff --git a/src/types.jl b/src/types.jl index 89f2027..51bab0f 100644 --- a/src/types.jl +++ b/src/types.jl @@ -225,28 +225,45 @@ bit-packed arrays (1 bit per element vs 1 byte for `Vector{Bool}`). ## Usage ```julia @with_pool pool begin - # BitVector view (1 bit per element, ~8x memory savings) + # BitVector (1 bit per element, ~8x memory savings) bv = acquire!(pool, Bit, 1000) # vs Vector{Bool} (1 byte per element) vb = acquire!(pool, Bool, 1000) # Convenience functions work too - mask = zeros!(pool, Bit, 100) # BitVector filled with false - flags = ones!(pool, Bit, 100) # BitVector filled with true + mask = falses!(pool, 100) # BitVector filled with false + flags = trues!(pool, 100) # BitVector filled with true end ``` -## Return Types -- **1D**: `SubArray{Bool,1,BitVector,...}` -- **N-D**: `ReshapedArray{Bool,N,...}` (reshaped view of 1D BitVector) +## Return Types (Unified for Performance) +Unlike other types, `Bit` always returns native `BitVector`/`BitArray`: +- **1D**: `BitVector` (both `acquire!` and `unsafe_acquire!`) +- **N-D**: `BitArray{N}` (reshaped, preserves SIMD optimization) -## Performance Note -`unsafe_acquire!(pool, Bit, n)` returns a real `BitVector` with shared chunks, -preserving SIMD-optimized operations like `count()` (~140x faster than SubArray). -Use this when you need native BitVector performance. +This design ensures users always get SIMD-optimized performance without +needing to remember which API to use. -See also: [`acquire!`](@ref), [`BitTypedPool`](@ref) +## Performance +`BitVector` operations like `count()`, `sum()`, and bitwise operations are +~140x faster than equivalent operations on `SubArray{Bool}` because they +use SIMD-optimized algorithms on packed 64-bit chunks. + +```julia +@with_pool pool begin + bv = acquire!(pool, Bit, 10000) + fill!(bv, true) + count(bv) # Uses fast SIMD path automatically +end +``` + +## Memory Safety +The returned `BitVector` shares its internal `chunks` array with the pool. +It is only valid within the `@with_pool` scope - using it after the scope +ends leads to undefined behavior (use-after-free risk). + +See also: [`trues!`](@ref), [`falses!`](@ref), [`BitTypedPool`](@ref) """ struct Bit end @@ -262,18 +279,21 @@ Specialized pool for `BitVector` arrays with memory reuse. Unlike `TypedPool{Bool}` which stores `Vector{Bool}` (1 byte per element), this pool stores `BitVector` (1 bit per element, ~8x memory efficiency). -## Acquisition Methods -- `acquire!(pool, Bit, n)` → `SubArray{Bool,1,BitVector,...}` (view-based) -- `unsafe_acquire!(pool, Bit, n)` → `BitVector` (chunks-sharing, SIMD optimized) +## Unified API (Always Returns BitVector) +Unlike other types, both `acquire!` and `unsafe_acquire!` return `BitVector` +for the `Bit` type. This design ensures users always get SIMD-optimized +performance without needing to choose between APIs. -Use `unsafe_acquire!` when you need native BitVector operations like `count()`, -`sum()`, or bitwise operations - these are ~140x faster than SubArray equivalents. +- `acquire!(pool, Bit, n)` → `BitVector` (SIMD optimized) +- `unsafe_acquire!(pool, Bit, n)` → `BitVector` (same behavior) +- `trues!(pool, n)` → `BitVector` filled with `true` +- `falses!(pool, n)` → `BitVector` filled with `false` ## Fields - `vectors`: Backing `BitVector` storage -- `views`: Cached `SubArray` views for `acquire!` +- `views`: Cached `SubArray` views (legacy, maintained for compatibility) - `view_lengths`: Cached lengths for fast comparison -- `nd_arrays`: Cached wrapper BitVectors for `unsafe_acquire!` (chunks sharing) +- `nd_arrays`: Cached wrapper BitVectors (chunks sharing) - `nd_dims`: Cached lengths for wrapper cache validation - `nd_ptrs`: Cached chunk pointers for invalidation detection - `nd_next_way`: Round-robin counter for N-way cache @@ -283,19 +303,21 @@ Use `unsafe_acquire!` when you need native BitVector operations like `count()`, ## Usage ```julia @with_pool pool begin - # View-based (standard) - bv = acquire!(pool, Bit, 100) # SubArray{Bool,1,BitVector,...} - - # SIMD-optimized (for performance-critical code) - bv_fast = unsafe_acquire!(pool, Bit, 100) # BitVector (real) - count(bv_fast) # ~140x faster than count(bv) + # All return BitVector with SIMD performance + bv = acquire!(pool, Bit, 100) # BitVector + count(bv) # Fast SIMD path # Convenience functions - t = trues!(pool, 50) # Filled with true - f = falses!(pool, 50) # Filled with false + t = trues!(pool, 50) # BitVector filled with true + f = falses!(pool, 50) # BitVector filled with false end ``` +## Performance +Operations like `count()`, `sum()`, and bitwise operations are ~140x faster +than equivalent operations on `SubArray{Bool}` because `BitVector` uses +SIMD-optimized algorithms on packed 64-bit chunks. + See also: [`trues!`](@ref), [`falses!`](@ref), [`Bit`](@ref) """ mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector} diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl index e5394de..3e0a267 100644 --- a/test/test_bitarray.jl +++ b/test/test_bitarray.jl @@ -26,13 +26,14 @@ @test isempty(pool.bits.vectors) end - @testset "acquire!(pool, Bit, n) - 1D" begin + @testset "acquire!(pool, Bit, n) - 1D (returns BitVector for SIMD performance)" begin pool = AdaptiveArrayPool() bv = acquire!(pool, Bit, 100) @test length(bv) == 100 @test eltype(bv) == Bool - @test bv isa SubArray{Bool, 1, BitVector} + # Returns BitVector (not SubArray) for SIMD-optimized operations + @test bv isa BitVector @test pool.bits.n_active == 1 # Write and read back @@ -45,6 +46,7 @@ # Second acquire bv2 = acquire!(pool, Bit, 50) @test length(bv2) == 50 + @test bv2 isa BitVector @test pool.bits.n_active == 2 # Independent values @@ -53,14 +55,15 @@ @test count(bv) == 99 # bv unchanged end - @testset "acquire!(pool, Bit, dims...) - N-D" begin + @testset "acquire!(pool, Bit, dims...) - N-D (returns BitArray for SIMD performance)" begin pool = AdaptiveArrayPool() - # 2D + # 2D - returns BitMatrix (Julia's reshape(BitVector, dims) returns BitArray) ba2 = acquire!(pool, Bit, 10, 10) @test size(ba2) == (10, 10) @test eltype(ba2) == Bool - @test ba2 isa Base.ReshapedArray + # Note: reshape(BitVector, dims) returns BitArray{N}, not ReshapedArray + @test ba2 isa BitMatrix @test pool.bits.n_active == 1 # Test indexing @@ -75,108 +78,126 @@ # 3D ba3 = acquire!(pool, Bit, 4, 5, 3) @test size(ba3) == (4, 5, 3) + @test ba3 isa BitArray{3} @test pool.bits.n_active == 2 # Tuple form ba_tuple = acquire!(pool, Bit, (3, 4, 2)) @test size(ba_tuple) == (3, 4, 2) + @test ba_tuple isa BitArray{3} @test pool.bits.n_active == 3 end - @testset "ones!(pool, Bit, dims...) - filled with true" begin + @testset "ones!(pool, Bit, dims...) - BitVector filled with true" begin pool = AdaptiveArrayPool() - # 1D + # 1D - returns BitVector t1 = ones!(pool, Bit, 100) @test length(t1) == 100 @test all(t1) + @test t1 isa BitVector @test pool.bits.n_active == 1 - # 2D + # 2D - returns BitMatrix (reshape of BitVector) t2 = ones!(pool, Bit, 10, 10) @test size(t2) == (10, 10) @test all(t2) @test count(t2) == 100 + @test t2 isa BitMatrix # Tuple form t3 = ones!(pool, Bit, (5, 5, 4)) @test size(t3) == (5, 5, 4) @test all(t3) + @test t3 isa BitArray{3} end - @testset "zeros!(pool, Bit, dims...) - filled with false" begin + @testset "zeros!(pool, Bit, dims...) - BitVector filled with false" begin pool = AdaptiveArrayPool() - # 1D + # 1D - returns BitVector f1 = zeros!(pool, Bit, 100) @test length(f1) == 100 @test !any(f1) + @test f1 isa BitVector @test pool.bits.n_active == 1 - # 2D + # 2D - returns BitMatrix (reshape of BitVector) f2 = zeros!(pool, Bit, 10, 10) @test size(f2) == (10, 10) @test !any(f2) @test count(f2) == 0 + @test f2 isa BitMatrix # Tuple form f3 = zeros!(pool, Bit, (5, 5, 4)) @test size(f3) == (5, 5, 4) @test !any(f3) + @test f3 isa BitArray{3} end - @testset "trues!(pool, dims...) - convenience for BitArray filled with true" begin + @testset "trues!(pool, dims...) - BitVector filled with true (SIMD optimized)" begin pool = AdaptiveArrayPool() - # 1D + # 1D - returns BitVector t1 = trues!(pool, 100) @test length(t1) == 100 @test all(t1) @test eltype(t1) == Bool + @test t1 isa BitVector @test pool.bits.n_active == 1 - # 2D + # 2D - returns BitMatrix (reshape of BitVector) t2 = trues!(pool, 10, 10) @test size(t2) == (10, 10) @test all(t2) @test count(t2) == 100 + @test t2 isa BitMatrix # Tuple form t3 = trues!(pool, (5, 5, 4)) @test size(t3) == (5, 5, 4) @test all(t3) + @test t3 isa BitArray{3} # Equivalent to ones!(pool, Bit, ...) t4 = trues!(pool, 50) t5 = ones!(pool, Bit, 50) @test all(t4 .== t5) + @test t4 isa BitVector + @test t5 isa BitVector end - @testset "falses!(pool, dims...) - convenience for BitArray filled with false" begin + @testset "falses!(pool, dims...) - BitVector filled with false (SIMD optimized)" begin pool = AdaptiveArrayPool() - # 1D + # 1D - returns BitVector f1 = falses!(pool, 100) @test length(f1) == 100 @test !any(f1) @test eltype(f1) == Bool + @test f1 isa BitVector @test pool.bits.n_active == 1 - # 2D + # 2D - returns BitMatrix (reshape of BitVector) f2 = falses!(pool, 10, 10) @test size(f2) == (10, 10) @test !any(f2) @test count(f2) == 0 + @test f2 isa BitMatrix # Tuple form f3 = falses!(pool, (5, 5, 4)) @test size(f3) == (5, 5, 4) @test !any(f3) + @test f3 isa BitArray{3} # Equivalent to zeros!(pool, Bit, ...) f4 = falses!(pool, 50) f5 = zeros!(pool, Bit, 50) @test all(f4 .== f5) + @test f4 isa BitVector + @test f5 isa BitVector end @testset "State management" begin @@ -405,14 +426,14 @@ @testset "Mixed Bool types" begin pool = AdaptiveArrayPool() - # Vector{Bool} via acquire! with Bool + # Vector{Bool} via acquire! with Bool - returns SubArray (view) vb = acquire!(pool, Bool, 100) @test vb isa SubArray{Bool, 1, Vector{Bool}} @test pool.bool.n_active == 1 - # BitVector via acquire! with Bit + # BitVector via acquire! with Bit - returns BitVector (for SIMD) bv = acquire!(pool, Bit, 100) - @test bv isa SubArray{Bool, 1, BitVector} + @test bv isa BitVector # Note: Bit returns BitVector, not SubArray @test pool.bits.n_active == 1 # Both should work independently @@ -477,28 +498,28 @@ end end - @testset "unsafe_acquire! SIMD performance" begin - # Verify that unsafe_acquire! preserves SIMD-optimized operations + @testset "Unified BitVector API - both acquire! and unsafe_acquire! return BitVector" begin + # Both acquire! and unsafe_acquire! return BitVector for Bit type + # This is a deliberate design choice for SIMD performance pool = AdaptiveArrayPool() @with_pool pool begin n = 10000 - # Setup: fill with known pattern + # unsafe_acquire! returns BitVector bv_unsafe = unsafe_acquire!(pool, Bit, n) fill!(bv_unsafe, true) - - # count() should work correctly @test count(bv_unsafe) == n - - # Verify it's using the fast path (type check) @test bv_unsafe isa BitVector - # Compare with acquire! (SubArray) - bv_view = acquire!(pool, Bit, n) - fill!(bv_view, true) - @test count(bv_view) == n - @test bv_view isa SubArray + # acquire! ALSO returns BitVector (not SubArray) + bv_acquire = acquire!(pool, Bit, n) + fill!(bv_acquire, true) + @test count(bv_acquire) == n + @test bv_acquire isa BitVector # Same type as unsafe_acquire! + + # Both benefit from SIMD-optimized count() + # (No performance difference since both return BitVector) end end @@ -517,6 +538,12 @@ @test eltype(v_bool) == Bool @test eltype(v_bit) == Bool + # Note: acquire! returns SubArray for most types, but BitVector for Bit + @test v_f64 isa SubArray + @test v_i32 isa SubArray + @test v_bool isa SubArray + @test v_bit isa BitVector # Special case for SIMD performance + # zeros!/ones! work consistently z_f64 = zeros!(pool, Float64, 10) z_bit = zeros!(pool, Bit, 10) @@ -527,29 +554,37 @@ @test !any(z_bit) @test all(o_f64 .== 1.0) @test all(o_bit) + + # Type consistency for convenience functions + @test z_bit isa BitVector + @test o_bit isa BitVector end - @testset "NTuple form coverage" begin + @testset "NTuple form coverage (all return BitArray types)" begin pool = AdaptiveArrayPool() # Test NTuple forms for trues!/falses! (covers _trues_impl! and _falses_impl! NTuple overloads) t_tuple = trues!(pool, (5, 5)) @test size(t_tuple) == (5, 5) @test all(t_tuple) + @test t_tuple isa BitMatrix f_tuple = falses!(pool, (5, 5)) @test size(f_tuple) == (5, 5) @test !any(f_tuple) + @test f_tuple isa BitMatrix # Test NTuple forms for zeros!/ones! with Bit type # (covers _zeros_impl! and _ones_impl! with Bit NTuple overloads) z_bit_tuple = zeros!(pool, Bit, (4, 4)) @test size(z_bit_tuple) == (4, 4) @test !any(z_bit_tuple) + @test z_bit_tuple isa BitMatrix o_bit_tuple = ones!(pool, Bit, (4, 4)) @test size(o_bit_tuple) == (4, 4) @test all(o_bit_tuple) + @test o_bit_tuple isa BitMatrix end @testset "Generic DisabledPool fallback for unknown backend" begin @@ -600,10 +635,17 @@ z = AdaptiveArrayPools._zeros_impl!(pool, Bit, (3, 3)) @test size(z) == (3, 3) @test !any(z) + @test z isa BitMatrix o = AdaptiveArrayPools._ones_impl!(pool, Bit, (3, 3)) @test size(o) == (3, 3) @test all(o) + @test o isa BitMatrix + + # Test _acquire_impl! returns BitVector (not SubArray) + bv = AdaptiveArrayPools._acquire_impl!(pool, Bit, 100) + @test bv isa BitVector + @test length(bv) == 100 end end # BitArray Support From 2434629008daa8b6f815daea10e55b0b3984cebc Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Fri, 30 Jan 2026 16:21:36 -0800 Subject: [PATCH 05/13] fix: replace prod with safe_prod in _unsafe_acquire_impl! for improved safety --- src/bitarray.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bitarray.jl b/src/bitarray.jl index fd64bff..353c9e9 100644 --- a/src/bitarray.jl +++ b/src/bitarray.jl @@ -198,7 +198,7 @@ end end @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N} - total = prod(dims) + total = safe_prod(dims) bv = _unsafe_acquire_impl!(pool, Bit, total) return reshape(bv, dims) # BitArray{N} (Julia's reshape on BitVector returns BitArray) end From c63a84363185a9199238d35013e9c1412d208f45 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Fri, 30 Jan 2026 16:37:16 -0800 Subject: [PATCH 06/13] fix: update performance metrics for BitVector operations in documentation --- src/bitarray.jl | 6 +++--- src/types.jl | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/bitarray.jl b/src/bitarray.jl index 353c9e9..2a061a7 100644 --- a/src/bitarray.jl +++ b/src/bitarray.jl @@ -20,7 +20,7 @@ # intentional for several reasons: # # 1. **SIMD Performance**: BitVector operations like `count()`, `sum()`, and -# bitwise operations are ~140x faster than their SubArray equivalents +# bitwise operations are ~(10x ~ 100x) faster than their SubArray equivalents # because they use SIMD-optimized chunked algorithms. # # 2. **API Simplicity**: Users always get BitVector regardless of which API @@ -61,7 +61,7 @@ Get a BitVector that shares `chunks` with the pooled BitVector. Unlike `get_view!` which returns a `SubArray` (loses SIMD optimizations), this returns a real `BitVector` with shared chunks, preserving native -BitVector performance (~140x faster for `count()`, `sum()`, etc.). +BitVector performance (~(10x ~ 100x) faster for `count()`, `sum()`, etc.). ## Implementation Creates a new BitVector shell and replaces its `chunks` field with the @@ -169,7 +169,7 @@ end # Unlike other types where acquire! returns SubArray (view-based) and # unsafe_acquire! returns Array (raw), Bit type always returns BitVector. # This is because BitVector's SIMD-optimized operations (count, sum, etc.) -# are ~140x faster than SubArray equivalents. +# are ~(10x ~ 100x) faster than SubArray equivalents. # # The delegation is transparent: users calling acquire!(pool, Bit, n) get # BitVector without needing to know about unsafe_acquire!. diff --git a/src/types.jl b/src/types.jl index 51bab0f..0f2e8b5 100644 --- a/src/types.jl +++ b/src/types.jl @@ -247,7 +247,7 @@ needing to remember which API to use. ## Performance `BitVector` operations like `count()`, `sum()`, and bitwise operations are -~140x faster than equivalent operations on `SubArray{Bool}` because they +~(10x ~ 100x) faster than equivalent operations on `SubArray{Bool}` because they use SIMD-optimized algorithms on packed 64-bit chunks. ```julia @@ -314,7 +314,7 @@ end ``` ## Performance -Operations like `count()`, `sum()`, and bitwise operations are ~140x faster +Operations like `count()`, `sum()`, and bitwise operations are ~(10x ~ 100x) faster than equivalent operations on `SubArray{Bool}` because `BitVector` uses SIMD-optimized algorithms on packed 64-bit chunks. From 8ef74e5796893e1d140155d5c9b14a3715249603 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Fri, 30 Jan 2026 16:43:58 -0800 Subject: [PATCH 07/13] docs: clarify BitVector and BitArray usage in documentation for performance optimization --- docs/src/features/bit-arrays.md | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/docs/src/features/bit-arrays.md b/docs/src/features/bit-arrays.md index ec21d39..25b1823 100644 --- a/docs/src/features/bit-arrays.md +++ b/docs/src/features/bit-arrays.md @@ -14,7 +14,7 @@ To distinguish between standard boolean arrays (`Vector{Bool}`, 1 byte/element) ## Usage ### 1D Arrays (BitVector) -For 1D arrays, `acquire!` returns a view into a pooled `BitVector`. +For 1D arrays, `acquire!` returns a native `BitVector`. This design choice enables full SIMD optimization, making operations significantly faster (10x~100x) than using views. ```julia @with_pool pool begin @@ -25,17 +25,17 @@ For 1D arrays, `acquire!` returns a view into a pooled `BitVector`. bv .= true bv[1] = false - # Supports standard operations + # Supports standard operations with full SIMD acceleration count(bv) end ``` -### N-D Arrays (BitArray / Reshaped) -For multi-dimensional arrays, `acquire!` returns a `ReshapedArray` wrapper around the linear `BitVector`. This maintains zero-allocation efficiency while providing N-D indexing. +### N-D Arrays (BitArray) +For multi-dimensional arrays, `acquire!` returns a `BitArray{N}` (specifically `BitMatrix` for 2D). This preserves the packed memory layout and SIMD benefits while providing N-D indexing. ```julia @with_pool pool begin - # 100x100 bit matrix + # 100x100 bit matrix (returns BitMatrix) mask = zeros!(pool, Bit, 100, 100) mask[5, 5] = true @@ -68,11 +68,17 @@ end Note: `zeros!(pool, Bit, ...)` and `ones!(pool, Bit, ...)` are also supported (aliased to `falses!` and `trues!`). ``` -## How It Works +## Performance & Safety -The pool maintains a separate `BitTypedPool` specifically for `BitVector` storage. -- **Sentinel**: `acquire!(..., Bit, ...)` dispatches to this special pool. -- **Views**: 1D returns `SubArray{Bool, 1, BitVector, ...}`. -- **Reshaping**: N-D returns `ReshapedArray{Bool, N, SubArray{...}}`. +### Why Native BitVector? +The pool returns native `BitVector`/`BitArray` types instead of `SubArray` views for **performance**. +Operations like `count()`, `sum()`, and bitwise broadcasting are **10x~100x faster** on native bit arrays because they utilize SIMD instructions on packed 64-bit chunks. -This ensures that even for complex shapes, the underlying storage is always a compact `BitVector` reused from the pool. +### ⚠️ Important: Do Not Resize + +While the returned arrays are standard `BitVector` types, they share their underlying memory chunks with the pool. + +!!! warning "Do Not Resize" + **NEVER** resize (`push!`, `pop!`, `resize!`) a pooled `BitVector` or `BitArray`. + + The underlying memory is owned and managed by the pool. Resizing it will detach it from the pool or potentially corrupt the shared state. Treat these arrays as **fixed-size** scratch buffers only. From ade8f99683428d86660bc3d9b9b6c657e3afd899 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Fri, 30 Jan 2026 21:45:47 -0800 Subject: [PATCH 08/13] feat: enhance BitArray and BitTypedPool functionality with improved statistics and testing --- src/bitarray.jl | 23 ++------- src/utils.jl | 113 +++++++++++------------------------------- test/test_bitarray.jl | 17 ++++++- test/test_utils.jl | 48 ++++++++++++++++++ 4 files changed, 97 insertions(+), 104 deletions(-) diff --git a/src/bitarray.jl b/src/bitarray.jl index 2a061a7..5a8488f 100644 --- a/src/bitarray.jl +++ b/src/bitarray.jl @@ -6,7 +6,6 @@ # generic Array acquisition code in acquire.jl for maintainability. # # Key components: -# - allocate_vector(::BitTypedPool, n) - BitVector allocation dispatch # - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type # - get_bitvector_wrapper! - SIMD-optimized BitVector with shared chunks # - _acquire_impl! for Bit - Delegates to _unsafe_acquire_impl! for performance @@ -40,12 +39,9 @@ # ============================================================================== # ============================================================================== -# Allocation Dispatch Points (BitArray-specific) +# Fill Value Dispatch (BitArray-specific) # ============================================================================== -# BitTypedPool allocates BitVector (used when acquiring with Bit type) -@inline allocate_vector(::BitTypedPool, n::Int) = BitVector(undef, n) - # Bit type returns Bool element type for fill operations (zero/one) @inline Base.zero(::Type{Bit}) = false @inline Base.one(::Type{Bit}) = true @@ -109,22 +105,9 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int) return wrapper end - # 2. Check N-way cache for hit + # 2. Check N-way cache for hit (cache slots always exist - created with vector slot above) @inbounds pool_bv = tp.vectors[idx] current_ptr = UInt(pointer(pool_bv.chunks)) - - # Ensure cache slots exist for this index - n_slots_cached = length(tp.nd_next_way) - while idx > n_slots_cached - for _ in 1:CACHE_WAYS - push!(tp.nd_arrays, nothing) - push!(tp.nd_dims, nothing) - push!(tp.nd_ptrs, UInt(0)) - end - push!(tp.nd_next_way, 0) - n_slots_cached += 1 - end - base = (idx - 1) * CACHE_WAYS # Linear search across all ways @@ -204,7 +187,7 @@ end end @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N} - _unsafe_acquire_impl!(pool, Bit, dims...) + return _unsafe_acquire_impl!(pool, Bit, dims...) end # ============================================================================== diff --git a/src/utils.jl b/src/utils.jl index cf12ea0..f252aaa 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -79,51 +79,24 @@ _validate_pool_return(val, ::DisabledPool) = nothing # Statistics & Pretty Printing # ============================================================================== -""" - pool_stats(tp::TypedPool{T}; io::IO=stdout, indent::Int=0, name::String="") - -Print statistics for a single TypedPool. -""" -function pool_stats(tp::TypedPool{T}; io::IO=stdout, indent::Int=0, name::String="") where {T} - prefix = " "^indent - type_name = isempty(name) ? string(T) : name - - n_arrays = length(tp.vectors) - if n_arrays == 0 - printstyled(io, prefix, type_name, color=:cyan) - printstyled(io, " (empty)\n", color=:dark_gray) - return - end +# --- Helper functions for pool_stats (type-specific behavior) --- +_default_type_name(::TypedPool{T}) where {T} = string(T) +_default_type_name(::BitTypedPool) = "Bit" - total_elements = sum(length(v) for v in tp.vectors) - total_bytes = sum(Base.summarysize(v) for v in tp.vectors) - bytes_str = Base.format_bytes(total_bytes) +_vector_bytes(v::Vector) = Base.summarysize(v) +_vector_bytes(v::BitVector) = sizeof(v.chunks) - # Header - printstyled(io, prefix, type_name, color=:cyan) - println(io) - - # Stats - printstyled(io, prefix, " slots: ", color=:dark_gray) - printstyled(io, n_arrays, color=:blue) - printstyled(io, " (active: ", color=:dark_gray) - printstyled(io, tp.n_active, color=:blue) - printstyled(io, ")\n", color=:dark_gray) - - printstyled(io, prefix, " elements: ", color=:dark_gray) - printstyled(io, total_elements, color=:blue) - printstyled(io, " ($bytes_str)\n", color=:dark_gray) - return nothing -end +_count_label(::TypedPool) = "elements" +_count_label(::BitTypedPool) = "bits" """ - pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String="") + pool_stats(tp::AbstractTypedPool; io::IO=stdout, indent::Int=0, name::String="") -Print statistics for a BitTypedPool. +Print statistics for a TypedPool or BitTypedPool. """ -function pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String="") +function pool_stats(tp::AbstractTypedPool; io::IO=stdout, indent::Int=0, name::String="") prefix = " "^indent - type_name = isempty(name) ? "Bit" : name + type_name = isempty(name) ? _default_type_name(tp) : name n_arrays = length(tp.vectors) if n_arrays == 0 @@ -132,8 +105,8 @@ function pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String return end - total_bits = sum(length(v) for v in tp.vectors) - total_bytes = sum(sizeof(v.chunks) for v in tp.vectors) + total_count = sum(length(v) for v in tp.vectors) + total_bytes = sum(_vector_bytes(v) for v in tp.vectors) bytes_str = Base.format_bytes(total_bytes) # Header @@ -147,8 +120,8 @@ function pool_stats(tp::BitTypedPool; io::IO=stdout, indent::Int=0, name::String printstyled(io, tp.n_active, color=:blue) printstyled(io, ")\n", color=:dark_gray) - printstyled(io, prefix, " bits: ", color=:dark_gray) - printstyled(io, total_bits, color=:blue) + printstyled(io, prefix, " ", _count_label(tp), ": ", color=:dark_gray) + printstyled(io, total_count, color=:blue) printstyled(io, " ($bytes_str)\n", color=:dark_gray) return nothing end @@ -178,12 +151,7 @@ function pool_stats(pool::AdaptiveArrayPool; io::IO=stdout) foreach_fixed_slot(pool) do tp if !isempty(tp.vectors) has_content = true - name = if tp isa BitTypedPool - "Bit (fixed)" - else - T = typeof(tp).parameters[1] # Extract T from TypedPool{T} - "$T (fixed)" - end + name = _default_type_name(tp) * " (fixed)" pool_stats(tp; io, indent=2, name) end end @@ -217,10 +185,7 @@ function pool_stats(; io::IO=stdout) pool_stats(:cpu; io) # Show CUDA pools if extension is loaded and pools exist try - pools = get_task_local_cuda_pools() - for pool in values(pools) - pool_stats(pool; io) - end + pool_stats(Val(:cuda); io) catch e e isa MethodError || rethrow() # CUDA extension not loaded - silently skip @@ -254,44 +219,26 @@ end # Base.show (delegates to pool_stats) # ============================================================================== -# Compact one-line show for TypedPool -function Base.show(io::IO, tp::TypedPool{T}) where {T} - n_vectors = length(tp.vectors) - if n_vectors == 0 - print(io, "TypedPool{$T}(empty)") - else - total = sum(length(v) for v in tp.vectors) - print(io, "TypedPool{$T}(slots=$n_vectors, active=$(tp.n_active), elements=$total)") - end -end +# --- Helper for Base.show (full type name for display) --- +_show_type_name(::TypedPool{T}) where {T} = "TypedPool{$T}" +_show_type_name(::BitTypedPool) = "BitTypedPool" -# Multi-line show for TypedPool -function Base.show(io::IO, ::MIME"text/plain", tp::TypedPool{T}) where {T} - pool_stats(tp; io, name="TypedPool{$T}") -end - -# Compact one-line show for BitTypedPool -function Base.show(io::IO, tp::BitTypedPool) +# Compact one-line show for all AbstractTypedPool +function Base.show(io::IO, tp::AbstractTypedPool) + name = _show_type_name(tp) n_vectors = length(tp.vectors) if n_vectors == 0 - print(io, "BitTypedPool(empty)") + print(io, "$name(empty)") else - total_bits = sum(length(v) for v in tp.vectors) - print(io, "BitTypedPool(slots=$n_vectors, active=$(tp.n_active), bits=$total_bits)") + total = sum(length(v) for v in tp.vectors) + label = _count_label(tp) + print(io, "$name(slots=$n_vectors, active=$(tp.n_active), $label=$total)") end end -# Multi-line show for BitTypedPool -function Base.show(io::IO, ::MIME"text/plain", tp::BitTypedPool) - n_vectors = length(tp.vectors) - println(io, "BitTypedPool:") - println(io, " slots: $n_vectors") - println(io, " active: $(tp.n_active)") - if n_vectors > 0 - total_bits = sum(length(v) for v in tp.vectors) - total_bytes = sum(sizeof(v.chunks) for v in tp.vectors) - println(io, " bits: $total_bits ($(Base.format_bytes(total_bytes)))") - end +# Multi-line show for all AbstractTypedPool +function Base.show(io::IO, ::MIME"text/plain", tp::AbstractTypedPool) + pool_stats(tp; io, name=_show_type_name(tp)) end # Compact one-line show for AdaptiveArrayPool diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl index 3e0a267..d6cb3d3 100644 --- a/test/test_bitarray.jl +++ b/test/test_bitarray.jl @@ -258,7 +258,7 @@ end @testset "DisabledPool fallback" begin - # acquire! with Bit + # --- acquire! with Bit --- bv = acquire!(DISABLED_CPU, Bit, 100) @test bv isa BitVector @test length(bv) == 100 @@ -273,6 +273,21 @@ @test ba_tuple isa BitArray{2} @test size(ba_tuple) == (5, 5) + # --- unsafe_acquire! with Bit (covers bitarray.jl:206-208) --- + ubv = unsafe_acquire!(DISABLED_CPU, Bit, 100) + @test ubv isa BitVector + @test length(ubv) == 100 + + # N-D + uba = unsafe_acquire!(DISABLED_CPU, Bit, 10, 10) + @test uba isa BitArray{2} + @test size(uba) == (10, 10) + + # Tuple form + uba_tuple = unsafe_acquire!(DISABLED_CPU, Bit, (5, 5)) + @test uba_tuple isa BitArray{2} + @test size(uba_tuple) == (5, 5) + # ones! with Bit (like trues) t = ones!(DISABLED_CPU, Bit, 50) @test t isa BitVector diff --git a/test/test_utils.jl b/test/test_utils.jl index ddbde0c..d668761 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -266,6 +266,54 @@ end @test occursin("empty", output) end + @testset "pool_stats for BitTypedPool" begin + import AdaptiveArrayPools: BitTypedPool + + # Empty BitTypedPool + btp = BitTypedPool() + output = @capture_out pool_stats(btp) + @test occursin("Bit", output) + @test occursin("empty", output) + + # BitTypedPool with content (via AdaptiveArrayPool) + pool = AdaptiveArrayPool() + checkpoint!(pool) + + # Acquire some BitVectors + bv1 = acquire!(pool, Bit, 100) + bv2 = acquire!(pool, Bit, 200) + + output = @capture_out pool_stats(pool) + @test occursin("Bit (fixed)", output) + @test occursin("slots: 2", output) + @test occursin("active: 2", output) + @test occursin("bits:", output) # BitTypedPool uses "bits" label, not "elements" + @test occursin("300", output) # Total bits: 100 + 200 + + rewind!(pool) + + # Test direct BitTypedPool stats + btp2 = BitTypedPool() + # Manually add vectors for testing + push!(btp2.vectors, BitVector(undef, 64)) + btp2.n_active = 1 + + output = @capture_out pool_stats(btp2) + @test occursin("Bit", output) + @test occursin("slots: 1", output) + @test occursin("bits: 64", output) + end + + @testset "direct call of internal helpers" begin + import AdaptiveArrayPools: _default_type_name, _vector_bytes, _count_label, TypedPool, BitTypedPool + @test _default_type_name(TypedPool{Float64}()) == "Float64" + @test _default_type_name(BitTypedPool()) == "Bit" + @test _vector_bytes([1, 2, 3]) == Base.summarysize([1, 2, 3]) + @test _vector_bytes(BitVector(undef, 100)) == sizeof(BitVector(undef, 100).chunks) + @test _count_label(TypedPool{Int}()) == "elements" + @test _count_label(BitTypedPool()) == "bits" + end + @testset "_validate_pool_return with N-D arrays" begin pool = AdaptiveArrayPool() checkpoint!(pool) From 08491acf8d003095be2dee01ad231b181503109e Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Fri, 30 Jan 2026 22:11:40 -0800 Subject: [PATCH 09/13] feat: add warning for BitTypedPool growth and enhance tests for Bit and BitMatrix acquisition --- src/bitarray.jl | 4 ++-- test/test_bitarray.jl | 25 +++++++++++++++++++++++++ test/test_utils.jl | 16 ++++++++++++++-- 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/bitarray.jl b/src/bitarray.jl index 5a8488f..7f0ccc6 100644 --- a/src/bitarray.jl +++ b/src/bitarray.jl @@ -98,8 +98,8 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int) # Warn at powers of 2 (possible missing rewind!) if idx >= 512 && (idx & (idx - 1)) == 0 - total_bits = sum(length, tp.vectors) - @warn "BitTypedPool growing large ($idx arrays, ~$(total_bits ÷ 8) bytes). Missing rewind!()?" + total_bytes = sum(_vector_bytes, tp.vectors) + @warn "BitTypedPool growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?" end return wrapper diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl index d6cb3d3..12a2e5b 100644 --- a/test/test_bitarray.jl +++ b/test/test_bitarray.jl @@ -661,6 +661,31 @@ bv = AdaptiveArrayPools._acquire_impl!(pool, Bit, 100) @test bv isa BitVector @test length(bv) == 100 + + bv = AdaptiveArrayPools._acquire_impl!(pool, Bit, (10, 10)) + @test bv isa BitMatrix + @test size(bv) == (10, 10) + end + @testset "BitTypedPool growth warning at 512 arrays" begin + # Use a fresh pool to ensure we start from 0 + pool = AdaptiveArrayPool() + + @test pooling_enabled(pool) == true + + # Acquire 511 arrays without rewind - no warning yet + for i in 1:511 + acquire!(pool, Bit, 10) + end + @test pool.bits.n_active == 511 + + # The 512th acquire should trigger a warning + @test_logs (:warn, r"BitTypedPool growing large \(512 arrays") begin + acquire!(pool, Bit, 10) + end + @test pool.bits.n_active == 512 + + # Clean up + empty!(pool) end end # BitArray Support diff --git a/test/test_utils.jl b/test/test_utils.jl index d668761..4efd0d2 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -196,8 +196,8 @@ end rewind!(pool) end - @testset "Base.show for TypedPool" begin - import AdaptiveArrayPools: TypedPool + @testset "Base.show for TypedPool & BitTypedPool" begin + import AdaptiveArrayPools: TypedPool, BitTypedPool # Empty TypedPool - compact show tp_empty = TypedPool{Float64}() @@ -210,6 +210,8 @@ end acquire!(pool, Float64, 100) acquire!(pool, Float64, 50) + acquire!(pool, Bit, 10) + output = sprint(show, pool.float64) @test occursin("TypedPool{Float64}", output) @test occursin("slots=2", output) @@ -222,6 +224,16 @@ end @test occursin("slots:", output) @test occursin("active:", output) + # BitTypedPool - compact show + output = sprint(show, pool.bits) + @test output == "BitTypedPool(slots=1, active=1, bits=10)" + # Multi-line show (MIME"text/plain") + output = sprint(show, MIME("text/plain"), pool.bits) + @test occursin("BitTypedPool", output) + @test occursin("slots:", output) + @test occursin("active:", output) + @test occursin("bits:", output) + rewind!(pool) end From 4ab1809b3d037a9af99b6ab2535da324c4e90dcc Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Fri, 30 Jan 2026 23:28:06 -0800 Subject: [PATCH 10/13] feat: update documentation for BitArray support, enhancing clarity on usage and performance optimizations --- docs/src/features/bit-arrays.md | 60 ++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/docs/src/features/bit-arrays.md b/docs/src/features/bit-arrays.md index 25b1823..61519b9 100644 --- a/docs/src/features/bit-arrays.md +++ b/docs/src/features/bit-arrays.md @@ -1,6 +1,6 @@ -# BitVector Support +# BitArray Support -AdaptiveArrayPools.jl includes specialized support for `BitArray` (specifically `BitVector`), enabling **~8x memory savings** for boolean arrays compared to standard `Vector{Bool}`. +AdaptiveArrayPools.jl includes specialized support for `BitArray` (including `BitVector` and N-dimensional `BitArray{N}`), enabling **~8x memory savings** for boolean arrays compared to standard `Vector{Bool}`. ## The `Bit` Sentinel Type @@ -20,11 +20,11 @@ For 1D arrays, `acquire!` returns a native `BitVector`. This design choice enabl @with_pool pool begin # Acquire a BitVector of length 1000 bv = acquire!(pool, Bit, 1000) - + # Use like normal bv .= true bv[1] = false - + # Supports standard operations with full SIMD acceleration count(bv) end @@ -37,8 +37,11 @@ For multi-dimensional arrays, `acquire!` returns a `BitArray{N}` (specifically ` @with_pool pool begin # 100x100 bit matrix (returns BitMatrix) mask = zeros!(pool, Bit, 100, 100) - + mask[5, 5] = true + + # 3D BitArray + volume = acquire!(pool, Bit, 10, 10, 10) end ``` @@ -50,35 +53,66 @@ For specific `BitVector` operations, prefer `trues!` and `falses!` which mirror @with_pool pool begin # Filled with false (equivalent to `falses(256)`) mask = falses!(pool, 256) - + # Filled with true (equivalent to `trues(256)`) flags = trues!(pool, 256) - + # Multidimensional grid = trues!(pool, 100, 100) - + # Similar to existing BitArray A = BitVector(undef, 50) B = similar!(pool, A) # Reuses eltype(A) -> Bool - + # To explicit get Bit-packed from pool irrespective of source - C = similar!(pool, A, Bit) + C = similar!(pool, A, Bit) end +``` Note: `zeros!(pool, Bit, ...)` and `ones!(pool, Bit, ...)` are also supported (aliased to `falses!` and `trues!`). -``` ## Performance & Safety -### Why Native BitVector? +### Why Native BitArray? The pool returns native `BitVector`/`BitArray` types instead of `SubArray` views for **performance**. Operations like `count()`, `sum()`, and bitwise broadcasting are **10x~100x faster** on native bit arrays because they utilize SIMD instructions on packed 64-bit chunks. +### N-D Caching & Zero Allocation + +The pool uses an N-way associative cache to efficiently reuse `BitArray{N}` instances: + +| Scenario | Allocation | +|----------|------------| +| First call with new dims | ~944 bytes (new `BitArray{N}` created) | +| Subsequent call with same dims | **0 bytes** (cached instance reused) | +| Same ndims, different dims | **0 bytes** (dims/len fields modified in-place) | +| Different ndims | ~944 bytes (new `BitArray{N}` created and cached) | + +Unlike regular `Array` where dimensions are immutable, `BitArray` allows in-place modification of its `dims` and `len` fields. The pool exploits this to achieve **zero allocation** on repeated calls with matching dimensionality. + +```julia +@with_pool pool begin + # First call: allocates BitMatrix wrapper (~944 bytes) + m1 = acquire!(pool, Bit, 100, 100) + + # Rewind to reuse the same slot + rewind!(pool) + + # Same dims: 0 allocation (exact cache hit) + m2 = acquire!(pool, Bit, 100, 100) + + rewind!(pool) + + # Different dims but same ndims: 0 allocation (dims modified in-place) + m3 = acquire!(pool, Bit, 50, 200) +end +``` + ### ⚠️ Important: Do Not Resize While the returned arrays are standard `BitVector` types, they share their underlying memory chunks with the pool. !!! warning "Do Not Resize" **NEVER** resize (`push!`, `pop!`, `resize!`) a pooled `BitVector` or `BitArray`. - + The underlying memory is owned and managed by the pool. Resizing it will detach it from the pool or potentially corrupt the shared state. Treat these arrays as **fixed-size** scratch buffers only. From fa1513ad93da9d646fbc291c6d0b6bf52cc3504f Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Fri, 30 Jan 2026 23:29:09 -0800 Subject: [PATCH 11/13] refactor(bitarray): implement N-D caching for zero-allocation BitArray reuse - Replace get_bitvector_wrapper! with get_bitarray! supporting proper N-D caching - Exploit BitArray.dims mutability for same-ndims reuse (0 allocation) - Add isa type check before equality to prevent Vector{Any} boxing - Remove unused views/view_lengths fields from BitTypedPool - Split empty! into separate BitTypedPool and TypedPool methods - Add test verifying zero-allocation on cached BitArray retrieval --- src/bitarray.jl | 135 +++++++++++++++++++++++------------------- src/state.jl | 28 ++++++++- src/types.jl | 26 +++----- test/test_bitarray.jl | 25 ++++++++ 4 files changed, 134 insertions(+), 80 deletions(-) diff --git a/src/bitarray.jl b/src/bitarray.jl index 7f0ccc6..f2263ac 100644 --- a/src/bitarray.jl +++ b/src/bitarray.jl @@ -1,5 +1,5 @@ # ============================================================================== -# BitArray Acquisition (Unified BitVector API) +# BitArray Acquisition (N-D Cached BitArray API) # ============================================================================== # # This file contains BitArray-specific pool operations, separated from the @@ -7,35 +7,35 @@ # # Key components: # - Base.zero/one(::Type{Bit}) - Fill value dispatch for Bit sentinel type -# - get_bitvector_wrapper! - SIMD-optimized BitVector with shared chunks +# - get_bitarray! - N-D BitArray with shared chunks and N-way caching # - _acquire_impl! for Bit - Delegates to _unsafe_acquire_impl! for performance -# - _unsafe_acquire_impl! for Bit - Raw BitVector/BitArray acquisition +# - _unsafe_acquire_impl! for Bit - Raw BitArray acquisition with caching # - DisabledPool fallbacks for Bit type # -# Design Decision: Unified BitVector Return Type +# Design Decision: Unified BitArray Return Type # ============================================= # Unlike regular types where acquire! returns SubArray and unsafe_acquire! -# returns Array, for Bit type BOTH return BitVector. This design choice is +# returns Array, for Bit type BOTH return BitArray{N}. This design choice is # intentional for several reasons: # -# 1. **SIMD Performance**: BitVector operations like `count()`, `sum()`, and +# 1. **SIMD Performance**: BitArray operations like `count()`, `sum()`, and # bitwise operations are ~(10x ~ 100x) faster than their SubArray equivalents # because they use SIMD-optimized chunked algorithms. # -# 2. **API Simplicity**: Users always get BitVector regardless of which API +# 2. **API Simplicity**: Users always get BitArray regardless of which API # they call. No need to remember "use unsafe_acquire! for performance". # -# 3. **Semantic Clarity**: The "unsafe" in unsafe_acquire! refers to memory -# safety concerns (use-after-free risk). BitVector already handles memory -# efficiently (1 bit per element), so the naming would be misleading. +# 3. **N-D Caching**: BitArray{N} can be reused by modifying dims/len fields +# when ndims matches, achieving 0 allocation on repeated calls. This is +# unique to BitArray - regular Array cannot modify dims in place. # # 4. **Backwards Compatibility**: Code using trues!/falses! just works with -# optimal performance - these convenience functions now return BitVector. +# optimal performance - these convenience functions return BitVector. # # Implementation: # - _acquire_impl!(pool, Bit, ...) delegates to _unsafe_acquire_impl! -# - get_bitvector_wrapper! creates BitVector shells sharing pool's chunks -# - N-D requests return reshaped BitArrays (reshape preserves chunk sharing) +# - get_bitarray! creates BitArray shells sharing pool's chunks +# - N-way cache stores BitArray{N} entries, reused via dims modification # ============================================================================== # ============================================================================== @@ -47,40 +47,45 @@ @inline Base.one(::Type{Bit}) = true # ============================================================================== -# BitVector Wrapper (chunks sharing for SIMD performance) +# BitArray Acquisition (N-D caching with chunks sharing) # ============================================================================== """ - get_bitvector_wrapper!(tp::BitTypedPool, n::Int) -> BitVector + get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) -> BitArray{N} -Get a BitVector that shares `chunks` with the pooled BitVector. +Get a BitArray{N} that shares `chunks` with the pooled BitVector. -Unlike `get_view!` which returns a `SubArray` (loses SIMD optimizations), -this returns a real `BitVector` with shared chunks, preserving native -BitVector performance (~(10x ~ 100x) faster for `count()`, `sum()`, etc.). +Uses N-way cache for BitArray reuse. Unlike Array which requires unsafe_wrap +for each shape, BitArray can reuse cached entries by modifying `dims`/`len` +fields when ndims matches (0 bytes allocation). -## Implementation -Creates a new BitVector shell and replaces its `chunks` field with the -pooled BitVector's chunks. Uses N-way cache for wrapper reuse. +## Cache Strategy +- **Exact match**: Return cached BitArray directly (0 bytes) +- **Same ndims**: Modify dims/len/chunks of cached entry (0 bytes) +- **Different ndims**: Create new BitArray{N} and cache it (~944 bytes) + +## Implementation Notes +- BitVector (N=1): `size()` uses `len` field, `dims` is ignored +- BitArray{N>1}: `size()` uses `dims` field +- All BitArrays share `chunks` with the pool's backing BitVector ## Safety -The returned BitVector is only valid within the `@with_pool` scope. +The returned BitArray is only valid within the `@with_pool` scope. Do NOT use after the scope ends (use-after-free risk). """ -function get_bitvector_wrapper!(tp::BitTypedPool, n::Int) +function get_bitarray!(tp::BitTypedPool, dims::NTuple{N,Int}) where {N} + total_len = safe_prod(dims) tp.n_active += 1 idx = tp.n_active # 1. Pool expansion needed (new slot) if idx > length(tp.vectors) - pool_bv = BitVector(undef, n) + pool_bv = BitVector(undef, total_len) push!(tp.vectors, pool_bv) - push!(tp.views, view(pool_bv, 1:n)) - push!(tp.view_lengths, n) - # Create wrapper sharing chunks - wrapper = BitVector(undef, n) - wrapper.chunks = pool_bv.chunks + # Create BitArray sharing chunks + ba = BitArray{N}(undef, dims) + ba.chunks = pool_bv.chunks # Expand N-way cache (CACHE_WAYS entries per slot) for _ in 1:CACHE_WAYS @@ -92,8 +97,8 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int) # Cache in first way base = (idx - 1) * CACHE_WAYS + 1 - @inbounds tp.nd_arrays[base] = wrapper - @inbounds tp.nd_dims[base] = n + @inbounds tp.nd_arrays[base] = ba + @inbounds tp.nd_dims[base] = dims @inbounds tp.nd_ptrs[base] = UInt(pointer(pool_bv.chunks)) # Warn at powers of 2 (possible missing rewind!) @@ -102,49 +107,59 @@ function get_bitvector_wrapper!(tp::BitTypedPool, n::Int) @warn "BitTypedPool growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?" end - return wrapper + return ba end - # 2. Check N-way cache for hit (cache slots always exist - created with vector slot above) + # 2. Ensure pool_bv has correct size @inbounds pool_bv = tp.vectors[idx] + if length(pool_bv) != total_len + resize!(pool_bv, total_len) + end current_ptr = UInt(pointer(pool_bv.chunks)) base = (idx - 1) * CACHE_WAYS - # Linear search across all ways + # 3. Check N-way cache for hit for k in 1:CACHE_WAYS cache_idx = base + k - @inbounds cached_n = tp.nd_dims[cache_idx] + @inbounds cached_dims = tp.nd_dims[cache_idx] @inbounds cached_ptr = tp.nd_ptrs[cache_idx] - if cached_n == n && cached_ptr == current_ptr - return @inbounds tp.nd_arrays[cache_idx]::BitVector + # Must check isa FIRST for type stability (avoids boxing in == comparison) + if cached_dims isa NTuple{N,Int} && cached_ptr == current_ptr + if cached_dims == dims + # Exact match - return cached BitArray directly (0 alloc) + return @inbounds tp.nd_arrays[cache_idx]::BitArray{N} + else + # Same ndims but different dims - reuse by modifying fields (0 alloc!) + ba = @inbounds tp.nd_arrays[cache_idx]::BitArray{N} + ba.len = total_len + ba.dims = dims + ba.chunks = pool_bv.chunks + # Update cache metadata + @inbounds tp.nd_dims[cache_idx] = dims + return ba + end end end - # 3. Cache miss - resize pool_bv to EXACTLY n elements and create new wrapper - # Unlike regular arrays where we only grow, BitVector wrappers MUST have exactly - # the right number of chunks. Otherwise fill!()/count() iterate over all chunks, - # not just the bits within wrapper.len, causing incorrect behavior. - if length(pool_bv) != n - resize!(pool_bv, n) - @inbounds tp.views[idx] = view(pool_bv, 1:n) - @inbounds tp.view_lengths[idx] = n - end - - wrapper = BitVector(undef, n) - wrapper.chunks = pool_bv.chunks + # 4. Cache miss - create new BitArray{N} + ba = BitArray{N}(undef, dims) + ba.chunks = pool_bv.chunks # Round-robin replacement @inbounds way_offset = tp.nd_next_way[idx] target_idx = base + way_offset + 1 - @inbounds tp.nd_arrays[target_idx] = wrapper - @inbounds tp.nd_dims[target_idx] = n - @inbounds tp.nd_ptrs[target_idx] = UInt(pointer(pool_bv.chunks)) + @inbounds tp.nd_arrays[target_idx] = ba + @inbounds tp.nd_dims[target_idx] = dims + @inbounds tp.nd_ptrs[target_idx] = current_ptr @inbounds tp.nd_next_way[idx] = (way_offset + 1) % CACHE_WAYS - return wrapper + return ba end +# Convenience: 1D case wraps to tuple +@inline get_bitarray!(tp::BitTypedPool, n::Int) = get_bitarray!(tp, (n,)) + # ============================================================================== # Acquire Implementation (Bit type → delegates to unsafe_acquire for performance) # ============================================================================== @@ -174,20 +189,20 @@ end # Unsafe Acquire Implementation (Bit type) # ============================================================================== -# Bit type: returns BitVector with shared chunks (SIMD optimized) +# Bit type: returns BitArray{N} with shared chunks (SIMD optimized, N-D cached) @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int) tp = get_typed_pool!(pool, Bit)::BitTypedPool - return get_bitvector_wrapper!(tp, n) + return get_bitarray!(tp, n) end @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::Vararg{Int,N}) where {N} - total = safe_prod(dims) - bv = _unsafe_acquire_impl!(pool, Bit, total) - return reshape(bv, dims) # BitArray{N} (Julia's reshape on BitVector returns BitArray) + tp = get_typed_pool!(pool, Bit)::BitTypedPool + return get_bitarray!(tp, dims) end @inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, dims::NTuple{N,Int}) where {N} - return _unsafe_acquire_impl!(pool, Bit, dims...) + tp = get_typed_pool!(pool, Bit)::BitTypedPool + return get_bitarray!(tp, dims) end # ============================================================================== diff --git a/src/state.jl b/src/state.jl index 9cb09ab..c9b2a66 100644 --- a/src/state.jl +++ b/src/state.jl @@ -206,12 +206,34 @@ end # ============================================================================== """ - empty!(tp::AbstractTypedPool) + empty!(tp::BitTypedPool) -Clear all internal storage, releasing all memory. +Clear all internal storage for BitTypedPool, releasing all memory. Restores sentinel values for 1-based sentinel pattern. """ -function Base.empty!(tp::AbstractTypedPool) +function Base.empty!(tp::BitTypedPool) + empty!(tp.vectors) + # Clear N-way wrapper cache + empty!(tp.nd_arrays) + empty!(tp.nd_dims) + empty!(tp.nd_ptrs) + empty!(tp.nd_next_way) + tp.n_active = 0 + # Restore sentinel values (1-based sentinel pattern) + empty!(tp._checkpoint_n_active) + push!(tp._checkpoint_n_active, 0) # Sentinel: n_active=0 at depth=0 + empty!(tp._checkpoint_depths) + push!(tp._checkpoint_depths, 0) # Sentinel: depth=0 = no checkpoint + return tp +end + +""" + empty!(tp::TypedPool) + +Clear all internal storage for TypedPool, releasing all memory. +Restores sentinel values for 1-based sentinel pattern. +""" +function Base.empty!(tp::TypedPool) empty!(tp.vectors) empty!(tp.views) empty!(tp.view_lengths) diff --git a/src/types.jl b/src/types.jl index 0f2e8b5..957eb92 100644 --- a/src/types.jl +++ b/src/types.jl @@ -291,8 +291,6 @@ performance without needing to choose between APIs. ## Fields - `vectors`: Backing `BitVector` storage -- `views`: Cached `SubArray` views (legacy, maintained for compatibility) -- `view_lengths`: Cached lengths for fast comparison - `nd_arrays`: Cached wrapper BitVectors (chunks sharing) - `nd_dims`: Cached lengths for wrapper cache validation - `nd_ptrs`: Cached chunk pointers for invalidation detection @@ -324,17 +322,14 @@ mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector} # --- Storage --- vectors::Vector{BitVector} - # --- 1D Cache (1:1 mapping) --- - views::Vector{SubArray{Bool, 1, BitVector, Tuple{UnitRange{Int64}}, true}} - view_lengths::Vector{Int} - - # --- N-D Array Cache (empty, for empty! compatibility) --- - # BitArray cannot use unsafe_wrap, so no N-D caching is possible. - # These fields exist only for compatibility with empty!(::AbstractTypedPool). - nd_arrays::Vector{Any} - nd_dims::Vector{Any} - nd_ptrs::Vector{UInt} - nd_next_way::Vector{Int} + # --- 1D BitVector Wrapper Cache (N-way set associative) --- + # Unlike TypedPool which uses views for 1D and nd_* for N-D, + # BitTypedPool uses nd_* for 1D wrapper caching (BitVector with shared chunks). + # No views needed since we always return BitVector, not SubArray. + nd_arrays::Vector{Any} # BitVector wrappers + nd_dims::Vector{Any} # requested lengths (Int, not tuple) + nd_ptrs::Vector{UInt} # pointer validation + nd_next_way::Vector{Int} # round-robin counter per slot # --- State Management (1-based sentinel pattern) --- n_active::Int @@ -345,10 +340,7 @@ end BitTypedPool() = BitTypedPool( # Storage BitVector[], - # 1D Cache - SubArray{Bool, 1, BitVector, Tuple{UnitRange{Int64}}, true}[], - Int[], - # N-D Array Cache (empty, for compatibility) + # 1D BitVector Wrapper Cache (N-way) Any[], Any[], UInt[], diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl index 12a2e5b..d848dcb 100644 --- a/test/test_bitarray.jl +++ b/test/test_bitarray.jl @@ -688,4 +688,29 @@ empty!(pool) end + @testset "N-D BitArray caching - zero allocation on reuse" begin + # Test that N-D caching works: first call may allocate, subsequent calls should not + # This verifies the optimization where BitArray{N}.dims can be modified in-place + + pool = get_task_local_pool() + empty!(pool) # Start fresh + @with_pool pool function foo() + # Warmup to populate cache + bv = acquire!(pool, Bit, 100) + ba2 = acquire!(pool, Bit, 10, 10) + ba3 = acquire!(pool, Bit, 5, 5, 4) + + tt1 = trues!(pool, 256) + tt2 = ones!(pool, 10, 20) + ff1 = falses!(pool, 100, 5) + ff2 = zeros!(pool, 100) + + C = similar!(pool, tt1) + end + + @test (@allocated foo()) > 0 # First call allocates + @test (@allocated foo()) == 0 # Subsequent calls reuse cached arrays + @test (@allocated foo()) == 0 # Further calls also zero allocation + end + end # BitArray Support From a9f9f5c3c93539a6d120a0edd65cb2be0bdfd745 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Sat, 31 Jan 2026 09:16:26 -0800 Subject: [PATCH 12/13] feat(bitarray): enhance N-D BitArray caching and add allocation tests --- src/bitarray.jl | 8 ++--- src/types.jl | 11 +++--- test/runtests.jl | 1 + test/test_allocation.jl | 30 ++++++++++++++++ test/test_bitarray.jl | 76 ++++++++++++++++++++++++++++++++--------- 5 files changed, 100 insertions(+), 26 deletions(-) create mode 100644 test/test_allocation.jl diff --git a/src/bitarray.jl b/src/bitarray.jl index f2263ac..eb9addf 100644 --- a/src/bitarray.jl +++ b/src/bitarray.jl @@ -165,12 +165,12 @@ end # ============================================================================== # # Unlike other types where acquire! returns SubArray (view-based) and -# unsafe_acquire! returns Array (raw), Bit type always returns BitVector. -# This is because BitVector's SIMD-optimized operations (count, sum, etc.) +# unsafe_acquire! returns Array (raw), Bit type always returns BitArray{N}. +# This is because BitArray's SIMD-optimized operations (count, sum, etc.) # are ~(10x ~ 100x) faster than SubArray equivalents. # -# The delegation is transparent: users calling acquire!(pool, Bit, n) get -# BitVector without needing to know about unsafe_acquire!. +# The delegation is transparent: users calling acquire!(pool, Bit, dims...) get +# BitArray{N} without needing to know about unsafe_acquire!. # Bit type: delegates to _unsafe_acquire_impl! for SIMD performance @inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{Bit}, n::Int) diff --git a/src/types.jl b/src/types.jl index 957eb92..0b6f62f 100644 --- a/src/types.jl +++ b/src/types.jl @@ -322,12 +322,13 @@ mutable struct BitTypedPool <: AbstractTypedPool{Bool, BitVector} # --- Storage --- vectors::Vector{BitVector} - # --- 1D BitVector Wrapper Cache (N-way set associative) --- + # --- N-D BitArray Cache (N-way set associative) --- # Unlike TypedPool which uses views for 1D and nd_* for N-D, - # BitTypedPool uses nd_* for 1D wrapper caching (BitVector with shared chunks). - # No views needed since we always return BitVector, not SubArray. - nd_arrays::Vector{Any} # BitVector wrappers - nd_dims::Vector{Any} # requested lengths (Int, not tuple) + # BitTypedPool uses nd_* for ALL dimensions (1D, 2D, 3D, etc.). + # No views needed since we always return BitArray{N}, not SubArray. + # BitArray.dims is mutable, enabling 0-alloc reuse for same-ndims requests. + nd_arrays::Vector{Any} # Cached BitArray{N} instances + nd_dims::Vector{Any} # Cached dims (NTuple{N,Int}) nd_ptrs::Vector{UInt} # pointer validation nd_next_way::Vector{Int} # round-robin counter per slot diff --git a/test/runtests.jl b/test/runtests.jl index c4417de..2525187 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -27,6 +27,7 @@ else include("test_convenience.jl") include("test_bitarray.jl") include("test_coverage.jl") + include("test_allocation.jl") # CUDA extension tests (auto-detect, skip with TEST_CUDA=false) if get(ENV, "TEST_CUDA", "true") != "false" diff --git a/test/test_allocation.jl b/test/test_allocation.jl new file mode 100644 index 0000000..270fb8f --- /dev/null +++ b/test/test_allocation.jl @@ -0,0 +1,30 @@ +@with_pool pool function foo() + float64_vec = acquire!(pool, Float64, 10) + float32_vec = acquire!(pool, Float32, 10) + + float64_mat = acquire!(pool, Float64, 10, 10) + float32_mat = acquire!(pool, Float32, 10, 10) + + bv = acquire!(pool, Bit, 100) + ba2 = acquire!(pool, Bit, 10, 10) + ba3 = acquire!(pool, Bit, 5, 5, 4) + + tt1 = trues!(pool, 256) + tt2 = ones!(pool, Bit, 10, 20) + ff1 = falses!(pool, 100, 5) + ff2 = zeros!(pool, Bit, 100) + + C = similar!(pool, tt1) +end + + +@testset "zero allocation on reuse" begin + + alloc1 = @allocated foo() + alloc2 = @allocated foo() + alloc3 = @allocated foo() + + @test alloc1 > 0 # First call allocates + @test alloc2 == 0 # Subsequent calls reuse cached arrays + @test alloc3 == 0 # Further calls also zero allocation +end \ No newline at end of file diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl index d848dcb..08e9fca 100644 --- a/test/test_bitarray.jl +++ b/test/test_bitarray.jl @@ -688,29 +688,71 @@ empty!(pool) end - @testset "N-D BitArray caching - zero allocation on reuse" begin - # Test that N-D caching works: first call may allocate, subsequent calls should not - # This verifies the optimization where BitArray{N}.dims can be modified in-place + @testset "N-D BitArray caching - same ndims different dims" begin + # Test the optimization where BitArray{N}.dims can be modified in-place + # when ndims matches but dims differ (e.g., (10,10) → (5,20)) pool = get_task_local_pool() - empty!(pool) # Start fresh - @with_pool pool function foo() - # Warmup to populate cache - bv = acquire!(pool, Bit, 100) - ba2 = acquire!(pool, Bit, 10, 10) - ba3 = acquire!(pool, Bit, 5, 5, 4) + empty!(pool) + + # Test correctness: verify dims are updated correctly + @with_pool pool begin + m1 = acquire!(pool, Bit, 10, 10) + @test size(m1) == (10, 10) + rewind!(pool) + + m2 = acquire!(pool, Bit, 5, 20) # Same ndims, different dims + @test size(m2) == (5, 20) + rewind!(pool) + + m3 = acquire!(pool, Bit, 25, 4) + @test size(m3) == (25, 4) + end + + # Test zero-allocation: separate function without assertions/returns + @with_pool pool function bar_alloc() + acquire!(pool, Bit, 10, 10) + rewind!(pool) + acquire!(pool, Bit, 5, 20) + rewind!(pool) + acquire!(pool, Bit, 25, 4) + nothing + end - tt1 = trues!(pool, 256) - tt2 = ones!(pool, 10, 20) - ff1 = falses!(pool, 100, 5) - ff2 = zeros!(pool, 100) + bar_alloc() # Warmup + @test (@allocated bar_alloc()) == 0 + end + + @testset "N-D BitArray caching - cache eviction (round-robin)" begin + # Test that round-robin replacement works correctly when cache is full + # CACHE_WAYS determines how many different ndims can be cached per slot + + pool = get_task_local_pool() + empty!(pool) - C = similar!(pool, tt1) + # Test that different ndims each allocate initially, but reuse on repeat + @with_pool pool function test_ndims_caching() + # These all use slot 1, each with different ndims + acquire!(pool, Bit, 100) # 1D + acquire!(pool, Bit, 10, 10) # 2D + acquire!(pool, Bit, 5, 5, 4) # 3D + acquire!(pool, Bit, 5, 2, 2, 5) # 4D + nothing end - @test (@allocated foo()) > 0 # First call allocates - @test (@allocated foo()) == 0 # Subsequent calls reuse cached arrays - @test (@allocated foo()) == 0 # Further calls also zero allocation + test_ndims_caching() # Warmup + @test (@allocated test_ndims_caching()) == 0 + + # Test that cache eviction doesn't break functionality + @with_pool pool begin + # Exceed CACHE_WAYS with different ndims to force eviction + for n in 1:6 + dims = ntuple(_ -> 2, n) + ba = acquire!(pool, Bit, dims...) + @test ndims(ba) == n + @test size(ba) == dims + end + end end end # BitArray Support From 8e0ff199a93fa4e9724c517dd6f347c2e53b5e7e Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Sat, 31 Jan 2026 09:17:58 -0800 Subject: [PATCH 13/13] feat(bitarray): remove N-D BitArray caching tests for optimization verification --- test/test_bitarray.jl | 67 ------------------------------------------- 1 file changed, 67 deletions(-) diff --git a/test/test_bitarray.jl b/test/test_bitarray.jl index 08e9fca..12a2e5b 100644 --- a/test/test_bitarray.jl +++ b/test/test_bitarray.jl @@ -688,71 +688,4 @@ empty!(pool) end - @testset "N-D BitArray caching - same ndims different dims" begin - # Test the optimization where BitArray{N}.dims can be modified in-place - # when ndims matches but dims differ (e.g., (10,10) → (5,20)) - - pool = get_task_local_pool() - empty!(pool) - - # Test correctness: verify dims are updated correctly - @with_pool pool begin - m1 = acquire!(pool, Bit, 10, 10) - @test size(m1) == (10, 10) - rewind!(pool) - - m2 = acquire!(pool, Bit, 5, 20) # Same ndims, different dims - @test size(m2) == (5, 20) - rewind!(pool) - - m3 = acquire!(pool, Bit, 25, 4) - @test size(m3) == (25, 4) - end - - # Test zero-allocation: separate function without assertions/returns - @with_pool pool function bar_alloc() - acquire!(pool, Bit, 10, 10) - rewind!(pool) - acquire!(pool, Bit, 5, 20) - rewind!(pool) - acquire!(pool, Bit, 25, 4) - nothing - end - - bar_alloc() # Warmup - @test (@allocated bar_alloc()) == 0 - end - - @testset "N-D BitArray caching - cache eviction (round-robin)" begin - # Test that round-robin replacement works correctly when cache is full - # CACHE_WAYS determines how many different ndims can be cached per slot - - pool = get_task_local_pool() - empty!(pool) - - # Test that different ndims each allocate initially, but reuse on repeat - @with_pool pool function test_ndims_caching() - # These all use slot 1, each with different ndims - acquire!(pool, Bit, 100) # 1D - acquire!(pool, Bit, 10, 10) # 2D - acquire!(pool, Bit, 5, 5, 4) # 3D - acquire!(pool, Bit, 5, 2, 2, 5) # 4D - nothing - end - - test_ndims_caching() # Warmup - @test (@allocated test_ndims_caching()) == 0 - - # Test that cache eviction doesn't break functionality - @with_pool pool begin - # Exceed CACHE_WAYS with different ndims to force eviction - for n in 1:6 - dims = ntuple(_ -> 2, n) - ba = acquire!(pool, Bit, dims...) - @test ndims(ba) == n - @test size(ba) == dims - end - end - end - end # BitArray Support