From 5de5de39c4e01e9cecf62a7cc60d8d6a8417cd38 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 5 Jan 2026 10:14:28 -0800 Subject: [PATCH 1/8] docs: add GitHub Pages documentation with Documenter.jl - Add .github/workflows/Documentation.yml for automated deployment - Add docs/Project.toml with Documenter.jl dependencies - Add docs/make.jl with path rewriting and page organization - Add new documentation pages: - guide/getting-started.md: Quick start guide - advanced/macro-internals.md: How @with_pool works - advanced/internals.md: Pool architecture overview - Organize existing docs into structured hierarchy: - Guide: Getting Started, Safety Rules - Usage: API Reference, Configuration, @maybe_with_pool, CUDA - Advanced: Multi-threading, Macro Internals, Internals --- .github/workflows/Documentation.yml | 53 +++++ docs/Project.toml | 7 + docs/make.jl | 143 ++++++++++++++ docs/src/advanced/internals.md | 136 +++++++++++++ docs/src/advanced/macro-internals.md | 256 ++++++++++++++++++++++++ docs/src/advanced/multi-threading.md | 284 +++++++++++++++++++++++++++ docs/src/guide/getting-started.md | 116 +++++++++++ docs/src/guide/safety.md | 110 +++++++++++ docs/src/index.md | 117 +++++++++++ docs/src/usage/api.md | 111 +++++++++++ docs/src/usage/configuration.md | 102 ++++++++++ docs/src/usage/cuda.md | 123 ++++++++++++ docs/src/usage/maybe_with_pool.md | 53 +++++ 13 files changed, 1611 insertions(+) create mode 100644 .github/workflows/Documentation.yml create mode 100644 docs/Project.toml create mode 100644 docs/make.jl create mode 100644 docs/src/advanced/internals.md create mode 100644 docs/src/advanced/macro-internals.md create mode 100644 docs/src/advanced/multi-threading.md create mode 100644 docs/src/guide/getting-started.md create mode 100644 docs/src/guide/safety.md create mode 100644 docs/src/index.md create mode 100644 docs/src/usage/api.md create mode 100644 docs/src/usage/configuration.md create mode 100644 docs/src/usage/cuda.md create mode 100644 docs/src/usage/maybe_with_pool.md diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml new file mode 100644 index 0000000..5e1668d --- /dev/null +++ b/.github/workflows/Documentation.yml @@ -0,0 +1,53 @@ +name: Documentation + +on: + push: + branches: + - master + tags: '*' + pull_request: + branches: + - master + workflow_dispatch: + +concurrency: + group: ${{ github.ref }}-docs + cancel-in-progress: true + +jobs: + build: + permissions: + contents: write + statuses: write + name: Documentation + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: julia-actions/setup-julia@latest + + - uses: julia-actions/cache@v2 + + - name: Add FuseRegistry + run: | + rm -rf ~/.julia/registries/FuseRegistry + julia -e 'using Pkg; Pkg.Registry.add(RegistrySpec(url="https://github.com/ProjectTorreyPines/FuseRegistry.jl.git")); Pkg.Registry.add("General"); Pkg.Registry.update()' + + - name: Replace git@github.com with https in Package.toml files + run: | + find ~/.julia/registries/FuseRegistry -type f -name 'Package.toml' -exec sed -i 's|git@github.com:|https://project-torrey-pines:${{secrets.PTP_READ_TOKEN}}@github.com/|g' {} + + + - name: Install dependencies + run: | + julia --project=docs -e ' + using Pkg + Pkg.activate("docs") + Pkg.develop(PackageSpec(path=pwd())) + Pkg.instantiate() + ' + + - name: Build and deploy + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} + run: julia --project=docs/ docs/make.jl diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..d21f8f7 --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,7 @@ +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +AdaptiveArrayPools = "5768322a-0810-4546-8322-123456789abc" +LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" + +[compat] +Documenter = "1" diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..7517dbd --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,143 @@ +using Documenter +using AdaptiveArrayPools + +# ============================================ +# Helper: Conditional write (for LiveServer compatibility) +# ============================================ + +""" +Write file only if content changed (prevents LiveServer infinite loop). +""" +function write_if_changed(path::String, content::String) + if isfile(path) && read(path, String) == content + return # Content unchanged, skip write + end + write(path, content) +end + +""" +Copy file only if content changed (prevents mtime update triggering rebuild). +""" +function cp_if_changed(src::String, dst::String) + if isfile(dst) && read(src) == read(dst) + return # Content unchanged, skip copy + end + cp(src, dst; force=true) +end + +# ============================================ +# Helper: Rewrite relative paths in README +# ============================================ + +""" +Rewrite relative paths in README.md for Documenter structure. + +Converts: +- `docs/api.md` → `usage/api.md` +- `docs/cuda.md` → `usage/cuda.md` +- `docs/safety.md` → `guide/safety.md` +- `docs/multi-threading.md` → `advanced/multi-threading.md` +- `docs/configuration.md` → `usage/configuration.md` +- `docs/maybe_with_pool.md` → `usage/maybe_with_pool.md` + +Also handles anchor links (e.g., `docs/api.md#convenience-functions`). +""" +function rewrite_readme_paths(content::String) + repo_url = "https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl" + + # Usage docs (with optional anchors) + content = replace(content, r"\(docs/api\.md(#[^)]+)?\)" => s"(usage/api.md\1)") + content = replace(content, r"\(docs/cuda\.md(#[^)]+)?\)" => s"(usage/cuda.md\1)") + content = replace(content, r"\(docs/configuration\.md(#[^)]+)?\)" => s"(usage/configuration.md\1)") + content = replace(content, r"\(docs/maybe_with_pool\.md(#[^)]+)?\)" => s"(usage/maybe_with_pool.md\1)") + + # Guide docs + content = replace(content, r"\(docs/safety\.md(#[^)]+)?\)" => s"(guide/safety.md\1)") + + # Advanced docs + content = replace(content, r"\(docs/multi-threading\.md(#[^)]+)?\)" => s"(advanced/multi-threading.md\1)") + + # LICENSE link → GitHub + content = replace(content, "(LICENSE)" => "($(repo_url)/blob/master/LICENSE)") + + return content +end + +# ============================================ +# Step 1: Setup directories +# ============================================ + +const DOCS_DIR = @__DIR__ +const DOCS_SRC = joinpath(DOCS_DIR, "src") + +# Create directory structure +mkpath(DOCS_SRC) +mkpath(joinpath(DOCS_SRC, "guide")) +mkpath(joinpath(DOCS_SRC, "usage")) +mkpath(joinpath(DOCS_SRC, "advanced")) + +# ============================================ +# Step 2: Copy and transform content +# ============================================ + +# README.md → index.md (with path rewriting) +readme_content = read(joinpath(DOCS_DIR, "../README.md"), String) +write_if_changed(joinpath(DOCS_SRC, "index.md"), rewrite_readme_paths(readme_content)) + +# Copy existing docs to their new locations (with path fixes) + +# Guide section - fix relative links +safety_content = read(joinpath(DOCS_DIR, "safety.md"), String) +safety_content = replace(safety_content, "(multi-threading.md)" => "(../advanced/multi-threading.md)") +write_if_changed(joinpath(DOCS_SRC, "guide/safety.md"), safety_content) + +# Usage section +cp_if_changed(joinpath(DOCS_DIR, "api.md"), joinpath(DOCS_SRC, "usage/api.md")) +cp_if_changed(joinpath(DOCS_DIR, "configuration.md"), joinpath(DOCS_SRC, "usage/configuration.md")) +cp_if_changed(joinpath(DOCS_DIR, "maybe_with_pool.md"), joinpath(DOCS_SRC, "usage/maybe_with_pool.md")) +cp_if_changed(joinpath(DOCS_DIR, "cuda.md"), joinpath(DOCS_SRC, "usage/cuda.md")) + +# Advanced section +cp_if_changed(joinpath(DOCS_DIR, "multi-threading.md"), joinpath(DOCS_SRC, "advanced/multi-threading.md")) + +# ============================================ +# Step 3: Build documentation +# ============================================ + +makedocs( + sitename = "AdaptiveArrayPools.jl", + authors = "Min-Gu Yoo", + modules = [AdaptiveArrayPools], + format = Documenter.HTML( + prettyurls = get(ENV, "CI", nothing) == "true", + canonical = "https://projecttorreypines.github.io/AdaptiveArrayPools.jl", + assets = String[], + ), + pages = [ + "Home" => "index.md", + "Guide" => [ + "Getting Started" => "guide/getting-started.md", + "Safety Rules" => "guide/safety.md", + ], + "Usage" => [ + "API Reference" => "usage/api.md", + "Configuration" => "usage/configuration.md", + "@maybe_with_pool" => "usage/maybe_with_pool.md", + "CUDA Support" => "usage/cuda.md", + ], + "Advanced" => [ + "Multi-threading" => "advanced/multi-threading.md", + "How @with_pool Works" => "advanced/macro-internals.md", + "Internals" => "advanced/internals.md", + ], + ], + doctest = false, # Doctests not set up in existing docs + checkdocs = :none, # Using manual API tables, not @autodocs + warnonly = [:cross_references, :missing_docs], +) + +deploydocs( + repo = "github.com/ProjectTorreyPines/AdaptiveArrayPools.jl.git", + devbranch = "master", + push_preview = false, # Deploy only on master/tag, not on PR +) diff --git a/docs/src/advanced/internals.md b/docs/src/advanced/internals.md new file mode 100644 index 0000000..3722203 --- /dev/null +++ b/docs/src/advanced/internals.md @@ -0,0 +1,136 @@ +# Internals + +This page provides an overview of the internal architecture of AdaptiveArrayPools.jl. For detailed design documents (in Korean), see the `design/` folder in the repository. + +## Checkpoint/Rewind Lifecycle + +The core mechanism that enables zero-allocation reuse: + +``` +@with_pool pool function foo() + │ + ├─► checkpoint!(pool) # Save current state (n_active counters) + │ + │ A = acquire!(pool, ...) # n_active += 1 + │ B = acquire!(pool, ...) # n_active += 1 + │ C = acquire!(pool, ...) # n_active += 1 + │ ... compute ... + │ + └─► rewind!(pool) # Restore n_active → all arrays recycled +end +``` + +On repeated calls, the same memory is reused without any allocation. + +## Fixed-Slot Type Dispatch + +To achieve zero-lookup overhead, common types have dedicated struct fields: + +```julia +struct AdaptiveArrayPool + float64::TypedPool{Float64} + float32::TypedPool{Float32} + int64::TypedPool{Int64} + int32::TypedPool{Int32} + complexf64::TypedPool{ComplexF64} + complexf32::TypedPool{ComplexF32} + bool::TypedPool{Bool} + others::IdDict{DataType, Any} # Fallback for rare types +end +``` + +When you call `acquire!(pool, Float64, n)`, the compiler inlines directly to `pool.float64` — no dictionary lookup, no type instability. + +## N-Way Set Associative Cache + +For `unsafe_acquire!` (which returns native `Array` types), we use an N-way cache to reduce header allocation: + +``` + CACHE_WAYS = 4 (default) + ┌────┬────┬────┬────┐ +Slot 0 (Float64): │way0│way1│way2│way3│ ← round-robin eviction + └────┴────┴────┴────┘ + ┌────┬────┬────┬────┐ +Slot 1 (Float32): │way0│way1│way2│way3│ + └────┴────┴────┴────┘ + ... +``` + +### Cache Lookup Pseudocode + +```julia +function unsafe_acquire!(pool, T, dims...) + typed_pool = get_typed_pool!(pool, T) + slot = n_active + 1 + base = (slot - 1) * CACHE_WAYS + + # Search all ways for matching dimensions + for k in 1:CACHE_WAYS + idx = base + k + if dims == typed_pool.nd_dims[idx] + # Cache hit! Check if underlying vector was resized + if pointer matches + return typed_pool.nd_arrays[idx] + end + end + end + + # Cache miss: create new Array header, store in next way (round-robin) + way = typed_pool.nd_next_way[slot] + typed_pool.nd_next_way[slot] = (way + 1) % CACHE_WAYS + # ... create and cache Array ... +end +``` + +**Key insight**: Even on cache miss, only the `Array` header (~80-144 bytes) is allocated. The actual data memory is always reused from the pool. + +## View vs Array Return Types + +Type stability is critical for performance. AdaptiveArrayPools provides two APIs: + +| API | 1D Return | N-D Return | Allocation | +|-----|-----------|------------|------------| +| `acquire!` | `SubArray{T,1}` | `ReshapedArray{T,N}` | Always 0 bytes | +| `unsafe_acquire!` | `Vector{T}` | `Array{T,N}` | 0 bytes (hit) / ~100 bytes (miss) | + +### Why Two APIs? + +**`acquire!` (views)** — The compiler can eliminate view wrappers entirely through SROA (Scalar Replacement of Aggregates) and escape analysis. This is why 1D `SubArray` and N-D `ReshapedArray` achieve true zero allocation. + +**`unsafe_acquire!` (arrays)** — Sometimes you need a concrete `Array` type: +- FFI/C interop requiring `Ptr{T}` from contiguous memory +- Type signatures that explicitly require `Array{T,N}` +- Avoiding runtime dispatch in polymorphic code + +## Typed Checkpoint/Rewind Optimization + +When the `@with_pool` macro can statically determine which types are used, it generates optimized code: + +```julia +# If only Float64 is used in the block: +checkpoint!(pool, Float64) # ~77% faster than full checkpoint +# ... compute ... +rewind!(pool, Float64) +``` + +This avoids iterating over all type slots and the `others` IdDict. + +## 1-Based Sentinel Pattern + +Internal state vectors use a sentinel at index 0 to eliminate `isempty()` checks: + +```julia +_checkpoint_n_active = [0] # Sentinel at depth=0 +_checkpoint_depths = [0] # Global scope marker +``` + +This pattern reduces branching in hot paths where every nanosecond counts. + +## Further Reading + +For detailed design documents (in Korean): +- `design/hybrid_api_design.md` — Two-API strategy rationale +- `design/cuda_extension_design.md` — CUDA backend architecture +- `design/untracked_acquire_design.md` — Untracked acquire detection +- `design/fixed_slots_codegen_design.md` — Code generation for fixed slots +- `design/nd_array_approach_comparison.md` — N-way cache design comparison diff --git a/docs/src/advanced/macro-internals.md b/docs/src/advanced/macro-internals.md new file mode 100644 index 0000000..e7aa69d --- /dev/null +++ b/docs/src/advanced/macro-internals.md @@ -0,0 +1,256 @@ +# How `@with_pool` Works + +This page explains the internal mechanics of the `@with_pool` macro for advanced users and contributors who want to understand the optimization strategies. + +## Overview + +The `@with_pool` macro provides automatic lifecycle management with three key optimizations: + +1. **Try-Finally Safety** — Guarantees cleanup even on exceptions +2. **Typed Checkpoint/Rewind** — Only saves/restores used types (~77% faster) +3. **Untracked Acquire Detection** — Safely handles `acquire!` calls outside macro visibility + +## Basic Lifecycle Flow + +``` +┌─────────────────────────────────────────────────────────────┐ +│ @with_pool pool function foo(x) │ +│ A = acquire!(pool, Float64, 100) │ +│ B = similar!(pool, A) │ +│ return sum(A) + sum(B) │ +│ end │ +└─────────────────────────────────────────────────────────────┘ + ↓ + ┌───────────────────────────────────┐ + │ Macro Transformation │ + └───────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ function foo(x) │ +│ pool = get_task_local_pool() │ +│ checkpoint!(pool, Float64) # ← Type-specific │ +│ try │ +│ A = _acquire_impl!(pool, Float64, 100) │ +│ B = _similar_impl!(pool, A) │ +│ return sum(A) + sum(B) │ +│ finally │ +│ rewind!(pool, Float64) # ← Type-specific │ +│ end │ +│ end │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Key Points + +- **`try-finally`** ensures `rewind!` executes even if an exception occurs +- `acquire!` → `_acquire_impl!` transformation bypasses untracked marking overhead +- Type-specific `checkpoint!(pool, Float64)` is ~77% faster than full checkpoint + +## Type Extraction: Static Analysis at Compile Time + +The macro analyzes the AST to extract types used in `acquire!` calls: + +```julia +# Macro sees these acquire! calls: +@with_pool pool begin + A = acquire!(pool, Float64, 10, 10) # → extracts Float64 + B = zeros!(pool, ComplexF64, 100) # → extracts ComplexF64 + C = similar!(pool, A) # → extracts eltype(A) → Float64 +end + +# Generated code uses typed checkpoint/rewind: +checkpoint!(pool, Float64, ComplexF64) +try + ... +finally + rewind!(pool, Float64, ComplexF64) +end +``` + +### Type Extraction Rules + +| Call Pattern | Extracted Type | +|--------------|----------------| +| `acquire!(pool, Float64, dims...)` | `Float64` | +| `acquire!(pool, x)` | `eltype(x)` (if x is external) | +| `zeros!(pool, dims...)` | `default_eltype(pool)` | +| `zeros!(pool, Float32, dims...)` | `Float32` | +| `similar!(pool, x)` | `eltype(x)` | +| `similar!(pool, x, Int64, ...)` | `Int64` | + +### When Type Extraction Fails → Full Checkpoint + +The macro falls back to full `checkpoint!(pool)` when: + +```julia +@with_pool pool begin + T = eltype(data) # T defined locally AFTER checkpoint + A = acquire!(pool, T, 100) # Can't use T at checkpoint time! +end +# → Falls back to checkpoint!(pool) / rewind!(pool) + +@with_pool pool begin + local_arr = compute() # local_arr defined AFTER checkpoint + B = similar!(pool, local_arr) # eltype(local_arr) unavailable +end +# → Falls back to checkpoint!(pool) / rewind!(pool) +``` + +## Untracked Acquire Detection + +### The Problem + +The macro can only see `acquire!` calls **directly in its AST**. Calls inside helper functions are invisible: + +```julia +function helper!(pool) + return zeros!(pool, Float64, 100) # Macro can't see this! +end + +@with_pool pool begin + A = acquire!(pool, Int64, 10) # ← Macro sees this (Int64) + B = helper!(pool) # ← Macro can't see Float64 inside! +end + +# If only checkpoint!(pool, Int64), Float64 arrays won't be rewound! +``` + +### The Solution: `_untracked_flags` + +Every `acquire!` call (and convenience functions) marks itself as "untracked": + +```julia +# Public API (called from user code outside macro) +@inline function acquire!(pool, ::Type{T}, n::Int) where {T} + _mark_untracked!(pool) # ← Sets flag! + _acquire_impl!(pool, T, n) +end + +# Macro-transformed calls skip the marking +# (because macro already knows about them) +_acquire_impl!(pool, T, n) # ← No flag +``` + +### Flow Diagram + +``` +@with_pool pool begin State of pool._untracked_flags + │ ───────────────────────────────── + ├─► checkpoint!(pool, Int64) depth=2, flag[2]=false + │ + │ A = _acquire_impl!(...) (macro-transformed, no flag set) + │ B = helper!(pool) + │ └─► zeros!(pool, Float64, N) + │ └─► _mark_untracked!(pool) flag[2]=TRUE ←──┐ + │ │ + │ ... more code ... │ + │ │ + └─► rewind! check: │ + if pool._untracked_flags[2] ─────────────────────────┘ + rewind!(pool) # Full rewind (safe) + else + rewind!(pool, Int64) # Typed rewind (fast) + end +end +``` + +### Why This Works + +1. **Macro-tracked calls**: Transformed to `_acquire_impl!` → no flag → typed rewind +2. **Untracked calls**: Use public API → sets flag → triggers full rewind +3. **Result**: Always safe, with optimization when possible + +## Nested `@with_pool` Handling + +Each `@with_pool` maintains its own checkpoint depth: + +``` +@with_pool p1 begin depth: 1 → 2 + v1 = acquire!(p1, Float64, 10) + │ + ├─► @with_pool p2 begin depth: 2 → 3 + │ v2 = acquire!(p2, Int64, 5) + │ helper!(p2) # sets flag[3]=true + │ sum(v2) + │ end depth: 3 → 2, flag[3] checked + │ + │ # v1 still valid here! + sum(v1) +end depth: 2 → 1, flag[2] checked +``` + +### Depth Tracking Data Structures + +```julia +struct AdaptiveArrayPool + # ... type pools ... + _current_depth::Int # Current scope depth (1 = global) + _untracked_flags::Vector{Bool} # Per-depth flag array +end + +# Initialized with sentinel: +_current_depth = 1 # Global scope +_untracked_flags = [false] # Sentinel for depth=1 +``` + +## Performance Impact + +| Scenario | Checkpoint Method | Relative Speed | +|----------|-------------------|----------------| +| 1 type, no untracked | `checkpoint!(pool, T)` | **~77% faster** | +| Multiple types, no untracked | `checkpoint!(pool, T1, T2, ...)` | **~50% faster** | +| Any untracked acquire | `checkpoint!(pool)` | Baseline | + +The optimization matters most in tight loops with many iterations. + +## Code Generation Summary + +```julia +# INPUT +@with_pool pool function compute(data) + A = acquire!(pool, Float64, length(data)) + result = helper!(pool, A) # May have untracked acquires + return result +end + +# OUTPUT (simplified) +function compute(data) + pool = get_task_local_pool() + + # Check if parent scope had untracked (for nested pools) + if pool._untracked_flags[pool._current_depth] + checkpoint!(pool) # Full checkpoint + else + checkpoint!(pool, Float64) # Typed checkpoint + end + + try + A = _acquire_impl!(pool, Float64, length(data)) + result = helper!(pool, A) + return result + finally + # Check if untracked acquires occurred in this scope + if pool._untracked_flags[pool._current_depth] + rewind!(pool) # Full rewind + else + rewind!(pool, Float64) # Typed rewind + end + end +end +``` + +## Key Internal Functions + +| Function | Purpose | +|----------|---------| +| `_extract_acquire_types(expr, pool_name)` | AST walk to find types | +| `_filter_static_types(types, local_vars)` | Filter out locally-defined types | +| `_transform_acquire_calls(expr, pool_name)` | Replace `acquire!` → `_acquire_impl!` | +| `_mark_untracked!(pool)` | Set untracked flag for current depth | +| `_generate_typed_checkpoint_call(pool, types)` | Generate `checkpoint!(pool, T...)` | + +## See Also + +- [Internals](internals.md) — Overview of pool architecture +- [Safety Rules](../guide/safety.md) — Scope rules and best practices +- [Configuration](../usage/configuration.md) — Performance tuning options diff --git a/docs/src/advanced/multi-threading.md b/docs/src/advanced/multi-threading.md new file mode 100644 index 0000000..4135019 --- /dev/null +++ b/docs/src/advanced/multi-threading.md @@ -0,0 +1,284 @@ +# Multi-Threading Guide + +AdaptiveArrayPools uses `task_local_storage()` for **task-local isolation**: each Julia Task gets its own independent pool. This design ensures thread safety when used correctly. + +## Table of Contents + +- [Understanding Julia's Task/Thread Model](#understanding-julias-taskthread-model) +- [How Pools Work with @threads](#how-pools-work-with-threads) +- [Safe Patterns](#safe-patterns) +- [Unsafe Patterns](#unsafe-patterns) +- [Why Task-Local (Not Thread-Local)?](#why-task-local-not-thread-local) +- [User Responsibility](#user-responsibility) + +--- + +## Understanding Julia's Task/Thread Model + +Julia uses an **M:N threading model** where multiple Tasks (lightweight coroutines) can run on multiple OS threads. + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Julia Process │ +│ │ +│ Thread 1 Thread 2 Thread 3 │ +│ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +│ │ Task A │ │ Task C │ │ Task E │ │ +│ │ (TLS-A) │ │ (TLS-C) │ │ (TLS-E) │ │ +│ └─────────┘ └─────────┘ └─────────┘ │ +│ ┌─────────┐ ┌─────────┐ │ +│ │ Task B │ │ Task D │ │ +│ │ (TLS-B) │ │ (TLS-D) │ │ +│ └─────────┘ └─────────┘ │ +└─────────────────────────────────────────────────────────────┘ +``` + +Key concepts: + +| Concept | Description | +|---------|-------------| +| **Thread** | OS-level execution unit. Fixed count at Julia startup. | +| **Task** | Julia's lightweight coroutine (Green Thread). Created dynamically. | +| **task_local_storage()** | Per-Task storage. Each Task has its own isolated TLS. | + +### Important: One Thread Can Run Multiple Tasks + +A single thread can execute multiple Tasks by switching between them at **yield points** (I/O, `sleep()`, `yield()`, etc.): + +```julia +# Both tasks run on Thread 1, interleaved! +task_a = @spawn begin + println("A start") + sleep(0.1) # yield point - switch to Task B + println("A end") +end + +task_b = @spawn begin + println("B start") + sleep(0.1) # yield point - switch back to Task A + println("B end") +end + +# Output (single thread): +# A start +# B start +# A end +# B end +``` + +--- + +## How Pools Work with @threads + +When you use `Threads.@threads`, Julia distributes iterations across threads. Each thread gets **one Task** that processes its assigned iterations. + +``` +Threads.@threads for i in 1:100_000 (4 threads) +│ +├─ Thread 1: Task-1 → Pool-1 +│ └─ Processes i = 1..25,000 (same pool reused for all!) +│ +├─ Thread 2: Task-2 → Pool-2 +│ └─ Processes i = 25,001..50,000 +│ +├─ Thread 3: Task-3 → Pool-3 +│ └─ Processes i = 50,001..75,000 +│ +└─ Thread 4: Task-4 → Pool-4 + └─ Processes i = 75,001..100,000 + +Total: 4 pools created, each reused ~25,000 times +``` + +### Key Insight + +- `@threads` creates **one Task per thread** (not one per iteration!) +- Each Task has its own `task_local_storage()` → its own pool +- Within one `@threads` block, pools are efficiently reused +- Calling `@threads` **multiple times** creates new Tasks → new pools each time + +--- + +## Safe Patterns + +### Pattern 1: `@with_pool` Inside `@threads` + +```julia +Threads.@threads for i in 1:N + @with_pool pool begin + a = acquire!(pool, Float64, 100) + # ... computation ... + end # pool automatically rewinds +end +``` + +Each thread's Task gets its own pool. Safe and efficient. + +### Pattern 2: Function Defined with `@with_pool` + +```julia +# Define function with @with_pool +@with_pool pool function inner_work(x) + tmp = acquire!(pool, Float64, length(x)) + tmp .= x + return sum(tmp) +end + +# Call from @threads - each thread gets its own pool +Threads.@threads for i in 1:N + result = inner_work(data[i]) +end +``` + +The pool is created per-Task when the function is called, not when defined. + +### Pattern 3: Nested Functions + +```julia +@with_pool outer_pool function outer_work(data) + # outer_pool belongs to Main Task + tmp = acquire!(outer_pool, Float64, 100) + + Threads.@threads for i in 1:length(data) + # inner_work creates its own pool per thread + inner_work(data[i]) # Inner pool ≠ outer_pool (safe!) + end +end +``` + +Outer and inner pools are completely independent. + +--- + +## Unsafe Patterns + +### Pattern 1: `@with_pool` Outside `@threads` + +```julia +# ❌ DANGER: Race condition! +@with_pool pool Threads.@threads for i in 1:N + a = acquire!(pool, Float64, 100) # All threads share ONE pool! +end +``` + +**Why it fails**: `pool` is created in the Main Task's TLS. All threads access the same pool simultaneously. + +### Pattern 2: Sharing Pool Reference + +```julia +# ❌ DANGER: Race condition! +pool = get_task_local_pool() # Main Task's pool +Threads.@threads for i in 1:N + a = acquire!(pool, Float64, 100) # Shared access! +end +``` + +### Pattern 3: Passing Pool to `@spawn` + +```julia +# ❌ DANGER: Race condition! +@with_pool pool begin + tasks = [Threads.@spawn begin + a = acquire!(pool, Float64, 100) # Multiple tasks, one pool! + end for _ in 1:4] + wait.(tasks) +end +``` + +--- + +## Why Task-Local (Not Thread-Local)? + +You might wonder: "Why not use thread-local pools? They persist across `@threads` calls!" + +### The Stack Discipline Problem + +AdaptiveArrayPools uses `checkpoint!` and `rewind!` - a **stack-based** allocation system: + +```julia +@with_pool pool begin + checkpoint!(pool) # Push current state + a = acquire!(pool, ...) + b = acquire!(pool, ...) + # ... + rewind!(pool) # Pop and restore state (LIFO!) +end +``` + +This requires **strict LIFO ordering**: the Task that checkpoints first must rewind last. + +### Why Thread-Local Fails with `@spawn` + +With `@spawn`, multiple Tasks can interleave on the same thread: + +``` +Thread 1 (with Thread-Local Pool): + +Time → +Task A: checkpoint! ──── acquire! ──── sleep ────────────── rewind! +Task B: checkpoint! ──── acquire! ──── sleep ──── rewind! + ↑ + A finishes first! +``` + +**Stack corruption occurs:** + +1. Task A: `checkpoint!` → stack = `[0]` +2. Task B: `checkpoint!` → stack = `[0, 1]` +3. Task A: `rewind!` → pops `1` (B's checkpoint!) → stack = `[0]` +4. Task B: `rewind!` → pops `0` (A's checkpoint!) → **WRONG!** + +**Result**: B's arrays may be reused while B is still using them → memory corruption. + +### Locks Don't Help + +Adding locks only prevents **simultaneous access**, not **LIFO violations**. The stack still gets corrupted because Tasks finish in unpredictable order. + +### Task-Local: The Only Safe Solution + +With Task-local pools: +- Each Task has its own pool +- Each pool has its own stack +- No interleaving possible → LIFO always preserved + +--- + +## User Responsibility + +### The Core Rule + +> **Pool objects must not be shared across Tasks.** + +This library prioritizes **zero-overhead performance** over runtime safety checks. No locks are added because: + +1. Locks would defeat the purpose of zero-allocation pooling +2. Even with locks, stack corruption would occur (LIFO violations) + +### Quick Reference + +| Pattern | Safety | Reason | +|---------|--------|--------| +| `@with_pool` inside `@threads` | ✅ Safe | Each Task gets own pool | +| `@with_pool` outside `@threads` | ❌ Unsafe | All threads share one pool | +| Function with `@with_pool` called from `@threads` | ✅ Safe | Pool created per-Task at call time | +| Passing pool to `@spawn` | ❌ Unsafe | Multiple Tasks access same pool | +| Nested `@with_pool` (outer/inner) | ✅ Safe | Each level has independent pool | + +### Debugging Tips + +If you encounter unexpected behavior: + +1. **Check pool placement**: Is `@with_pool` inside or outside `@threads`? +2. **Check pool sharing**: Is the same pool variable accessed from multiple Tasks? +3. **Enable POOL_DEBUG**: `POOL_DEBUG[] = true` catches some (not all) misuse patterns + +--- + +## Summary + +- AdaptiveArrayPools uses **Task-local isolation** for thread safety +- Each Julia Task gets its own independent pool via `task_local_storage()` +- `@threads` creates one Task per thread → pools are reused within the block +- **Always place `@with_pool` inside `@threads`**, not outside +- Thread-local pools are **not an alternative** due to stack discipline requirements +- Correct usage is the user's responsibility (no runtime checks for performance) diff --git a/docs/src/guide/getting-started.md b/docs/src/guide/getting-started.md new file mode 100644 index 0000000..76bb0c3 --- /dev/null +++ b/docs/src/guide/getting-started.md @@ -0,0 +1,116 @@ +# Getting Started + +This guide will help you get up and running with AdaptiveArrayPools.jl in minutes. + +## Installation + +```julia +using Pkg +Pkg.Registry.add(Pkg.RegistrySpec(url="https://github.com/ProjectTorreyPines/FuseRegistry.jl.git")) +Pkg.add("AdaptiveArrayPools") +``` + +## Basic Usage + +The core workflow is simple: +1. Wrap your function with `@with_pool` +2. Replace allocations with `acquire!` or convenience functions +3. Return computed values (scalars, copies), not the arrays themselves + +### Before (Standard Julia) + +```julia +function compute(n) + A = rand(n, n) # allocates + B = rand(n, n) # allocates + C = A * B # allocates + return sum(C) +end + +for i in 1:10_000 + compute(100) # 90k allocations, 2.75 GiB, 31% GC time +end +``` + +### After (With Pooling) + +```julia +using AdaptiveArrayPools, LinearAlgebra, Random + +@with_pool pool function compute_pooled(n) + A = acquire!(pool, Float64, n, n) # reuses memory + B = similar!(pool, A) + C = similar!(pool, A) + + rand!(A); rand!(B) + mul!(C, A, B) + return sum(C) +end + +compute_pooled(100) # warmup (first call allocates) +for i in 1:10_000 + compute_pooled(100) # zero allocations, 0% GC +end +``` + +## Convenience Functions + +Common initialization patterns have shortcuts: + +| Function | Equivalent to | +|----------|---------------| +| `zeros!(pool, 10)` | `acquire!` + `fill!(0)` | +| `ones!(pool, Float32, 3, 3)` | `acquire!` + `fill!(1)` | +| `similar!(pool, A)` | `acquire!` matching `eltype(A)`, `size(A)` | + +```julia +@with_pool pool function example(n) + A = zeros!(pool, n, n) # zero-initialized + B = ones!(pool, Float32, n) # Float32 ones + C = similar!(pool, A) # same type and size as A + # ... +end +``` + +## Return Types + +`acquire!` and convenience functions return **view types** (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK: + +```julia +A = acquire!(pool, Float64, 10, 10) # ReshapedArray{Float64,2} +mul!(C, A, B) # works perfectly with BLAS +``` + +If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`: + +```julia +A = unsafe_acquire!(pool, Float64, 10, 10) # Array{Float64,2} +``` + +## Important Safety Rules + +Arrays from the pool are **only valid within the `@with_pool` scope**: + +```julia +# DO NOT return pool-backed arrays +@with_pool pool function bad_example() + A = acquire!(pool, Float64, 10) + return A # WRONG - A will be recycled after this scope! +end + +# Return computed values instead +@with_pool pool function good_example() + A = acquire!(pool, Float64, 10) + return sum(A) # OK - returning a scalar +end +``` + +For complete safety guidelines, see [Safety Rules](safety.md). + +## Next Steps + +- [Safety Rules](safety.md) - Complete scope rules and anti-patterns +- [API Reference](../usage/api.md) - Full function and macro reference +- [Configuration](../usage/configuration.md) - Preferences and cache tuning +- [Multi-threading](../advanced/multi-threading.md) - Task/thread safety patterns +- [CUDA Support](../usage/cuda.md) - GPU backend usage diff --git a/docs/src/guide/safety.md b/docs/src/guide/safety.md new file mode 100644 index 0000000..e4eedc1 --- /dev/null +++ b/docs/src/guide/safety.md @@ -0,0 +1,110 @@ +# Safety Guide + +AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This requires one simple rule: **acquired arrays are only valid within their `@with_pool` scope**. + +## The Scope Rule + +When `@with_pool` ends, all arrays acquired within that scope are recycled. Using them after the scope ends leads to undefined behavior. + +```julia +@with_pool pool begin + v = acquire!(pool, Float64, 100) + + result = sum(v) # ✅ compute and return values + copied = copy(v) # ✅ copy if you need data outside +end +# v is no longer valid here +``` + +## What NOT to Do + +### Don't return pool-backed arrays + +```julia +# ❌ Wrong: returning the array itself +@with_pool pool function bad_example() + v = acquire!(pool, Float64, 100) + return v # v will be recycled after this returns! +end + +# ✅ Correct: return computed values or copies +@with_pool pool function good_example() + v = acquire!(pool, Float64, 100) + return sum(v) # scalar result +end +``` + +### Don't store in globals or closures + +```julia +# ❌ Wrong: storing in global +global_ref = nothing +@with_pool pool begin + global_ref = acquire!(pool, Float64, 100) +end +# global_ref now points to recycled memory + +# ❌ Wrong: capturing in closure +@with_pool pool begin + v = acquire!(pool, Float64, 100) + callback = () -> sum(v) # v captured but will be invalid +end +``` + +### Don't resize or push! to unsafe_acquire! arrays + +```julia +@with_pool pool begin + v = unsafe_acquire!(pool, Float64, 100) + # ❌ These break pool memory management: + # resize!(v, 200) + # push!(v, 1.0) + # append!(v, [1.0, 2.0]) +end +``` + +## Debugging with POOL_DEBUG + +Enable runtime safety checks during development: + +```julia +using AdaptiveArrayPools +AdaptiveArrayPools.POOL_DEBUG[] = true + +@with_pool pool function test() + v = acquire!(pool, Float64, 100) + return v # Will warn about returning pool-backed array +end +``` + +## acquire! vs unsafe_acquire! + +| Function | Returns | Best For | +|----------|---------|----------| +| `acquire!` | View types (`SubArray`, `ReshapedArray`) | General use, BLAS/LAPACK | +| `unsafe_acquire!` | Native `Array`/`CuArray` | FFI, type constraints | + +Both follow the same scope rules. Use `acquire!` by default—views work with all standard Julia linear algebra operations. + +## Thread Safety + +Pools are task-local, so each thread automatically gets its own pool: + +```julia +# ✅ Safe: each task has independent pool +Threads.@threads for i in 1:N + @with_pool pool begin + a = acquire!(pool, Float64, 100) + # work with a... + end +end + +# ❌ Unsafe: pool created outside threaded region +@with_pool pool begin + Threads.@threads for i in 1:N + a = acquire!(pool, Float64, 100) # race condition! + end +end +``` + +See [Multi-Threading](../advanced/multi-threading.md) for more patterns. diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..c89a222 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,117 @@ +[![CI](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/actions/workflows/CI.yml/badge.svg)](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/actions/workflows/CI.yml) +[![codecov](https://codecov.io/github/projecttorreypines/adaptivearraypools.jl/graph/badge.svg?token=ZL0U0OvnL2)](https://codecov.io/github/projecttorreypines/adaptivearraypools.jl) + +# AdaptiveArrayPools.jl + +**Zero-allocation temporary arrays for Julia.** + +A lightweight library that lets you write natural, allocation-style code while automatically reusing memory behind the scenes. Eliminates GC pressure in hot loops without the complexity of manual buffer management. + +**Supported backends:** +- **CPU** — `Array`, works out of the box +- **CUDA** — `CuArray`, loads automatically when [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) is available + +## The Problem + +In performance-critical code, temporary array allocations inside loops create massive GC pressure: + +```julia +function compute(n) + A = rand(n, n) # allocates + B = rand(n, n) # allocates + C = A * B # allocates + return sum(C) +end + +for i in 1:10_000 + compute(100) # ⚠️ 90k allocations, 2.75 GiB, 31% GC time +end +``` + +The traditional fix—passing pre-allocated buffers—works for simple cases but quickly becomes impractical: + +- **API pollution**: Every function needs extra buffer arguments, breaking clean interfaces +- **Nested calls**: Buffers must be threaded through entire call stacks, even third-party code +- **Dynamic shapes**: Hard to pre-allocate when array sizes depend on runtime values +- **Package boundaries**: You can't easily pass buffers into library functions you don't control + +## The Solution + +Wrap your function with `@with_pool` and replace allocations with `acquire!` or convenience functions: + +```julia +using AdaptiveArrayPools, LinearAlgebra, Random + +@with_pool pool function compute_pooled(n) + A = acquire!(pool, Float64, n, n) # reuses memory from pool + B = similar!(pool, A) + C = similar!(pool, A) + + rand!(A); rand!(B) + mul!(C, A, B) + return sum(C) +end + +compute_pooled(100) # warmup +for i in 1:10_000 + compute_pooled(100) # ✅ Zero allocations, 0% GC +end +``` + +| Metric | Standard | **AdaptiveArrayPools** | Improvement | +|--------|----------|------------------------|-------------| +| Time | 787 ms | **525 ms** | 1.5× faster | +| Allocations | ⚠️ 90,000 (2.75 GiB) | ✅ **0** | 100% eliminated | +| GC Time | ⚠️ 31% | ✅ **0%** | No GC pauses | + +> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](usage/cuda.md). + +## How It Works + +`@with_pool` automatically manages memory lifecycle for you: + +1. **Checkpoint** — Saves current pool state when entering the block +2. **Acquire** — `acquire!` returns arrays backed by pooled memory +3. **Rewind** — When the block ends, all acquired arrays are recycled for reuse + +This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors. + +`acquire!` returns lightweight views (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK. If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`—see [API Reference](usage/api.md). + +> **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](guide/safety.md). + +**Thread-safe by design**: Each Julia Task gets its own independent pool—no locks needed. See [Multi-Threading](advanced/multi-threading.md) for patterns. + +### Convenience Functions + +Common initialization patterns have convenience functions: + +| Function | Equivalent to | +|----------|---------------| +| `zeros!(pool, 10)` | `acquire!` + `fill!(0)` | +| `ones!(pool, Float32, 3, 3)` | `acquire!` + `fill!(1)` | +| `similar!(pool, A)` | `acquire!` matching `eltype(A)`, `size(A)` | + +These return views like `acquire!`. For raw `Array` types, use `unsafe_acquire!` or its convenience variants (`unsafe_zeros!`, `unsafe_ones!`, `unsafe_similar!`). See [API Reference](usage/api.md#convenience-functions). + +## Installation + +```julia +using Pkg +Pkg.Registry.add(Pkg.RegistrySpec(url="https://github.com/ProjectTorreyPines/FuseRegistry.jl.git")) +Pkg.add("AdaptiveArrayPools") +``` + +## Documentation + +| Guide | Description | +|-------|-------------| +| [API Reference](usage/api.md) | Complete function and macro reference | +| [CUDA Backend](usage/cuda.md) | GPU-specific usage and examples | +| [Safety Guide](guide/safety.md) | Scope rules and best practices | +| [Multi-Threading](advanced/multi-threading.md) | Task/thread safety patterns | +| [Configuration](usage/configuration.md) | Preferences and cache tuning | + +## License + +[Apache 2.0](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/LICENSE) diff --git a/docs/src/usage/api.md b/docs/src/usage/api.md new file mode 100644 index 0000000..4d3eb8c --- /dev/null +++ b/docs/src/usage/api.md @@ -0,0 +1,111 @@ +# API Reference + +## Macros + +| Macro | Description | +|-------|-------------| +| `@with_pool name expr` | **Recommended.** Injects a global, task-local pool named `name`. Automatically checkpoints and rewinds. | +| `@maybe_with_pool name expr` | Same as `@with_pool`, but can be toggled on/off at runtime via `MAYBE_POOLING_ENABLED[]`. | + +## Functions + +| Function | Description | +|----------|-------------| +| `acquire!(pool, T, dims...)` | Returns a view: `SubArray{T,1}` for 1D, `ReshapedArray{T,N}` for N-D. Always 0 bytes. | +| `acquire!(pool, T, dims::Tuple)` | Tuple overload for `acquire!` (e.g., `acquire!(pool, T, size(x))`). | +| `acquire!(pool, x::AbstractArray)` | Similar-style: acquires array matching `eltype(x)` and `size(x)`. | +| `unsafe_acquire!(pool, T, dims...)` | Returns native `Array`/`CuArray` (CPU: `Vector{T}` for 1D, `Array{T,N}` for N-D). Only for FFI/type constraints. | +| `unsafe_acquire!(pool, T, dims::Tuple)` | Tuple overload for `unsafe_acquire!`. | +| `unsafe_acquire!(pool, x::AbstractArray)` | Similar-style: acquires raw array matching `eltype(x)` and `size(x)`. | +| `acquire_view!(pool, T, dims...)` | Alias for `acquire!`. Returns view types. | +| `acquire_array!(pool, T, dims...)` | Alias for `unsafe_acquire!`. Returns Array for N-D. | +| `checkpoint!(pool)` | Saves the current pool state (stack pointer). | +| `checkpoint!(pool, T...)` | Type-specific checkpoint for optimized performance. | +| `rewind!(pool)` | Restores the pool to the last checkpoint, freeing all arrays acquired since then. | +| `rewind!(pool, T...)` | Type-specific rewind for optimized performance. | +| `pool_stats(pool)` | Prints detailed statistics about pool usage. | +| `get_task_local_pool()` | Returns the task-local pool instance. | +| `empty!(pool)` | Clears all internal storage, releasing all memory. | + +## Convenience Functions + +Shortcuts for common `acquire!` + initialization patterns. Default element type is `Float64` (CPU) or `Float32` (CUDA). + +### View-returning (like `acquire!`) + +| Function | Description | +|----------|-------------| +| `zeros!(pool, [T,] dims...)` | Zero-initialized view. Equivalent to `acquire!` + `fill!(0)`. | +| `ones!(pool, [T,] dims...)` | One-initialized view. Equivalent to `acquire!` + `fill!(1)`. | +| `similar!(pool, A)` | View matching `eltype(A)` and `size(A)`. | +| `similar!(pool, A, T)` | View with type `T`, size from `A`. | +| `similar!(pool, A, dims...)` | View with `eltype(A)`, specified dimensions. | +| `similar!(pool, A, T, dims...)` | View with type `T`, specified dimensions. | + +### Array-returning (like `unsafe_acquire!`) + +| Function | Description | +|----------|-------------| +| `unsafe_zeros!(pool, [T,] dims...)` | Zero-initialized raw `Array`. | +| `unsafe_ones!(pool, [T,] dims...)` | One-initialized raw `Array`. | +| `unsafe_similar!(pool, A, ...)` | Raw `Array` with same signatures as `similar!`. | + +All convenience functions support tuple dimensions: `zeros!(pool, (3, 4))`. + +**CUDA note**: Default type is `Float32` to match `CUDA.zeros()` behavior. + +## Types + +| Type | Description | +|------|-------------| +| `AdaptiveArrayPool` | The main pool type. Create with `AdaptiveArrayPool()`. | +| `DisabledPool{Backend}` | Sentinel type when pooling is disabled. Preserves backend context (`:cpu` or `:cuda`). | + +## Utility Functions + +| Function | Description | +|----------|-------------| +| `pooling_enabled(pool)` | Returns `true` if pool is active, `false` if `DisabledPool`. Use instead of `pool === nothing`. | +| `default_eltype(pool)` | Returns default element type: `Float64` (CPU) or `Float32` (CUDA). | + +## Constants + +| Constant | Description | +|----------|-------------| +| `USE_POOLING` | Compile-time constant. Set via `Preferences.jl` to disable all pooling. | +| `MAYBE_POOLING_ENABLED` | Runtime `Ref{Bool}`. Only affects `@maybe_with_pool`. | +| `POOL_DEBUG` | Runtime `Ref{Bool}`. Enable safety validation for debugging. | +| `CACHE_WAYS` | Compile-time constant. N-way cache size for `unsafe_acquire!` (default: 4, range: 1-16). | + +## Configuration Functions + +| Function | Description | +|----------|-------------| +| `set_cache_ways!(n)` | Set N-way cache size. Requires Julia restart. | + +## Safety Notes + +Arrays acquired from a pool are **only valid within the `@with_pool` scope**. Do not: +- Return pool-backed arrays from functions +- Store them in global variables +- Capture them in closures that outlive the scope +- Call `resize!`, `push!`, or `append!` on arrays from `unsafe_acquire!` + +Use `POOL_DEBUG[] = true` during development to catch direct returns of pool-backed arrays. + +## `acquire!` vs `unsafe_acquire!` + +| Function | 1D Return | N-D Return | Allocation | +|----------|-----------|------------|------------| +| `acquire!` | `SubArray{T,1}` | `ReshapedArray{T,N}` | Always 0 bytes (stack-based views) | +| `unsafe_acquire!` | `Vector{T}` | `Array{T,N}` | 0 bytes (hit) / ~100 bytes header (miss) | + +Both share the same underlying pool memory. Even on cache miss, only the `Array` header is allocated—**data memory is always reused from the pool**. **Use `acquire!` by default**—BLAS/LAPACK are fully optimized for `StridedArray`, so there's no performance difference. + +Use `unsafe_acquire!` only when you need a concrete `Array` type (FFI, type signatures, runtime dispatch). + +**Caching**: +- `acquire!` 1D uses simple 1:1 cache (reuses `SubArray` if same length) +- `unsafe_acquire!` (all dimensions) uses N-way cache (up to `CACHE_WAYS`, default: 4) per slot; exceeding this causes eviction + +> **Header size by dimensionality**: The `~100 bytes` is an average. Actual `Array` header allocation varies: 1D → 80 bytes, 2D-3D → 112 bytes, 4D-5D → 144 bytes. This is Julia's internal `Array` metadata; actual data memory is always reused from the pool. diff --git a/docs/src/usage/configuration.md b/docs/src/usage/configuration.md new file mode 100644 index 0000000..fdc1d16 --- /dev/null +++ b/docs/src/usage/configuration.md @@ -0,0 +1,102 @@ +# Configuration + +AdaptiveArrayPools can be configured via `LocalPreferences.toml`: + +```toml +[AdaptiveArrayPools] +use_pooling = false # ⭐ Primary: Disable pooling entirely +cache_ways = 8 # Advanced: N-way cache size (default: 4) +``` + +## Compile-time: USE_POOLING (⭐ Primary) + +**The most important configuration.** Completely disable pooling to make `acquire!` behave like standard allocation. + +```toml +# LocalPreferences.toml +[AdaptiveArrayPools] +use_pooling = false +``` + +Or programmatically: + +```julia +using Preferences +Preferences.set_preferences!(AdaptiveArrayPools, "use_pooling" => false) +# Restart Julia for changes to take effect +``` + +When `USE_POOLING = false`: +- `pool` becomes `DisabledPool{backend}()` instead of an active pool +- All pool functions fall back to standard allocation +- Backend context is preserved: `:cuda` still returns `CuArray` + +```julia +# These become equivalent: +@with_pool pool acquire!(pool, Float64, n, n) → Matrix{Float64}(undef, n, n) +@with_pool pool acquire!(pool, Float64, n) → Vector{Float64}(undef, n) + +# With CUDA backend: +@with_pool :cuda pool zeros!(pool, 100) → CUDA.zeros(Float32, 100) +``` + +Use `pooling_enabled(pool)` to check if pooling is active. + +**Use cases:** +- **Debugging**: Compare behavior with/without pooling +- **Benchmarking**: Measure pooling overhead vs direct allocation +- **Gradual adoption**: Add `@with_pool` annotations now, enable pooling later +- **CI/Testing**: Run tests without pooling to isolate issues + +All pooling code is **completely eliminated at compile time** (zero overhead). + +## Runtime: MAYBE_POOLING_ENABLED + +Only affects `@maybe_with_pool`. Toggle without restart. + +```julia +MAYBE_POOLING_ENABLED[] = false # Disable +MAYBE_POOLING_ENABLED[] = true # Enable (default) +``` + +## Runtime: POOL_DEBUG + +Enable safety validation to catch direct returns of pool-backed arrays. + +```julia +POOL_DEBUG[] = true # Enable safety checks (development) +POOL_DEBUG[] = false # Disable (default, production) +``` + +When enabled, returning a pool-backed array from a `@with_pool` block will throw an error. + +## Compile-time: CACHE_WAYS + +Configure the N-way cache size for `unsafe_acquire!`. Higher values reduce cache eviction but increase memory per slot. + +```toml +# LocalPreferences.toml +[AdaptiveArrayPools] +cache_ways = 8 # Default: 4, Range: 1-16 +``` + +Or programmatically: + +```julia +using AdaptiveArrayPools +set_cache_ways!(8) +# Restart Julia for changes to take effect +``` + +**When to increase**: If your code alternates between more than 4 dimension patterns per pool slot, increase `cache_ways` to avoid cache eviction (~100 bytes header per miss). + +> **Scope**: `cache_ways` affects **all `unsafe_acquire!`** calls (including 1D). Only `acquire!` 1D uses simple 1:1 caching. + +## Summary + +| Setting | Scope | Restart? | Priority | Affects | +|---------|-------|----------|----------|---------| +| `use_pooling` | Compile-time | Yes | ⭐ Primary | All macros, `acquire!` behavior | +| `cache_ways` | Compile-time | Yes | Advanced | `unsafe_acquire!` N-D caching | +| `MAYBE_POOLING_ENABLED` | Runtime | No | Optional | `@maybe_with_pool` only | +| `POOL_DEBUG` | Runtime | No | Debug | Safety validation | diff --git a/docs/src/usage/cuda.md b/docs/src/usage/cuda.md new file mode 100644 index 0000000..c5778c8 --- /dev/null +++ b/docs/src/usage/cuda.md @@ -0,0 +1,123 @@ +# CUDA Backend + +AdaptiveArrayPools provides native CUDA support through a package extension that loads automatically when CUDA.jl is available. + +## Quick Start + +```julia +using AdaptiveArrayPools, CUDA + +# Use :cuda backend for GPU arrays +@with_pool :cuda pool function gpu_computation(n) + A = acquire!(pool, Float64, n, n) # CuArray view + B = acquire!(pool, Float64, n, n) # CuArray view + + fill!(A, 1.0) + fill!(B, 2.0) + + return sum(A .+ B) +end + +# Zero GPU allocation in hot loops +for i in 1:1000 + gpu_computation(100) # GPU memory reused from pool +end +``` + +## API + +The CUDA backend uses the same API as CPU, with `:cuda` backend specifier: + +| Macro/Function | Description | +|----------------|-------------| +| `@with_pool :cuda pool expr` | GPU pool with automatic checkpoint/rewind | +| `acquire!(pool, T, dims...)` | Returns `CuArray` view (always 0 bytes GPU alloc) | +| `unsafe_acquire!(pool, T, dims...)` | Returns raw `CuArray` (for FFI/type constraints) | +| `get_task_local_cuda_pool()` | Returns the task-local CUDA pool | +| `pool_stats(:cuda)` | Print CUDA pool statistics | + +## Return Types + +| Function | 1D Return | N-D Return | +|----------|-----------|------------| +| `acquire!` | `CuArray{T,1}` (view) | `CuArray{T,N}` (view) | +| `unsafe_acquire!` | `CuArray{T,1}` | `CuArray{T,N}` | + +## Allocation Behavior + +**GPU Memory**: Always 0 bytes allocation after warmup. The underlying `CuVector` is resized as needed and reused. + +**CPU Memory**: +- Cache hit (≤4 dimension patterns per slot): 0 bytes +- Cache miss (>4 patterns): ~100 bytes for wrapper metadata + +```julia +# Example: 4 patterns fit in 4-way cache → zero CPU allocation +dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) +for dims in dims_list + @with_pool :cuda p begin + A = acquire!(p, Float64, dims...) + # Use A... + end +end +``` + +## Fixed Slot Types + +Optimized types with pre-allocated slots (same as CPU): + +| Type | Field | +|------|-------| +| `Float64` | `.float64` | +| `Float32` | `.float32` | +| `Float16` | `.float16` | +| `Int64` | `.int64` | +| `Int32` | `.int32` | +| `ComplexF64` | `.complexf64` | +| `ComplexF32` | `.complexf32` | +| `Bool` | `.bool` | + +Other types use the fallback dictionary (`.others`). + +## Limitations + +- **No `@maybe_with_pool :cuda`**: Runtime toggle not supported for CUDA backend +- **Task-local only**: Each Task gets its own CUDA pool, same as CPU +- **Same device**: All arrays in a pool use the same CUDA device + +## Example: Matrix Multiplication + +```julia +using AdaptiveArrayPools, CUDA, LinearAlgebra + +@with_pool :cuda pool function gpu_matmul(n) + A = acquire!(pool, Float64, n, n) + B = acquire!(pool, Float64, n, n) + C = acquire!(pool, Float64, n, n) + + rand!(A); rand!(B) + mul!(C, A, B) + + return sum(C) +end + +# Warmup +gpu_matmul(100) + +# Benchmark - zero GPU allocation +using BenchmarkTools +@benchmark gpu_matmul(1000) +``` + +## Debugging + +```julia +# Check pool state +pool_stats(:cuda) + +# Output: +# CuAdaptiveArrayPool (device 0) +# Float64 (fixed) [GPU] +# slots: 3 (active: 0) +# elements: 30000 (234.375 KiB) +``` diff --git a/docs/src/usage/maybe_with_pool.md b/docs/src/usage/maybe_with_pool.md new file mode 100644 index 0000000..39c31b7 --- /dev/null +++ b/docs/src/usage/maybe_with_pool.md @@ -0,0 +1,53 @@ +# @maybe_with_pool + +Runtime-toggleable pooling. Users can enable/disable via `MAYBE_POOLING_ENABLED[]`. + +## Usage + +```julia +@maybe_with_pool pool function compute(n) + v = acquire!(pool, Float64, n) + v .= 1.0 + sum(v) +end + +# Toggle at runtime +MAYBE_POOLING_ENABLED[] = false # Normal allocation +MAYBE_POOLING_ENABLED[] = true # Uses pool +``` + +## When to Use + +- Library code where end-users should control pooling behavior +- Debugging: disable pooling to isolate memory issues +- Benchmarking: compare pooled vs non-pooled performance + +## How It Works + +When `MAYBE_POOLING_ENABLED[] == false`: +- `pool` becomes `DisabledPool{backend}()` (e.g., `DisabledPool{:cpu}()` or `DisabledPool{:cuda}()`) +- All pool functions (`acquire!`, `zeros!`, etc.) fall back to standard allocation +- Backend context is preserved: `:cuda` → `CuArray`, `:cpu` → `Array` + +Use `pooling_enabled(pool)` to check if pooling is active: +```julia +@maybe_with_pool pool begin + if pooling_enabled(pool) + # Using pooled memory + else + # Using standard allocation (DisabledPool) + end +end +``` + +## vs @with_pool + +| | `@with_pool` | `@maybe_with_pool` | +|---|---|---| +| Runtime toggle | No | Yes | +| Overhead when disabled | None | Branch check | +| Use case | Application code | Library code | + +## Safety + +Same rules as `@with_pool`: arrays are only valid within the scope. Do not return or store them externally. From e98413423fe69c8d15bd8124593046106d735149 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 5 Jan 2026 10:36:01 -0800 Subject: [PATCH 2/8] docs: add English design documents to docs/design - Translate 6 Korean/mixed documents to English: - fixed_slots_codegen_design.md (updated for 7 fixed slots) - hybrid_api_design.md - nd_array_approach_comparison.md - new_hybrid_api_design.md - untracked_acquire_design.md - macro-linenumbernode-improvement.md - Copy 2 existing English documents: - cuda_extension_design.md - vector_resize_memory_behavior.md - Verify codebase consistency with design specs --- docs/design/cuda_extension_design.md | 1266 +++++++++++++++++ docs/design/fixed_slots_codegen_design.md | 362 +++++ docs/design/hybrid_api_design.md | 441 ++++++ .../macro-linenumbernode-improvement.md | 369 +++++ docs/design/nd_array_approach_comparison.md | 432 ++++++ docs/design/new_hybrid_api_design.md | 140 ++ docs/design/untracked_acquire_design.md | 598 ++++++++ docs/design/vector_resize_memory_behavior.md | 247 ++++ 8 files changed, 3855 insertions(+) create mode 100644 docs/design/cuda_extension_design.md create mode 100644 docs/design/fixed_slots_codegen_design.md create mode 100644 docs/design/hybrid_api_design.md create mode 100644 docs/design/macro-linenumbernode-improvement.md create mode 100644 docs/design/nd_array_approach_comparison.md create mode 100644 docs/design/new_hybrid_api_design.md create mode 100644 docs/design/untracked_acquire_design.md create mode 100644 docs/design/vector_resize_memory_behavior.md diff --git a/docs/design/cuda_extension_design.md b/docs/design/cuda_extension_design.md new file mode 100644 index 0000000..5b31c2d --- /dev/null +++ b/docs/design/cuda_extension_design.md @@ -0,0 +1,1266 @@ +# AdaptiveArrayPools.jl CUDA Extension Design + +> **Status**: Draft v0.6 (Post-Review Revision) +> **Version**: 0.6 +> **Date**: 2024-12-14 +> **Authors**: Design discussion with AI assistance + +## 1. Executive Summary + +This document outlines the design for extending AdaptiveArrayPools.jl to support GPU arrays via CUDA.jl. The design prioritizes: + +1. **Zero overhead when CUDA not loaded** - Pure CPU code path unchanged +2. **Maximum code reuse** - Generic functions with minimal dispatch points +3. **Extensibility** - Abstract type hierarchy for future GPU backends +4. **Separate namespaces** - Independent CPU and GPU pools per task + +### Key Design Decision: Parametric Abstract Types + +Instead of duplicating code in the extension, we use parametric abstract types and generic functions. The extension only needs to define: +- Type definitions (~50 lines) +- One allocation method (~3 lines) +- Task-local getter (~15 lines) + +**Total extension code: ~70 lines** (vs ~300 lines with full duplication) + +--- + +## 2. Current Architecture Analysis + +### 2.1 Core Type Structure + +```julia +# Current: Concrete types only +mutable struct TypedPool{T} + vectors::Vector{Vector{T}} + views::Vector{SubArray{T,1,Vector{T},...}} + view_lengths::Vector{Int} + nd_arrays::Vector{Any} + nd_dims::Vector{Any} + nd_ptrs::Vector{UInt} + nd_next_way::Vector{Int} + n_active::Int + _checkpoint_n_active::Vector{Int} + _checkpoint_depths::Vector{Int} +end +``` + +### 2.2 Code Reuse Analysis + +| Component | Operates On | GPU-Specific? | +|-----------|-------------|---------------| +| `get_view!` | vectors, n_active, cache | Only allocation | +| `get_nd_view!` | calls get_view!, reshape | **No** | +| `get_nd_array!` | calls get_view!, unsafe_wrap | Only wrap call | +| `checkpoint!` | n_active, checkpoint stacks | **No** | +| `rewind!` | n_active, checkpoint stacks | **No** | +| `reset!` | n_active, checkpoint stacks | **No** | +| `empty!` | all fields | Clear vectors only | + +**Key insight**: 95%+ of logic is type-agnostic. Only allocation/wrapping differs. + +--- + +## 3. Proposed Architecture + +### 3.1 Type Hierarchy + +``` + AbstractTypedPool{T, V<:AbstractVector{T}} + │ + ┌──────────────┼──────────────┐ + │ │ │ + TypedPool{T} CuTypedPool{T} [Future: ROCTypedPool{T}] + V = Vector{T} V = CuVector{T} V = ROCArray{T,1} + + + AbstractArrayPool + │ + ┌──────────────┼──────────────┐ + │ │ │ + AdaptiveArrayPool CuAdaptiveArrayPool [Future: ROCArrayPool] +``` + +### 3.2 Abstract Type Definitions + +```julia +# src/types.jl - New additions + +""" + AbstractTypedPool{T, V<:AbstractVector{T}} + +Abstract base for type-specific memory pools. +`T` is the element type, `V` is the storage vector type. + +Concrete subtypes must have these fields: +- vectors::Vector{V} +- views, view_lengths, nd_* fields +- n_active::Int +- _checkpoint_n_active::Vector{Int} +- _checkpoint_depths::Vector{Int} +""" +abstract type AbstractTypedPool{T, V<:AbstractVector{T}} end + +""" + AbstractArrayPool + +Abstract base for multi-type array pools. + +Concrete subtypes must have these fields: +- Fixed slot fields (type-specific TypedPools) +- others::IdDict{DataType,Any} +- _current_depth::Int +- _untracked_flags::Vector{Bool} +""" +abstract type AbstractArrayPool end + +# Storage type accessor (for generic code) +storage_type(::AbstractTypedPool{T,V}) where {T,V} = V +storage_type(::Type{<:AbstractTypedPool{T,V}}) where {T,V} = V +``` + +### 3.3 Concrete Types + +#### CPU (existing, modified to inherit) + +```julia +# src/types.jl + +mutable struct TypedPool{T} <: AbstractTypedPool{T, Vector{T}} + # Storage + vectors::Vector{Vector{T}} + views::Vector{SubArray{T,1,Vector{T},Tuple{UnitRange{Int64}},true}} + view_lengths::Vector{Int} + + # N-D cache + nd_arrays::Vector{Any} + nd_dims::Vector{Any} + nd_ptrs::Vector{UInt} + nd_next_way::Vector{Int} + + # State + n_active::Int + _checkpoint_n_active::Vector{Int} + _checkpoint_depths::Vector{Int} +end + +mutable struct AdaptiveArrayPool <: AbstractArrayPool + # Fixed slots (CPU types) + float64::TypedPool{Float64} + float32::TypedPool{Float32} + int64::TypedPool{Int64} + int32::TypedPool{Int32} + complexf64::TypedPool{ComplexF64} + complexf32::TypedPool{ComplexF32} + bool::TypedPool{Bool} + + others::IdDict{DataType,Any} + _current_depth::Int + _untracked_flags::Vector{Bool} +end +``` + +#### GPU (extension - minimal definitions) + +> **[AI Review: Float16 & Device Safety]** +> 1. **Float16 Support**: Added `Float16` to fixed slots. This is critical for modern AI/ML workloads on GPU. +> 2. **Device Awareness**: Added `device_id::Int` to `CuAdaptiveArrayPool`. This is crucial for multi-GPU setups. A pool created on Device 0 cannot be safely used on Device 1. We must track which device owns the memory. + +> **[Post-Review v0.6: Critical Type Correction]** +> **`view(CuVector, 1:n)` returns `CuVector`, NOT `SubArray`!** +> +> GPUArrays.jl handles contiguous views via `derive()` which returns a new GPU array +> sharing the same memory buffer (see `~/.julia/packages/GPUArrays/.../src/host/base.jl:302`). +> This is fundamentally different from CPU where `view()` returns `SubArray`. +> +> **Implications for pool design**: +> 1. We **cannot cache views separately** from backing vectors on GPU +> 2. Instead, we store `CuVector{T}` directly and return slices via `view()` on each call +> 3. View creation is cheap (no allocation, just metadata), so no caching benefit +> 4. This simplifies the GPU pool: no `views` or `view_lengths` fields needed + +```julia +# ext/AdaptiveArrayPoolsCUDAExt/types.jl + +using CUDA + +# IMPORTANT: Unlike CPU, GPU views are derived CuArrays, not SubArrays. +# view(::CuVector{T}, ::UnitRange) -> CuVector{T} (shared memory, different offset/length) +# This means: +# 1. "views" vector would just hold more CuVectors (no savings) +# 2. We skip view caching entirely - just return view(vec, 1:n) each time +# 3. View creation is O(1) metadata operation, no GPU memory allocation + +mutable struct CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} + # Storage (GPU vectors) + vectors::Vector{CuVector{T}} + + # View length cache (for resize decision, but no view object cache) + # The actual view is created fresh each time since it's just metadata + view_lengths::Vector{Int} + + # N-D cache (same structure as CPU) + nd_arrays::Vector{Any} + nd_dims::Vector{Any} + nd_ptrs::Vector{UInt} + nd_next_way::Vector{Int} + + # State (identical to CPU) + n_active::Int + _checkpoint_n_active::Vector{Int} + _checkpoint_depths::Vector{Int} +end + +# Constructor with sentinel pattern +function CuTypedPool{T}() where T + CuTypedPool{T}( + CuVector{T}[], Int[], # No views vector! + Any[], Any[], UInt[], Int[], + 0, [0], [0] + ) +end + +# GPU-optimized fixed slots (different from CPU!) +const GPU_FIXED_SLOT_FIELDS = ( + :float32, # Primary (GPU-optimized) + :float64, # Precision when needed + :float16, # ML inference (added per AI review) + :int32, # Indexing (GPU-preferred) + :int64, # Large indices + :complexf32, # FFT, signal processing + :complexf64, # High-precision complex + :bool, # Masks +) + +mutable struct CuAdaptiveArrayPool <: AbstractArrayPool + # Fixed slots (GPU-optimized order: Float32 first) + float32::CuTypedPool{Float32} + float64::CuTypedPool{Float64} + float16::CuTypedPool{Float16} # Added per AI review + int32::CuTypedPool{Int32} + int64::CuTypedPool{Int64} + complexf32::CuTypedPool{ComplexF32} + complexf64::CuTypedPool{ComplexF64} + bool::CuTypedPool{Bool} + + others::IdDict{DataType,Any} + _current_depth::Int + _untracked_flags::Vector{Bool} + + # Safety: Track which device this pool belongs to (use public API!) + device_id::Int +end + +function CuAdaptiveArrayPool() + dev = CUDA.device() + CuAdaptiveArrayPool( + CuTypedPool{Float32}(), CuTypedPool{Float64}(), CuTypedPool{Float16}(), + CuTypedPool{Int32}(), CuTypedPool{Int64}(), + CuTypedPool{ComplexF32}(), CuTypedPool{ComplexF64}(), + CuTypedPool{Bool}(), + IdDict{DataType,Any}(), 1, [false], + CUDA.deviceid(dev) # Use public API, not internal .handle + ) +end +``` + +--- + +## 4. Generic Functions with Minimal Dispatch + +### 4.1 Allocation Dispatch Point + +The **only** type-specific function needed: + +```julia +# src/acquire.jl - CPU default +""" + allocate_vector(tp::AbstractTypedPool{T}, n::Int) -> V + +Allocate a new vector of type V with n elements. +This is the single dispatch point for storage-specific allocation. +""" +@inline allocate_vector(::AbstractTypedPool{T,Vector{T}}, n::Int) where T = + Vector{T}(undef, n) + +# ext/ - GPU override (THE ONLY METHOD EXTENSION NEEDS TO ADD!) +@inline allocate_vector(::AbstractTypedPool{T,CuVector{T}}, n::Int) where T = + CuVector{T}(undef, n) +``` + +> **[AI Review: Interaction with CUDA.jl Allocator]** +> It is important to note that `CuVector{T}(undef, n)` uses `CUDA.jl`'s own internal memory pool. +> **Why do we need another pool?** +> 1. **Overhead Reduction**: Even cached CUDA allocations have Julia-side overhead (struct creation, finalizer registration). `AdaptiveArrayPools` reuses the *Julia objects* (`CuArray` structs) and views, reducing GC pressure and allocation latency further. +> 2. **Logical Grouping**: It allows "rewinding" a whole block of temporary allocations in one go, which `CUDA.jl`'s allocator doesn't support (it's `malloc`/`free` style). + +### 4.2 get_view! Implementation + +> **[Post-Review v0.6: CPU vs GPU Differences]** +> Due to type differences (`view(Vector, 1:n) → SubArray` vs `view(CuVector, 1:n) → CuVector`), +> the CPU and GPU implementations differ slightly. CPU caches view objects; GPU creates them fresh. + +#### CPU Version (existing, unchanged) + +```julia +# src/acquire.jl - CPU implementation (caches SubArray views) + +function get_view!(tp::AbstractTypedPool{T,Vector{T}}, n::Int) where {T} + tp.n_active += 1 + idx = tp.n_active + + # 1. Expand pool if needed + if idx > length(tp.vectors) + push!(tp.vectors, allocate_vector(tp, n)) + new_view = view(tp.vectors[idx], 1:n) + push!(tp.views, new_view) # Cache the SubArray + push!(tp.view_lengths, n) + # ... growth warning ... + return new_view + end + + # 2. Cache hit (return cached SubArray - ZERO ALLOC) + @inbounds cached_len = tp.view_lengths[idx] + if cached_len == n + return @inbounds tp.views[idx] + end + + # 3. Cache miss - resize and update cached view + @inbounds vec = tp.vectors[idx] + if length(vec) < n + resize!(vec, n) + end + new_view = view(vec, 1:n) + @inbounds tp.views[idx] = new_view + @inbounds tp.view_lengths[idx] = n + return new_view +end +``` + +#### GPU Version (extension) + +> **[Post-Review v0.6: resize! Cost Warning]** +> `resize!(::CuVector, n)` with capacity increase triggers: +> 1. New GPU buffer allocation +> 2. Async copy of existing elements (even if we don't need them!) +> +> For pools, we typically don't need old data. Consider using `CUDA.unsafe_free!` + fresh +> allocation instead, or just allocating oversized initially. This is a **performance +> optimization opportunity** for v1.1+. + +```julia +# ext/AdaptiveArrayPoolsCUDAExt/acquire.jl + +# GPU version: no view caching (view() returns CuVector, not SubArray) +function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, n::Int) where {T} + tp.n_active += 1 + idx = tp.n_active + + # 1. Expand pool if needed + if idx > length(tp.vectors) + push!(tp.vectors, allocate_vector(tp, n)) + push!(tp.view_lengths, n) + # Return fresh view (no caching - view creates CuVector metadata) + return view(tp.vectors[idx], 1:n) + end + + # 2. Check if resize needed + @inbounds cached_len = tp.view_lengths[idx] + @inbounds vec = tp.vectors[idx] + + if length(vec) < n + # WARNING: resize! on CuVector copies old data (wasteful for pools) + # TODO v1.1: Consider CUDA.unsafe_free! + fresh alloc instead + resize!(vec, n) + end + + @inbounds tp.view_lengths[idx] = n + + # Always create fresh view (O(1) metadata, no GPU allocation) + return view(vec, 1:n) +end +``` + +### 4.3 get_nd_view! Implementation + +> **[Post-Review v0.6: reshape Behavior on GPU]** +> `reshape(::CuVector, dims)` also uses GPUArrays' `derive()` mechanism, returning a +> `CuArray{T,N}` (not `ReshapedArray`). This is actually simpler - we get a proper +> GPU array that CUDA kernels can use directly. + +```julia +# src/acquire.jl - Works for both, but return types differ: +# - CPU: ReshapedArray{T,N,SubArray{...}} +# - GPU: CuArray{T,N} (via derive) + +@inline function get_nd_view!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int}) where {T,N} + total_len = safe_prod(dims) + flat_view = get_view!(tp, total_len) + return reshape(flat_view, dims) # CPU: ReshapedArray, GPU: CuArray +end +``` + +### 4.4 Generic get_nd_array! (minimal dispatch) + +```julia +# src/acquire.jl + +# CPU version uses unsafe_wrap +@inline function wrap_array(::AbstractTypedPool{T,Vector{T}}, + flat_view, dims::NTuple{N,Int}) where {T,N} + unsafe_wrap(Array{T,N}, pointer(flat_view), dims) +end + +# ext/ - GPU version +@inline function wrap_array(::AbstractTypedPool{T,CuVector{T}}, + flat_view, dims::NTuple{N,Int}) where {T,N} + # Use reshape - returns CuArray{T,N} via GPUArrays derive() + reshape(flat_view, dims) +end + +# Generic implementation +@inline function get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int}) where {T,N} + total_len = safe_prod(dims) + flat_view = get_view!(tp, total_len) + slot = tp.n_active + + # ... cache lookup logic (identical) ... + + # DISPATCH POINT for array wrapping + arr = wrap_array(tp, flat_view, dims) + + # ... cache update logic (identical) ... + + return arr +end +``` + +> **[Post-Review v0.6: GPU reshape Clarification]** +> `reshape(::CuArray, dims)` returns a `CuArray{T,N}` (via GPUArrays `derive()`), **NOT** +> `ReshapedArray`. This is actually better for GPU kernels - they work directly with +> `CuArray` without any wrapper overhead. The `derive()` mechanism shares the underlying +> GPU memory buffer with different offset/strides metadata. + +--- + +## 5. State Management (100% Reusable) + +### 5.1 Generic State Functions + +All state functions operate only on `n_active` and checkpoint vectors - pure CPU operations. + +```julia +# src/state.jl - These work for ANY AbstractTypedPool! + +@inline function _checkpoint_typed_pool!(tp::AbstractTypedPool, depth::Int) + push!(tp._checkpoint_n_active, tp.n_active) + push!(tp._checkpoint_depths, depth) + nothing +end + +@inline function _rewind_typed_pool!(tp::AbstractTypedPool, current_depth::Int) + # Orphan cleanup + while @inbounds tp._checkpoint_depths[end] > current_depth + pop!(tp._checkpoint_depths) + pop!(tp._checkpoint_n_active) + end + + # Restore + if @inbounds tp._checkpoint_depths[end] == current_depth + pop!(tp._checkpoint_depths) + tp.n_active = pop!(tp._checkpoint_n_active) + else + tp.n_active = @inbounds tp._checkpoint_n_active[end] + end + nothing +end + +function _reset_typed_pool!(tp::AbstractTypedPool) + tp.n_active = 0 + empty!(tp._checkpoint_n_active) + push!(tp._checkpoint_n_active, 0) + empty!(tp._checkpoint_depths) + push!(tp._checkpoint_depths, 0) + tp +end + +# Concrete dispatches (trivial wrappers) +reset!(tp::TypedPool) = _reset_typed_pool!(tp) +reset!(tp::CuTypedPool) = _reset_typed_pool!(tp) # ext/ adds this +``` + +### 5.2 empty! (Type-Specific) + +`empty!` needs to clear storage, but the logic is identical: + +```julia +# src/state.jl - Generic implementation + +function Base.empty!(tp::AbstractTypedPool) + empty!(tp.vectors) + empty!(tp.views) # CPU only (GPU CuTypedPool has no views field) + empty!(tp.view_lengths) + empty!(tp.nd_arrays) + empty!(tp.nd_dims) + empty!(tp.nd_ptrs) + empty!(tp.nd_next_way) + _reset_typed_pool!(tp) + tp +end + +# GPU-specific version (no views field) +function Base.empty!(tp::CuTypedPool) + empty!(tp.vectors) + empty!(tp.view_lengths) + empty!(tp.nd_arrays) + empty!(tp.nd_dims) + empty!(tp.nd_ptrs) + empty!(tp.nd_next_way) + _reset_typed_pool!(tp) + tp +end +``` + +> **[Post-Review v0.6: GPU Memory Release Clarification]** +> `empty!(tp.vectors)` **removes Julia references** to `CuVector` objects. This does NOT +> guarantee immediate VRAM release! The actual GPU memory lifecycle is: +> +> 1. **Reference removed** → CuArray becomes GC-eligible +> 2. **GC runs** → CuArray finalizer queued +> 3. **Finalizer runs** → Returns memory to CUDA.jl's internal pool +> 4. **CUDA.jl pool decision** → May or may not release to driver +> +> For **immediate VRAM release**, use `CUDA.reclaim()` after `empty!()`: +> ```julia +> empty!(get_task_local_cuda_pool()) +> GC.gc() # Force finalizers to run +> CUDA.reclaim() # Request CUDA.jl to release cached memory +> ``` + +--- + +## 6. Task-Local Pool Design + +> **[AI Review: Multi-Device Safety]** +> The original design for `get_task_local_cuda_pool` was unsafe for multi-GPU workflows. If a task switches devices (e.g., `CUDA.device!(1)`), it must not use the pool created for Device 0. +> **Revised Design**: We use a `Dict{Int, CuAdaptiveArrayPool}` in task local storage to manage one pool per device per task. + +### 6.1 Separate Keys & Device Awareness + +```julia +# src/task_local_pool.jl +const _POOL_KEY = :ADAPTIVE_ARRAY_POOL + +@inline function get_task_local_pool() + pool = get(task_local_storage(), _POOL_KEY, nothing) + if pool === nothing + pool = AdaptiveArrayPool() + task_local_storage(_POOL_KEY, pool) + end + return pool::AdaptiveArrayPool +end + +# ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl +const _CU_POOL_KEY = :ADAPTIVE_ARRAY_POOL_CUDA + +@inline function get_task_local_cuda_pool() + # Get the dictionary of pools (one per device) + pools = get(task_local_storage(), _CU_POOL_KEY, nothing) + if pools === nothing + pools = Dict{Int, CuAdaptiveArrayPool}() + task_local_storage(_CU_POOL_KEY, pools) + end + + # Get current device ID using public API + dev_id = CUDA.deviceid(CUDA.device()) + + # Get or create pool for this device + if !haskey(pools, dev_id) + pools[dev_id] = CuAdaptiveArrayPool() # Constructor captures device_id + end + + return pools[dev_id] +end +``` + +> **[Post-Review v0.6: Public API for Device ID]** +> Always use `CUDA.deviceid(dev)` instead of `dev.handle`. The `.handle` field is internal +> and may change between CUDA.jl versions. `deviceid()` is the stable public API. + +### 6.2 Rationale for Separation + +| Scenario | Benefit | +|----------|---------| +| Mixed CPU/GPU workflow | Use both pools independently | +| GPU memory pressure | `empty!(cuda_pool)` without affecting CPU | +| Different lifecycles | CPU warm, GPU cleared per batch | +| **Multi-GPU** | **Safety**: Prevents cross-device access errors | +| Debugging | Clear distinction in profiling | + +--- + +## 7. Macro Design + +### 7.1 Recommended: Unified Macro with Backend Symbol + +```julia +# Unified API - single macro with optional backend symbol +@with_pool pool begin ... end # CPU (default, :cpu implied) +@with_pool :cuda pool begin ... end # GPU via CUDA +@with_pool :metal pool begin ... end # GPU via Metal (future) +@with_pool :cpu pool begin ... end # Explicit CPU + +# Without pool name (auto-generated) +@with_pool begin ... end # CPU default +@with_pool :cuda begin ... end # GPU +``` + +**Advantages:** +- Single macro to learn +- Easy backend switching (`:cuda` → `:metal`) +- Future-proof (just add new symbols in extensions) +- Clean, consistent API + +### 7.2 Implementation + +> **[Post-Review v0.6: Zero-Overhead Backend Selection]** +> The original `Dict{Symbol, Function}` registry has a critical flaw: runtime dictionary +> lookup weakens type inference, preventing the compiler from inlining the pool getter. +> This conflicts with our "zero overhead for CPU path" goal. +> +> **Solution**: Use `Val{:backend}` dispatch instead. Extensions add methods at load time, +> and the compiler can fully inline the call chain. + +```julia +# src/macros.jl - Val-based dispatch for zero overhead + +""" + _get_pool_for_backend(::Val{:cpu}) -> AdaptiveArrayPool + +Get task-local pool for the specified backend. Extensions add methods for their backends. +Using Val{Symbol} enables compile-time dispatch and full inlining. +""" +@inline _get_pool_for_backend(::Val{:cpu}) = get_task_local_pool() + +# Fallback with helpful error message +@noinline function _get_pool_for_backend(::Val{B}) where B + error("Pool backend :$B not found. Did you forget to load the extension (e.g., `using CUDA`)?") +end + +# Macro signatures +macro with_pool(backend::QuoteNode, pool_name, expr) + _generate_pool_code_with_backend(backend.value, pool_name, expr) +end + +macro with_pool(backend::QuoteNode, expr) + # Backend symbol without pool name + pool_name = gensym(:pool) + _generate_pool_code_with_backend(backend.value, pool_name, expr) +end + +macro with_pool(pool_name, expr) + # No backend = CPU default + _generate_pool_code_with_backend(:cpu, pool_name, expr) +end + +macro with_pool(expr) + # No backend, no pool name + pool_name = gensym(:pool) + _generate_pool_code_with_backend(:cpu, pool_name, expr) +end + +function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr) + transformed_expr = _transform_acquire_calls(expr, pool_name) + + # Use Val{backend} for compile-time dispatch - fully inlinable! + quote + local $(esc(pool_name)) = $_get_pool_for_backend($(Val{backend}())) + checkpoint!($(esc(pool_name))) + try + $(esc(transformed_expr)) + finally + rewind!($(esc(pool_name))) + end + end +end +``` + +> **Why Val{:backend} instead of Dict?** +> +> | Approach | Lookup Cost | Type Inference | Inlining | +> |----------|-------------|----------------|----------| +> | `Dict{Symbol,Function}` | O(1) hash | ❌ Returns `Function` | ❌ Dynamic call | +> | `Val{:cpu}` dispatch | O(0) compiled | ✅ Concrete type | ✅ Full inlining | +> +> With Val dispatch, `@with_pool :cpu` compiles to exactly the same code as the +> original non-backend version—zero overhead. + +### 7.3 Extension Registration + +```julia +# ext/AdaptiveArrayPoolsCUDAExt/macros.jl + +# Add method for :cuda backend via Val dispatch (no __init__ needed!) +@inline AdaptiveArrayPools._get_pool_for_backend(::Val{:cuda}) = get_task_local_cuda_pool() + +# Optional: Explicit macro alias for users who prefer it +macro with_cuda_pool(pool_name, expr) + esc(:(@with_pool :cuda $pool_name $expr)) +end + +macro with_cuda_pool(expr) + esc(:(@with_pool :cuda $expr)) +end + +export @with_cuda_pool # Optional explicit alias +``` + +> **Note**: With Val dispatch, no `__init__` registration is needed. The method is added +> when the extension module loads, and Julia's method dispatch handles the rest. + +### 7.4 Design Trade-offs + +| Approach | Pros | Cons | +|----------|------|------| +| **Unified** (`@with_pool :cuda`) | Single API, easy switching, extensible | Symbol must be literal | +| **Explicit** (`@with_cuda_pool`) | Clear intent, better autocomplete | Multiple macros to learn | +| **Hybrid** (both available) | User choice | Slight API redundancy | + +**Recommendation: Hybrid approach** - unified macro as primary API, explicit aliases optional. + +--- + +## 8. Package Extension Structure + +### 8.1 Project.toml Changes + +```toml +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +AdaptiveArrayPoolsCUDAExt = "CUDA" +``` + +### 8.2 File Structure + +``` +AdaptiveArrayPools/ +├── src/ +│ ├── AdaptiveArrayPools.jl +│ ├── types.jl # + AbstractTypedPool{T,V}, AbstractArrayPool +│ ├── acquire.jl # + allocate_vector, wrap_array dispatch points +│ ├── state.jl # Generic _checkpoint/_rewind/_reset (unchanged logic) +│ ├── task_local_pool.jl # (unchanged) +│ ├── macros.jl # + _get_pool_for_backend(::Val{:cpu}) dispatch +│ └── utils.jl # (unchanged) +└── ext/ + └── AdaptiveArrayPoolsCUDAExt/ + ├── AdaptiveArrayPoolsCUDAExt.jl # ~25 lines + ├── types.jl # ~50 lines (no views field!) + ├── acquire.jl # ~30 lines (GPU-specific get_view!) + ├── dispatch.jl # ~35 lines (+ checkpoint correction) + ├── task_local_pool.jl # ~25 lines (multi-device, public API) + └── macros.jl # ~15 lines (@with_cuda_pool) +``` + +**Total extension: ~180 lines** (slightly more due to GPU-specific get_view!) + +### 8.3 Extension Entry Point + +```julia +# ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl + +module AdaptiveArrayPoolsCUDAExt + +using AdaptiveArrayPools +using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool, + allocate_vector, wrap_array, get_view!, + _checkpoint_typed_pool!, _rewind_typed_pool!, + _reset_typed_pool!, _get_pool_for_backend, + CACHE_WAYS, checkpoint!, rewind!, reset! +using CUDA + +include("types.jl") +include("acquire.jl") # GPU-specific get_view! +include("dispatch.jl") +include("task_local_pool.jl") +include("macros.jl") + +# Exports +export CuAdaptiveArrayPool, CuTypedPool +export get_task_local_cuda_pool +export @with_cuda_pool + +end # module +``` + +### 8.4 dispatch.jl + +```julia +# ext/AdaptiveArrayPoolsCUDAExt/dispatch.jl + +# THE KEY DISPATCH METHODS + +@inline AdaptiveArrayPools.allocate_vector( + ::AbstractTypedPool{T,CuVector{T}}, n::Int +) where T = CuVector{T}(undef, n) + +@inline AdaptiveArrayPools.wrap_array( + ::AbstractTypedPool{T,CuVector{T}}, flat_view, dims::NTuple{N,Int} +) where {T,N} = reshape(flat_view, dims) + +# get_typed_pool! dispatches for CuAdaptiveArrayPool +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Float32}) = p.float32 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Float64}) = p.float64 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Float16}) = p.float16 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Int32}) = p.int32 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Int64}) = p.int64 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{ComplexF32}) = p.complexf32 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{ComplexF64}) = p.complexf64 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Bool}) = p.bool + +# Fallback for other types (with checkpoint correction!) +@inline function AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{T}) where T + get!(p.others, T) do + tp = CuTypedPool{T}() + # CRITICAL: Match CPU behavior - auto-checkpoint new pool if inside @with_pool scope + # Without this, rewind! would corrupt state for dynamically-created pools + if p._current_depth > 1 + push!(tp._checkpoint_n_active, 0) # n_active starts at 0 + push!(tp._checkpoint_depths, p._current_depth) + end + tp + end::CuTypedPool{T} +end +``` + +> **[Post-Review v0.6: Checkpoint Correction for Dynamic Pools]** +> When a new `CuTypedPool{T}` is created inside a `@with_pool` scope (i.e., when +> `_current_depth > 1`), we must initialize its checkpoint state to match the current +> depth. Otherwise, `rewind!` would pop from an incorrect checkpoint stack state. +> +> This mirrors the CPU implementation in `src/types.jl:230-238`. + +--- + +## 9. Memory Layout Clarification + +### 9.1 Why `Vector{CuVector{T}}` (not `CuVector{CuVector{T}}`) + +``` +✅ Correct: Vector{CuVector{T}} + + CPU RAM GPU VRAM + ┌─────────────────┐ ┌─────────────────┐ + │ Vector │ │ │ + │ ├─ CuVec meta1 ─┼──────────┼─► data1 [...] │ + │ ├─ CuVec meta2 ─┼──────────┼─► data2 [...] │ + │ └─ CuVec meta3 ─┼──────────┼─► data3 [...] │ + └─────────────────┘ └─────────────────┘ + + Pool management: CPU Computation: GPU +``` + +### 9.2 What Lives Where + +| Component | Location | Reason | +|-----------|----------|--------| +| Pool struct | CPU | Julia runtime | +| `vectors::Vector{...}` | CPU | Pool indexing | +| CuVector metadata | CPU | Julia object wrapper | +| CuVector data | **GPU** | Actual computation | +| n_active, checkpoints | CPU | State management | + +--- + +## 10. Migration Path + +### 10.1 Phase 1: Abstract Types (Non-Breaking) + +**Changes to src/:** +```julia +# types.jl ++ abstract type AbstractTypedPool{T, V<:AbstractVector{T}} end ++ abstract type AbstractArrayPool end +- mutable struct TypedPool{T} ++ mutable struct TypedPool{T} <: AbstractTypedPool{T, Vector{T}} +- mutable struct AdaptiveArrayPool ++ mutable struct AdaptiveArrayPool <: AbstractArrayPool + +# acquire.jl ++ allocate_vector(::AbstractTypedPool{T,Vector{T}}, n) where T = Vector{T}(undef, n) ++ wrap_array(::AbstractTypedPool{T,Vector{T}}, view, dims) where {T,N} = unsafe_wrap(...) +# Change get_view!, get_nd_array! signatures to use AbstractTypedPool + +# state.jl +# Change _checkpoint_typed_pool!, _rewind_typed_pool! to use AbstractTypedPool +``` + +**Breaking potential**: None (only adding supertypes and using more general signatures) + +### 10.2 Phase 2: CUDA Extension + +**New files in ext/:** +- Minimal implementation as described above + +**Breaking potential**: None (purely additive) + +### 10.3 Phase 3: Macro Enhancement (Optional) + +- Consider Option B unified macro +- Add `@with_cuda_pool` first, evaluate need for unification + +--- + +## 11. Example Usage (Target API) + +### 11.1 Basic Usage - Unified Macro + +```julia +using AdaptiveArrayPools +using CUDA # Triggers extension loading, registers :cuda backend + +# CPU workflow (default, unchanged) +function cpu_compute(data) + @with_pool pool begin + tmp = acquire!(pool, Float64, length(data)) + tmp .= data + sum(tmp) + end +end + +# GPU workflow - using :cuda backend symbol +function gpu_compute(data::CuVector) + @with_pool :cuda pool begin + A = acquire!(pool, Float32, 1000, 1000) # Returns CuMatrix{Float32} + B = acquire!(pool, Float32, 1000, 1000) + + A .= CUDA.rand(1000, 1000) + B .= A .* 2 + + sum(B) + end +end + +# Explicit CPU backend (equivalent to default) +function explicit_cpu_compute(data) + @with_pool :cpu pool begin + tmp = acquire!(pool, Float64, length(data)) + tmp .= data + sum(tmp) + end +end +``` + +### 11.2 Mixed CPU/GPU Workflow + +```julia +function mixed_compute(host_data::Vector{Float32}) + # CPU pool for staging + @with_pool cpu_pool begin + staging = acquire!(cpu_pool, Float32, length(host_data)) + staging .= host_data + + # Nested GPU pool + @with_pool :cuda gpu_pool begin + device_data = acquire!(gpu_pool, Float32, length(staging)) + copyto!(device_data, staging) # CPU → GPU + device_data .= device_data .^ 2 + copyto!(staging, device_data) # GPU → CPU + end # GPU pool rewinds here + + sum(staging) + end # CPU pool rewinds here +end +``` + +### 11.3 Without Pool Name (Auto-generated) + +```julia +# When you don't need to reference the pool directly +function simple_gpu_compute() + @with_pool :cuda begin + # pool name auto-generated, use get_task_local_cuda_pool() if needed + A = acquire!(get_task_local_cuda_pool(), Float32, 100, 100) + sum(A) + end +end + +# Or use the explicit getter within the block +function gpu_with_getter() + @with_pool :cuda begin + pool = get_task_local_cuda_pool() + A = acquire!(pool, Float32, 100, 100) + B = acquire!(pool, Float32, 100, 100) + A .+ B + end +end +``` + +### 11.4 Backend Switching (Same Code, Different Backend) + +```julia +# Parameterized backend - useful for testing/benchmarking +function compute_on_backend(data, backend::Symbol) + if backend == :cpu + @with_pool pool begin + tmp = acquire!(pool, Float32, length(data)) + tmp .= data + sum(tmp) + end + elseif backend == :cuda + @with_pool :cuda pool begin + tmp = acquire!(pool, Float32, length(data)) + tmp .= data + sum(tmp) + end + end +end + +# Note: Backend symbol must be literal in macro (compile-time) +# For runtime dispatch, use explicit pool getters: +function runtime_backend_dispatch(data, use_gpu::Bool) + pool = use_gpu ? get_task_local_cuda_pool() : get_task_local_pool() + checkpoint!(pool) + try + tmp = acquire!(pool, Float32, length(data)) + tmp .= data + sum(tmp) + finally + rewind!(pool) + end +end +``` + +### 11.5 Explicit Pool Management (Advanced) + +```julia +# Manual checkpoint/rewind for fine-grained control +function explicit_pool_management() + cpu = get_task_local_pool() + gpu = get_task_local_cuda_pool() + + # Checkpoint both pools + checkpoint!(cpu) + checkpoint!(gpu) + try + cpu_buf = acquire!(cpu, Float64, 1000) + gpu_buf = acquire!(gpu, Float32, 1000) + + # ... computation ... + + finally + # Rewind in reverse order (LIFO) + rewind!(gpu) + rewind!(cpu) + end +end + +# Clear GPU memory when under pressure +function memory_sensitive_workflow() + @with_pool :cuda pool begin + # Heavy GPU computation + A = acquire!(pool, Float32, 10000, 10000) + # ... + end + + # Explicitly free GPU memory if needed + empty!(get_task_local_cuda_pool()) + + # Continue with CPU work + @with_pool pool begin + # CPU pool unaffected + end +end +``` + +### 11.6 Future: Multiple GPU Backends + +```julia +# When Metal.jl extension is added (future) +using Metal # Registers :metal backend + +function apple_silicon_compute() + @with_pool :metal pool begin + A = acquire!(pool, Float32, 1000, 1000) # MtlMatrix{Float32} + # Metal-specific computation + end +end +``` + +> **[Post-Review v0.6: Backend Symbol Must Be Literal]** +> The macro `@with_pool :backend` requires a **literal symbol** (`:cuda`, `:metal`), +> not a variable containing a symbol. This is a Julia macro limitation—the backend +> is resolved at macro expansion time (compile time), not runtime. +> +> **This does NOT work:** +> ```julia +> const GPU_BACKEND = Sys.isapple() ? :metal : :cuda +> @with_pool GPU_BACKEND pool begin ... end # ERROR: GPU_BACKEND is not a QuoteNode +> ``` +> +> **For runtime backend selection, use explicit pool getters:** +> ```julia +> function portable_gpu_compute(use_metal::Bool) +> pool = use_metal ? get_task_local_metal_pool() : get_task_local_cuda_pool() +> checkpoint!(pool) +> try +> A = acquire!(pool, Float32, 1000, 1000) +> # ... computation ... +> finally +> rewind!(pool) +> end +> end +> ``` +> +> **Or use `@static` for compile-time platform selection:** +> ```julia +> function portable_gpu_compute() +> @static if Sys.isapple() +> @with_pool :metal pool begin +> # Metal path +> end +> else +> @with_pool :cuda pool begin +> # CUDA path +> end +> end +> end +> ``` + +--- + +## 12. Open Questions + +### 12.1 Resolved + +1. **Code duplication in extension** → Solved with parametric abstract types +2. **Macro approach** → Hybrid: unified `@with_pool :cuda` + optional `@with_cuda_pool` +3. **Memory layout** → `Vector{CuVector{T}}` is correct +4. **Float16 support** → **Added** to GPU fixed slots (per AI review) +5. **Multi-Device Safety** → **Solved** with `Dict{Int, Pool}` in task local storage (per AI review) +6. **unsafe_wrap for GPU** → Use `reshape` instead (per AI review) +7. **[v0.6] GPU view type** → `view(CuVector, 1:n)` returns `CuVector`, not `SubArray`. Pool design simplified. +8. **[v0.6] Zero-overhead backend selection** → `Val{:backend}` dispatch instead of Dict registry +9. **[v0.6] GPU checkpoint correction** → Added to `get_typed_pool!` fallback for `others` dict +10. **[v0.6] Device ID API** → Use `CUDA.deviceid(dev)` instead of internal `.handle` +11. **[v0.6] Backend symbol literal requirement** → Documented; `@static if` for platform selection + +### 12.2 Stream Synchronization (Critical Safety Documentation) + +> **[Post-Review v0.6: Expanded Safety Documentation]** + +**The Problem**: `rewind!` logically "frees" pooled memory. If a GPU kernel is still +running asynchronously using that memory, and the pool re-issues it for a new allocation, +**data corruption** or **use-after-free** occurs. + +**When It's Safe** (no synchronization needed): +- Single Task, default stream: Julia tasks typically use CUDA's default stream, which + serializes operations. `rewind!` happens after all prior operations complete. +- `CUDA.@sync` inside the block: Explicit synchronization before rewind. + +**When It's DANGEROUS** (must synchronize): + +1. **Passing arrays to other Tasks**: + ```julia + @with_pool :cuda pool begin + A = acquire!(pool, Float32, 1000) + @spawn begin + # DANGER: This task may still be using A after rewind! + expensive_computation!(A) + end + end # rewind! happens here - A is now invalid! + ``` + **Fix**: Wait for spawned task before exiting scope. + +2. **Explicit async streams**: + ```julia + @with_pool :cuda pool begin + A = acquire!(pool, Float32, 1000) + stream = CUDA.stream() + CUDA.@sync stream begin + # Kernel launched on non-default stream + my_kernel!(A; stream) + end + # If no @sync: kernel may still be running when rewind! executes + end + ``` + **Fix**: `CUDA.synchronize(stream)` or use `CUDA.@sync` before scope ends. + +3. **Kernel launch then immediate exit**: + ```julia + @with_pool :cuda pool begin + A = acquire!(pool, Float32, 1000) + @cuda threads=1024 my_kernel!(A) + # Kernel is async! May still be running... + end # rewind! immediately follows! + ``` + **Fix**: `CUDA.synchronize()` or `CUDA.@sync @cuda ...` + +**Recommendation for Documentation**: +```julia +# GPU POOLING SAFETY RULES +# +# 1. DO NOT pass pooled arrays to other Tasks without synchronization +# 2. DO synchronize before @with_pool block ends if using async streams +# 3. PREFER `CUDA.@sync` around kernel launches in pooled scopes +# 4. WHEN IN DOUBT: `CUDA.synchronize()` before the block ends +``` + +### 12.3 Still Open + +1. **Typed checkpoint for GPU**: Reuse existing macro logic? + - Should work with minimal changes + - Need to export `_transform_acquire_calls` etc. + +2. **resize! optimization for GPU** (v1.1+): + - Current: `resize!(CuVector, n)` copies old data (wasteful for pools) + - Consider: `CUDA.unsafe_free!` + fresh allocation, or pre-allocate oversized + +3. **Multi-backend single macro**: Support multiple pools in one call? + - Tuple syntax: `@with_pool (:cpu, cpu_pool) (:cuda, cuda_pool) begin ... end` + - Pro: Cleaner for mixed workflows, guaranteed proper rewind order + - Con: More complex macro implementation, less common use case + - Alternative: Nested `@with_pool` blocks (current approach) + - > **[AI Review]**: The tuple syntax is elegant but maybe over-engineering for V1. + +--- + +## 13. Summary: What Changes Where + +### src/ Changes (Phase 1) + +| File | Changes | +|------|---------| +| types.jl | Add abstract types, inherit from them | +| acquire.jl | Add `allocate_vector`, `wrap_array` dispatch points; generalize signatures | +| state.jl | Generalize to `AbstractTypedPool` | +| macros.jl | Add `_get_pool_for_backend(::Val{:cpu})` dispatch (NOT Dict registry) | +| Others | No changes | + +### ext/ New Files (Phase 2) + +| File | Lines | Content | +|------|-------|---------| +| AdaptiveArrayPoolsCUDAExt.jl | ~20 | Module, imports, exports | +| types.jl | ~50 | CuTypedPool (no views field!), CuAdaptiveArrayPool (+ Float16, device_id) | +| acquire.jl | ~30 | GPU-specific `get_view!` (no view caching) | +| dispatch.jl | ~35 | allocate_vector, wrap_array, get_typed_pool! (with checkpoint correction) | +| task_local_pool.jl | ~25 | get_task_local_cuda_pool (multi-device aware, public API) | +| macros.jl | ~25 | @with_cuda_pool | +| **Total** | **~155** | | + +--- + +## Changelog + +| Version | Date | Changes | +|---------|------|---------| +| 0.1 | 2024-12-10 | Initial draft | +| 0.2 | 2024-12-10 | Redesigned with parametric abstract types for maximum code reuse | +| 0.3 | 2024-12-10 | Unified macro design (`@with_pool :cuda`), comprehensive usage examples | +| 0.3.1 | 2024-12-10 | Added open questions: macro style preference, multi-backend single macro | +| 0.4 | 2024-12-10 | AI Review: Added Float16, device_id, multi-device pool getter, stream sync warning | +| 0.5 | 2024-12-10 | Merged AI feedback with restored full documentation | +| 0.6 | 2024-12-14 | **Post-Review Revision**: (1) Fixed GPU view type—`view(CuVector,1:n)` returns `CuVector` via GPUArrays `derive()`, not `SubArray`; simplified pool design by removing view caching. (2) Replaced Dict registry with `Val{:backend}` dispatch for zero-overhead backend selection. (3) Added checkpoint correction to GPU `get_typed_pool!` fallback. (4) Fixed `device_id` to use public API `CUDA.deviceid()`. (5) Clarified `empty!` semantics (reference removal ≠ VRAM release). (6) Documented `resize!` cost on GPU. (7) Expanded stream synchronization safety documentation. (8) Fixed backend symbol literal requirement (removed invalid `GPU_BACKEND` variable example). | diff --git a/docs/design/fixed_slots_codegen_design.md b/docs/design/fixed_slots_codegen_design.md new file mode 100644 index 0000000..c141e6b --- /dev/null +++ b/docs/design/fixed_slots_codegen_design.md @@ -0,0 +1,362 @@ +# Fixed Slots Iteration Automation Design Document + +## 1. Problem Definition + +### Current State +Iteration over fixed slot types was **manually repeated** across multiple functions: + +```julia +# Inside checkpoint! +_checkpoint_typed_pool!(pool.float64, depth) +_checkpoint_typed_pool!(pool.float32, depth) +_checkpoint_typed_pool!(pool.int64, depth) +_checkpoint_typed_pool!(pool.int32, depth) +_checkpoint_typed_pool!(pool.complexf64, depth) +_checkpoint_typed_pool!(pool.complexf32, depth) +_checkpoint_typed_pool!(pool.bool, depth) + +# Inside rewind! - same pattern repeated +_rewind_typed_pool!(pool.float64, depth) +_rewind_typed_pool!(pool.float32, depth) +... + +# Inside empty! - repeated again +empty!(pool.float64) +empty!(pool.float32) +... +``` + +### Improvement Goals +- **Centralized iteration logic**: Define once, use everywhere +- **Zero allocation**: No runtime overhead +- **IDE support preserved**: Keep struct definitions explicit + +--- + +## 2. Design Decision + +### Option B Adopted: const tuple + @generated (automate iteration only) + +**Core principle**: Keep struct definition manual, automate only iteration + +```julia +# 1. Keep struct explicitly defined (full IDE support) +mutable struct AdaptiveArrayPool + float64::TypedPool{Float64} + float32::TypedPool{Float32} + int64::TypedPool{Int64} + int32::TypedPool{Int32} + complexf64::TypedPool{ComplexF64} + complexf32::TypedPool{ComplexF32} + bool::TypedPool{Bool} + others::IdDict{DataType, Any} + _current_depth::Int + _untracked_flags::Vector{Bool} +end + +# 2. Define field names as const tuple +const FIXED_SLOT_FIELDS = (:float64, :float32, :int64, :int32, :complexf64, :complexf32, :bool) + +# 3. Use @generated for zero-allocation iteration +@generated function foreach_fixed_slot(f::F, pool::AdaptiveArrayPool) where {F} + exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in FIXED_SLOT_FIELDS] + quote + $(exprs...) + nothing + end +end +``` + +--- + +## 3. Detailed Implementation + +### 3.1 types.jl Changes + +```julia +# ============================================================================== +# Fixed Slot Configuration +# ============================================================================== + +""" + FIXED_SLOT_FIELDS + +Fixed slot field names for iteration. Used by `foreach_fixed_slot`. + +Note: When adding/removing fixed slots, update BOTH: +1. This tuple +2. The AdaptiveArrayPool struct definition below +""" +const FIXED_SLOT_FIELDS = (:float64, :float32, :int64, :int32, :complexf64, :complexf32, :bool) + +# ============================================================================== +# AdaptiveArrayPool (explicit definition - full IDE support) +# ============================================================================== + +mutable struct AdaptiveArrayPool + # Fixed Slots: common types with zero lookup overhead + # NOTE: Keep in sync with FIXED_SLOT_FIELDS above + float64::TypedPool{Float64} + float32::TypedPool{Float32} + int64::TypedPool{Int64} + int32::TypedPool{Int32} + complexf64::TypedPool{ComplexF64} + complexf32::TypedPool{ComplexF32} + bool::TypedPool{Bool} + + # Fallback: rare types + others::IdDict{DataType, Any} + + # Untracked acquire detection + _current_depth::Int + _untracked_flags::Vector{Bool} +end + +# ... constructor, get_typed_pool! etc. remain unchanged ... + +# ============================================================================== +# Zero-Allocation Iteration +# ============================================================================== + +""" + foreach_fixed_slot(f, pool::AdaptiveArrayPool) + +Apply function `f` to each fixed slot TypedPool. +Zero allocation via compile-time unrolling. + +## Example +```julia +foreach_fixed_slot(pool) do tp + _checkpoint_typed_pool!(tp, depth) +end +``` +""" +@generated function foreach_fixed_slot(f::F, pool::AdaptiveArrayPool) where {F} + exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in FIXED_SLOT_FIELDS] + quote + $(exprs...) + nothing + end +end +``` + +### 3.2 state.jl Changes + +```julia +function checkpoint!(pool::AdaptiveArrayPool) + pool._current_depth += 1 + push!(pool._untracked_flags, false) + depth = pool._current_depth + + # Fixed slots - zero allocation via @generated + foreach_fixed_slot(pool) do tp + _checkpoint_typed_pool!(tp, depth) + end + + # Others - fallback types + for p in values(pool.others) + _checkpoint_typed_pool!(p, depth) + end + nothing +end + +function rewind!(pool::AdaptiveArrayPool) + depth = pool._current_depth + + # Fixed slots - zero allocation + foreach_fixed_slot(pool) do tp + _rewind_typed_pool!(tp, depth) + end + + # Others + for tp in values(pool.others) + _rewind_typed_pool!(tp, depth) + end + + pop!(pool._untracked_flags) + pool._current_depth -= 1 + nothing +end + +function Base.empty!(pool::AdaptiveArrayPool) + # Fixed slots + foreach_fixed_slot(empty!, pool) + + # Others + for tp in values(pool.others) + empty!(tp) + end + empty!(pool.others) + + pool._current_depth = 0 + empty!(pool._untracked_flags) + pool +end +``` + +--- + +## 4. Type Add/Remove Procedure + +### Adding UInt8 + +**Locations requiring manual update (2 places)**: + +```julia +# 1. Update FIXED_SLOT_FIELDS +const FIXED_SLOT_FIELDS = (:float64, :float32, :int64, :int32, :complexf64, :complexf32, :bool, :uint8) + +# 2. Update AdaptiveArrayPool struct +mutable struct AdaptiveArrayPool + float64::TypedPool{Float64} + float32::TypedPool{Float32} + int64::TypedPool{Int64} + int32::TypedPool{Int32} + complexf64::TypedPool{ComplexF64} + complexf32::TypedPool{ComplexF32} + bool::TypedPool{Bool} + uint8::TypedPool{UInt8} # ← Added + ... +end + +# 3. Update constructor +function AdaptiveArrayPool() + AdaptiveArrayPool( + TypedPool{Float64}(), + ... + TypedPool{UInt8}(), # ← Added + ... + ) +end + +# 4. Add get_typed_pool! dispatch +@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{UInt8}) = p.uint8 +``` + +**Automatically updated**: +- `checkpoint!` internal iteration +- `rewind!` internal iteration +- `empty!` internal iteration +- All code using `foreach_fixed_slot` + +--- + +## 5. Testing Strategy + +```julia +@testset "Fixed Slot Iteration" begin + pool = AdaptiveArrayPool() + + # Verify FIXED_SLOT_FIELDS and struct synchronization + for field in FIXED_SLOT_FIELDS + @test hasfield(AdaptiveArrayPool, field) + @test getfield(pool, field) isa TypedPool + end + + # Verify foreach_fixed_slot visits all slots + count = Ref(0) + foreach_fixed_slot(pool) do tp + count[] += 1 + end + @test count[] == length(FIXED_SLOT_FIELDS) + + # Zero allocation verification + pool2 = AdaptiveArrayPool() + foreach_fixed_slot(identity, pool2) # warmup + allocs = @allocated foreach_fixed_slot(identity, pool2) + @test allocs == 0 +end +``` + +--- + +## 6. Benefits + +### 6.1 Full IDE Support +- Explicit struct definition → autocomplete, Go to Definition work correctly +- Perfect type inference +- LSP/Language Server compatible + +### 6.2 Simple Implementation +- Single `@generated` function automates iteration +- No `@eval` needed → no precompilation concerns +- Most existing code preserved + +### 6.3 Easy Debugging +- Clear struct definition allows field inspection in debugger +- Compatible with tools like `@infiltrate` + +### 6.4 Zero Runtime Overhead +```julia +# @generated unrolls at compile time: +# foreach_fixed_slot(f, pool) is equivalent to: +f(pool.float64) +f(pool.float32) +f(pool.int64) +f(pool.int32) +f(pool.complexf64) +f(pool.complexf32) +f(pool.bool) +``` + +--- + +## 7. Drawbacks and Considerations + +### 7.1 Synchronization Required (2 places) +```julia +# These two locations must always be in sync: +const FIXED_SLOT_FIELDS = (:float64, :float32, ...) # 1 +mutable struct AdaptiveArrayPool # 2 + float64::TypedPool{Float64} + ... +end +``` + +**Mitigation**: Explicit comments + test verification + +### 7.2 @generated First-Call Cost +```julia +# Recompiles for different closures +foreach_fixed_slot(x -> checkpoint!(x, 1), pool) # Compiles +foreach_fixed_slot(x -> rewind!(x, 1), pool) # Compiles again +``` + +**Impact**: Slight effect on TTFX (Time To First X) +**Mitigation**: Warmup (precompile) at package load + +### 7.3 Metaprogramming Knowledge Required +```julia +@generated function foreach_fixed_slot(f::F, pool) where {F} + # Understanding this code requires @generated knowledge + exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in FIXED_SLOT_FIELDS] + ... +end +``` + +**Mitigation**: Thorough comments and docstrings + +--- + +## 8. Option Comparison Summary + +| Aspect | Current (Manual) | Option B (Adopted) | Option C (@eval) | +|--------|------------------|-------------------|------------------| +| Modification locations | 6+ places | 2 places + α | 1 place | +| IDE support | Perfect | Perfect | Partial | +| Complexity | Low | Low | High | +| Debugging | Easy | Easy | Difficult | +| Type addition safety | May miss | Test-verified | Automatic | + +--- + +## 9. Conclusion + +**Reasons for adopting Option B**: + +1. **Practical balance**: Removes repetitive code without the complexity of full automation (Option C) +2. **IDE support preserved**: Maintains the most important developer experience +3. **Low risk**: Uses only `@generated` without `@eval`, ensuring precompilation stability +4. **Incremental improvement**: Improves only iteration while preserving most existing code + +Since type changes are rare (1-2 times during package lifetime), and struct definition synchronization across 2 locations can be sufficiently verified by tests, Option B is the optimal choice. diff --git a/docs/design/hybrid_api_design.md b/docs/design/hybrid_api_design.md new file mode 100644 index 0000000..a407e72 --- /dev/null +++ b/docs/design/hybrid_api_design.md @@ -0,0 +1,441 @@ +# Hybrid API Design: acquire! vs unsafe_acquire! + +## Executive Summary + +Redesigning `AdaptiveArrayPools.jl`'s N-D array acquisition API with a **Two Tools Strategy**: + +| API | Return Type | Use Case | Allocation Characteristics | +|-----|-------------|----------|---------------------------| +| `acquire!` | `ReshapedArray` (fixed) | General use, Static dispatch | No cache needed, relies on compiler optimization | +| `unsafe_acquire!` | `Array` (fixed) | Dynamic dispatch, FFI | Cache hit: 0, miss: 112 bytes | + +**Core Principle**: Return type does not change based on state (Type Stability guaranteed) + +> **Note**: ReshapedArray's "0 allocation" depends on compiler's SROA (Scalar Replacement of Aggregates) and +> escape analysis. Not always guaranteed - heap allocation may occur if the value escapes from the function. + +--- + +## Problem Statement + +### Current State (v0.2.0) + +``` +acquire!(pool, T, dims...) + └─> get_nd_view!() + └─> get_nd_array!() ← uses unsafe_wrap + └─> 112 bytes on cache miss! +``` + +- Both `acquire!` and `unsafe_acquire!` internally use `unsafe_wrap` +- Always 112 bytes allocation on cache miss +- Tried to reduce miss rate with N-way cache, but 100% miss on cyclic patterns + +### v0.1.2 Approach + +``` +acquire!(pool, T, dims...) + └─> get_view!(tp, total_len) ← 1D view (cached) + └─> reshape(view, dims) ← 0 bytes always! +``` + +- `reshape(view, dims)` creates a wrapper object, but heap allocation can be avoided via compiler optimization (SROA/escape analysis) +- Simple and predictable + +--- + +## Why Not Mixed Return Types? + +### Proposed (but rejected) Approach + +```julia +# ❌ BAD: Array on cache hit, View on miss +function acquire!(pool, T, dims...) + if cache_hit + return cached_array::Array{T,N} + else + return reshape(view, dims)::ReshapedArray{...} + end +end +``` + +### Problem: Type Instability + +| Aspect | Impact | +|--------|--------| +| **Compiler inference** | `Union{Array, ReshapedArray}` → Union splitting or dynamic dispatch | +| **Performance** | Execution slowdown while trying to achieve zero-alloc | +| **API semantics** | Same function returning different types → confusion | +| **Module boundaries** | Inference widens when storing result or passing to other modules | + +**AI Feedback Quote**: +> "State-dependent returns become Union{Array, ReshapedArray} from external view, breaking API-level type stability." + +--- + +## Recommended Design: Two Tools Strategy + +### Principles + +1. **Fixed return type**: Each API always returns the same type +2. **Purpose separation**: Users choose API based on situation +3. **Simple implementation**: Minimize complex cache logic + +### API Design + +#### 1. `acquire!` → ReshapedArray (regression to v0.1.2 style) + +```julia +@inline function acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} + tp = get_typed_pool!(pool, T) + total_len = safe_prod(dims) + flat_view = get_view!(tp, total_len) # 1D view (cached, 0 alloc) + return reshape(flat_view, dims) # ReshapedArray (0 alloc always!) +end +``` + +**Characteristics**: +- Always returns `ReshapedArray{T, N, SubArray{T, 1, Vector{T}, ...}, ...}` +- No `unsafe_wrap` call → no Array header creation cost (112B) even on cache miss +- N-way cache unnecessary (simple 1D view cache sufficient) + +**Use Cases**: +- General `Flux` layers (`mul!`, `broadcast`) +- Code where static dispatch is guaranteed +- Most use cases + +**Constraints**: +- Escape optimization may fail in type-unspecified call paths, causing wrapper allocation +- Incompatible with APIs requiring strict `Array` type (rare) + +#### 2. `unsafe_acquire!` → Array (maintains v0.2.0 + N-way Cache) + +```julia +@inline function unsafe_acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} + tp = get_typed_pool!(pool, T) + return get_nd_array!(tp, dims) # Array with slot-based + N-way cache +end +``` + +**Characteristics**: +- Always returns `Array{T, N}` +- Cache hit: 0 bytes, Cache miss: 112 bytes +- Maintains existing N-way cache (4-way) + +**Use Cases**: +- Type-unspecified call paths (e.g., `TGLFNNmodel._pooled_chain` - no concrete type parameters) +- FFI / ccall +- Special APIs that strictly require `StridedArray` +- Places requiring strict `Array` type + +**Benefits**: +- Array is already a heap-allocated object → no additional allocation when reusing cached instance +- Avoids wrapper object optimization issues in type-unspecified paths + +--- + +## Comparison Matrix + +| Strategy | Return Type | Cache Miss Cost | Type-Unspecified Path* | Type Stable | +|----------|-------------|-----------------|------------------------|-------------| +| **acquire! (new)** | `ReshapedArray` | **0 bytes** (no unsafe_wrap) | May allocate wrapper | **✓** | +| **unsafe_acquire!** | `Array` | 112 bytes | **0 bytes** (on cache hit) | **✓** | +| ~~Mixed (rejected)~~ | `Union{...}` | 0 bytes | Unspecified | **✗** | + +*Type-unspecified path: Calls through abstract fields without concrete type parameters, etc. Compiler cannot apply escape optimization, causing wrapper object heap allocation. + +### Recommended API by Situation + +| Situation | acquire! | unsafe_acquire! | Recommendation | +|-----------|----------|-----------------|----------------| +| Type-specified call path | Optimizable | 0 bytes (hit) | `acquire!` | +| Variable dims (cyclic pattern) | Optimizable | cache miss occurs | `acquire!` | +| Type-unspecified path | Wrapper alloc | **0 bytes** (hit) | **`unsafe_acquire!`** | +| FFI / raw pointer | N/A | 0 bytes | `unsafe_acquire!` | + +--- + +## Implementation Plan + +### Phase 1: Simplify `acquire!` N-D Path + +**File**: `src/core.jl` + +**Before**: +```julia +@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N} + arr = get_nd_array!(tp, dims) # uses unsafe_wrap + idx = tp.n_active + # ... complex caching logic + new_view = view(arr, ntuple(_ -> Colon(), Val(N))...) + return new_view # SubArray{T,N,Array{T,N}} +end +``` + +**After**: +```julia +@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N} + total_len = safe_prod(dims) + flat_view = get_view!(tp, total_len) # 1D view (cached) + return reshape(flat_view, dims) # ReshapedArray (0 alloc!) +end +``` + +**Change Summary**: +- Remove `get_nd_array!` call +- Directly return `reshape(1D_view, dims)` +- N-D view cache (`nd_views`) not used (in acquire! path) + +### Phase 2: Maintain `unsafe_acquire!` Cache + +**No changes** - maintain current implementation: +- `get_nd_array!` → `unsafe_wrap` + slot-based cache +- Maintain N-way cache (4-way) +- 112 bytes allocation on cache miss + +### Phase 3: TypedPool Field Cleanup (Optional) + +Since `acquire!` no longer uses N-D cache, redefine field purposes: + +```julia +mutable struct TypedPool{T} + # Storage + vectors::Vector{Vector{T}} + + # 1D Cache (shared by acquire! 1D + acquire! N-D) + views::Vector{SubArray{...}} + view_lengths::Vector{Int} + + # N-D Cache (unsafe_acquire! only) + nd_arrays::Vector{Any} # Array objects for unsafe_acquire! + nd_dims::Vector{Any} # Dimension tuples + nd_ptrs::Vector{UInt} # Pointer validation + + # Note: nd_views can be removed (acquire! uses reshape) + + # State + n_active::Int + _checkpoint_n_active::Vector{Int} + _checkpoint_depths::Vector{Int} +end +``` + +### Phase 4: Test Updates + +**Files**: `test/test_nway_cache.jl`, `test/test_zero_allocation.jl` + +```julia +@testset "acquire! returns ReshapedArray" begin + pool = AdaptiveArrayPool() + @with_pool pool begin + m = acquire!(pool, Float64, 10, 10) + @test m isa Base.ReshapedArray + @test size(m) == (10, 10) + end +end + +@testset "acquire! is always zero-allocation" begin + pool = AdaptiveArrayPool() + + # 5-way cycling (exceeds any cache) - still 0 alloc! + function test_5way!(p) + dims_list = ((5, 10), (10, 5), (7, 7), (3, 16), (4, 12)) + for dims in dims_list + checkpoint!(p) + acquire!(p, Float64, dims...) # ReshapedArray + rewind!(p) + end + end + + test_5way!(pool); test_5way!(pool) + allocs = @allocated test_5way!(pool) + @test allocs == 0 # Always zero, regardless of pattern! +end + +@testset "unsafe_acquire! returns Array" begin + pool = AdaptiveArrayPool() + @with_pool pool begin + m = unsafe_acquire!(pool, Float64, 10, 10) + @test m isa Array + @test size(m) == (10, 10) + end +end +``` + +### Phase 5: Documentation + +**CHANGELOG.md** (not a breaking change, behavior improvement): +```markdown +## [Unreleased] +### Changed +- `acquire!` N-D path now returns `ReshapedArray` instead of `SubArray{Array}` + - Always zero-allocation, regardless of cache hit/miss + - Simpler implementation, no N-D cache dependency +- `unsafe_acquire!` continues to return `Array` with N-way cache + - Use this when dynamic dispatch or raw Array is needed +``` + +**Docstring Updates**: +```julia +""" + acquire!(pool, Type{T}, dims...) -> ReshapedArray{T,N,...} + +Acquire a view with dimensions `dims` from the pool. + +Returns a `ReshapedArray` backed by pool memory. **Zero creation cost** - no +`unsafe_wrap` call needed. Compiler may optimize away heap allocation via +SROA/escape analysis in type-specified paths. + +For type-unspecified paths (struct fields without concrete type parameters), +use [`unsafe_acquire!`](@ref) instead - cached Array instances can be reused. + +## Example +```julia +@with_pool pool begin + m = acquire!(pool, Float64, 64, 100) # ReshapedArray + m .= 1.0 + result = sum(m) +end +``` +""" +``` + +```julia +""" + unsafe_acquire!(pool, Type{T}, dims...) -> Array{T,N} + +Acquire a raw `Array` backed by pool memory. + +Returns an `Array` object. Since Array is already heap-allocated, the cached +instance can be reused without wrapper allocation overhead. + +## When to use +- Type-unspecified paths (e.g., struct fields without concrete type parameters) +- FFI / ccall requiring raw pointers +- APIs that strictly require `Array` type + +## Allocation behavior +- Cache hit: 0 bytes (cached Array instance reused) +- Cache miss: 112 bytes (Array header creation) + +## Example +```julia +@with_pool pool begin + m = unsafe_acquire!(pool, Float64, 64, 100) # Matrix{Float64} + # Safe for type-unspecified paths + some_abstract_field.process(m) # 0 bytes - cached instance reused +end +``` +""" +``` + +--- + +## TurbulentTransport Integration + +### Changed File: `src/tglf_nn.jl` + +**Already Applied** (line 277): +```julia +@with_pool pool function flux_array!(out_y::AbstractMatrix{T}, fluxmodel::TGLFNNmodel, x::AbstractMatrix{T}; ...) where {T<:Real} + # ... + # NOTE: Use unsafe_acquire! (returns Array) instead of acquire! (returns ReshapedArray) + # because _pooled_chain field lacks concrete type parameters, causing + # escape optimization failure. Array (cached instance) avoids wrapper allocation. + xx = unsafe_acquire!(pool, T, size(x)) + # ... + fluxmodel._pooled_chain(out_y, xx) # 0 bytes - cached Array instance reused +end +``` + +### No Change Needed: `src/pooled_layers.jl` + +`PooledDense`, `PooledActivation` are in **static dispatch** environment: +- Types are known at compile time +- Maintain use of `acquire!` (ReshapedArray) +- ReshapedArray is also 0 bytes in static dispatch + +```julia +@inline function _pooled_dense_forward!(pd::PooledDense, x::AbstractVecOrMat) + pool = get_task_local_pool() + # acquire! usage OK - static dispatch environment + out = acquire!(pool, Float64, size(d.weight, 1), size(xT, 2)) + mul!(out, d.weight, xT) # ReshapedArray is StridedArray ✓ + return Flux.NNlib.bias_act!(d.σ, out, d.bias) +end +``` + +--- + +## Summary + +### Before (v0.2.0) + +``` +┌─────────────────────────────────────────────────────────────┐ +│ acquire!() ──┬──> get_nd_view!() ──> get_nd_array!() │ +│ │ │ │ │ +│ │ │ unsafe_wrap (112B miss) │ +│ │ │ ↓ │ +│ │ └──────> SubArray{Array} ←──────────┘ +│ │ │ +│ unsafe_acquire!() ──> get_nd_array!() ──> Array │ +│ │ │ +│ unsafe_wrap (112B miss) │ +└─────────────────────────────────────────────────────────────┘ +``` + +### After (Hybrid) + +``` +┌─────────────────────────────────────────────────────────────┐ +│ acquire!() ──> get_view!() ──> reshape() ──> ReshapedArray │ +│ │ │ │ +│ 1D cache 0 bytes always! │ +│ (0 alloc) │ +│ │ +│ unsafe_acquire!() ──> get_nd_array!() ──> Array │ +│ │ │ +│ unsafe_wrap + N-way cache │ +│ (0B hit, 112B miss) │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Decision Matrix for Users + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Which API to use? │ +├─────────────────────────────────────────────────────────────┤ +│ │ +│ Is the code path type-unspecified? │ +│ (abstract fields without concrete type params, │ +│ runtime-determined function calls) │ +│ │ +│ YES ──────────────> unsafe_acquire!() │ +│ │ │ │ +│ │ Returns Array │ +│ │ (cached instance reused) │ +│ │ │ +│ NO ───────────────> acquire!() │ +│ │ │ +│ Returns ReshapedArray │ +│ (0 bytes creation) │ +│ │ +└─────────────────────────────────────────────────────────────┘ +``` + +--- + +## Open Questions for Review + +1. **N-way cache retention level**: Keep current 4-way? Reduce to 2-way? +2. **nd_views field removal**: Can be removed since `acquire!` no longer uses it? +3. **Backward compatibility**: Cases where existing `acquire!` users check for `SubArray` type? + +--- + +## References + +- [nd_array_approach_comparison.md](./nd_array_approach_comparison.md) - Benchmark results and boxing analysis +- [PR_MESSAGE.md](../PR_MESSAGE.md) - Original PR description diff --git a/docs/design/macro-linenumbernode-improvement.md b/docs/design/macro-linenumbernode-improvement.md new file mode 100644 index 0000000..26eab67 --- /dev/null +++ b/docs/design/macro-linenumbernode-improvement.md @@ -0,0 +1,369 @@ +# @with_pool Macro LineNumberNode Improvement Plan + +## Goal +Utilize `__source__` and `LineNumberNode` to improve coverage, stack trace, and debugging + +## Target File +- `/Users/yoo/.julia/dev/AdaptiveArrayPools/src/macros.jl` + +--- + +## Phase 1: Add Helper Functions + +### 1.1 LineNumberNode Insertion Helper (New) + +**Location**: Add before `_generate_pool_code` function + +```julia +""" + _maybe_add_source_location!(expr, source) + +Insert source location LineNumberNode at the beginning of an Expr block. +No-op if source is nothing or expr is not an Expr(:block, ...). +""" +function _maybe_add_source_location!(expr::Expr, source::Union{LineNumberNode,Nothing}) + if source !== nothing && expr.head === :block + pushfirst!(expr.args, LineNumberNode(source.line, source.file)) + end + return expr +end +_maybe_add_source_location!(expr, ::Nothing) = expr +``` + +**Benefits**: Common application across all return paths, reduces risk of omission/drift + +### 1.2 Function Body LineNumberNode Correction Helper (New) + +**Goal**: Correct with `__source__` **only when no LNN exists at the body top level** + +> **Background**: `body` is the **user code AST** obtained from `func_def.args[2]`. +> Existing LNNs point to **user file lines** and must be preserved. +> The problem is **short function forms** like `f(x) = ...` that have no LNN. + +```julia +""" + _has_toplevel_lnn(body) -> Bool + +Check if body has a LineNumberNode at the top level (within first few args). +More robust than checking only args[1], handles Expr(:meta) etc. +""" +function _has_toplevel_lnn(body) + body isa Expr && body.head === :block || return false + # Check first 3 args for LNN (handles :meta, :line annotations, etc.) + for i in 1:min(3, length(body.args)) + body.args[i] isa LineNumberNode && return true + end + return false +end + +""" + _ensure_body_has_toplevel_lnn(body, source) + +Ensure body has a LineNumberNode at the top level. +- If body already has a top-level LNN, preserve it (user file line info) +- If not, prepend source LNN (macro call location as fallback) + +Returns a new Expr to avoid mutating the original AST. +""" +function _ensure_body_has_toplevel_lnn(body, source::Union{LineNumberNode,Nothing}) + source === nothing && return body + + # Check if top-level LNN already exists (robust check) + if _has_toplevel_lnn(body) + return body # Preserve existing user file LNN + end + + # No top-level LNN → add source as fallback (no mutation) + lnn = LineNumberNode(source.line, source.file) + if body isa Expr && body.head === :block + return Expr(:block, lnn, body.args...) + else + return Expr(:block, lnn, body) + end +end +``` + +**Benefits**: +- **User body LNN preserved**: If existing top-level LNN exists, keep it (accurate body line) +- **Short function form handling**: If no LNN, correct with `__source__` +- **Mutation prevention**: Returns new Expr to protect original AST + +--- + +## Phase 2: Modify Helper Function Signatures + +### 2.1 Keyword Argument Approach (Recommended) + +To avoid the risk of fixed `:cpu` default, add `source` as keyword argument: + +| Function | Search Pattern | Change | +|----------|----------------|--------| +| `_generate_pool_code` | `function _generate_pool_code(pool_name, expr, force_enable)` | `(...; source::Union{LineNumberNode,Nothing}=nothing)` | +| `_generate_pool_code_with_backend` | `function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, force_enable::Bool)` | `(...; source::Union{LineNumberNode,Nothing}=nothing)` | +| `_generate_function_pool_code` | `function _generate_function_pool_code(pool_name, func_def, force_enable, disable_pooling, backend::Symbol=:cpu)` | `(...; source::Union{LineNumberNode,Nothing}=nothing)` | +| `_generate_function_pool_code_with_backend` | `function _generate_function_pool_code_with_backend(backend::Symbol, pool_name, func_def, disable_pooling::Bool)` | `(...; source::Union{LineNumberNode,Nothing}=nothing)` | + +**Benefits**: Minimal changes to existing call sites, solves `backend` default value issue + +--- + +## Phase 3: Pass source to Internal Function Calls + +### Inside `_generate_pool_code` (Search: `_generate_function_pool_code(pool_name`) + +```julia +# Before +return _generate_function_pool_code(pool_name, expr, force_enable, true, :cpu) +return _generate_function_pool_code(pool_name, expr, force_enable, false) + +# After (pass as keyword argument) +return _generate_function_pool_code(pool_name, expr, force_enable, true, :cpu; source) +return _generate_function_pool_code(pool_name, expr, force_enable, false; source) +``` + +### Inside `_generate_pool_code_with_backend` (Search: `_generate_function_pool_code_with_backend(backend`) + +```julia +# After +_generate_function_pool_code_with_backend(backend, pool_name, expr, ...; source) +``` + +--- + +## Phase 4: LineNumberNode Insertion + +Call helper before returning each `quote ... end` block: + +```julia +result = quote + # ... generated code ... +end +_maybe_add_source_location!(result, source) +return result +``` + +### Insertion Locations (Based on Search Patterns) + +**`_generate_pool_code`** (Search: `function _generate_pool_code`): +- `return quote ... end` in `!USE_POOLING` branch +- `return quote ... end` in `force_enable` branch +- `return quote ... end` in `else` branch + +**`_generate_pool_code_with_backend`** (Search: `function _generate_pool_code_with_backend`): +- All `return quote ... end` with same pattern + +**`_generate_function_pool_code`** (Search: `function _generate_function_pool_code`): +- After `transformed_body` creation: `transformed_body = _ensure_body_has_toplevel_lnn(transformed_body, source)` +- Also correct `body` in `disable_pooling` path: `body = _ensure_body_has_toplevel_lnn(body, source)` +- Then `new_body = quote ... end` +- **(Optional) To make wrapper appear as call-site**: Add `_maybe_add_source_location!(new_body, source)` + +**`_generate_function_pool_code_with_backend`** (Search: `function _generate_function_pool_code_with_backend`): +- Apply `_ensure_body_has_toplevel_lnn(..., source)` to `transformed_body` and `body` before constructing `new_body` +- **(Optional)** Can also apply `_maybe_add_source_location!` to `new_body` + +> **Core Principle**: Preserve if top-level LNN exists (user line), correct with `__source__` if not (short function form) +> **Note**: Wrapper code (checkpoint/try/finally) lines may still point to macros.jl. Inserting LNN to `new_body` improves this but is not required. + +--- + +## Phase 5: Macro Definition Modifications + +**@with_pool** (Search: `macro with_pool`): +```julia +macro with_pool(pool_name, expr) + _generate_pool_code(pool_name, expr, true; source=__source__) +end +# Same pattern for remaining 3 +``` + +**@maybe_with_pool** (Search: `macro maybe_with_pool`): +```julia +macro maybe_with_pool(pool_name, expr) + _generate_pool_code(pool_name, expr, false; source=__source__) +end +# Same pattern for remaining 3 +``` + +--- + +## Phase 6: Testing + +### 6.1 Robust Search Helpers (test/test_macro_expansion.jl) + +```julia +""" + find_linenumbernode_with_line(expr, target_line) -> Union{LineNumberNode, Nothing} + +Recursively search for a LineNumberNode matching target_line. +More robust than checking only the first LNN (handles block forms where +_maybe_add_source_location! may insert LNN before user code LNN). +""" +function find_linenumbernode_with_line(expr, target_line::Int) + if expr isa LineNumberNode && expr.line == target_line + return expr + elseif expr isa Expr + for arg in expr.args + result = find_linenumbernode_with_line(arg, target_line) + result !== nothing && return result + end + end + return nothing +end + +""" + has_valid_linenumbernode(expr) -> Bool + +Check if expr contains any LineNumberNode with valid line info. +""" +function has_valid_linenumbernode(expr) + if expr isa LineNumberNode + return expr.line > 0 && expr.file !== :none + elseif expr isa Expr + for arg in expr.args + has_valid_linenumbernode(arg) && return true + end + end + return false +end + +""" + get_function_body(expr) -> Union{Expr, Nothing} + +Extract function body from a function definition expression. +Handles both `function f() ... end` and `f() = ...` forms. +""" +function get_function_body(expr) + if expr isa Expr + if expr.head === :function && length(expr.args) >= 2 + return expr.args[2] + elseif expr.head === :(=) && expr.args[1] isa Expr && expr.args[1].head === :call + return expr.args[2] + end + # Recurse for wrapped expressions + for arg in expr.args + result = get_function_body(arg) + result !== nothing && return result + end + end + return nothing +end +``` + +### 6.2 Test Cases (Full Coverage) + +> **Test Strategy**: Verify "existence of LNN matching expected line" rather than "first LNN". +> In block forms, `_maybe_add_source_location!` may insert additional LNNs, +> so checking for existence of LNN with specific line is more robust. + +```julia +@testset "Source location preservation" begin + # Test 1: @with_pool block form + @testset "@with_pool block" begin + expected_line = @__LINE__ + 2 + expr = @macroexpand @with_pool pool begin + v = acquire!(pool, Float64, 10) + end + # Check if LNN matching expected line exists + lnn = find_linenumbernode_with_line(expr, expected_line) + @test lnn !== nothing + @test lnn.file !== :none + # At minimum, valid LNN must exist + @test has_valid_linenumbernode(expr) + end + + # Test 2: @with_pool function form + @testset "@with_pool function" begin + expected_line = @__LINE__ + 2 + func_expr = @macroexpand @with_pool pool function test_func(n) + acquire!(pool, Float64, n) + end + body = get_function_body(func_expr) + @test body !== nothing + lnn = find_linenumbernode_with_line(body, expected_line) + @test lnn !== nothing + end + + # Test 3: @maybe_with_pool + @testset "@maybe_with_pool" begin + expected_line = @__LINE__ + 2 + expr = @macroexpand @maybe_with_pool pool begin + v = acquire!(pool, Float64, 10) + end + lnn = find_linenumbernode_with_line(expr, expected_line) + @test lnn !== nothing + end + + # Test 4: Backend variant (@with_pool :cpu) + @testset "@with_pool :cpu backend" begin + expected_line = @__LINE__ + 2 + expr = @macroexpand @with_pool :cpu pool begin + v = acquire!(pool, Float64, 10) + end + lnn = find_linenumbernode_with_line(expr, expected_line) + @test lnn !== nothing + end + + # Test 5: Without pool name (implicit gensym) + @testset "@with_pool without pool name" begin + expected_line = @__LINE__ + 2 + expr = @macroexpand @with_pool begin + inner_function() + end + lnn = find_linenumbernode_with_line(expr, expected_line) + @test lnn !== nothing + end + + # Test 6: Short-form function (f(x) = ...) - Case without LNN, corrected with __source__ + @testset "@with_pool short function" begin + expected_line = @__LINE__ + 1 + func_expr = @macroexpand @with_pool pool test_func(x) = acquire!(pool, Float64, x) + body = get_function_body(func_expr) + @test body !== nothing + # Short function originally has no LNN, so corrected with __source__ + lnn = find_linenumbernode_with_line(body, expected_line) + @test lnn !== nothing + end +end +``` + +### 6.3 Verification Command +```bash +julia --project -e 'using Pkg; Pkg.test()' +``` + +--- + +## Expected Results + +| Item | Before Improvement | After Improvement | +|------|-------------------|-------------------| +| Coverage | signature uncovered | Properly mapped | +| Stack trace | macros.jl:XXX | Original source:line | +| Breakpoint | Inside macros.jl | Improved to inside body | + +--- + +## Considerations + +1. **Use Keyword Arguments**: Add `source` as keyword arg to minimize impact on existing call sites +2. **Use Helper Functions**: Use `_maybe_add_source_location!` for consistent insertion across all paths +3. **Body Line Correction**: Use `_ensure_body_has_toplevel_lnn` to preserve top-level LNN, correct with `__source__` if not present +4. **Robust Tests**: Search-based verification resistant to AST structure changes + line number accuracy verification +5. **esc() Interaction**: `LineNumberNode` is unrelated to hygiene → insert at quote block top +6. **try-finally**: Lines inside wrapper still point to macros.jl (acceptable) +7. **CUDA Extension**: Only registers backend dispatch, no macro definitions → no changes needed + +--- + +## Change Summary + +| Phase | Work | Estimated Change | +|-------|------|-----------------| +| 1 | Add helper functions (3: `_maybe_add_source_location!`, `_has_toplevel_lnn`, `_ensure_body_has_toplevel_lnn`) | +35 lines | +| 2 | Modify signatures (4 functions) | 4 lines modified | +| 3 | Modify internal calls | ~5 lines modified | +| 4 | LineNumberNode insertion | ~10 lines added | +| 5 | Modify macro definitions (8) | 8 lines modified | +| 6 | Add tests (3 helpers + 6 tests) | +80 lines | +| **Total** | | ~140 lines | diff --git a/docs/design/nd_array_approach_comparison.md b/docs/design/nd_array_approach_comparison.md new file mode 100644 index 0000000..e01f308 --- /dev/null +++ b/docs/design/nd_array_approach_comparison.md @@ -0,0 +1,432 @@ +# N-D Array Approach Comparison: unsafe_wrap vs ReshapedArray + +## Summary + +This document analyzes two approaches for returning N-dimensional arrays from AdaptiveArrayPools: + +1. **Current (v1.1.x)**: `unsafe_wrap(Array, pointer, dims)` with N-way cache +2. **Proposed (v1.0.2 style)**: `reshape(view(backing, 1:n), dims)` without cache + +**Recommendation**: Switch back to ReshapedArray approach for `acquire!` N-D path. + +--- + +## Key Finding: SubArray Wrapper Allocation + +### The 48-byte Problem + +When using `acquire!` vs `unsafe_acquire!` in real code: + +```julia +# In flux_array! (TurbulentTransport) +xx = unsafe_acquire!(pool, T, size(x)) # 0 bytes - returns Array directly +xx = acquire!(pool, T, size(x)) # 48 bytes - SubArray wrapper! +``` + +**Root Cause**: `acquire!` returns `SubArray`, which allocates its wrapper struct (48 bytes) when it escapes the function scope. + +### Allocation Breakdown + +| API | Return Type | Allocation | +|-----|-------------|------------| +| `unsafe_acquire!(pool, T, m, n)` | `Matrix{T}` | 0 bytes (cache hit), 112 bytes (miss) | +| `acquire!(pool, T, m, n)` | `SubArray{..., Matrix}` | **48 bytes** (wrapper) + 112 bytes (miss) | + +### Why SubArray Allocates + +**Fundamental difference:** + +``` +unsafe_acquire! returns: +┌─────────────────────────────────────────────────────┐ +│ Pool backing Vector │ +│ [████████████████████████████████████] │ +│ ↓ │ +│ Array header lives in pool cache (reused) │ +│ → Returns pointer to EXISTING object (0 alloc) │ +└─────────────────────────────────────────────────────┘ + +acquire! returns: +┌─────────────────────────────────────────────────────┐ +│ Pool backing Vector │ +│ [████████████████████████████████████] │ +│ ↓ │ +│ Array in cache │ +│ ↓ │ +│ NEW SubArray struct (parent, indices, stride...) │ +│ → Creates NEW wrapper object (48 bytes!) │ +└─────────────────────────────────────────────────────┘ +``` + +SubArray is stack-allocated **only when**: +1. Used entirely within a single function +2. Compiler can prove it doesn't escape + +In `flux_array!`, `xx` escapes because: +- Passed to `_pooled_chain(out_y, xx)` +- Used across multiple loop iterations +- Compiler can't optimize away the wrapper + +### Root Cause: Type-Unspecified Path → Escape Optimization Failure + +**Critical Finding**: The core reason for allocation is that **compiler escape optimization fails in type-unspecified call paths**. + +> **Correction**: SubArray is a mutable struct (not immutable/isbits). +> The explanation that "dynamic dispatch causes boxing" is inaccurate. +> Precisely: wrapper objects are created at construction time, and the key factor is +> whether the compiler can optimize to stack allocation through escape analysis. + +#### Why it happens in `flux_array!`: + +```julia +# TGLFNNmodel struct (tglf_nn.jl) +struct TGLFNNmodel <: TGLFmodel + fluxmodel::Flux.Chain + # ... + _pooled_chain::PooledChain # ← No concrete type parameter! +end +``` + +The `_pooled_chain` field is declared without concrete type parameters, +so the call `fluxmodel._pooled_chain(out_y, xx)` is **not recompiled** and wrapper object optimization is not applied. + +#### Escape Optimization Failure: + +| Condition | Compiler Behavior | Result | +|-----------|-------------------|--------| +| Type specified + no escape | SROA/escape analysis applied | Stack allocation or elimination possible | +| Type specified + escape | Partial optimization possible | Depends on situation | +| **Type unspecified** | Optimization not applicable | **Wrapper object heap allocation** | + +``` +Type-specified path: +┌─────────────────────────────────────────────────────────────────┐ +│ Compiler knows the type │ +│ → SROA/escape analysis can be applied │ +│ → Wrapper object can be stack-allocated or completely removed │ +└─────────────────────────────────────────────────────────────────┘ + +Type-unspecified path (e.g., call through abstract field): +┌─────────────────────────────────────────────────────────────────┐ +│ Compiler doesn't know concrete type │ +│ → Escape analysis cannot be applied │ +│ → Wrapper object is heap-allocated │ +└─────────────────────────────────────────────────────────────────┘ +``` + +#### Array vs View Types in Type-Unspecified Paths: + +| Type | Characteristic | In Type-Unspecified Path | +|------|----------------|--------------------------| +| `Array` | Already heap-allocated object | Cached instance reuse → **No additional allocation** | +| `SubArray` | Requires wrapper object | Escape optimization failure → **Wrapper allocation** | +| `ReshapedArray` | Requires wrapper object | Escape optimization failure → **Wrapper allocation** | + +**Key insight**: Array is an object that already exists on the heap, so returning the same instance from cache incurs no additional allocation. +In contrast, SubArray/ReshapedArray create new wrapper objects each time, and optimization is not applied when type is unspecified. + +### Solutions for the Wrapper Allocation Problem + +#### 1. Use `unsafe_acquire!` (Recommended for this case) + +`unsafe_acquire!` returns `Array`, which is already a heap-allocated object (cached instance can be reused): + +```julia +# flux_array! in tglf_nn.jl +xx = unsafe_acquire!(pool, T, size(x)) # Returns Matrix{T} → cache hit = 0 alloc +``` + +✅ Zero allocation on cache hit (cached Array instance reused) +✅ No code changes to TGLFNNmodel needed +✅ Safe since `xx` is only used as scratch memory + +#### 2. Parameterize TGLFNNmodel (Fundamental fix) + +```julia +struct TGLFNNmodel{M<:Flux.Chain, P<:PooledChain} <: TGLFmodel + fluxmodel::M + _pooled_chain::P # Now compiler knows exact type +end +``` + +✅ Enables compiler escape optimization +✅ SubArray/ReshapedArray may become zero-alloc (depends on SROA/escape analysis) +❌ Requires significant code changes +❌ Changes serialization behavior + +#### 3. Function Barrier + +```julia +# Force type specialization +@inline function _call_pooled_chain(chain::PooledChain{M}, out, x) where M + chain(out, x) +end +``` + +⚠️ May not help if extracting `_pooled_chain` from struct is also dynamic + +### Implications for API Design + +**Key Insight: Wrapper Types Depend on Compiler Optimization** + +> **Note**: Whether wrapper types (SubArray, ReshapedArray) allocate depends on compiler SROA/escape analysis. +> The comparison below assumes **type-specified paths**. In type-unspecified paths, all wrapper types allocate. + +| Approach | Return Type | Creation Cost | Type-Specified Path | Type-Unspecified Path | +|----------|-------------|---------------|---------------------|----------------------| +| `unsafe_acquire!` | `Array` | 112 bytes (miss) | cache hit: 0 | cache hit: 0 | +| `acquire!` (current) | `SubArray{Array}` | 112 bytes (miss) | Optimizable | Wrapper allocation | +| **`acquire!` (reshape)** | `ReshapedArray{View}` | **0 bytes** | **Optimizable** | Wrapper allocation | + +**Advantages of ReshapedArray approach**: + +1. ✅ No creation cost (no unsafe_wrap call) +2. ✅ Compiler optimization possible in type-specified paths +3. ✅ BLAS compatible (StridedArray) +4. ✅ Same operation performance (mul!, broadcast) + +**`unsafe_acquire!` is better in type-unspecified paths**: +- Array already exists on heap → no additional allocation when reusing cached instance + +--- + +## Benchmark Results + +### Test Environment +- Benchmark file: `benchmark/nd_approach_comparison.jl` +- Tests 8-10 specifically compare N-way cache behavior + +### Allocation Comparison + +| Scenario | unsafe_wrap | reshape | Savings | +|----------|-------------|---------|---------| +| Single call (cache miss) | 112 bytes | **0 bytes** | 100% | +| 3-way cycling × 100 | 33,600 bytes | **0 bytes** | 100% | +| 5-way cycling × 100 | 56,000 bytes | **0 bytes** | 100% | +| With 4-way cache (3-way pattern) | 0 bytes | **0 bytes** | - | +| With 4-way cache (5-way pattern) | 56,000 bytes | **0 bytes** | 100% | + +### Performance Comparison + +| Operation | unsafe_wrap | reshape | Winner | +|-----------|-------------|---------|--------| +| mul! (BLAS) | 9.92 μs | 9.96 μs | Tie | +| Broadcast σ.(x) | 24.2 μs | 23.5 μs | Tie | +| Dense layer | 35.3 μs | 33.7 μs | Tie | +| 3-way cycling | 5.87 μs | **0.38 μs** | reshape (15x) | +| 5-way cycling | 10.38 μs | **0.59 μs** | reshape (18x) | + +### Type Information + +All three types are `StridedArray` (BLAS compatible): + +```julia +# unsafe_wrap +Matrix{Float64} # isa StridedArray ✓ + +# SubArray of unsafe_wrap +SubArray{Float64, 2, Matrix{Float64}, ...} # isa StridedArray ✓ + +# ReshapedArray of 1D view +Base.ReshapedArray{Float64, 2, SubArray{Float64, 1, Vector{Float64}, ...}, ...} # isa StridedArray ✓ +``` + +### Memory Layout + +All have identical column-major layout: +``` +Strides: (1, 4) # Same for all three +``` + +--- + +## Current Implementation (unsafe_wrap) + +### Call Path +``` +acquire!(pool, Float64, 64, 100) + └─> get_nd_view!(tp, (64, 100)) + └─> get_nd_array!(tp, (64, 100)) + ├─> get_view!(tp, 6400) # 1D view (0 alloc, cached) + └─> unsafe_wrap(...) # 112 bytes on cache miss! +``` + +### Code Location +`src/core.jl:129`: +```julia +arr = unsafe_wrap(Array{T, N}, pointer(flat_view), dims) +``` + +### N-way Cache Structure +```julia +# In TypedPool (src/types.jl) +nd_views::Vector{Any} # Cached SubArray objects +nd_arrays::Vector{Any} # Cached Array objects (from unsafe_wrap) +nd_dims::Vector{Any} # Cached dimension tuples +nd_ptrs::Vector{UInt} # Cached pointers for invalidation +``` + +--- + +## Proposed Implementation (ReshapedArray) + +### Call Path +``` +acquire!(pool, Float64, 64, 100) + └─> get_nd_view!(tp, (64, 100)) + └─> get_view!(tp, 6400) # 1D view (0 alloc, cached) + └─> reshape(view, dims) # 0 alloc always! +``` + +### Proposed Code Change +```julia +# Replace get_nd_view! in src/core.jl +@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N} + total_len = safe_prod(dims) + flat_view = get_view!(tp, total_len) # 1D view (cached) + return reshape(flat_view, dims) # Zero-alloc ReshapedArray +end +``` + +--- + +## Pros and Cons + +### ReshapedArray Approach (Proposed) + +#### Pros +1. **Zero allocation always** - No 112-byte allocation regardless of cache hit/miss +2. **No N-D cache needed** - Simpler code, less memory overhead +3. **Faster** - No cache lookup overhead (0.38μs vs 5.87μs for cycling patterns) +4. **BLAS compatible** - ReshapedArray is StridedArray +5. **Same performance** - Identical mul!/broadcast speed +6. **No Bélády's Anomaly** - Works with any access pattern (5-way, 10-way, etc.) +7. **Simpler TypedPool** - Can remove nd_arrays, nd_dims, nd_ptrs fields + +#### Cons +1. **Return type changes** - `SubArray{..., Array{...}}` → `ReshapedArray{..., SubArray{...}}` +2. **Some libraries might check `isa Array`** - Rare, but possible (not BLAS though) +3. **Slightly different printing** - Display shows as ReshapedArray + +### unsafe_wrap Approach (Current) + +#### Pros +1. **Returns actual Array** - Some code might expect `Matrix{Float64}` +2. **Cache hits are zero-alloc** - When pattern fits in N-way cache + +#### Cons +1. **112 bytes per cache miss** - Adds up with varying batch sizes +2. **N-way cache complexity** - Extra fields, cache lookup logic +3. **Bélády's Anomaly** - 5+ patterns = 100% miss with 4-way cache +4. **Slower cycling** - Cache lookup overhead even on hits + +--- + +## Impact Analysis + +### TurbulentTransport Usage + +In `src/pooled_layers.jl`: + +```julia +# PooledDense (line 86) +out = acquire!(pool, Float64, size(d.weight, 1), size(xT, 2)) +mul!(out, d.weight, xT) # Works with ReshapedArray ✓ + +# PooledActivation (line 54) +out = acquire!(pool, Float64, size(x)) +out .= pa.σ.(x) # Works with ReshapedArray ✓ +``` + +Both use cases are compatible with ReshapedArray: +- `mul!` accepts any `StridedMatrix` +- Broadcasting works on any `AbstractArray` + +### Flux.NNlib.bias_act! + +```julia +Flux.NNlib.bias_act!(d.σ, out, d.bias) +``` + +This function accepts `AbstractArray` - ReshapedArray is compatible. + +### unsafe_acquire! Unchanged + +For code that explicitly needs raw `Array` (FFI, specific BLAS paths): +```julia +unsafe_acquire!(pool, Float64, 64, 100) # Still returns Matrix{Float64} +``` + +This API remains unchanged and still uses `unsafe_wrap` with caching. + +--- + +## Migration Path + +### Phase 1: Modify acquire! N-D path +```julia +# src/core.jl - Replace get_nd_view! +@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N} + total_len = safe_prod(dims) + flat_view = get_view!(tp, total_len) + return reshape(flat_view, dims) +end +``` + +### Phase 2: Simplify TypedPool (optional) +Remove N-D cache fields if `unsafe_acquire!` usage is rare: +- `nd_views`, `nd_arrays`, `nd_dims`, `nd_ptrs` + +### Phase 3: Update documentation +- Note return type change in CHANGELOG +- Update docstrings for `acquire!` + +--- + +## Conclusion + +### Two Separate Problems, Two Solutions + +This investigation revealed **two distinct allocation issues**: + +#### Problem 1: N-D Array Creation (unsafe_wrap vs reshape) + +| Metric | unsafe_wrap + cache | reshape | +|--------|---------------------|---------| +| Allocation (miss) | 112 bytes | **0 bytes** | +| Allocation (hit) | 0 bytes | **0 bytes** | +| Speed (cycling) | 5-10 μs | **0.3-0.6 μs** | +| BLAS compat | ✓ | ✓ | +| Code complexity | High (cache) | **Low** | +| Works with any pattern | ✗ (≤4 ways) | **✓ (any)** | + +**Solution**: Switch `acquire!` N-D path to use `reshape(view, dims)` instead of `unsafe_wrap`. + +#### Problem 2: Type-Unspecified Path Wrapper Allocation + +> **Correction**: SubArray and ReshapedArray are both **mutable structs**. +> Whether allocation occurs depends on whether compiler escape optimization can be applied. + +| Type | Type-Specified Path | Type-Unspecified Path | +|------|---------------------|----------------------| +| `Array` | cache hit: 0 bytes | cache hit: **0 bytes** ✓ | +| `SubArray` | Optimizable | **Wrapper allocation** ✗ | +| `ReshapedArray` | Optimizable | **Wrapper allocation** ✗ | + +**Key difference**: +- `Array`: Object already exists on heap → no additional allocation when reusing cached instance +- Wrapper types: New object created each time → heap allocation when optimization fails in type-unspecified paths + +**Solution for TurbulentTransport**: Use `unsafe_acquire!` which returns `Array` (cached instance reusable). + +### Summary + +| Context | Recommended API | Reason | +|---------|-----------------|--------| +| Type-specified path (general use) | `acquire!` → ReshapedArray | 0 bytes creation, compiler optimization possible | +| Type-unspecified path | `unsafe_acquire!` → Array | Cached instance can be reused | +| FFI / raw pointer needs | `unsafe_acquire!` → Array | Direct memory access | + +**The N-way cache was solving the wrong problem** - caching `Array` objects when `reshape` is already zero-cost for creation. + +For type-unspecified paths (like `TGLFNNmodel._pooled_chain` without concrete type parameters), `unsafe_acquire!` returning `Array` is the correct choice because cached Array instances can be reused without additional allocation. diff --git a/docs/design/new_hybrid_api_design.md b/docs/design/new_hybrid_api_design.md new file mode 100644 index 0000000..14cd235 --- /dev/null +++ b/docs/design/new_hybrid_api_design.md @@ -0,0 +1,140 @@ +# Design Spec: Hybrid N-way Cache & ReshapedArray Strategy + +> **Note**: This document was written as a clear, concrete specification that another AI can +> implement mechanically without needing to think through the design. + +--- + +## 1. Objective +Refactor AdaptiveArrayPools.jl to implement a **Hybrid Allocation Strategy**: +1. **`acquire!` (Default)**: Return `ReshapedArray` (Zero-Allocation, Stack-allocated). Remove N-D caching logic for this path. +2. **`unsafe_acquire!` (Special)**: Return `Array` (via `unsafe_wrap`). Implement **N-way Set Associative Cache** to minimize `unsafe_wrap` overhead (112 bytes) and support interleaved access patterns. + +## 2. Data Structure Changes (types.jl) + +### Constants +Define the cache associativity level. +```julia +const CACHE_WAYS = 4 +``` + +### `TypedPool{T}` Struct +Modify fields to support N-way caching for Arrays, while removing unused View caching. + +* **Remove**: `nd_views` (No longer needed as `acquire!` returns `ReshapedArray`). +* **Update**: `nd_arrays`, `nd_dims`, `nd_ptrs`. These vectors must store `CACHE_WAYS` items per active slot. +* **Add**: `nd_next_way::Vector{Int}` (To track Round-Robin replacement index for each slot). + +**Updated Layout:** +```julia +mutable struct TypedPool{T} + # --- Backing Storage --- + vectors::Vector{Vector{T}} + + # --- 1D Cache (Simple 1-way or Direct) --- + views::Vector{SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int}}, true}} + view_lengths::Vector{Int} + + # --- N-D Array Cache (N-way Set Associative) --- + # Layout: Flat Vector. Index = (slot_idx - 1) * CACHE_WAYS + way_idx + nd_arrays::Vector{Any} # Stores Array{T, N} + nd_dims::Vector{Any} # Stores NTuple{N, Int} + nd_ptrs::Vector{UInt} # Stores objectid/pointer for validation + nd_next_way::Vector{Int} # Round-Robin counter per slot (1 per slot) + + n_active::Int + _checkpoint_n_active::Vector{Int} + _checkpoint_depths::Vector{Int} +end +``` + +### Initialization +Ensure `nd_arrays`, `nd_dims`, `nd_ptrs` are initialized with `nothing` or empty values, and `nd_next_way` with `0` or `1`. + +## 3. Logic Implementation (core.jl) + +### A. `acquire!` (The Fast Path) +**Goal**: Always return `ReshapedArray`. No N-D cache lookup. + +**Implementation**: +Modify `get_nd_view!` to: +1. Calculate total length (`prod(dims)`). +2. Call `get_view!(tp, len)` to get a 1D `SubArray`. +3. Return `reshape(flat_view, dims)`. + +```julia +@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N} + len = safe_prod(dims) + flat_view = get_view!(tp, len) + return reshape(flat_view, dims) +end +``` + +### B. `unsafe_acquire!` (The N-way Path) +**Goal**: Return `Array`. Use N-way cache to avoid `unsafe_wrap`. + +**Implementation**: +Modify `get_nd_array!` to use **Linear Search + Round-Robin Replacement**. + +**Algorithm**: +1. Get 1D view: `flat_view = get_view!(tp, prod(dims))`. +2. Get current pointer: `current_ptr = UInt(pointer(flat_view))`. +3. Calculate Base Index: `base = (tp.n_active - 1) * CACHE_WAYS`. +4. **Search (Hit Check)**: + * Loop `k` from `1` to `CACHE_WAYS`. + * Check if `nd_dims[base + k] == dims` **AND** `nd_ptrs[base + k] == current_ptr`. + * If match: Return `nd_arrays[base + k]`. +5. **Miss (Replacement)**: + * Get victim way from `nd_next_way[tp.n_active]`. + * Target Index: `target = base + victim_way + 1`. + * Create Array: `arr = unsafe_wrap(Array{T, N}, pointer(flat_view), dims)`. + * **Update Cache**: + * `nd_arrays[target] = arr` + * `nd_dims[target] = dims` + * `nd_ptrs[target] = current_ptr` + * **Update Round-Robin**: Increment `nd_next_way` (modulo `CACHE_WAYS`). + * Return `arr`. + +## 4. API & Aliases (AdaptiveArrayPools.jl) + +Add explicit aliases for clarity. + +```julia +# Main APIs +export acquire!, unsafe_acquire! + +# Explicit Aliases +export acquire_view!, acquire_array! + +"""Alias for [`acquire!`](@ref). Returns a ReshapedArray (View).""" +const acquire_view! = acquire! + +"""Alias for [`unsafe_acquire!`](@ref). Returns an Array (via unsafe_wrap).""" +const acquire_array! = unsafe_acquire! +``` + +## 5. Client Integration (`TurbulentTransport.jl`) + +Update tglf_nn.jl to use the Array-returning API to avoid dynamic dispatch boxing. + +**File**: tglf_nn.jl +**Function**: `flux_array!` +**Change**: +```julia +# Before +xx = acquire!(pool, T, size(x)) + +# After +xx = unsafe_acquire!(pool, T, size(x)) +# OR +xx = acquire_array!(pool, T, size(x)) +``` + +## 6. Verification Checklist + +1. **Type Check**: `acquire!` must return `ReshapedArray`. `unsafe_acquire!` must return `Array`. +2. **Allocation Check**: + * `acquire!`: 0 allocations always. + * `unsafe_acquire!`: 0 allocations on cache hit. + * `unsafe_acquire!`: 0 allocations on interleaved access (e.g., alternating 10x10 and 20x20) thanks to N-way cache. +3. **Safety**: Ensure `unsafe_acquire!` validates pointers (re-wraps if the backing vector was resized). diff --git a/docs/design/untracked_acquire_design.md b/docs/design/untracked_acquire_design.md new file mode 100644 index 0000000..7358d0e --- /dev/null +++ b/docs/design/untracked_acquire_design.md @@ -0,0 +1,598 @@ +# AdaptiveArrayPools: Untracked Acquire Handling Design + +## 1. Background + +### 1.1 Current System +Julia array pool library. The `@with_pool` macro manages temporary array allocation through checkpoint/rewind. + +```julia +@with_pool p begin + v = acquire!(p, Float64, 100) # Acquire array from pool + # ... use ... +end # Automatically returned +``` + +### 1.2 Optimization: Typed Checkpoint/Rewind +The macro extracts types from acquire! calls at compile time, checkpointing/rewinding only those types (77% performance improvement). + +```julia +# Code generated by macro +checkpoint!(pool, Float64) # Only checkpoint Float64 stack +try + v = acquire!(pool, Float64, 100) +finally + rewind!(pool, Float64) # Only rewind Float64 stack +end +``` + +### 1.3 Current Data Structures +```julia +mutable struct TypedPool{T} + vectors::Vector{Vector{T}} # Reusable arrays + n_active::Int # Number of currently active arrays + _checkpoint_n_active::Vector{Int} # Checkpointed n_active values + _checkpoint_depths::Vector{Int} # Depth at which each checkpoint was saved +end + +mutable struct AdaptiveArrayPool + float64::TypedPool{Float64} + float32::TypedPool{Float32} + int64::TypedPool{Int64} + int32::TypedPool{Int32} + complexf64::TypedPool{ComplexF64} + complexf32::TypedPool{ComplexF32} + bool::TypedPool{Bool} + others::IdDict{DataType, Any} + _current_depth::Int + _untracked_flags::Vector{Bool} +end +``` + +--- + +## 2. Problem: Untracked Acquire + +### 2.1 Scenario +```julia +function inner_helper(pool) + acquire!(pool, Float32, 50) # Macro can't see this! +end + +@with_pool p begin + v = acquire!(p, Float64, 100) # Macro extracts: Float64 + inner_helper(p) # Float32 acquire is "untracked" +end +``` + +**Problem:** Macro only knows about Float64, generating `checkpoint!(pool, Float64)`, `rewind!(pool, Float64)`. +Float32's n_active is not restored → memory leak or corruption. + +### 2.2 Nested Case +```julia +@with_pool p begin # L1: checkpoint Float64 + acquire!(p, Float64, 10) + + @with_pool p begin # L2: checkpoint Int64 + acquire!(p, Int64, 5) + inner_untracked(p) # Float32 acquire (untracked) + end # L2 rewind + +end # L1 rewind +``` + +--- + +## 3. Proposed Solution + +### 3.1 Core Idea: Shared Implementation Function + Marking Wrapper + +**Previous Problem:** Runtime detection in acquire! → overhead on every call + +**New Approach:** +- `_acquire_impl!` → actual implementation (existing acquire! logic as-is) +- `acquire!` → marking + `_acquire_impl!` call (1-line wrapper) +- Macro transforms `acquire!` calls to `_acquire_impl!` + +```julia +# User code +@with_pool p begin + v = acquire!(p, Float64, 100) +end + +# After macro transformation +checkpoint!(p, Float64) +try + v = _acquire_impl!(p, Float64, 100) # ← Transformed! (no marking) +finally + # ... +end +``` + +### 3.2 Benefits +1. **Zero overhead:** Inside @with_pool, impl is called directly +2. **Automatic detection:** acquire! outside @with_pool passes through marking wrapper → untracked +3. **No code duplication:** Implementation logic in `_acquire_impl!` only once + +### 3.3 Data Structure Changes + +```julia +mutable struct TypedPool{T} + # ... existing fields ... + n_active::Int + _checkpoint_n_active::Vector{Int} + _checkpoint_depths::Vector{Int} # NEW: Tracks which depth each checkpoint was saved at +end + +mutable struct AdaptiveArrayPool + # ... existing TypedPools ... + + # NEW: Depth tracking (pool level) + _current_depth::Int # Current checkpoint depth + _untracked_flags::Vector{Bool} # Whether untracked occurred per depth (typed vs full rewind decision) +end +``` + +**Key Changes:** +- `_checkpoint_depths` added to `TypedPool` → tracks which depth each checkpoint occurred at +- `_untracked_flags` maintained → for typed rewind vs full rewind decision +- `_full_rewind_with_types!` removed → `_checkpoint_depths` makes regular `rewind!(pool)` sufficient + +--- + +## 4. Implementation Details + +### 4.1 Function Structure + +```julia +# Implementation function (existing acquire! logic moved as-is) +@inline function _acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where T + tp = get_typed_pool!(pool, T) + return get_view!(tp, n) +end + +@inline function _acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N} + tp = get_typed_pool!(pool, T) + return get_nd_view!(tp, dims) +end + +# User-facing: marking wrapper (1 line) +@inline function acquire!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where T + _mark_untracked!(pool) + _acquire_impl!(pool, T, n) +end + +@inline function acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N} + _mark_untracked!(pool) + _acquire_impl!(pool, T, dims...) +end + +# Untracked marking (still needed - for typed rewind vs full rewind decision) +@inline function _mark_untracked!(pool::AdaptiveArrayPool) + if pool._current_depth > 0 + @inbounds pool._untracked_flags[pool._current_depth] = true + end +end +``` + +**Macro transformation:** `acquire!(p, T, n)` → `_acquire_impl!(p, T, n)` (only function name substitution) + +**Note:** `_mark_untracked!` is maintained - for triggering full rewind when untracked occurs + +### 4.2 unsafe_acquire! and Aliases + +```julia +# Implementation functions +@inline function _unsafe_acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where T + tp = get_typed_pool!(pool, T) + return get_nd_array!(tp, (n,)) +end + +@inline function _unsafe_acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N} + tp = get_typed_pool!(pool, T) + return get_nd_array!(tp, dims) +end + +# User-facing: marking wrapper +@inline function unsafe_acquire!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where T + _mark_untracked!(pool) + _unsafe_acquire_impl!(pool, T, n) +end + +@inline function unsafe_acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N} + _mark_untracked!(pool) + _unsafe_acquire_impl!(pool, T, dims...) +end + +# Aliases (user API) +const acquire_view! = acquire! +const acquire_array! = unsafe_acquire! + +# Aliases (internal impl) +const _acquire_view_impl! = _acquire_impl! +const _acquire_array_impl! = _unsafe_acquire_impl! +``` + +### 4.3 Macro Transformation + +```julia +# Macro transforms acquire! calls in AST to _*_impl! +function _transform_acquire_calls(expr, pool_name) + if expr isa Expr + if expr.head == :call && length(expr.args) >= 2 + fn = expr.args[1] + pool_arg = expr.args[2] + + # Only transform when matching target pool + if pool_arg == pool_name + if fn == :acquire! || fn == :acquire_view! + expr.args[1] = :_acquire_impl! + elseif fn == :unsafe_acquire! || fn == :acquire_array! + expr.args[1] = :_unsafe_acquire_impl! + end + end + end + # Recursively transform + for i in eachindex(expr.args) + expr.args[i] = _transform_acquire_calls(expr.args[i], pool_name) + end + end + return expr +end +``` + +**Transformation Rules:** +| Original | Transformed | +|----------|-------------| +| `acquire!(p, ...)` | `_acquire_impl!(p, ...)` | +| `acquire_view!(p, ...)` | `_acquire_impl!(p, ...)` | +| `unsafe_acquire!(p, ...)` | `_unsafe_acquire_impl!(p, ...)` | +| `acquire_array!(p, ...)` | `_unsafe_acquire_impl!(p, ...)`| + +### 4.4 Checkpoint/Rewind + +```julia +# Typed checkpoint +function checkpoint!(pool::AdaptiveArrayPool, types::Type...) + pool._current_depth += 1 + push!(pool._untracked_flags, false) + depth = pool._current_depth + for T in types + tp = get_typed_pool!(pool, T) + push!(tp._checkpoint_n_active, tp.n_active) + push!(tp._checkpoint_depths, depth) # NEW: record depth + end +end + +# Typed rewind (fast path - when no untracked) +function rewind!(pool::AdaptiveArrayPool, types::Type...) + for T in types + tp = get_typed_pool!(pool, T) + pop!(tp._checkpoint_depths) # NEW: remove depth + tp.n_active = pop!(tp._checkpoint_n_active) + end + pop!(pool._untracked_flags) + pool._current_depth -= 1 +end + +# Full rewind (untracked fallback - simplified with _checkpoint_depths!) +function rewind!(pool::AdaptiveArrayPool) + depth = pool._current_depth + for tp in all_type_stacks(pool) + if !isempty(tp._checkpoint_depths) && tp._checkpoint_depths[end] == depth + # Checkpointed at current depth → pop + pop!(tp._checkpoint_depths) + tp.n_active = pop!(tp._checkpoint_n_active) + elseif !isempty(tp._checkpoint_n_active) + # Checkpointed at previous depth → restore without pop + tp.n_active = tp._checkpoint_n_active[end] + elseif tp.n_active > 0 + # ⚠️ CRITICAL ERROR: Would destroy arrays outside @with_pool + T = eltype(tp) + error(""" + [AdaptiveArrayPools] Cannot rewind type $T: no checkpoint exists. + Found $(tp.n_active) active array(s) that were never checkpointed. + + Fix: Wrap the scope where $T was first acquired in @with_pool. + """) + end + # else: _checkpoint_n_active empty and n_active == 0 → normal, do nothing + end + pop!(pool._untracked_flags) + pool._current_depth -= 1 +end +``` + +**Key Improvement:** `_checkpoint_depths[end] == depth` comparison enables accurate pop/restore decision → `_full_rewind_with_types!` not needed! + +### 4.5 all_type_stacks Implementation + +```julia +# Generator to iterate all TypedPools (fixed slots + others) +function all_type_stacks(pool::AdaptiveArrayPool) + return Iterators.flatten(( + # Fixed slots (7) + (pool.float64, pool.float32, pool.int64, pool.int32, pool.complexf64, pool.complexf32, pool.bool), + # Others (IdDict values) + values(pool.others) + )) +end + +# Or callback pattern (more efficient, no allocation) +@inline function foreach_type_stack(f, pool::AdaptiveArrayPool) + f(pool.float64) + f(pool.float32) + f(pool.int64) + f(pool.int32) + f(pool.complexf64) + f(pool.complexf32) + f(pool.bool) + for tp in values(pool.others) + f(tp) + end +end +``` + +### 4.6 Macro Generated Code + +```julia +@with_pool p begin + v = acquire!(p, Float64, 100) + inner_function(p) # untracked acquire possible here +end + +# Generated code: +local p = get_task_local_pool() +checkpoint!(p, Float64) +try + local _result = begin + v = _acquire_impl!(p, Float64, 100) # ← Transformed! (no marking) + inner_function(p) # acquire! inside not transformed → passes through marking wrapper → untracked + end + _result +finally + if p._untracked_flags[p._current_depth] + rewind!(p) # Full rewind (handled accurately with _checkpoint_depths) + else + rewind!(p, Float64) # Typed rewind (fast) + end +end +``` + +**Change:** `_full_rewind_with_types!(p, Float64)` → `rewind!(p)` (simplified!) + +--- + +## 5. Behavior Verification + +### 5.1 Simple Case (no untracked) + +```julia +@with_pool p begin + v = acquire!(p, Float64, 100) +end +``` + +**Flow:** +1. `checkpoint!(p, Float64)` → _current_depth=1, _untracked_flags=[false] +2. `_acquire_impl!(p, Float64, 100)` → no marking (macro transformed) +3. `_untracked_flags[1] == false` → `rewind!(p, Float64)` (fast path) + +### 5.2 Untracked Case + +```julia +function helper(p) + acquire!(p, Float32, 50) # Defined outside @with_pool → not transformed! +end + +@with_pool p begin + v = acquire!(p, Float64, 100) + helper(p) +end +``` + +**Flow:** +1. `checkpoint!(p, Float64)` → _current_depth=1, _untracked_flags=[false], Float64._checkpoint_depths=[1] +2. `_acquire_impl!(p, Float64, 100)` → no marking (macro transformed) +3. `acquire!(p, Float32, 50)` → marking wrapper → `_mark_untracked!` → _untracked_flags=[true] +4. `_untracked_flags[1] == true` → `rewind!(p)` (full rewind) + - Float64: _checkpoint_depths[end]=1 == depth=1 → pop + - Float32: _checkpoint_depths=[] → if n_active=0 do nothing, if >0 restore + +### 5.3 Nested Case + +``` +L1: @with_pool (Float64) + L2: @with_pool (Int64) + untracked Float32 + L3: @with_pool (Bool) +``` + +| Timing | depth | Float64 saved/depths | Int64 saved/depths | Float32 saved/depths | Bool saved/depths | +|--------|-------|---------------------|-------------------|---------------------|------------------| +| Initial | 0 | []/[] | []/[] | []/[] | []/[] | +| L1 checkpoint(F64) | 1 | [0]/[1] | []/[] | []/[] | []/[] | +| L2 checkpoint(I64) | 2 | [0]/[1] | [0]/[2] | []/[] | []/[] | +| Float32 untracked | 2 | [0]/[1] | [0]/[2] | []/[] (n=1) | []/[] | +| L3 checkpoint(Bool) | 3 | [0]/[1] | [0]/[2] | []/[] | [0]/[3] | +| L3 rewind(Bool) | 2 | [0]/[1] | [0]/[2] | []/[] | []/[] | +| L2 rewind() full | 1 | [0]/[1] | []/[] | []/[] (n=0) | []/[] | +| L1 rewind(F64) | 0 | []/[] | []/[] | []/[] | []/[] | + +**L2 Full Rewind Detail (depth=2):** +- Float64: depths[end]=1 ≠ 2 → no pop, restore only ✓ +- Int64: depths[end]=2 == 2 → **pop!** ✓ (now works correctly!) +- Float32: depths=[] → n_active to 0 (untracked cleanup) ✓ +- Bool: depths=[] → n_active=0, do nothing ✓ + +**Key:** Thanks to `_checkpoint_depths`, we can accurately know Int64 was checkpointed at depth=2! + +--- + +## 6. API Summary + +### 6.1 User API (unchanged) +```julia +acquire!(pool, T, dims...) # Acquire array (returns view) +unsafe_acquire!(pool, T, dims...) # Acquire array (returns Array) +acquire_view!(pool, T, dims...) # alias for acquire! +acquire_array!(pool, T, dims...) # alias for unsafe_acquire! +``` + +### 6.2 Internal API (newly added) +```julia +# Implementation functions (macro transforms to direct call) +_acquire_impl!(pool, T, dims...) +_unsafe_acquire_impl!(pool, T, dims...) + +# Aliases (const) +_acquire_view_impl! = _acquire_impl! +_acquire_array_impl! = _unsafe_acquire_impl! + +# Untracked marking +_mark_untracked!(pool) + +# TypedPool iteration +all_type_stacks(pool) # Generator +foreach_type_stack(f, pool) # Callback pattern +``` + +### 6.3 Struct Changes +```julia +mutable struct TypedPool{T} + # existing fields... + n_active::Int + _checkpoint_n_active::Vector{Int} + _checkpoint_depths::Vector{Int} # NEW: checkpoint depth tracking +end + +mutable struct AdaptiveArrayPool + # existing fields... + _current_depth::Int # NEW + _untracked_flags::Vector{Bool} # NEW +end +``` + +--- + +## 7. Performance Characteristics + +| Path | Condition | Behavior | Overhead | +|------|-----------|----------|----------| +| Fast path | no untracked | typed rewind | Minimal (same as existing) | +| Fallback | untracked exists | full rewind | Iterate all types (accurate handling with _checkpoint_depths) | + +### 7.1 acquire! Overhead +```julia +# Implementation function (existing logic as-is) +@inline function _acquire_impl!(pool, T, n) + tp = get_typed_pool!(pool, T) + get_view!(tp, n) +end + +# User wrapper (marking added) +@inline function acquire!(pool, T, n) + _mark_untracked!(pool) # Added: if + array access + _acquire_impl!(pool, T, n) +end + +@inline function _mark_untracked!(pool) + if pool._current_depth > 0 # branch (usually false) + @inbounds pool._untracked_flags[pool._current_depth] = true + end +end +``` + +**Overhead Analysis:** +- Outside @with_pool: `_current_depth == 0` → branch only (very light) +- Inside @with_pool: macro calls `_acquire_impl!` directly → **zero overhead** + +--- + +## 8. Edge Case: Parent Scope Corruption Prevention + +### 8.1 Problem Scenario +```julia +# acquire outside @with_pool +v_parent = acquire!(p, Int64, 10) # Int64 n_active = 1 + +@with_pool p begin # checkpoint(Float64) - Int64 not checkpointed + v = acquire!(p, Float64, 100) + + # Untracked Int64 acquire! + v_child = acquire!(p, Int64, 5) # Int64 n_active = 2 +end +# Full rewind occurs +# Int64: _checkpoint_n_active=[], n_active=2 +# If reset n_active=0, v_parent would be destroyed! +``` + +### 8.2 Solution: Clear Error +```julia +elseif tp.n_active > 0 + # _checkpoint_n_active empty but n_active > 0 + # = attempting rewind without checkpoint + T = eltype(tp) + error(""" + [AdaptiveArrayPools] Cannot rewind type $T: no checkpoint exists. + Found $(tp.n_active) active array(s) that were never checkpointed. + + Fix: Wrap the scope where $T was first acquired in @with_pool. + """) +end +``` + +### 8.3 Error Conditions Summary +| _checkpoint_n_active | n_active | Situation | Action | +|----------------------|----------|-----------|--------| +| len == depth | any | Checkpointed at current depth | pop | +| len > 0, < depth | any | Checkpointed at previous depth | restore to [end] | +| empty | 0 | Not used | do nothing | +| empty | > 0 | **Rewind attempted without checkpoint** | **ERROR** | + +### 8.4 User Resolution +When error occurs, two choices: +1. **Wrap parent in @with_pool too:** + ```julia + @with_pool p begin + v_parent = acquire!(p, Int64, 10) + @with_pool p begin + # inner block + end + end + ``` + +2. **Avoid untracked acquire in inner block:** + ```julia + v_parent = acquire!(p, Int64, 10) + @with_pool p begin + # Don't acquire Int64 + # Or use @with_pool in inner helper too + end + ``` + +--- + +## 9. Feedback Requests + +1. **Function separation approach:** Is the `acquire!` wrapper + `_acquire_impl!` implementation separation appropriate? +2. **length vs _current_depth comparison:** Is it correct in all nested cases? +3. **Macro transformation:** Is AST function name substitution safe? (qualified names, macros, etc.) +4. **_untracked_flags Vector{Bool}:** Are there more efficient alternatives? +5. **_full_rewind_with_types!:** Performance of checking types with Set? +6. **Simpler alternative:** Is there a way to simplify the entire design? + +--- + +## 10. Alternative Review + +### 10.1 Option: Always Full Checkpoint/Rewind +- Pros: Simple implementation +- Cons: Loses 77% performance improvement + +### 10.2 Option: Don't Support Untracked +- Documentation: "Use @with_pool in helper functions too" +- Pros: No implementation changes +- Cons: Silent corruption on user error + +### 10.3 Option: Current Proposal (impl function separation) +- Pros: Maintains performance, safe fallback, no code duplication +- Cons: Requires macro transformation diff --git a/docs/design/vector_resize_memory_behavior.md b/docs/design/vector_resize_memory_behavior.md new file mode 100644 index 0000000..e0194a4 --- /dev/null +++ b/docs/design/vector_resize_memory_behavior.md @@ -0,0 +1,247 @@ +# Vector Resize Memory Behavior: CPU vs GPU + +## Context +AdaptiveArrayPools uses backing vectors that may need to grow when larger arrays are requested. +Current implementation only grows vectors, never shrinks them. + +**Question**: Should we shrink vectors when smaller sizes are requested? What are the memory implications? + +--- + +## CPU Julia Vector Behavior + +```julia +v = Vector{Float64}(undef, 1000) +resize!(v, 100) # Shrink to 100 elements +resize!(v, 500) # Grow back to 500 +``` + +### Key Facts (needs verification): +1. **Capacity vs Length**: Does Julia Vector maintain separate capacity? +2. **Shrink behavior**: Does `resize!(v, smaller)` release memory immediately? +3. **Regrow cost**: If we shrink then grow again, is there reallocation? + +### My Understanding: +- Julia's `Vector` uses a growth strategy (typically 2x) +- `resize!` to smaller size may NOT release memory (keeps capacity) +- Growing back within capacity is O(1), no allocation +- Memory is only released when Vector is GC'd + +**Question for review**: Is this accurate? Does Julia guarantee capacity preservation on shrink? + +--- + +## GPU CuVector Behavior + +```julia +using CUDA +v = CUDA.zeros(Float64, 1000) +resize!(v, 100) # Shrink - what happens to GPU memory? +resize!(v, 500) # Grow back - allocation? +``` + +### VERIFIED: CUDA.jl resize! Implementation (src/array.jl:889) + +**CuVector has capacity tracking via `A.maxsize` field.** + +```julia +# CUDA.jl constants +const RESIZE_THRESHOLD = 100 * 1024^2 # 100 MiB +const RESIZE_INCREMENT = 32 * 1024^2 # 32 MiB + +function Base.resize!(A::CuVector{T}, n::Integer) where T + n == length(A) && return A + + # only resize when the new length exceeds the capacity or is much smaller + cap = A.maxsize ÷ aligned_sizeof(T) + if n > cap || n < cap ÷ 4 # ← SHRINK THRESHOLD: 25% + len = if n < cap + # shrink to fit (allocates EXACT new size, no over-allocation) + n + elseif A.maxsize > RESIZE_THRESHOLD + # large arrays (>100MB): grow by fixed +32 MiB increments + max(n, cap + RESIZE_INCREMENT ÷ aligned_sizeof(T)) + else + # small arrays (<100MB): double in size + max(n, 2 * length(A)) + end + # ... allocates new buffer, copies data ... + end + # If within capacity: just update length, no reallocation +end +``` + +### Key Findings: + +| Aspect | CUDA.jl CuVector | +|--------|------------------| +| **Capacity tracking** | Yes, via `A.maxsize` | +| **Shrink threshold** | `n < cap ÷ 4` (25%) | +| **Shrink behavior** | Reallocates to EXACT new size | +| **Growth (small <100MB)** | 2x doubling | +| **Growth (large ≥100MB)** | +32 MiB increments | + +### CUDA.jl Memory Management: +- CUDA.jl uses a memory pool (stream-ordered or binned allocator) +- Released memory goes back to pool, not immediately to OS/driver +- `CUDA.reclaim()` forces return to driver +- Pool may return same block on regrow (observed in verification tests) + +--- + +## Current Pool Design Trade-offs + +### Current Approach: Never Shrink +```julia +# In get_view!: +if length(vec) < total_len + resize!(vec, total_len) # Only grow, never shrink +end +new_view = view(vec, 1:total_len) # View handles size +``` + +**Pros**: +- Simple implementation +- Avoids any potential reallocation costs +- Views already handle returning correct size + +**Cons**: +- One large allocation permanently increases memory footprint +- GPU memory is precious and limited +- No way to recover memory without `empty!(pool)` + +### Alternative: Shrink When Significantly Smaller +```julia +if length(vec) < total_len + resize!(vec, total_len) +elseif length(vec) > total_len * 4 # Example: 4x threshold + resize!(vec, total_len) # Shrink to save memory +end +``` + +**Pros**: +- Recovers memory from outlier large allocations +- Better memory efficiency over time + +**Cons**: +- May cause reallocations +- Added complexity +- Need to invalidate cached views on shrink too + +--- + +## Specific Questions for Review + +1. **Julia Vector capacity**: + - Does `resize!(v, smaller)` preserve capacity? + - Is this behavior documented/guaranteed? + - Is there a way to query capacity vs length? + +2. **CuVector resize behavior**: + - Does CUDA.jl's CuVector follow same capacity model? + - What happens to GPU memory on shrink? + - Does CUDA memory pool make shrink "free" anyway? + +3. **Design recommendation**: + - Should pools shrink vectors at some threshold? + - What threshold makes sense? (2x? 4x? 10x?) + - Should CPU and GPU have different policies? + +4. **Memory pressure handling**: + - Should pool respond to memory pressure signals? + - Is there a way to detect "memory is tight"? + +--- + +## Test Code to Verify Behavior + +```julia +# CPU Test +function test_cpu_resize_behavior() + v = Vector{Float64}(undef, 10_000_000) # ~80MB + @show Base.summarysize(v) + + resize!(v, 100) + @show Base.summarysize(v) # Does this shrink? + + resize!(v, 5_000_000) + @show Base.summarysize(v) # Reallocation needed? + + # Is there a way to check capacity? +end + +# GPU Test +function test_gpu_resize_behavior() + CUDA.reclaim() # Start clean + + v = CUDA.zeros(Float64, 10_000_000) # ~80MB GPU + @show CUDA.memory_status() + + resize!(v, 100) + @show CUDA.memory_status() # Memory returned to pool? + + resize!(v, 5_000_000) + @show CUDA.memory_status() # New allocation? +end +``` + +--- + +## Related: View Cache Invalidation + +Currently, when `resize!` is called (grow only), we invalidate all cached views: + +```julia +if length(vec) < total_len + resize!(vec, total_len) + # Invalidate all N-way cache entries for this slot + for k in 1:CUDA_CACHE_WAYS + @inbounds tp.views[base + k] = nothing + @inbounds tp.view_dims[base + k] = nothing + end +end +``` + +If we add shrinking, same invalidation would be needed since shrink can also reallocate. + +--- + +## Summary + +### VERIFIED Results + +| Aspect | CPU Vector | GPU CuVector | +|--------|------------|--------------| +| **Capacity tracking** | Yes (implicit) | Yes (`A.maxsize`) | +| **Capacity preservation on shrink** | Yes (pointer unchanged) | No (reallocates at 25%) | +| **Memory returned on shrink** | No (until GC) | To pool (can be reclaimed) | +| **Regrow cost after shrink** | O(1) within capacity | May realloc (pool often returns same block) | +| **CUDA.jl shrink threshold** | N/A | `n < cap ÷ 4` (25%) | + +### Design Recommendation for AdaptiveArrayPools + +**Current "never shrink" is suboptimal for GPU.** CUDA.jl already implements a 25% threshold, meaning: + +1. **Our explicit `resize!(vec, smaller)` calls would trigger CUDA.jl's internal shrink anyway** if below 25% +2. **We're just deferring the inevitable reallocation** when usage drops significantly +3. **GPU memory is precious** - holding 4x+ more than needed is wasteful + +**Recommendation**: Add lazy shrink for GPU at 25% threshold (matching CUDA.jl): + +```julia +# In get_view! for CuTypedPool: +cap = length(vec) +if total_len > cap + resize!(vec, total_len) # Grow + # invalidate cache... +elseif total_len < cap ÷ 4 + resize!(vec, total_len) # Shrink when using <25% capacity + # invalidate cache... +end +``` + +**Why 25%?** +- Matches CUDA.jl's internal threshold +- Consistent behavior - calling resize! directly would shrink at same point +- Allows 4x variation without reallocation (handles typical size fluctuations) +- Recovers memory from outlier large allocations From 7384f84d512ab0fb05db4c671f0436eea58ffd4f Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 5 Jan 2026 10:40:03 -0800 Subject: [PATCH 3/8] docs: merge hybrid API design docs into single file - Merge new_hybrid_api_design.md into hybrid_api_design.md - Add Implementation Specification section (struct layout, N-way algorithm) - Add API Aliases section (acquire_view!, acquire_array!) - Add Verification Checklist - Remove redundant new_hybrid_api_design.md --- docs/design/hybrid_api_design.md | 140 +++++++++++++++++++++------ docs/design/new_hybrid_api_design.md | 140 --------------------------- 2 files changed, 109 insertions(+), 171 deletions(-) delete mode 100644 docs/design/new_hybrid_api_design.md diff --git a/docs/design/hybrid_api_design.md b/docs/design/hybrid_api_design.md index a407e72..23fa851 100644 --- a/docs/design/hybrid_api_design.md +++ b/docs/design/hybrid_api_design.md @@ -132,6 +132,24 @@ end - Array is already a heap-allocated object → no additional allocation when reusing cached instance - Avoids wrapper object optimization issues in type-unspecified paths +### API Aliases + +For clarity, explicit aliases are provided: + +```julia +# Main APIs +export acquire!, unsafe_acquire! + +# Explicit Aliases +export acquire_view!, acquire_array! + +"""Alias for [`acquire!`](@ref). Returns a ReshapedArray (View).""" +const acquire_view! = acquire! + +"""Alias for [`unsafe_acquire!`](@ref). Returns an Array (via unsafe_wrap).""" +const acquire_array! = unsafe_acquire! +``` + --- ## Comparison Matrix @@ -155,6 +173,84 @@ end --- +## Implementation Specification + +### Data Structure Changes (types.jl) + +#### Constants +```julia +const CACHE_WAYS = 4 +``` + +#### TypedPool Struct Layout + +```julia +mutable struct TypedPool{T} + # --- Backing Storage --- + vectors::Vector{Vector{T}} + + # --- 1D Cache (Simple 1-way or Direct) --- + views::Vector{SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int}}, true}} + view_lengths::Vector{Int} + + # --- N-D Array Cache (N-way Set Associative) --- + # Layout: Flat Vector. Index = (slot_idx - 1) * CACHE_WAYS + way_idx + nd_arrays::Vector{Any} # Stores Array{T, N} + nd_dims::Vector{Any} # Stores NTuple{N, Int} + nd_ptrs::Vector{UInt} # Stores objectid/pointer for validation + nd_next_way::Vector{Int} # Round-Robin counter per slot (1 per slot) + + # --- State --- + n_active::Int + _checkpoint_n_active::Vector{Int} + _checkpoint_depths::Vector{Int} +end +``` + +**Key Changes**: +- **Remove**: `nd_views` (No longer needed as `acquire!` returns `ReshapedArray`) +- **Update**: `nd_arrays`, `nd_dims`, `nd_ptrs` store `CACHE_WAYS` items per active slot +- **Add**: `nd_next_way::Vector{Int}` for Round-Robin replacement index per slot + +### Logic Implementation (core.jl) + +#### `acquire!` (The Fast Path) + +**Goal**: Always return `ReshapedArray`. No N-D cache lookup. + +```julia +@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N} + len = safe_prod(dims) + flat_view = get_view!(tp, len) + return reshape(flat_view, dims) +end +``` + +#### `unsafe_acquire!` (The N-way Cache Path) + +**Goal**: Return `Array`. Use N-way cache with Linear Search + Round-Robin Replacement. + +**Algorithm**: +1. Get 1D view: `flat_view = get_view!(tp, prod(dims))` +2. Get current pointer: `current_ptr = UInt(pointer(flat_view))` +3. Calculate Base Index: `base = (tp.n_active - 1) * CACHE_WAYS` +4. **Search (Hit Check)**: + - Loop `k` from `1` to `CACHE_WAYS` + - Check if `nd_dims[base + k] == dims` **AND** `nd_ptrs[base + k] == current_ptr` + - If match: Return `nd_arrays[base + k]` +5. **Miss (Replacement)**: + - Get victim way from `nd_next_way[tp.n_active]` + - Target Index: `target = base + victim_way + 1` + - Create Array: `arr = unsafe_wrap(Array{T, N}, pointer(flat_view), dims)` + - **Update Cache**: + - `nd_arrays[target] = arr` + - `nd_dims[target] = dims` + - `nd_ptrs[target] = current_ptr` + - **Update Round-Robin**: Increment `nd_next_way` (modulo `CACHE_WAYS`) + - Return `arr` + +--- + ## Implementation Plan ### Phase 1: Simplify `acquire!` N-D Path @@ -181,11 +277,6 @@ end end ``` -**Change Summary**: -- Remove `get_nd_array!` call -- Directly return `reshape(1D_view, dims)` -- N-D view cache (`nd_views`) not used (in acquire! path) - ### Phase 2: Maintain `unsafe_acquire!` Cache **No changes** - maintain current implementation: @@ -193,32 +284,9 @@ end - Maintain N-way cache (4-way) - 112 bytes allocation on cache miss -### Phase 3: TypedPool Field Cleanup (Optional) - -Since `acquire!` no longer uses N-D cache, redefine field purposes: +### Phase 3: TypedPool Field Updates -```julia -mutable struct TypedPool{T} - # Storage - vectors::Vector{Vector{T}} - - # 1D Cache (shared by acquire! 1D + acquire! N-D) - views::Vector{SubArray{...}} - view_lengths::Vector{Int} - - # N-D Cache (unsafe_acquire! only) - nd_arrays::Vector{Any} # Array objects for unsafe_acquire! - nd_dims::Vector{Any} # Dimension tuples - nd_ptrs::Vector{UInt} # Pointer validation - - # Note: nd_views can be removed (acquire! uses reshape) - - # State - n_active::Int - _checkpoint_n_active::Vector{Int} - _checkpoint_depths::Vector{Int} -end -``` +Update struct as specified in Implementation Specification above. ### Phase 4: Test Updates @@ -331,6 +399,17 @@ end --- +## Verification Checklist + +1. **Type Check**: `acquire!` must return `ReshapedArray`. `unsafe_acquire!` must return `Array`. +2. **Allocation Check**: + - `acquire!`: 0 allocations always + - `unsafe_acquire!`: 0 allocations on cache hit + - `unsafe_acquire!`: 0 allocations on interleaved access (e.g., alternating 10x10 and 20x20) thanks to N-way cache +3. **Safety**: Ensure `unsafe_acquire!` validates pointers (re-wraps if the backing vector was resized) + +--- + ## TurbulentTransport Integration ### Changed File: `src/tglf_nn.jl` @@ -438,4 +517,3 @@ end ## References - [nd_array_approach_comparison.md](./nd_array_approach_comparison.md) - Benchmark results and boxing analysis -- [PR_MESSAGE.md](../PR_MESSAGE.md) - Original PR description diff --git a/docs/design/new_hybrid_api_design.md b/docs/design/new_hybrid_api_design.md deleted file mode 100644 index 14cd235..0000000 --- a/docs/design/new_hybrid_api_design.md +++ /dev/null @@ -1,140 +0,0 @@ -# Design Spec: Hybrid N-way Cache & ReshapedArray Strategy - -> **Note**: This document was written as a clear, concrete specification that another AI can -> implement mechanically without needing to think through the design. - ---- - -## 1. Objective -Refactor AdaptiveArrayPools.jl to implement a **Hybrid Allocation Strategy**: -1. **`acquire!` (Default)**: Return `ReshapedArray` (Zero-Allocation, Stack-allocated). Remove N-D caching logic for this path. -2. **`unsafe_acquire!` (Special)**: Return `Array` (via `unsafe_wrap`). Implement **N-way Set Associative Cache** to minimize `unsafe_wrap` overhead (112 bytes) and support interleaved access patterns. - -## 2. Data Structure Changes (types.jl) - -### Constants -Define the cache associativity level. -```julia -const CACHE_WAYS = 4 -``` - -### `TypedPool{T}` Struct -Modify fields to support N-way caching for Arrays, while removing unused View caching. - -* **Remove**: `nd_views` (No longer needed as `acquire!` returns `ReshapedArray`). -* **Update**: `nd_arrays`, `nd_dims`, `nd_ptrs`. These vectors must store `CACHE_WAYS` items per active slot. -* **Add**: `nd_next_way::Vector{Int}` (To track Round-Robin replacement index for each slot). - -**Updated Layout:** -```julia -mutable struct TypedPool{T} - # --- Backing Storage --- - vectors::Vector{Vector{T}} - - # --- 1D Cache (Simple 1-way or Direct) --- - views::Vector{SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int}}, true}} - view_lengths::Vector{Int} - - # --- N-D Array Cache (N-way Set Associative) --- - # Layout: Flat Vector. Index = (slot_idx - 1) * CACHE_WAYS + way_idx - nd_arrays::Vector{Any} # Stores Array{T, N} - nd_dims::Vector{Any} # Stores NTuple{N, Int} - nd_ptrs::Vector{UInt} # Stores objectid/pointer for validation - nd_next_way::Vector{Int} # Round-Robin counter per slot (1 per slot) - - n_active::Int - _checkpoint_n_active::Vector{Int} - _checkpoint_depths::Vector{Int} -end -``` - -### Initialization -Ensure `nd_arrays`, `nd_dims`, `nd_ptrs` are initialized with `nothing` or empty values, and `nd_next_way` with `0` or `1`. - -## 3. Logic Implementation (core.jl) - -### A. `acquire!` (The Fast Path) -**Goal**: Always return `ReshapedArray`. No N-D cache lookup. - -**Implementation**: -Modify `get_nd_view!` to: -1. Calculate total length (`prod(dims)`). -2. Call `get_view!(tp, len)` to get a 1D `SubArray`. -3. Return `reshape(flat_view, dims)`. - -```julia -@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N} - len = safe_prod(dims) - flat_view = get_view!(tp, len) - return reshape(flat_view, dims) -end -``` - -### B. `unsafe_acquire!` (The N-way Path) -**Goal**: Return `Array`. Use N-way cache to avoid `unsafe_wrap`. - -**Implementation**: -Modify `get_nd_array!` to use **Linear Search + Round-Robin Replacement**. - -**Algorithm**: -1. Get 1D view: `flat_view = get_view!(tp, prod(dims))`. -2. Get current pointer: `current_ptr = UInt(pointer(flat_view))`. -3. Calculate Base Index: `base = (tp.n_active - 1) * CACHE_WAYS`. -4. **Search (Hit Check)**: - * Loop `k` from `1` to `CACHE_WAYS`. - * Check if `nd_dims[base + k] == dims` **AND** `nd_ptrs[base + k] == current_ptr`. - * If match: Return `nd_arrays[base + k]`. -5. **Miss (Replacement)**: - * Get victim way from `nd_next_way[tp.n_active]`. - * Target Index: `target = base + victim_way + 1`. - * Create Array: `arr = unsafe_wrap(Array{T, N}, pointer(flat_view), dims)`. - * **Update Cache**: - * `nd_arrays[target] = arr` - * `nd_dims[target] = dims` - * `nd_ptrs[target] = current_ptr` - * **Update Round-Robin**: Increment `nd_next_way` (modulo `CACHE_WAYS`). - * Return `arr`. - -## 4. API & Aliases (AdaptiveArrayPools.jl) - -Add explicit aliases for clarity. - -```julia -# Main APIs -export acquire!, unsafe_acquire! - -# Explicit Aliases -export acquire_view!, acquire_array! - -"""Alias for [`acquire!`](@ref). Returns a ReshapedArray (View).""" -const acquire_view! = acquire! - -"""Alias for [`unsafe_acquire!`](@ref). Returns an Array (via unsafe_wrap).""" -const acquire_array! = unsafe_acquire! -``` - -## 5. Client Integration (`TurbulentTransport.jl`) - -Update tglf_nn.jl to use the Array-returning API to avoid dynamic dispatch boxing. - -**File**: tglf_nn.jl -**Function**: `flux_array!` -**Change**: -```julia -# Before -xx = acquire!(pool, T, size(x)) - -# After -xx = unsafe_acquire!(pool, T, size(x)) -# OR -xx = acquire_array!(pool, T, size(x)) -``` - -## 6. Verification Checklist - -1. **Type Check**: `acquire!` must return `ReshapedArray`. `unsafe_acquire!` must return `Array`. -2. **Allocation Check**: - * `acquire!`: 0 allocations always. - * `unsafe_acquire!`: 0 allocations on cache hit. - * `unsafe_acquire!`: 0 allocations on interleaved access (e.g., alternating 10x10 and 20x20) thanks to N-way cache. -3. **Safety**: Ensure `unsafe_acquire!` validates pointers (re-wraps if the backing vector was resized). From 27ef39a1ec6ad0bff45320d0a0c8d6d6c288d3a2 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 5 Jan 2026 10:46:22 -0800 Subject: [PATCH 4/8] docs: update design documents to match current codebase - fixed_slots_codegen_design.md: Update empty!() to 1-based sentinel pattern - hybrid_api_design.md: Mark changes as implemented, resolve open questions - nd_array_approach_comparison.md: Update N-way cache structure (nd_views removed) - untracked_acquire_design.md: Update _mark_untracked!(), rewind!(), replace all_type_stacks() with foreach_fixed_slot() --- docs/design/fixed_slots_codegen_design.md | 4 +- docs/design/hybrid_api_design.md | 16 +-- docs/design/nd_array_approach_comparison.md | 7 +- docs/design/untracked_acquire_design.md | 108 +++++++++++--------- 4 files changed, 77 insertions(+), 58 deletions(-) diff --git a/docs/design/fixed_slots_codegen_design.md b/docs/design/fixed_slots_codegen_design.md index c141e6b..7b57ec9 100644 --- a/docs/design/fixed_slots_codegen_design.md +++ b/docs/design/fixed_slots_codegen_design.md @@ -188,8 +188,10 @@ function Base.empty!(pool::AdaptiveArrayPool) end empty!(pool.others) - pool._current_depth = 0 + # Reset untracked detection state (1-based sentinel pattern) + pool._current_depth = 1 # 1 = global scope (sentinel) empty!(pool._untracked_flags) + push!(pool._untracked_flags, false) # Sentinel: global scope starts with false pool end ``` diff --git a/docs/design/hybrid_api_design.md b/docs/design/hybrid_api_design.md index 23fa851..3d8c84a 100644 --- a/docs/design/hybrid_api_design.md +++ b/docs/design/hybrid_api_design.md @@ -207,10 +207,10 @@ mutable struct TypedPool{T} end ``` -**Key Changes**: -- **Remove**: `nd_views` (No longer needed as `acquire!` returns `ReshapedArray`) -- **Update**: `nd_arrays`, `nd_dims`, `nd_ptrs` store `CACHE_WAYS` items per active slot -- **Add**: `nd_next_way::Vector{Int}` for Round-Robin replacement index per slot +**Implemented Changes** (compared to initial design): +- **Removed**: `nd_views` (No longer needed as `acquire!` returns `ReshapedArray`) ✓ +- **Updated**: `nd_arrays`, `nd_dims`, `nd_ptrs` store `CACHE_WAYS` items per active slot ✓ +- **Added**: `nd_next_way::Vector{Int}` for Round-Robin replacement index per slot ✓ ### Logic Implementation (core.jl) @@ -506,11 +506,11 @@ end --- -## Open Questions for Review +## Resolved Questions -1. **N-way cache retention level**: Keep current 4-way? Reduce to 2-way? -2. **nd_views field removal**: Can be removed since `acquire!` no longer uses it? -3. **Backward compatibility**: Cases where existing `acquire!` users check for `SubArray` type? +1. **N-way cache retention level**: Configurable via `CACHE_WAYS` preference (default: 4-way). ✅ +2. **nd_views field removal**: Removed. `acquire!` now returns `ReshapedArray` via `reshape()`. ✅ +3. **Backward compatibility**: `acquire!` returns `ReshapedArray` (a type of `AbstractArray`), maintaining API compatibility. ✅ --- diff --git a/docs/design/nd_array_approach_comparison.md b/docs/design/nd_array_approach_comparison.md index e01f308..d4a531f 100644 --- a/docs/design/nd_array_approach_comparison.md +++ b/docs/design/nd_array_approach_comparison.md @@ -257,13 +257,14 @@ acquire!(pool, Float64, 64, 100) arr = unsafe_wrap(Array{T, N}, pointer(flat_view), dims) ``` -### N-way Cache Structure +### N-way Cache Structure (Current Implementation) ```julia -# In TypedPool (src/types.jl) -nd_views::Vector{Any} # Cached SubArray objects +# In TypedPool (src/types.jl) - used by unsafe_acquire! only +# Note: nd_views was removed since acquire! now uses reshape() nd_arrays::Vector{Any} # Cached Array objects (from unsafe_wrap) nd_dims::Vector{Any} # Cached dimension tuples nd_ptrs::Vector{UInt} # Cached pointers for invalidation +nd_next_way::Vector{Int} # Round-robin counter per slot ``` --- diff --git a/docs/design/untracked_acquire_design.md b/docs/design/untracked_acquire_design.md index 7358d0e..c1568fb 100644 --- a/docs/design/untracked_acquire_design.md +++ b/docs/design/untracked_acquire_design.md @@ -167,10 +167,10 @@ end end # Untracked marking (still needed - for typed rewind vs full rewind decision) -@inline function _mark_untracked!(pool::AdaptiveArrayPool) - if pool._current_depth > 0 - @inbounds pool._untracked_flags[pool._current_depth] = true - end +# Note: 1-based sentinel pattern guarantees _current_depth >= 1, so no check needed +@inline function _mark_untracked!(pool::AbstractArrayPool) + # Always mark (_current_depth >= 1 guaranteed by sentinel) + @inbounds pool._untracked_flags[pool._current_depth] = true end ``` @@ -275,63 +275,79 @@ function rewind!(pool::AdaptiveArrayPool, types::Type...) end # Full rewind (untracked fallback - simplified with _checkpoint_depths!) +# Note: Uses 1-based sentinel pattern. At global scope (depth=1), delegates to reset!() function rewind!(pool::AdaptiveArrayPool) - depth = pool._current_depth - for tp in all_type_stacks(pool) - if !isempty(tp._checkpoint_depths) && tp._checkpoint_depths[end] == depth - # Checkpointed at current depth → pop - pop!(tp._checkpoint_depths) - tp.n_active = pop!(tp._checkpoint_n_active) - elseif !isempty(tp._checkpoint_n_active) - # Checkpointed at previous depth → restore without pop - tp.n_active = tp._checkpoint_n_active[end] - elseif tp.n_active > 0 - # ⚠️ CRITICAL ERROR: Would destroy arrays outside @with_pool - T = eltype(tp) - error(""" - [AdaptiveArrayPools] Cannot rewind type $T: no checkpoint exists. - Found $(tp.n_active) active array(s) that were never checkpointed. - - Fix: Wrap the scope where $T was first acquired in @with_pool. - """) - end - # else: _checkpoint_n_active empty and n_active == 0 → normal, do nothing + cur_depth = pool._current_depth + + # Safety guard: at global scope (depth=1), no checkpoint to rewind to + # Delegate to reset! which safely clears all n_active counters + if cur_depth == 1 + reset!(pool) + return nothing end + + # Fixed slots - zero allocation via @generated iteration + foreach_fixed_slot(pool) do tp + _rewind_typed_pool!(tp, cur_depth) + end + + # Process fallback types + for tp in values(pool.others) + _rewind_typed_pool!(tp, cur_depth) + end + pop!(pool._untracked_flags) pool._current_depth -= 1 + return nothing +end + +# Internal helper for rewind with orphan cleanup +# Uses 1-based sentinel pattern: no isempty checks needed +@inline function _rewind_typed_pool!(tp::AbstractTypedPool, current_depth::Int) + # 1. Orphaned Checkpoints Cleanup + while @inbounds tp._checkpoint_depths[end] > current_depth + pop!(tp._checkpoint_depths) + pop!(tp._checkpoint_n_active) + end + + # 2. Normal Rewind Logic (Sentinel Pattern) + if @inbounds tp._checkpoint_depths[end] == current_depth + pop!(tp._checkpoint_depths) + tp.n_active = pop!(tp._checkpoint_n_active) + else + # No checkpoint at current depth - restore from parent + tp.n_active = @inbounds tp._checkpoint_n_active[end] + end + nothing end ``` **Key Improvement:** `_checkpoint_depths[end] == depth` comparison enables accurate pop/restore decision → `_full_rewind_with_types!` not needed! -### 4.5 all_type_stacks Implementation +### 4.5 Zero-Allocation Iteration (Current Implementation) + +> **Note**: The design originally proposed `all_type_stacks()` generator, but was replaced with +> `foreach_fixed_slot()` @generated function for zero allocation via compile-time unrolling. ```julia -# Generator to iterate all TypedPools (fixed slots + others) -function all_type_stacks(pool::AdaptiveArrayPool) - return Iterators.flatten(( - # Fixed slots (7) - (pool.float64, pool.float32, pool.int64, pool.int32, pool.complexf64, pool.complexf32, pool.bool), - # Others (IdDict values) - values(pool.others) - )) -end - -# Or callback pattern (more efficient, no allocation) -@inline function foreach_type_stack(f, pool::AdaptiveArrayPool) - f(pool.float64) - f(pool.float32) - f(pool.int64) - f(pool.int32) - f(pool.complexf64) - f(pool.complexf32) - f(pool.bool) - for tp in values(pool.others) - f(tp) +# Current implementation uses @generated for zero allocation +const FIXED_SLOT_FIELDS = (:float64, :float32, :int64, :int32, :complexf64, :complexf32, :bool) + +@generated function foreach_fixed_slot(f::F, pool::AdaptiveArrayPool) where {F} + exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in FIXED_SLOT_FIELDS] + quote + Base.@_inline_meta + $(exprs...) + nothing end end ``` +**Benefits over generator approach**: +- Zero allocation via compile-time unrolling +- Full inlining for hot paths +- No runtime iteration overhead + ### 4.6 Macro Generated Code ```julia From 5049263af14c4e034f066891244bbc54eeb10bee Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 5 Jan 2026 10:53:09 -0800 Subject: [PATCH 5/8] docs: update Further Reading with GitHub links and English descriptions --- docs/src/advanced/internals.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/src/advanced/internals.md b/docs/src/advanced/internals.md index 3722203..a31ec81 100644 --- a/docs/src/advanced/internals.md +++ b/docs/src/advanced/internals.md @@ -1,6 +1,6 @@ # Internals -This page provides an overview of the internal architecture of AdaptiveArrayPools.jl. For detailed design documents (in Korean), see the `design/` folder in the repository. +This page provides an overview of the internal architecture of AdaptiveArrayPools.jl. For detailed design documents, see the [`docs/design/`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/tree/master/docs/design) folder in the repository. ## Checkpoint/Rewind Lifecycle @@ -128,9 +128,10 @@ This pattern reduces branching in hot paths where every nanosecond counts. ## Further Reading -For detailed design documents (in Korean): -- `design/hybrid_api_design.md` — Two-API strategy rationale -- `design/cuda_extension_design.md` — CUDA backend architecture -- `design/untracked_acquire_design.md` — Untracked acquire detection -- `design/fixed_slots_codegen_design.md` — Code generation for fixed slots -- `design/nd_array_approach_comparison.md` — N-way cache design comparison +For detailed design documents: + +- [`hybrid_api_design.md`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/hybrid_api_design.md) — Two-API strategy (`acquire!` vs `unsafe_acquire!`) and type stability analysis +- [`nd_array_approach_comparison.md`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/nd_array_approach_comparison.md) — N-way cache design, boxing analysis, and ReshapedArray benchmarks +- [`untracked_acquire_design.md`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/untracked_acquire_design.md) — Macro-based untracked acquire detection and 1-based sentinel pattern +- [`fixed_slots_codegen_design.md`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/fixed_slots_codegen_design.md) — Zero-allocation iteration via `@generated` functions +- [`cuda_extension_design.md`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/cuda_extension_design.md) — CUDA backend architecture and extension loading From d807d05b45d740975aa0a6bb08c98553be6261f2 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 5 Jan 2026 11:00:14 -0800 Subject: [PATCH 6/8] refactor: move docs to docs/src/, update README links to GitHub Pages - Remove copy logic from make.jl (files now live in docs/src/) - Delete redundant docs/*.md files (content is in docs/src/**) - Update README links to point to GitHub Pages stable URLs - Simplify documentation build process --- README.md | 20 +-- docs/api.md | 111 ---------------- docs/configuration.md | 102 --------------- docs/cuda.md | 123 ----------------- docs/make.jl | 49 +------ docs/maybe_with_pool.md | 53 -------- docs/multi-threading.md | 284 ---------------------------------------- docs/safety.md | 110 ---------------- 8 files changed, 17 insertions(+), 835 deletions(-) delete mode 100644 docs/api.md delete mode 100644 docs/configuration.md delete mode 100644 docs/cuda.md delete mode 100644 docs/maybe_with_pool.md delete mode 100644 docs/multi-threading.md delete mode 100644 docs/safety.md diff --git a/README.md b/README.md index a904175..248854a 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ end | Allocations | ⚠️ 90,000 (2.75 GiB) | ✅ **0** | 100% eliminated | | GC Time | ⚠️ 31% | ✅ **0%** | No GC pauses | -> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](docs/cuda.md). +> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/cuda). ## How It Works @@ -76,11 +76,11 @@ end This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors. -`acquire!` returns lightweight views (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK. If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`—see [API Reference](docs/api.md). +`acquire!` returns lightweight views (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK. If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`—see [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api). -> **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](docs/safety.md). +> **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/guide/safety). -**Thread-safe by design**: Each Julia Task gets its own independent pool—no locks needed. See [Multi-Threading](docs/multi-threading.md) for patterns. +**Thread-safe by design**: Each Julia Task gets its own independent pool—no locks needed. See [Multi-Threading](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/advanced/multi-threading) for patterns. ### Convenience Functions @@ -92,7 +92,7 @@ Common initialization patterns have convenience functions: | `ones!(pool, Float32, 3, 3)` | `acquire!` + `fill!(1)` | | `similar!(pool, A)` | `acquire!` matching `eltype(A)`, `size(A)` | -These return views like `acquire!`. For raw `Array` types, use `unsafe_acquire!` or its convenience variants (`unsafe_zeros!`, `unsafe_ones!`, `unsafe_similar!`). See [API Reference](docs/api.md#convenience-functions). +These return views like `acquire!`. For raw `Array` types, use `unsafe_acquire!` or its convenience variants (`unsafe_zeros!`, `unsafe_ones!`, `unsafe_similar!`). See [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api#convenience-functions). ## Installation @@ -106,11 +106,11 @@ Pkg.add("AdaptiveArrayPools") | Guide | Description | |-------|-------------| -| [API Reference](docs/api.md) | Complete function and macro reference | -| [CUDA Backend](docs/cuda.md) | GPU-specific usage and examples | -| [Safety Guide](docs/safety.md) | Scope rules and best practices | -| [Multi-Threading](docs/multi-threading.md) | Task/thread safety patterns | -| [Configuration](docs/configuration.md) | Preferences and cache tuning | +| [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api) | Complete function and macro reference | +| [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/cuda) | GPU-specific usage and examples | +| [Safety Guide](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/guide/safety) | Scope rules and best practices | +| [Multi-Threading](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/advanced/multi-threading) | Task/thread safety patterns | +| [Configuration](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/configuration) | Preferences and cache tuning | ## License diff --git a/docs/api.md b/docs/api.md deleted file mode 100644 index 4d3eb8c..0000000 --- a/docs/api.md +++ /dev/null @@ -1,111 +0,0 @@ -# API Reference - -## Macros - -| Macro | Description | -|-------|-------------| -| `@with_pool name expr` | **Recommended.** Injects a global, task-local pool named `name`. Automatically checkpoints and rewinds. | -| `@maybe_with_pool name expr` | Same as `@with_pool`, but can be toggled on/off at runtime via `MAYBE_POOLING_ENABLED[]`. | - -## Functions - -| Function | Description | -|----------|-------------| -| `acquire!(pool, T, dims...)` | Returns a view: `SubArray{T,1}` for 1D, `ReshapedArray{T,N}` for N-D. Always 0 bytes. | -| `acquire!(pool, T, dims::Tuple)` | Tuple overload for `acquire!` (e.g., `acquire!(pool, T, size(x))`). | -| `acquire!(pool, x::AbstractArray)` | Similar-style: acquires array matching `eltype(x)` and `size(x)`. | -| `unsafe_acquire!(pool, T, dims...)` | Returns native `Array`/`CuArray` (CPU: `Vector{T}` for 1D, `Array{T,N}` for N-D). Only for FFI/type constraints. | -| `unsafe_acquire!(pool, T, dims::Tuple)` | Tuple overload for `unsafe_acquire!`. | -| `unsafe_acquire!(pool, x::AbstractArray)` | Similar-style: acquires raw array matching `eltype(x)` and `size(x)`. | -| `acquire_view!(pool, T, dims...)` | Alias for `acquire!`. Returns view types. | -| `acquire_array!(pool, T, dims...)` | Alias for `unsafe_acquire!`. Returns Array for N-D. | -| `checkpoint!(pool)` | Saves the current pool state (stack pointer). | -| `checkpoint!(pool, T...)` | Type-specific checkpoint for optimized performance. | -| `rewind!(pool)` | Restores the pool to the last checkpoint, freeing all arrays acquired since then. | -| `rewind!(pool, T...)` | Type-specific rewind for optimized performance. | -| `pool_stats(pool)` | Prints detailed statistics about pool usage. | -| `get_task_local_pool()` | Returns the task-local pool instance. | -| `empty!(pool)` | Clears all internal storage, releasing all memory. | - -## Convenience Functions - -Shortcuts for common `acquire!` + initialization patterns. Default element type is `Float64` (CPU) or `Float32` (CUDA). - -### View-returning (like `acquire!`) - -| Function | Description | -|----------|-------------| -| `zeros!(pool, [T,] dims...)` | Zero-initialized view. Equivalent to `acquire!` + `fill!(0)`. | -| `ones!(pool, [T,] dims...)` | One-initialized view. Equivalent to `acquire!` + `fill!(1)`. | -| `similar!(pool, A)` | View matching `eltype(A)` and `size(A)`. | -| `similar!(pool, A, T)` | View with type `T`, size from `A`. | -| `similar!(pool, A, dims...)` | View with `eltype(A)`, specified dimensions. | -| `similar!(pool, A, T, dims...)` | View with type `T`, specified dimensions. | - -### Array-returning (like `unsafe_acquire!`) - -| Function | Description | -|----------|-------------| -| `unsafe_zeros!(pool, [T,] dims...)` | Zero-initialized raw `Array`. | -| `unsafe_ones!(pool, [T,] dims...)` | One-initialized raw `Array`. | -| `unsafe_similar!(pool, A, ...)` | Raw `Array` with same signatures as `similar!`. | - -All convenience functions support tuple dimensions: `zeros!(pool, (3, 4))`. - -**CUDA note**: Default type is `Float32` to match `CUDA.zeros()` behavior. - -## Types - -| Type | Description | -|------|-------------| -| `AdaptiveArrayPool` | The main pool type. Create with `AdaptiveArrayPool()`. | -| `DisabledPool{Backend}` | Sentinel type when pooling is disabled. Preserves backend context (`:cpu` or `:cuda`). | - -## Utility Functions - -| Function | Description | -|----------|-------------| -| `pooling_enabled(pool)` | Returns `true` if pool is active, `false` if `DisabledPool`. Use instead of `pool === nothing`. | -| `default_eltype(pool)` | Returns default element type: `Float64` (CPU) or `Float32` (CUDA). | - -## Constants - -| Constant | Description | -|----------|-------------| -| `USE_POOLING` | Compile-time constant. Set via `Preferences.jl` to disable all pooling. | -| `MAYBE_POOLING_ENABLED` | Runtime `Ref{Bool}`. Only affects `@maybe_with_pool`. | -| `POOL_DEBUG` | Runtime `Ref{Bool}`. Enable safety validation for debugging. | -| `CACHE_WAYS` | Compile-time constant. N-way cache size for `unsafe_acquire!` (default: 4, range: 1-16). | - -## Configuration Functions - -| Function | Description | -|----------|-------------| -| `set_cache_ways!(n)` | Set N-way cache size. Requires Julia restart. | - -## Safety Notes - -Arrays acquired from a pool are **only valid within the `@with_pool` scope**. Do not: -- Return pool-backed arrays from functions -- Store them in global variables -- Capture them in closures that outlive the scope -- Call `resize!`, `push!`, or `append!` on arrays from `unsafe_acquire!` - -Use `POOL_DEBUG[] = true` during development to catch direct returns of pool-backed arrays. - -## `acquire!` vs `unsafe_acquire!` - -| Function | 1D Return | N-D Return | Allocation | -|----------|-----------|------------|------------| -| `acquire!` | `SubArray{T,1}` | `ReshapedArray{T,N}` | Always 0 bytes (stack-based views) | -| `unsafe_acquire!` | `Vector{T}` | `Array{T,N}` | 0 bytes (hit) / ~100 bytes header (miss) | - -Both share the same underlying pool memory. Even on cache miss, only the `Array` header is allocated—**data memory is always reused from the pool**. **Use `acquire!` by default**—BLAS/LAPACK are fully optimized for `StridedArray`, so there's no performance difference. - -Use `unsafe_acquire!` only when you need a concrete `Array` type (FFI, type signatures, runtime dispatch). - -**Caching**: -- `acquire!` 1D uses simple 1:1 cache (reuses `SubArray` if same length) -- `unsafe_acquire!` (all dimensions) uses N-way cache (up to `CACHE_WAYS`, default: 4) per slot; exceeding this causes eviction - -> **Header size by dimensionality**: The `~100 bytes` is an average. Actual `Array` header allocation varies: 1D → 80 bytes, 2D-3D → 112 bytes, 4D-5D → 144 bytes. This is Julia's internal `Array` metadata; actual data memory is always reused from the pool. diff --git a/docs/configuration.md b/docs/configuration.md deleted file mode 100644 index fdc1d16..0000000 --- a/docs/configuration.md +++ /dev/null @@ -1,102 +0,0 @@ -# Configuration - -AdaptiveArrayPools can be configured via `LocalPreferences.toml`: - -```toml -[AdaptiveArrayPools] -use_pooling = false # ⭐ Primary: Disable pooling entirely -cache_ways = 8 # Advanced: N-way cache size (default: 4) -``` - -## Compile-time: USE_POOLING (⭐ Primary) - -**The most important configuration.** Completely disable pooling to make `acquire!` behave like standard allocation. - -```toml -# LocalPreferences.toml -[AdaptiveArrayPools] -use_pooling = false -``` - -Or programmatically: - -```julia -using Preferences -Preferences.set_preferences!(AdaptiveArrayPools, "use_pooling" => false) -# Restart Julia for changes to take effect -``` - -When `USE_POOLING = false`: -- `pool` becomes `DisabledPool{backend}()` instead of an active pool -- All pool functions fall back to standard allocation -- Backend context is preserved: `:cuda` still returns `CuArray` - -```julia -# These become equivalent: -@with_pool pool acquire!(pool, Float64, n, n) → Matrix{Float64}(undef, n, n) -@with_pool pool acquire!(pool, Float64, n) → Vector{Float64}(undef, n) - -# With CUDA backend: -@with_pool :cuda pool zeros!(pool, 100) → CUDA.zeros(Float32, 100) -``` - -Use `pooling_enabled(pool)` to check if pooling is active. - -**Use cases:** -- **Debugging**: Compare behavior with/without pooling -- **Benchmarking**: Measure pooling overhead vs direct allocation -- **Gradual adoption**: Add `@with_pool` annotations now, enable pooling later -- **CI/Testing**: Run tests without pooling to isolate issues - -All pooling code is **completely eliminated at compile time** (zero overhead). - -## Runtime: MAYBE_POOLING_ENABLED - -Only affects `@maybe_with_pool`. Toggle without restart. - -```julia -MAYBE_POOLING_ENABLED[] = false # Disable -MAYBE_POOLING_ENABLED[] = true # Enable (default) -``` - -## Runtime: POOL_DEBUG - -Enable safety validation to catch direct returns of pool-backed arrays. - -```julia -POOL_DEBUG[] = true # Enable safety checks (development) -POOL_DEBUG[] = false # Disable (default, production) -``` - -When enabled, returning a pool-backed array from a `@with_pool` block will throw an error. - -## Compile-time: CACHE_WAYS - -Configure the N-way cache size for `unsafe_acquire!`. Higher values reduce cache eviction but increase memory per slot. - -```toml -# LocalPreferences.toml -[AdaptiveArrayPools] -cache_ways = 8 # Default: 4, Range: 1-16 -``` - -Or programmatically: - -```julia -using AdaptiveArrayPools -set_cache_ways!(8) -# Restart Julia for changes to take effect -``` - -**When to increase**: If your code alternates between more than 4 dimension patterns per pool slot, increase `cache_ways` to avoid cache eviction (~100 bytes header per miss). - -> **Scope**: `cache_ways` affects **all `unsafe_acquire!`** calls (including 1D). Only `acquire!` 1D uses simple 1:1 caching. - -## Summary - -| Setting | Scope | Restart? | Priority | Affects | -|---------|-------|----------|----------|---------| -| `use_pooling` | Compile-time | Yes | ⭐ Primary | All macros, `acquire!` behavior | -| `cache_ways` | Compile-time | Yes | Advanced | `unsafe_acquire!` N-D caching | -| `MAYBE_POOLING_ENABLED` | Runtime | No | Optional | `@maybe_with_pool` only | -| `POOL_DEBUG` | Runtime | No | Debug | Safety validation | diff --git a/docs/cuda.md b/docs/cuda.md deleted file mode 100644 index c5778c8..0000000 --- a/docs/cuda.md +++ /dev/null @@ -1,123 +0,0 @@ -# CUDA Backend - -AdaptiveArrayPools provides native CUDA support through a package extension that loads automatically when CUDA.jl is available. - -## Quick Start - -```julia -using AdaptiveArrayPools, CUDA - -# Use :cuda backend for GPU arrays -@with_pool :cuda pool function gpu_computation(n) - A = acquire!(pool, Float64, n, n) # CuArray view - B = acquire!(pool, Float64, n, n) # CuArray view - - fill!(A, 1.0) - fill!(B, 2.0) - - return sum(A .+ B) -end - -# Zero GPU allocation in hot loops -for i in 1:1000 - gpu_computation(100) # GPU memory reused from pool -end -``` - -## API - -The CUDA backend uses the same API as CPU, with `:cuda` backend specifier: - -| Macro/Function | Description | -|----------------|-------------| -| `@with_pool :cuda pool expr` | GPU pool with automatic checkpoint/rewind | -| `acquire!(pool, T, dims...)` | Returns `CuArray` view (always 0 bytes GPU alloc) | -| `unsafe_acquire!(pool, T, dims...)` | Returns raw `CuArray` (for FFI/type constraints) | -| `get_task_local_cuda_pool()` | Returns the task-local CUDA pool | -| `pool_stats(:cuda)` | Print CUDA pool statistics | - -## Return Types - -| Function | 1D Return | N-D Return | -|----------|-----------|------------| -| `acquire!` | `CuArray{T,1}` (view) | `CuArray{T,N}` (view) | -| `unsafe_acquire!` | `CuArray{T,1}` | `CuArray{T,N}` | - -## Allocation Behavior - -**GPU Memory**: Always 0 bytes allocation after warmup. The underlying `CuVector` is resized as needed and reused. - -**CPU Memory**: -- Cache hit (≤4 dimension patterns per slot): 0 bytes -- Cache miss (>4 patterns): ~100 bytes for wrapper metadata - -```julia -# Example: 4 patterns fit in 4-way cache → zero CPU allocation -dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) -for dims in dims_list - @with_pool :cuda p begin - A = acquire!(p, Float64, dims...) - # Use A... - end -end -``` - -## Fixed Slot Types - -Optimized types with pre-allocated slots (same as CPU): - -| Type | Field | -|------|-------| -| `Float64` | `.float64` | -| `Float32` | `.float32` | -| `Float16` | `.float16` | -| `Int64` | `.int64` | -| `Int32` | `.int32` | -| `ComplexF64` | `.complexf64` | -| `ComplexF32` | `.complexf32` | -| `Bool` | `.bool` | - -Other types use the fallback dictionary (`.others`). - -## Limitations - -- **No `@maybe_with_pool :cuda`**: Runtime toggle not supported for CUDA backend -- **Task-local only**: Each Task gets its own CUDA pool, same as CPU -- **Same device**: All arrays in a pool use the same CUDA device - -## Example: Matrix Multiplication - -```julia -using AdaptiveArrayPools, CUDA, LinearAlgebra - -@with_pool :cuda pool function gpu_matmul(n) - A = acquire!(pool, Float64, n, n) - B = acquire!(pool, Float64, n, n) - C = acquire!(pool, Float64, n, n) - - rand!(A); rand!(B) - mul!(C, A, B) - - return sum(C) -end - -# Warmup -gpu_matmul(100) - -# Benchmark - zero GPU allocation -using BenchmarkTools -@benchmark gpu_matmul(1000) -``` - -## Debugging - -```julia -# Check pool state -pool_stats(:cuda) - -# Output: -# CuAdaptiveArrayPool (device 0) -# Float64 (fixed) [GPU] -# slots: 3 (active: 0) -# elements: 30000 (234.375 KiB) -``` diff --git a/docs/make.jl b/docs/make.jl index 7517dbd..2147369 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -15,24 +15,17 @@ function write_if_changed(path::String, content::String) write(path, content) end -""" -Copy file only if content changed (prevents mtime update triggering rebuild). -""" -function cp_if_changed(src::String, dst::String) - if isfile(dst) && read(src) == read(dst) - return # Content unchanged, skip copy - end - cp(src, dst; force=true) -end - # ============================================ # Helper: Rewrite relative paths in README # ============================================ +const GITHUB_PAGES_BASE = "https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable" +const REPO_URL = "https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl" + """ Rewrite relative paths in README.md for Documenter structure. -Converts: +Converts GitHub repo links to internal Documenter links: - `docs/api.md` → `usage/api.md` - `docs/cuda.md` → `usage/cuda.md` - `docs/safety.md` → `guide/safety.md` @@ -43,8 +36,6 @@ Converts: Also handles anchor links (e.g., `docs/api.md#convenience-functions`). """ function rewrite_readme_paths(content::String) - repo_url = "https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl" - # Usage docs (with optional anchors) content = replace(content, r"\(docs/api\.md(#[^)]+)?\)" => s"(usage/api.md\1)") content = replace(content, r"\(docs/cuda\.md(#[^)]+)?\)" => s"(usage/cuda.md\1)") @@ -58,50 +49,24 @@ function rewrite_readme_paths(content::String) content = replace(content, r"\(docs/multi-threading\.md(#[^)]+)?\)" => s"(advanced/multi-threading.md\1)") # LICENSE link → GitHub - content = replace(content, "(LICENSE)" => "($(repo_url)/blob/master/LICENSE)") + content = replace(content, "(LICENSE)" => "($(REPO_URL)/blob/master/LICENSE)") return content end # ============================================ -# Step 1: Setup directories +# Generate index.md from README # ============================================ const DOCS_DIR = @__DIR__ const DOCS_SRC = joinpath(DOCS_DIR, "src") -# Create directory structure -mkpath(DOCS_SRC) -mkpath(joinpath(DOCS_SRC, "guide")) -mkpath(joinpath(DOCS_SRC, "usage")) -mkpath(joinpath(DOCS_SRC, "advanced")) - -# ============================================ -# Step 2: Copy and transform content -# ============================================ - # README.md → index.md (with path rewriting) readme_content = read(joinpath(DOCS_DIR, "../README.md"), String) write_if_changed(joinpath(DOCS_SRC, "index.md"), rewrite_readme_paths(readme_content)) -# Copy existing docs to their new locations (with path fixes) - -# Guide section - fix relative links -safety_content = read(joinpath(DOCS_DIR, "safety.md"), String) -safety_content = replace(safety_content, "(multi-threading.md)" => "(../advanced/multi-threading.md)") -write_if_changed(joinpath(DOCS_SRC, "guide/safety.md"), safety_content) - -# Usage section -cp_if_changed(joinpath(DOCS_DIR, "api.md"), joinpath(DOCS_SRC, "usage/api.md")) -cp_if_changed(joinpath(DOCS_DIR, "configuration.md"), joinpath(DOCS_SRC, "usage/configuration.md")) -cp_if_changed(joinpath(DOCS_DIR, "maybe_with_pool.md"), joinpath(DOCS_SRC, "usage/maybe_with_pool.md")) -cp_if_changed(joinpath(DOCS_DIR, "cuda.md"), joinpath(DOCS_SRC, "usage/cuda.md")) - -# Advanced section -cp_if_changed(joinpath(DOCS_DIR, "multi-threading.md"), joinpath(DOCS_SRC, "advanced/multi-threading.md")) - # ============================================ -# Step 3: Build documentation +# Build documentation # ============================================ makedocs( diff --git a/docs/maybe_with_pool.md b/docs/maybe_with_pool.md deleted file mode 100644 index 39c31b7..0000000 --- a/docs/maybe_with_pool.md +++ /dev/null @@ -1,53 +0,0 @@ -# @maybe_with_pool - -Runtime-toggleable pooling. Users can enable/disable via `MAYBE_POOLING_ENABLED[]`. - -## Usage - -```julia -@maybe_with_pool pool function compute(n) - v = acquire!(pool, Float64, n) - v .= 1.0 - sum(v) -end - -# Toggle at runtime -MAYBE_POOLING_ENABLED[] = false # Normal allocation -MAYBE_POOLING_ENABLED[] = true # Uses pool -``` - -## When to Use - -- Library code where end-users should control pooling behavior -- Debugging: disable pooling to isolate memory issues -- Benchmarking: compare pooled vs non-pooled performance - -## How It Works - -When `MAYBE_POOLING_ENABLED[] == false`: -- `pool` becomes `DisabledPool{backend}()` (e.g., `DisabledPool{:cpu}()` or `DisabledPool{:cuda}()`) -- All pool functions (`acquire!`, `zeros!`, etc.) fall back to standard allocation -- Backend context is preserved: `:cuda` → `CuArray`, `:cpu` → `Array` - -Use `pooling_enabled(pool)` to check if pooling is active: -```julia -@maybe_with_pool pool begin - if pooling_enabled(pool) - # Using pooled memory - else - # Using standard allocation (DisabledPool) - end -end -``` - -## vs @with_pool - -| | `@with_pool` | `@maybe_with_pool` | -|---|---|---| -| Runtime toggle | No | Yes | -| Overhead when disabled | None | Branch check | -| Use case | Application code | Library code | - -## Safety - -Same rules as `@with_pool`: arrays are only valid within the scope. Do not return or store them externally. diff --git a/docs/multi-threading.md b/docs/multi-threading.md deleted file mode 100644 index 4135019..0000000 --- a/docs/multi-threading.md +++ /dev/null @@ -1,284 +0,0 @@ -# Multi-Threading Guide - -AdaptiveArrayPools uses `task_local_storage()` for **task-local isolation**: each Julia Task gets its own independent pool. This design ensures thread safety when used correctly. - -## Table of Contents - -- [Understanding Julia's Task/Thread Model](#understanding-julias-taskthread-model) -- [How Pools Work with @threads](#how-pools-work-with-threads) -- [Safe Patterns](#safe-patterns) -- [Unsafe Patterns](#unsafe-patterns) -- [Why Task-Local (Not Thread-Local)?](#why-task-local-not-thread-local) -- [User Responsibility](#user-responsibility) - ---- - -## Understanding Julia's Task/Thread Model - -Julia uses an **M:N threading model** where multiple Tasks (lightweight coroutines) can run on multiple OS threads. - -``` -┌─────────────────────────────────────────────────────────────┐ -│ Julia Process │ -│ │ -│ Thread 1 Thread 2 Thread 3 │ -│ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ -│ │ Task A │ │ Task C │ │ Task E │ │ -│ │ (TLS-A) │ │ (TLS-C) │ │ (TLS-E) │ │ -│ └─────────┘ └─────────┘ └─────────┘ │ -│ ┌─────────┐ ┌─────────┐ │ -│ │ Task B │ │ Task D │ │ -│ │ (TLS-B) │ │ (TLS-D) │ │ -│ └─────────┘ └─────────┘ │ -└─────────────────────────────────────────────────────────────┘ -``` - -Key concepts: - -| Concept | Description | -|---------|-------------| -| **Thread** | OS-level execution unit. Fixed count at Julia startup. | -| **Task** | Julia's lightweight coroutine (Green Thread). Created dynamically. | -| **task_local_storage()** | Per-Task storage. Each Task has its own isolated TLS. | - -### Important: One Thread Can Run Multiple Tasks - -A single thread can execute multiple Tasks by switching between them at **yield points** (I/O, `sleep()`, `yield()`, etc.): - -```julia -# Both tasks run on Thread 1, interleaved! -task_a = @spawn begin - println("A start") - sleep(0.1) # yield point - switch to Task B - println("A end") -end - -task_b = @spawn begin - println("B start") - sleep(0.1) # yield point - switch back to Task A - println("B end") -end - -# Output (single thread): -# A start -# B start -# A end -# B end -``` - ---- - -## How Pools Work with @threads - -When you use `Threads.@threads`, Julia distributes iterations across threads. Each thread gets **one Task** that processes its assigned iterations. - -``` -Threads.@threads for i in 1:100_000 (4 threads) -│ -├─ Thread 1: Task-1 → Pool-1 -│ └─ Processes i = 1..25,000 (same pool reused for all!) -│ -├─ Thread 2: Task-2 → Pool-2 -│ └─ Processes i = 25,001..50,000 -│ -├─ Thread 3: Task-3 → Pool-3 -│ └─ Processes i = 50,001..75,000 -│ -└─ Thread 4: Task-4 → Pool-4 - └─ Processes i = 75,001..100,000 - -Total: 4 pools created, each reused ~25,000 times -``` - -### Key Insight - -- `@threads` creates **one Task per thread** (not one per iteration!) -- Each Task has its own `task_local_storage()` → its own pool -- Within one `@threads` block, pools are efficiently reused -- Calling `@threads` **multiple times** creates new Tasks → new pools each time - ---- - -## Safe Patterns - -### Pattern 1: `@with_pool` Inside `@threads` - -```julia -Threads.@threads for i in 1:N - @with_pool pool begin - a = acquire!(pool, Float64, 100) - # ... computation ... - end # pool automatically rewinds -end -``` - -Each thread's Task gets its own pool. Safe and efficient. - -### Pattern 2: Function Defined with `@with_pool` - -```julia -# Define function with @with_pool -@with_pool pool function inner_work(x) - tmp = acquire!(pool, Float64, length(x)) - tmp .= x - return sum(tmp) -end - -# Call from @threads - each thread gets its own pool -Threads.@threads for i in 1:N - result = inner_work(data[i]) -end -``` - -The pool is created per-Task when the function is called, not when defined. - -### Pattern 3: Nested Functions - -```julia -@with_pool outer_pool function outer_work(data) - # outer_pool belongs to Main Task - tmp = acquire!(outer_pool, Float64, 100) - - Threads.@threads for i in 1:length(data) - # inner_work creates its own pool per thread - inner_work(data[i]) # Inner pool ≠ outer_pool (safe!) - end -end -``` - -Outer and inner pools are completely independent. - ---- - -## Unsafe Patterns - -### Pattern 1: `@with_pool` Outside `@threads` - -```julia -# ❌ DANGER: Race condition! -@with_pool pool Threads.@threads for i in 1:N - a = acquire!(pool, Float64, 100) # All threads share ONE pool! -end -``` - -**Why it fails**: `pool` is created in the Main Task's TLS. All threads access the same pool simultaneously. - -### Pattern 2: Sharing Pool Reference - -```julia -# ❌ DANGER: Race condition! -pool = get_task_local_pool() # Main Task's pool -Threads.@threads for i in 1:N - a = acquire!(pool, Float64, 100) # Shared access! -end -``` - -### Pattern 3: Passing Pool to `@spawn` - -```julia -# ❌ DANGER: Race condition! -@with_pool pool begin - tasks = [Threads.@spawn begin - a = acquire!(pool, Float64, 100) # Multiple tasks, one pool! - end for _ in 1:4] - wait.(tasks) -end -``` - ---- - -## Why Task-Local (Not Thread-Local)? - -You might wonder: "Why not use thread-local pools? They persist across `@threads` calls!" - -### The Stack Discipline Problem - -AdaptiveArrayPools uses `checkpoint!` and `rewind!` - a **stack-based** allocation system: - -```julia -@with_pool pool begin - checkpoint!(pool) # Push current state - a = acquire!(pool, ...) - b = acquire!(pool, ...) - # ... - rewind!(pool) # Pop and restore state (LIFO!) -end -``` - -This requires **strict LIFO ordering**: the Task that checkpoints first must rewind last. - -### Why Thread-Local Fails with `@spawn` - -With `@spawn`, multiple Tasks can interleave on the same thread: - -``` -Thread 1 (with Thread-Local Pool): - -Time → -Task A: checkpoint! ──── acquire! ──── sleep ────────────── rewind! -Task B: checkpoint! ──── acquire! ──── sleep ──── rewind! - ↑ - A finishes first! -``` - -**Stack corruption occurs:** - -1. Task A: `checkpoint!` → stack = `[0]` -2. Task B: `checkpoint!` → stack = `[0, 1]` -3. Task A: `rewind!` → pops `1` (B's checkpoint!) → stack = `[0]` -4. Task B: `rewind!` → pops `0` (A's checkpoint!) → **WRONG!** - -**Result**: B's arrays may be reused while B is still using them → memory corruption. - -### Locks Don't Help - -Adding locks only prevents **simultaneous access**, not **LIFO violations**. The stack still gets corrupted because Tasks finish in unpredictable order. - -### Task-Local: The Only Safe Solution - -With Task-local pools: -- Each Task has its own pool -- Each pool has its own stack -- No interleaving possible → LIFO always preserved - ---- - -## User Responsibility - -### The Core Rule - -> **Pool objects must not be shared across Tasks.** - -This library prioritizes **zero-overhead performance** over runtime safety checks. No locks are added because: - -1. Locks would defeat the purpose of zero-allocation pooling -2. Even with locks, stack corruption would occur (LIFO violations) - -### Quick Reference - -| Pattern | Safety | Reason | -|---------|--------|--------| -| `@with_pool` inside `@threads` | ✅ Safe | Each Task gets own pool | -| `@with_pool` outside `@threads` | ❌ Unsafe | All threads share one pool | -| Function with `@with_pool` called from `@threads` | ✅ Safe | Pool created per-Task at call time | -| Passing pool to `@spawn` | ❌ Unsafe | Multiple Tasks access same pool | -| Nested `@with_pool` (outer/inner) | ✅ Safe | Each level has independent pool | - -### Debugging Tips - -If you encounter unexpected behavior: - -1. **Check pool placement**: Is `@with_pool` inside or outside `@threads`? -2. **Check pool sharing**: Is the same pool variable accessed from multiple Tasks? -3. **Enable POOL_DEBUG**: `POOL_DEBUG[] = true` catches some (not all) misuse patterns - ---- - -## Summary - -- AdaptiveArrayPools uses **Task-local isolation** for thread safety -- Each Julia Task gets its own independent pool via `task_local_storage()` -- `@threads` creates one Task per thread → pools are reused within the block -- **Always place `@with_pool` inside `@threads`**, not outside -- Thread-local pools are **not an alternative** due to stack discipline requirements -- Correct usage is the user's responsibility (no runtime checks for performance) diff --git a/docs/safety.md b/docs/safety.md deleted file mode 100644 index 0016d5a..0000000 --- a/docs/safety.md +++ /dev/null @@ -1,110 +0,0 @@ -# Safety Guide - -AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This requires one simple rule: **acquired arrays are only valid within their `@with_pool` scope**. - -## The Scope Rule - -When `@with_pool` ends, all arrays acquired within that scope are recycled. Using them after the scope ends leads to undefined behavior. - -```julia -@with_pool pool begin - v = acquire!(pool, Float64, 100) - - result = sum(v) # ✅ compute and return values - copied = copy(v) # ✅ copy if you need data outside -end -# v is no longer valid here -``` - -## What NOT to Do - -### Don't return pool-backed arrays - -```julia -# ❌ Wrong: returning the array itself -@with_pool pool function bad_example() - v = acquire!(pool, Float64, 100) - return v # v will be recycled after this returns! -end - -# ✅ Correct: return computed values or copies -@with_pool pool function good_example() - v = acquire!(pool, Float64, 100) - return sum(v) # scalar result -end -``` - -### Don't store in globals or closures - -```julia -# ❌ Wrong: storing in global -global_ref = nothing -@with_pool pool begin - global_ref = acquire!(pool, Float64, 100) -end -# global_ref now points to recycled memory - -# ❌ Wrong: capturing in closure -@with_pool pool begin - v = acquire!(pool, Float64, 100) - callback = () -> sum(v) # v captured but will be invalid -end -``` - -### Don't resize or push! to unsafe_acquire! arrays - -```julia -@with_pool pool begin - v = unsafe_acquire!(pool, Float64, 100) - # ❌ These break pool memory management: - # resize!(v, 200) - # push!(v, 1.0) - # append!(v, [1.0, 2.0]) -end -``` - -## Debugging with POOL_DEBUG - -Enable runtime safety checks during development: - -```julia -using AdaptiveArrayPools -AdaptiveArrayPools.POOL_DEBUG[] = true - -@with_pool pool function test() - v = acquire!(pool, Float64, 100) - return v # Will warn about returning pool-backed array -end -``` - -## acquire! vs unsafe_acquire! - -| Function | Returns | Best For | -|----------|---------|----------| -| `acquire!` | View types (`SubArray`, `ReshapedArray`) | General use, BLAS/LAPACK | -| `unsafe_acquire!` | Native `Array`/`CuArray` | FFI, type constraints | - -Both follow the same scope rules. Use `acquire!` by default—views work with all standard Julia linear algebra operations. - -## Thread Safety - -Pools are task-local, so each thread automatically gets its own pool: - -```julia -# ✅ Safe: each task has independent pool -Threads.@threads for i in 1:N - @with_pool pool begin - a = acquire!(pool, Float64, 100) - # work with a... - end -end - -# ❌ Unsafe: pool created outside threaded region -@with_pool pool begin - Threads.@threads for i in 1:N - a = acquire!(pool, Float64, 100) # race condition! - end -end -``` - -See [Multi-Threading](multi-threading.md) for more patterns. From 6c13552e6accdeaa597757331a616f036b95c389 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 5 Jan 2026 11:29:22 -0800 Subject: [PATCH 7/8] docs: restructure to 4-stage learning journey Reorganize documentation from Guide/Usage/Advanced to: - Basics: quick-start, @with_pool patterns, essential API, safety rules - Features: @maybe_with_pool, CUDA, multi-threading, configuration - Reference: full API - Architecture: how-it-works, type dispatch, macro internals, design docs New content: - basics/with-pool-patterns.md: function decorator vs block wrapper - basics/api-essentials.md: core API with reset! and fill! patterns - architecture/type-dispatch.md: N-way cache + View vs Array guide - architecture/design-docs.md: links to design documents Improvements: - safety-rules.md: visual emphasis with tables - how-it-works.md: zero-alloc diagram + try...finally explanation - make.jl: mapping table approach for path rewriting --- docs/make.jl | 69 +++++---- docs/src/architecture/design-docs.md | 44 ++++++ .../how-it-works.md} | 65 ++++++-- .../macro-internals.md | 6 +- docs/src/architecture/type-dispatch.md | 140 ++++++++++++++++++ docs/src/basics/api-essentials.md | 120 +++++++++++++++ .../quick-start.md} | 14 +- .../safety.md => basics/safety-rules.md} | 41 ++++- docs/src/basics/with-pool-patterns.md | 111 ++++++++++++++ docs/src/{usage => features}/configuration.md | 0 .../cuda.md => features/cuda-support.md} | 0 .../maybe-with-pool.md} | 2 +- .../{advanced => features}/multi-threading.md | 0 docs/src/index.md | 20 +-- docs/src/{usage => reference}/api.md | 0 15 files changed, 564 insertions(+), 68 deletions(-) create mode 100644 docs/src/architecture/design-docs.md rename docs/src/{advanced/internals.md => architecture/how-it-works.md} (72%) rename docs/src/{advanced => architecture}/macro-internals.md (97%) create mode 100644 docs/src/architecture/type-dispatch.md create mode 100644 docs/src/basics/api-essentials.md rename docs/src/{guide/getting-started.md => basics/quick-start.md} (85%) rename docs/src/{guide/safety.md => basics/safety-rules.md} (66%) create mode 100644 docs/src/basics/with-pool-patterns.md rename docs/src/{usage => features}/configuration.md (100%) rename docs/src/{usage/cuda.md => features/cuda-support.md} (100%) rename docs/src/{usage/maybe_with_pool.md => features/maybe-with-pool.md} (98%) rename docs/src/{advanced => features}/multi-threading.md (100%) rename docs/src/{usage => reference}/api.md (100%) diff --git a/docs/make.jl b/docs/make.jl index 2147369..f385e83 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -22,31 +22,32 @@ end const GITHUB_PAGES_BASE = "https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable" const REPO_URL = "https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl" +# Path mapping table: (pattern, replacement) +# Order matters for overlapping patterns +const README_PATH_MAPPINGS = [ + # Reference + (r"\(docs/api\.md(#[^)]+)?\)", s"(reference/api.md\1)"), + + # Features + (r"\(docs/cuda\.md(#[^)]+)?\)", s"(features/cuda-support.md\1)"), + (r"\(docs/configuration\.md(#[^)]+)?\)", s"(features/configuration.md\1)"), + (r"\(docs/maybe_with_pool\.md(#[^)]+)?\)", s"(features/maybe-with-pool.md\1)"), + (r"\(docs/multi-threading\.md(#[^)]+)?\)", s"(features/multi-threading.md\1)"), + + # Basics + (r"\(docs/safety\.md(#[^)]+)?\)", s"(basics/safety-rules.md\1)"), +] + """ Rewrite relative paths in README.md for Documenter structure. -Converts GitHub repo links to internal Documenter links: -- `docs/api.md` → `usage/api.md` -- `docs/cuda.md` → `usage/cuda.md` -- `docs/safety.md` → `guide/safety.md` -- `docs/multi-threading.md` → `advanced/multi-threading.md` -- `docs/configuration.md` → `usage/configuration.md` -- `docs/maybe_with_pool.md` → `usage/maybe_with_pool.md` - +Uses mapping table to convert GitHub repo links to internal Documenter links. Also handles anchor links (e.g., `docs/api.md#convenience-functions`). """ function rewrite_readme_paths(content::String) - # Usage docs (with optional anchors) - content = replace(content, r"\(docs/api\.md(#[^)]+)?\)" => s"(usage/api.md\1)") - content = replace(content, r"\(docs/cuda\.md(#[^)]+)?\)" => s"(usage/cuda.md\1)") - content = replace(content, r"\(docs/configuration\.md(#[^)]+)?\)" => s"(usage/configuration.md\1)") - content = replace(content, r"\(docs/maybe_with_pool\.md(#[^)]+)?\)" => s"(usage/maybe_with_pool.md\1)") - - # Guide docs - content = replace(content, r"\(docs/safety\.md(#[^)]+)?\)" => s"(guide/safety.md\1)") - - # Advanced docs - content = replace(content, r"\(docs/multi-threading\.md(#[^)]+)?\)" => s"(advanced/multi-threading.md\1)") + for (pattern, replacement) in README_PATH_MAPPINGS + content = replace(content, pattern => replacement) + end # LICENSE link → GitHub content = replace(content, "(LICENSE)" => "($(REPO_URL)/blob/master/LICENSE)") @@ -80,20 +81,26 @@ makedocs( ), pages = [ "Home" => "index.md", - "Guide" => [ - "Getting Started" => "guide/getting-started.md", - "Safety Rules" => "guide/safety.md", + "Basics" => [ + "Quick Start" => "basics/quick-start.md", + "@with_pool Patterns" => "basics/with-pool-patterns.md", + "Essential API" => "basics/api-essentials.md", + "Safety Rules" => "basics/safety-rules.md", + ], + "Features" => [ + "@maybe_with_pool" => "features/maybe-with-pool.md", + "CUDA Support" => "features/cuda-support.md", + "Multi-threading" => "features/multi-threading.md", + "Configuration" => "features/configuration.md", ], - "Usage" => [ - "API Reference" => "usage/api.md", - "Configuration" => "usage/configuration.md", - "@maybe_with_pool" => "usage/maybe_with_pool.md", - "CUDA Support" => "usage/cuda.md", + "Reference" => [ + "Full API" => "reference/api.md", ], - "Advanced" => [ - "Multi-threading" => "advanced/multi-threading.md", - "How @with_pool Works" => "advanced/macro-internals.md", - "Internals" => "advanced/internals.md", + "Architecture" => [ + "How It Works" => "architecture/how-it-works.md", + "Type Dispatch & Cache" => "architecture/type-dispatch.md", + "@with_pool Internals" => "architecture/macro-internals.md", + "Design Documents" => "architecture/design-docs.md", ], ], doctest = false, # Doctests not set up in existing docs diff --git a/docs/src/architecture/design-docs.md b/docs/src/architecture/design-docs.md new file mode 100644 index 0000000..4b5e051 --- /dev/null +++ b/docs/src/architecture/design-docs.md @@ -0,0 +1,44 @@ +# Design Documents + +For in-depth analysis of design decisions, implementation tradeoffs, and architectural choices, see the design documents in the repository: + +## API Design + +- **[hybrid_api_design.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/hybrid_api_design.md)** + Two-API strategy (`acquire!` vs `unsafe_acquire!`) and type stability analysis + +## Caching & Performance + +- **[nd_array_approach_comparison.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/nd_array_approach_comparison.md)** + N-way cache design, boxing analysis, and ReshapedArray benchmarks + +- **[fixed_slots_codegen_design.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/fixed_slots_codegen_design.md)** + Zero-allocation iteration via `@generated` functions and fixed-slot type dispatch + +## Macro Internals + +- **[untracked_acquire_design.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/untracked_acquire_design.md)** + Macro-based untracked acquire detection and 1-based sentinel pattern + +## Backend Extensions + +- **[cuda_extension_design.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/cuda_extension_design.md)** + CUDA backend architecture and package extension loading + +--- + +## Document Overview + +| Document | Focus Area | Key Insights | +|----------|------------|--------------| +| hybrid_api_design | API strategy | View types for zero-alloc, Array for FFI | +| nd_array_approach_comparison | Caching | N-way associative cache reduces header allocation | +| fixed_slots_codegen_design | Codegen | @generated functions enable type-stable iteration | +| untracked_acquire_design | Macro safety | Sentinel pattern ensures correct cleanup | +| cuda_extension_design | GPU support | Seamless CPU/CUDA API parity | + +## See Also + +- [How It Works](how-it-works.md) - High-level architecture overview +- [Type Dispatch & Cache](type-dispatch.md) - Technical deep-dive +- [@with_pool Macro Internals](macro-internals.md) - Macro transformation details diff --git a/docs/src/advanced/internals.md b/docs/src/architecture/how-it-works.md similarity index 72% rename from docs/src/advanced/internals.md rename to docs/src/architecture/how-it-works.md index a31ec81..39840a2 100644 --- a/docs/src/advanced/internals.md +++ b/docs/src/architecture/how-it-works.md @@ -1,27 +1,68 @@ -# Internals +# How It Works -This page provides an overview of the internal architecture of AdaptiveArrayPools.jl. For detailed design documents, see the [`docs/design/`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/tree/master/docs/design) folder in the repository. +This page explains the core mechanisms that enable zero-allocation array reuse. + +## The Zero-Allocation Promise + +``` ++-------------------------------------------------------------+ +| Call 1 (warmup): | +| checkpoint! --> acquire! x 3 --> rewind! | +| | | +| +-- backing memory allocated | +| | +| Call 2+ (zero-alloc): | +| checkpoint! --> acquire! x 3 --> rewind! | +| | | +| +-- same memory reused, 0 bytes allocated | ++-------------------------------------------------------------+ +``` ## Checkpoint/Rewind Lifecycle -The core mechanism that enables zero-allocation reuse: +The core mechanism that enables memory reuse: ``` @with_pool pool function foo() - │ - ├─► checkpoint!(pool) # Save current state (n_active counters) - │ - │ A = acquire!(pool, ...) # n_active += 1 - │ B = acquire!(pool, ...) # n_active += 1 - │ C = acquire!(pool, ...) # n_active += 1 - │ ... compute ... - │ - └─► rewind!(pool) # Restore n_active → all arrays recycled + | + +---> checkpoint!(pool) # Save current state (n_active counters) + | + | A = acquire!(pool, ...) # n_active += 1 + | B = acquire!(pool, ...) # n_active += 1 + | C = acquire!(pool, ...) # n_active += 1 + | ... compute ... + | + +---> rewind!(pool) # Restore n_active, arrays recycled end ``` On repeated calls, the same memory is reused without any allocation. +## Exception Safety: try...finally + +The `@with_pool` macro generates code with exception-safe cleanup: + +```julia +# What you write: +@with_pool pool begin + A = acquire!(pool, Float64, 100) + result = compute(A) +end + +# What the macro generates: +let pool = get_task_local_pool() + checkpoint!(pool) + try + A = acquire!(pool, Float64, 100) + result = compute(A) + finally + rewind!(pool) # Always executes, even on exception + end +end +``` + +**Key guarantee**: The `finally` block ensures `rewind!` is called even if an exception occurs, preventing memory leaks and state corruption. + ## Fixed-Slot Type Dispatch To achieve zero-lookup overhead, common types have dedicated struct fields: diff --git a/docs/src/advanced/macro-internals.md b/docs/src/architecture/macro-internals.md similarity index 97% rename from docs/src/advanced/macro-internals.md rename to docs/src/architecture/macro-internals.md index e7aa69d..d8e5f93 100644 --- a/docs/src/advanced/macro-internals.md +++ b/docs/src/architecture/macro-internals.md @@ -251,6 +251,6 @@ end ## See Also -- [Internals](internals.md) — Overview of pool architecture -- [Safety Rules](../guide/safety.md) — Scope rules and best practices -- [Configuration](../usage/configuration.md) — Performance tuning options +- [How It Works](how-it-works.md) — Overview of pool architecture +- [Safety Rules](../basics/safety-rules.md) — Scope rules and best practices +- [Configuration](../features/configuration.md) — Performance tuning options diff --git a/docs/src/architecture/type-dispatch.md b/docs/src/architecture/type-dispatch.md new file mode 100644 index 0000000..896d17f --- /dev/null +++ b/docs/src/architecture/type-dispatch.md @@ -0,0 +1,140 @@ +# Type Dispatch & Caching + +This page explains the internal mechanisms that enable zero-allocation performance. + +## Fixed-Slot Type Dispatch + +To achieve zero-lookup overhead, common types have dedicated struct fields: + +```julia +struct AdaptiveArrayPool + float64::TypedPool{Float64} + float32::TypedPool{Float32} + int64::TypedPool{Int64} + int32::TypedPool{Int32} + complexf64::TypedPool{ComplexF64} + complexf32::TypedPool{ComplexF32} + bool::TypedPool{Bool} + others::IdDict{DataType, Any} # Fallback for rare types +end +``` + +When you call `acquire!(pool, Float64, n)`, the compiler inlines directly to `pool.float64` - no dictionary lookup, no type instability. + +## N-Way Set Associative Cache + +For `unsafe_acquire!` (which returns native `Array` types), we use an N-way cache to reduce header allocation: + +``` + CACHE_WAYS = 4 (default) + +----+----+----+----+ +Slot 0 (Float64): |way0|way1|way2|way3| <-- round-robin eviction + +----+----+----+----+ + +----+----+----+----+ +Slot 1 (Float32): |way0|way1|way2|way3| + +----+----+----+----+ + ... +``` + +### Cache Lookup Logic + +```julia +function unsafe_acquire!(pool, T, dims...) + typed_pool = get_typed_pool!(pool, T) + slot = n_active + 1 + base = (slot - 1) * CACHE_WAYS + + # Search all ways for matching dimensions + for k in 1:CACHE_WAYS + idx = base + k + if dims == typed_pool.nd_dims[idx] + # Cache hit! Check if underlying vector was resized + if pointer matches + return typed_pool.nd_arrays[idx] + end + end + end + + # Cache miss: create new Array header, store in next way (round-robin) + way = typed_pool.nd_next_way[slot] + typed_pool.nd_next_way[slot] = (way % CACHE_WAYS) + 1 + # ... create and cache Array ... +end +``` + +**Key insight**: Even on cache miss, only the `Array` header (~80-144 bytes) is allocated. The actual data memory is always reused from the pool. + +--- + +## View vs Array: When to Use What? + +| API | Return Type | Allocation | Recommended For | +|-----|-------------|------------|-----------------| +| `acquire!` | `SubArray` / `ReshapedArray` | **Always 0 bytes** | 99% of cases | +| `unsafe_acquire!` | `Vector` / `Array` | 0-144 bytes | FFI, type constraints | + +### Why View is the Default + +1. **Zero-allocation guarantee**: Compiler eliminates view wrappers via SROA (Scalar Replacement of Aggregates) +2. **BLAS/LAPACK compatible**: Processed as `StridedArray`, no performance difference +3. **Type stable**: Always returns the same wrapper types + +### When to Use unsafe_acquire! + +1. **C FFI**: When `ccall` requires `Ptr{T}` from contiguous memory + +```julia +arr = unsafe_acquire!(pool, Float64, 100) +ccall(:c_function, Cvoid, (Ptr{Float64}, Cint), arr, 100) +``` + +2. **Type signature constraints**: Function explicitly requires `Array{T,N}` + +```julia +function process(data::Array{Float64,2}) + # Only accepts Array, not AbstractArray +end + +m = unsafe_acquire!(pool, Float64, 10, 10) +process(m) # Works +``` + +3. **Runtime dispatch avoidance**: When types are determined at runtime + +```julia +# Polymorphic code where type stability matters +function dispatch_heavy(pool, T) + arr = unsafe_acquire!(pool, T, 100) # Concrete Array type + # ... operations that would trigger dispatch with views +end +``` + +### Performance Comparison + +| Operation | acquire! (View) | unsafe_acquire! (Array) | +|-----------|-----------------|-------------------------| +| Allocation (cached) | 0 bytes | 0 bytes | +| Allocation (miss) | 0 bytes | 80-144 bytes | +| BLAS operations | Identical | Identical | +| Type stability | Guaranteed | Guaranteed | +| FFI compatibility | Requires conversion | Direct | + +### Header Size by Dimensionality + +When `unsafe_acquire!` has a cache miss: + +| Dimensions | Header Size | +|------------|-------------| +| 1D (Vector) | 80 bytes | +| 2D-3D | 112 bytes | +| 4D-5D | 144 bytes | + +This is Julia's internal `Array` metadata; actual data memory is always reused from the pool. + +--- + +## See Also + +- [How It Works](how-it-works.md) - Checkpoint/Rewind mechanism +- [Design Documents](design-docs.md) - Detailed design analysis +- [Configuration](../features/configuration.md) - Cache tuning options diff --git a/docs/src/basics/api-essentials.md b/docs/src/basics/api-essentials.md new file mode 100644 index 0000000..1c2e63e --- /dev/null +++ b/docs/src/basics/api-essentials.md @@ -0,0 +1,120 @@ +# Essential API + +This page covers the core functions you'll use 99% of the time. For the complete API reference, see [Full API](../reference/api.md). + +## Array Acquisition + +### `acquire!(pool, T, dims...)` + +The primary function. Returns a view (`SubArray` for 1D, `ReshapedArray` for N-D). + +```julia +@with_pool pool begin + v = acquire!(pool, Float64, 100) # 1D: SubArray{Float64,1} + m = acquire!(pool, Float64, 10, 10) # 2D: ReshapedArray{Float64,2} + t = acquire!(pool, Int64, 2, 3, 4) # 3D: ReshapedArray{Int64,3} +end +``` + +**Always use `acquire!` by default.** Views are zero-allocation and work with all BLAS/LAPACK operations. + +### `unsafe_acquire!(pool, T, dims...)` + +Returns a native `Array` type. Only use when you specifically need `Array{T,N}`: + +```julia +@with_pool pool begin + # Use when you need Array for: + arr = unsafe_acquire!(pool, Float64, 100) + + # - FFI/ccall requiring Ptr{T} + ccall(:some_c_function, Cvoid, (Ptr{Float64}, Cint), arr, length(arr)) + + # - Functions with strict Array{T,N} type signatures +end +``` + +## Convenience Functions + +Zero-initialized arrays: + +```julia +@with_pool pool begin + z = zeros!(pool, Float64, 10, 10) # All zeros + o = ones!(pool, Float64, 100) # All ones +end +``` + +Match existing array properties: + +```julia +@with_pool pool begin + A = acquire!(pool, Float64, 50, 50) + B = similar!(pool, A) # Same type and size as A + C = similar!(pool, A, ComplexF64) # Same size, different type +end +``` + +### Custom Initialization with `fill!` + +For values other than 0 or 1, use Julia's built-in `fill!`: + +```julia +@with_pool pool begin + v = acquire!(pool, Float64, 100) + fill!(v, 3.14) # Fill with pi + + m = acquire!(pool, Int64, 10, 10) + fill!(m, -1) # Fill with sentinel value +end +``` + +This pattern works because pool arrays are mutable views into the underlying storage. + +## Pool Management + +### `reset!(pool)` + +Releases all memory held by the pool. Useful for long-running processes: + +```julia +# After processing a large batch +@with_pool pool begin + # ... large computation ... +end + +# Optionally release memory if pool grew too large +reset!(get_task_local_pool()) +``` + +### `pooling_enabled(pool)` + +Check if pooling is active (returns `false` for `DisabledPool`): + +```julia +@maybe_with_pool pool begin + if pooling_enabled(pool) + println("Using pool") + else + println("Pooling disabled") + end +end +``` + +## Quick Reference + +| Function | Returns | Allocation | Use Case | +|----------|---------|------------|----------| +| `acquire!(pool, T, dims...)` | View type | 0 bytes | Default choice | +| `unsafe_acquire!(pool, T, dims...)` | `Array{T,N}` | 0-144 bytes | FFI, type constraints | +| `zeros!(pool, [T,] dims...)` | View type | 0 bytes | Zero-initialized | +| `ones!(pool, [T,] dims...)` | View type | 0 bytes | One-initialized | +| `similar!(pool, A)` | View type | 0 bytes | Match existing array | +| `reset!(pool)` | `nothing` | - | Release all memory | +| `pooling_enabled(pool)` | `Bool` | - | Check pool status | + +## See Also + +- [Full API Reference](../reference/api.md) - Complete function list +- [@with_pool Patterns](with-pool-patterns.md) - Usage patterns +- [Safety Rules](safety-rules.md) - Scope rules diff --git a/docs/src/guide/getting-started.md b/docs/src/basics/quick-start.md similarity index 85% rename from docs/src/guide/getting-started.md rename to docs/src/basics/quick-start.md index 76bb0c3..2c4f89a 100644 --- a/docs/src/guide/getting-started.md +++ b/docs/src/basics/quick-start.md @@ -1,4 +1,4 @@ -# Getting Started +# Quick Start This guide will help you get up and running with AdaptiveArrayPools.jl in minutes. @@ -105,12 +105,12 @@ end end ``` -For complete safety guidelines, see [Safety Rules](safety.md). +For complete safety guidelines, see [Safety Rules](safety-rules.md). ## Next Steps -- [Safety Rules](safety.md) - Complete scope rules and anti-patterns -- [API Reference](../usage/api.md) - Full function and macro reference -- [Configuration](../usage/configuration.md) - Preferences and cache tuning -- [Multi-threading](../advanced/multi-threading.md) - Task/thread safety patterns -- [CUDA Support](../usage/cuda.md) - GPU backend usage +- [Safety Rules](safety-rules.md) - Complete scope rules and anti-patterns +- [Full API Reference](../reference/api.md) - Complete function and macro reference +- [Configuration](../features/configuration.md) - Preferences and cache tuning +- [Multi-threading](../features/multi-threading.md) - Task/thread safety patterns +- [CUDA Support](../features/cuda-support.md) - GPU backend usage diff --git a/docs/src/guide/safety.md b/docs/src/basics/safety-rules.md similarity index 66% rename from docs/src/guide/safety.md rename to docs/src/basics/safety-rules.md index e4eedc1..ab4685c 100644 --- a/docs/src/guide/safety.md +++ b/docs/src/basics/safety-rules.md @@ -1,8 +1,41 @@ -# Safety Guide +# Safety Rules -AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This requires one simple rule: **acquired arrays are only valid within their `@with_pool` scope**. +AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This requires understanding one critical rule. -## The Scope Rule +--- + +## The One Rule + +``` ++-------------------------------------------------------------+ +| | +| Pool arrays are ONLY valid within their @with_pool scope | +| | +| When the scope ends, the memory is recycled. | +| Using arrays after scope ends = UNDEFINED BEHAVIOR | +| | ++-------------------------------------------------------------+ +``` + +### What's Safe + +| Pattern | Example | Why It Works | +|---------|---------|--------------| +| Return computed values | `return sum(v)` | Scalar escapes, not the array | +| Return copies | `return copy(v)` | New allocation, independent data | +| Use within scope | `result = A * B` | Arrays valid during computation | + +### What's Dangerous + +| Pattern | Example | Why It Fails | +|---------|---------|--------------| +| Return array | `return v` | Array recycled after return | +| Store in global | `global_ref = v` | Points to recycled memory | +| Capture in closure | `() -> sum(v)` | v invalid when closure runs | + +--- + +## The Scope Rule in Detail When `@with_pool` ends, all arrays acquired within that scope are recycled. Using them after the scope ends leads to undefined behavior. @@ -107,4 +140,4 @@ end end ``` -See [Multi-Threading](../advanced/multi-threading.md) for more patterns. +See [Multi-Threading](../features/multi-threading.md) for more patterns. diff --git a/docs/src/basics/with-pool-patterns.md b/docs/src/basics/with-pool-patterns.md new file mode 100644 index 0000000..83906dd --- /dev/null +++ b/docs/src/basics/with-pool-patterns.md @@ -0,0 +1,111 @@ +# @with_pool Patterns + +The `@with_pool` macro provides automatic memory lifecycle management. It supports two usage patterns depending on your needs. + +## Pattern 1: Function Decorator + +Wraps an entire function with pool management. The pool is active for the function's full duration. + +```julia +@with_pool pool function compute(n) + A = acquire!(pool, Float64, n, n) + B = zeros!(pool, Float64, n) + + # ... compute with A and B ... + + return sum(A) + sum(B) # Return computed values, not arrays +end + +# Usage +result = compute(100) # Zero-allocation after warmup +``` + +**Best for:** +- Functions that exclusively use pooled arrays +- Hot-path functions called repeatedly +- Clear ownership semantics + +## Pattern 2: Block Wrapper + +Wraps only a portion of a function. Useful when you need pool arrays for part of the computation. + +```julia +function process_data(data) + # Pre-processing (no pool needed) + n = length(data) + + @with_pool pool begin + # Pool is only active inside this block + temp = acquire!(pool, Float64, n) + temp .= data .* 2 + result = sum(temp) + end # Pool arrays recycled here + + # Post-processing + return result * 1.5 +end +``` + +**Best for:** +- Functions with mixed allocation needs +- Gradual adoption in existing code +- Fine-grained scope control + +## Pattern Comparison + +| Aspect | Function Decorator | Block Wrapper | +|--------|-------------------|---------------| +| Scope | Entire function | begin...end block | +| Syntax | `@with_pool pool function ...` | `@with_pool pool begin ... end` | +| Pool lifetime | Function start to return | Block entry to exit | +| Nesting | Functions can call each other | Blocks can be nested | + +## Nested Pools + +Both patterns support nesting. Each scope maintains independent checkpoint state: + +```julia +@with_pool pool function outer(n) + A = acquire!(pool, Float64, n) + + @with_pool pool begin + # Inner scope - new checkpoint + B = acquire!(pool, Float64, n * 2) + inner_result = sum(B) + end # B recycled here + + # A still valid here + return sum(A) + inner_result +end +``` + +## Common Mistakes + +### Returning pool arrays (wrong) + +```julia +@with_pool pool function bad() + v = acquire!(pool, Float64, 100) + return v # v is recycled after return! +end +``` + +### Correct: return computed values + +```julia +@with_pool pool function good() + v = acquire!(pool, Float64, 100) + return sum(v) # Scalar result is safe +end + +# Or copy if you need the array +@with_pool pool function also_good() + v = acquire!(pool, Float64, 100) + return copy(v) # Explicit copy is safe +end +``` + +## See Also + +- [Essential API](api-essentials.md) - Core functions for pool operations +- [Safety Rules](safety-rules.md) - Important scope rules diff --git a/docs/src/usage/configuration.md b/docs/src/features/configuration.md similarity index 100% rename from docs/src/usage/configuration.md rename to docs/src/features/configuration.md diff --git a/docs/src/usage/cuda.md b/docs/src/features/cuda-support.md similarity index 100% rename from docs/src/usage/cuda.md rename to docs/src/features/cuda-support.md diff --git a/docs/src/usage/maybe_with_pool.md b/docs/src/features/maybe-with-pool.md similarity index 98% rename from docs/src/usage/maybe_with_pool.md rename to docs/src/features/maybe-with-pool.md index 39c31b7..bb0ec30 100644 --- a/docs/src/usage/maybe_with_pool.md +++ b/docs/src/features/maybe-with-pool.md @@ -1,4 +1,4 @@ -# @maybe_with_pool +# `@maybe_with_pool` Runtime-toggleable pooling. Users can enable/disable via `MAYBE_POOLING_ENABLED[]`. diff --git a/docs/src/advanced/multi-threading.md b/docs/src/features/multi-threading.md similarity index 100% rename from docs/src/advanced/multi-threading.md rename to docs/src/features/multi-threading.md diff --git a/docs/src/index.md b/docs/src/index.md index c89a222..82d9275 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -64,7 +64,7 @@ end | Allocations | ⚠️ 90,000 (2.75 GiB) | ✅ **0** | 100% eliminated | | GC Time | ⚠️ 31% | ✅ **0%** | No GC pauses | -> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](usage/cuda.md). +> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/cuda). ## How It Works @@ -76,11 +76,11 @@ end This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors. -`acquire!` returns lightweight views (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK. If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`—see [API Reference](usage/api.md). +`acquire!` returns lightweight views (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK. If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`—see [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api). -> **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](guide/safety.md). +> **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/guide/safety). -**Thread-safe by design**: Each Julia Task gets its own independent pool—no locks needed. See [Multi-Threading](advanced/multi-threading.md) for patterns. +**Thread-safe by design**: Each Julia Task gets its own independent pool—no locks needed. See [Multi-Threading](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/advanced/multi-threading) for patterns. ### Convenience Functions @@ -92,7 +92,7 @@ Common initialization patterns have convenience functions: | `ones!(pool, Float32, 3, 3)` | `acquire!` + `fill!(1)` | | `similar!(pool, A)` | `acquire!` matching `eltype(A)`, `size(A)` | -These return views like `acquire!`. For raw `Array` types, use `unsafe_acquire!` or its convenience variants (`unsafe_zeros!`, `unsafe_ones!`, `unsafe_similar!`). See [API Reference](usage/api.md#convenience-functions). +These return views like `acquire!`. For raw `Array` types, use `unsafe_acquire!` or its convenience variants (`unsafe_zeros!`, `unsafe_ones!`, `unsafe_similar!`). See [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api#convenience-functions). ## Installation @@ -106,11 +106,11 @@ Pkg.add("AdaptiveArrayPools") | Guide | Description | |-------|-------------| -| [API Reference](usage/api.md) | Complete function and macro reference | -| [CUDA Backend](usage/cuda.md) | GPU-specific usage and examples | -| [Safety Guide](guide/safety.md) | Scope rules and best practices | -| [Multi-Threading](advanced/multi-threading.md) | Task/thread safety patterns | -| [Configuration](usage/configuration.md) | Preferences and cache tuning | +| [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api) | Complete function and macro reference | +| [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/cuda) | GPU-specific usage and examples | +| [Safety Guide](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/guide/safety) | Scope rules and best practices | +| [Multi-Threading](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/advanced/multi-threading) | Task/thread safety patterns | +| [Configuration](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/configuration) | Preferences and cache tuning | ## License diff --git a/docs/src/usage/api.md b/docs/src/reference/api.md similarity index 100% rename from docs/src/usage/api.md rename to docs/src/reference/api.md From b7cb5bf5c65386c45cbcc3c2016c87744c6fe4e3 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 5 Jan 2026 12:29:35 -0800 Subject: [PATCH 8/8] docs: improve technical accuracy and add advanced pool patterns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add advanced/pool-patterns.md with direct pool access pattern - Fix memory semantics: "recycled" → "marked available for reuse" - Add warning admonition explaining undefined behavior after scope - Clarify unsafe_acquire! is zero-allocation on cache hit - Add tip admonition for cache behavior documentation - Separate basics and advanced pool patterns into distinct sections --- README.md | 2 +- docs/make.jl | 11 +- docs/src/advanced/pool-patterns.md | 199 ++++++++++++++++++++++++++ docs/src/architecture/how-it-works.md | 2 +- docs/src/basics/api-essentials.md | 7 +- docs/src/basics/quick-start.md | 2 +- docs/src/basics/safety-rules.md | 27 ++-- docs/src/basics/with-pool-patterns.md | 74 ++++------ docs/src/index.md | 2 +- 9 files changed, 258 insertions(+), 68 deletions(-) create mode 100644 docs/src/advanced/pool-patterns.md diff --git a/README.md b/README.md index 248854a..0d572ab 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ end 1. **Checkpoint** — Saves current pool state when entering the block 2. **Acquire** — `acquire!` returns arrays backed by pooled memory -3. **Rewind** — When the block ends, all acquired arrays are recycled for reuse +3. **Rewind** — When the block ends, all acquired arrays are marked available for reuse This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors. diff --git a/docs/make.jl b/docs/make.jl index f385e83..d441c21 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -83,14 +83,17 @@ makedocs( "Home" => "index.md", "Basics" => [ "Quick Start" => "basics/quick-start.md", - "@with_pool Patterns" => "basics/with-pool-patterns.md", + "`@with_pool` Patterns" => "basics/with-pool-patterns.md", "Essential API" => "basics/api-essentials.md", "Safety Rules" => "basics/safety-rules.md", ], + "Advanced" => [ + "Pool Patterns" => "advanced/pool-patterns.md", + "Multi-threading" => "features/multi-threading.md", + ], "Features" => [ - "@maybe_with_pool" => "features/maybe-with-pool.md", + "`@maybe_with_pool`" => "features/maybe-with-pool.md", "CUDA Support" => "features/cuda-support.md", - "Multi-threading" => "features/multi-threading.md", "Configuration" => "features/configuration.md", ], "Reference" => [ @@ -99,7 +102,7 @@ makedocs( "Architecture" => [ "How It Works" => "architecture/how-it-works.md", "Type Dispatch & Cache" => "architecture/type-dispatch.md", - "@with_pool Internals" => "architecture/macro-internals.md", + "`@with_pool` Internals" => "architecture/macro-internals.md", "Design Documents" => "architecture/design-docs.md", ], ], diff --git a/docs/src/advanced/pool-patterns.md b/docs/src/advanced/pool-patterns.md new file mode 100644 index 0000000..8aedc3e --- /dev/null +++ b/docs/src/advanced/pool-patterns.md @@ -0,0 +1,199 @@ +# Advanced Pool Patterns + +This page covers advanced usage patterns for experienced users. + +## Calling Other `@with_pool` Functions + +Each `@with_pool` function manages its own checkpoint. They can call each other freely: + +```julia +@with_pool pool function step1(n) + A = zeros!(pool, Float64, n) + fill!(A, 1.0) + return sum(A) +end + +@with_pool pool function step2(n) + B = zeros!(pool, Float64, n) + fill!(B, 2.0) + return sum(B) +end + +@with_pool pool function pipeline(n) + a = step1(n) # step1's arrays marked for reuse when it returns + b = step2(n) # step2's arrays marked for reuse when it returns + C = acquire!(pool, Float64, n) + fill!(C, a + b) + return sum(C) +end +``` + +## Passing Pool as Argument + +For complex call hierarchies, use `@with_pool` only at the top level and pass the pool through function arguments: + +```julia +# Inner functions receive pool as argument - no @with_pool needed +function compute_step!(pool, data, result) + temp = acquire!(pool, Float64, length(data)) + temp .= data .* 2 + result[] += sum(temp) +end + +function process_chunk!(pool, chunk, result) + temp = zeros!(pool, Float64, length(chunk)) + compute_step!(pool, chunk, temp) + result[] += sum(temp) +end + +# Only the entry point uses @with_pool +@with_pool pool function main_computation(chunks) + result = Ref(0.0) + for chunk in chunks + process_chunk!(pool, chunk, result) + end + return result[] +end +``` + +**Benefits:** +- Single checkpoint/rewind at top level +- Inner functions are simpler (no macro overhead) +- Pool lifetime is explicit and controlled + +## Direct Pool Access in Inner Functions + +An alternative to passing pool as argument: inner functions call `get_task_local_pool()` directly, while a top-level `@with_pool` function controls the lifecycle. + +```julia +# Inner functions access pool directly - no argument needed +function compute_step!(data, result) + pool = get_task_local_pool() # Direct access + temp = acquire!(pool, Float64, length(data)) + temp .= data .* 2 + result[] += sum(temp) + # temp NOT released here - stays active +end + +function process_chunk!(chunk, accumulator) + pool = get_task_local_pool() # Direct access + buffer = zeros!(pool, Float64, length(chunk)) + compute_step!(chunk, buffer) + accumulator[] += sum(buffer) + # buffer NOT released here - stays active +end + +# Top-level controls lifecycle with @with_pool +@with_pool pool function main_pipeline(chunks) + # checkpoint!() ─────────────────────────────────┐ + accumulator = Ref(0.0) # │ + for chunk in chunks # │ + process_chunk!(chunk, accumulator) # │ All arrays from + # └─ compute_step! allocates temp # │ inner functions + # └─ process_chunk! allocates buffer # │ accumulate here + end # │ + return accumulator[] # │ + # rewind!() ─────────────────────────────────────┘ + # └─ ALL arrays (temp, buffer, ...) marked for reuse +end +``` + +### Memory Flow Visualization + +``` +main_pipeline(chunks) Inner Functions + │ + checkpoint!() + │ + ├──► process_chunk!() + │ │ + │ ├──► get_task_local_pool() ──► buffer allocated + │ │ + │ └──► compute_step!() + │ │ + │ └──► get_task_local_pool() ──► temp allocated + │ + ├──► process_chunk!() (next iteration) + │ └──► ... more allocations ... + │ + ▼ + rewind!() ◄─────── ALL arrays marked for reuse +``` + +### ⚠️ User Responsibility Warning + +This pattern requires **you** to guarantee that inner functions are **always** called through a `@with_pool` entry point: + +```julia +# SAFE: Called through main_pipeline +main_pipeline(my_chunks) # ✓ Lifecycle managed + +# DANGEROUS: Direct call without @with_pool wrapper +compute_step!(some_data, some_ref) # ✗ No checkpoint/rewind! +# └─ Arrays allocated but NEVER marked for reuse → pool grows unboundedly +``` + +**When to use this pattern:** +- Deep call hierarchies where threading pool through every function is tedious +- Performance-critical code where you want to avoid argument passing overhead +- You can enforce that all entry points use `@with_pool` + +**When to prefer "Passing Pool as Argument":** +- Functions may be called from various contexts (some pooled, some not) +- Library code where you can't control the caller +- You want explicit documentation of pool dependency in function signatures + +## Manual Checkpoint/Rewind + +For fine-grained control, use `checkpoint!` and `rewind!` directly: + +```julia +function manual_control() + pool = get_task_local_pool() + + checkpoint!(pool) + try + A = acquire!(pool, Float64, 100) + B = acquire!(pool, Float64, 100) + # ... compute ... + return sum(A) + sum(B) + finally + rewind!(pool) + end +end +``` + +This is what `@with_pool` generates internally. Use manual control when: +- Integrating with existing try/catch blocks +- Conditional checkpoint/rewind logic needed +- Building custom pool management abstractions + +## Scope-Only `@with_pool` + +You can omit the pool name when inner functions handle their own acquire: + +```julia +@with_pool p function step1() + v = acquire!(p, Float64, 100) + sum(v) +end + +@with_pool p function step2() + v = acquire!(p, Float64, 200) + sum(v) +end + +# Outer function just provides scope management +@with_pool function orchestrate() + a = step1() + b = step2() + return a + b +end +``` + +The name-less `@with_pool` still performs checkpoint/rewind but doesn't expose the pool variable. This is useful when you're orchestrating other `@with_pool` functions. + +## See Also + +- [`@with_pool` Patterns](../basics/with-pool-patterns.md) - Basic usage patterns +- [Safety Rules](../basics/safety-rules.md) - Scope rules diff --git a/docs/src/architecture/how-it-works.md b/docs/src/architecture/how-it-works.md index 39840a2..94ac084 100644 --- a/docs/src/architecture/how-it-works.md +++ b/docs/src/architecture/how-it-works.md @@ -32,7 +32,7 @@ The core mechanism that enables memory reuse: | C = acquire!(pool, ...) # n_active += 1 | ... compute ... | - +---> rewind!(pool) # Restore n_active, arrays recycled + +---> rewind!(pool) # Restore n_active, arrays available for reuse end ``` diff --git a/docs/src/basics/api-essentials.md b/docs/src/basics/api-essentials.md index 1c2e63e..b0c5e42 100644 --- a/docs/src/basics/api-essentials.md +++ b/docs/src/basics/api-essentials.md @@ -20,7 +20,7 @@ end ### `unsafe_acquire!(pool, T, dims...)` -Returns a native `Array` type. Only use when you specifically need `Array{T,N}`: +Returns a native `Array` type. **Zero-allocation on cache hit**—only allocates a small header (~80-144 bytes) on cache miss. Use when you specifically need `Array{T,N}`: ```julia @with_pool pool begin @@ -34,6 +34,9 @@ Returns a native `Array` type. Only use when you specifically need `Array{T,N}`: end ``` +!!! tip "Cache behavior" + Same dimension pattern → **0 bytes**. Different pattern → 80-144 bytes header only (data memory always reused). See [N-Way Cache](../architecture/type-dispatch.md#n-way-set-associative-cache) for details. + ## Convenience Functions Zero-initialized arrays: @@ -106,7 +109,7 @@ end | Function | Returns | Allocation | Use Case | |----------|---------|------------|----------| | `acquire!(pool, T, dims...)` | View type | 0 bytes | Default choice | -| `unsafe_acquire!(pool, T, dims...)` | `Array{T,N}` | 0-144 bytes | FFI, type constraints | +| `unsafe_acquire!(pool, T, dims...)` | `Array{T,N}` | 0 (hit) / 80-144 (miss) | FFI, type constraints | | `zeros!(pool, [T,] dims...)` | View type | 0 bytes | Zero-initialized | | `ones!(pool, [T,] dims...)` | View type | 0 bytes | One-initialized | | `similar!(pool, A)` | View type | 0 bytes | Match existing array | diff --git a/docs/src/basics/quick-start.md b/docs/src/basics/quick-start.md index 2c4f89a..87eaa5b 100644 --- a/docs/src/basics/quick-start.md +++ b/docs/src/basics/quick-start.md @@ -95,7 +95,7 @@ Arrays from the pool are **only valid within the `@with_pool` scope**: # DO NOT return pool-backed arrays @with_pool pool function bad_example() A = acquire!(pool, Float64, 10) - return A # WRONG - A will be recycled after this scope! + return A # WRONG - A marked for reuse, data may be overwritten! end # Return computed values instead diff --git a/docs/src/basics/safety-rules.md b/docs/src/basics/safety-rules.md index ab4685c..0faec74 100644 --- a/docs/src/basics/safety-rules.md +++ b/docs/src/basics/safety-rules.md @@ -11,7 +11,7 @@ AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This | | | Pool arrays are ONLY valid within their @with_pool scope | | | -| When the scope ends, the memory is recycled. | +| When the scope ends, arrays are marked for reuse. | | Using arrays after scope ends = UNDEFINED BEHAVIOR | | | +-------------------------------------------------------------+ @@ -29,15 +29,15 @@ AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This | Pattern | Example | Why It Fails | |---------|---------|--------------| -| Return array | `return v` | Array recycled after return | -| Store in global | `global_ref = v` | Points to recycled memory | -| Capture in closure | `() -> sum(v)` | v invalid when closure runs | +| Return array | `return v` | Array marked for reuse after return | +| Store in global | `global_ref = v` | Points to reusable memory | +| Capture in closure | `() -> sum(v)` | v may be overwritten when closure runs | --- ## The Scope Rule in Detail -When `@with_pool` ends, all arrays acquired within that scope are recycled. Using them after the scope ends leads to undefined behavior. +When `@with_pool` ends, all arrays acquired within that scope are **marked available for reuse**—not immediately freed. This is what makes zero-allocation possible on subsequent calls. ```julia @with_pool pool begin @@ -46,9 +46,18 @@ When `@with_pool` ends, all arrays acquired within that scope are recycled. Usin result = sum(v) # ✅ compute and return values copied = copy(v) # ✅ copy if you need data outside end -# v is no longer valid here +# v is no longer valid here - it's marked for reuse ``` +!!! warning "Why Undefined Behavior?" + After scope ends, using `v` is undefined because: + + - **Subsequent `acquire!` calls may overwrite the data** — the memory is available for reuse + - **Task termination may trigger GC** — the pool itself could be garbage collected + - **It might "work" by luck** — data unchanged until next acquire, but don't rely on this + + The worst case is **silent data corruption**: your code appears to work but produces wrong results intermittently. + ## What NOT to Do ### Don't return pool-backed arrays @@ -57,7 +66,7 @@ end # ❌ Wrong: returning the array itself @with_pool pool function bad_example() v = acquire!(pool, Float64, 100) - return v # v will be recycled after this returns! + return v # v marked for reuse after return! end # ✅ Correct: return computed values or copies @@ -75,12 +84,12 @@ global_ref = nothing @with_pool pool begin global_ref = acquire!(pool, Float64, 100) end -# global_ref now points to recycled memory +# global_ref now points to reusable memory - data may be overwritten # ❌ Wrong: capturing in closure @with_pool pool begin v = acquire!(pool, Float64, 100) - callback = () -> sum(v) # v captured but will be invalid + callback = () -> sum(v) # v captured but may be overwritten later end ``` diff --git a/docs/src/basics/with-pool-patterns.md b/docs/src/basics/with-pool-patterns.md index 83906dd..e77aa88 100644 --- a/docs/src/basics/with-pool-patterns.md +++ b/docs/src/basics/with-pool-patterns.md @@ -1,55 +1,55 @@ -# @with_pool Patterns +# `@with_pool` Patterns The `@with_pool` macro provides automatic memory lifecycle management. It supports two usage patterns depending on your needs. +## Pool Name: Choose Any Identifier + +The first argument to `@with_pool` is a **variable name** you choose - it doesn't have to be `pool`: + +```julia +@with_pool p function foo() ... end +@with_pool mypool function bar() ... end +@with_pool scratch function baz() ... end +``` + +Use whatever name makes your code clearest. + ## Pattern 1: Function Decorator -Wraps an entire function with pool management. The pool is active for the function's full duration. +Wraps an entire function with pool management: ```julia @with_pool pool function compute(n) A = acquire!(pool, Float64, n, n) B = zeros!(pool, Float64, n) - - # ... compute with A and B ... - - return sum(A) + sum(B) # Return computed values, not arrays + # ... compute ... + return sum(A) + sum(B) end -# Usage result = compute(100) # Zero-allocation after warmup ``` -**Best for:** -- Functions that exclusively use pooled arrays -- Hot-path functions called repeatedly -- Clear ownership semantics +**Best for:** Functions that exclusively use pooled arrays, hot-path functions. ## Pattern 2: Block Wrapper -Wraps only a portion of a function. Useful when you need pool arrays for part of the computation. +Wraps only a portion of a function: ```julia function process_data(data) - # Pre-processing (no pool needed) n = length(data) @with_pool pool begin - # Pool is only active inside this block temp = acquire!(pool, Float64, n) temp .= data .* 2 result = sum(temp) - end # Pool arrays recycled here + end # temp marked for reuse here - # Post-processing return result * 1.5 end ``` -**Best for:** -- Functions with mixed allocation needs -- Gradual adoption in existing code -- Fine-grained scope control +**Best for:** Functions with mixed allocation needs, gradual adoption. ## Pattern Comparison @@ -58,50 +58,26 @@ end | Scope | Entire function | begin...end block | | Syntax | `@with_pool pool function ...` | `@with_pool pool begin ... end` | | Pool lifetime | Function start to return | Block entry to exit | -| Nesting | Functions can call each other | Blocks can be nested | - -## Nested Pools - -Both patterns support nesting. Each scope maintains independent checkpoint state: - -```julia -@with_pool pool function outer(n) - A = acquire!(pool, Float64, n) - - @with_pool pool begin - # Inner scope - new checkpoint - B = acquire!(pool, Float64, n * 2) - inner_result = sum(B) - end # B recycled here - - # A still valid here - return sum(A) + inner_result -end -``` ## Common Mistakes -### Returning pool arrays (wrong) - ```julia +# WRONG: returning the array itself @with_pool pool function bad() v = acquire!(pool, Float64, 100) - return v # v is recycled after return! + return v # v marked for reuse after return! end -``` -### Correct: return computed values - -```julia +# CORRECT: return computed values @with_pool pool function good() v = acquire!(pool, Float64, 100) return sum(v) # Scalar result is safe end -# Or copy if you need the array +# CORRECT: return a copy if you need the data @with_pool pool function also_good() v = acquire!(pool, Float64, 100) - return copy(v) # Explicit copy is safe + return copy(v) end ``` diff --git a/docs/src/index.md b/docs/src/index.md index 82d9275..8b95f77 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -72,7 +72,7 @@ end 1. **Checkpoint** — Saves current pool state when entering the block 2. **Acquire** — `acquire!` returns arrays backed by pooled memory -3. **Rewind** — When the block ends, all acquired arrays are recycled for reuse +3. **Rewind** — When the block ends, all acquired arrays are marked available for reuse This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors.