From 5de5de39c4e01e9cecf62a7cc60d8d6a8417cd38 Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Mon, 5 Jan 2026 10:14:28 -0800
Subject: [PATCH 1/8] docs: add GitHub Pages documentation with Documenter.jl

- Add .github/workflows/Documentation.yml for automated deployment
- Add docs/Project.toml with Documenter.jl dependencies
- Add docs/make.jl with path rewriting and page organization
- Add new documentation pages:
  - guide/getting-started.md: Quick start guide
  - advanced/macro-internals.md: How @with_pool works
  - advanced/internals.md: Pool architecture overview
- Organize existing docs into structured hierarchy:
  - Guide: Getting Started, Safety Rules
  - Usage: API Reference, Configuration, @maybe_with_pool, CUDA
  - Advanced: Multi-threading, Macro Internals, Internals
---
 .github/workflows/Documentation.yml  |  53 +++++
 docs/Project.toml                    |   7 +
 docs/make.jl                         | 143 ++++++++++++++
 docs/src/advanced/internals.md       | 136 +++++++++++++
 docs/src/advanced/macro-internals.md | 256 ++++++++++++++++++++++++
 docs/src/advanced/multi-threading.md | 284 +++++++++++++++++++++++++++
 docs/src/guide/getting-started.md    | 116 +++++++++++
 docs/src/guide/safety.md             | 110 +++++++++++
 docs/src/index.md                    | 117 +++++++++++
 docs/src/usage/api.md                | 111 +++++++++++
 docs/src/usage/configuration.md      | 102 ++++++++++
 docs/src/usage/cuda.md               | 123 ++++++++++++
 docs/src/usage/maybe_with_pool.md    |  53 +++++
 13 files changed, 1611 insertions(+)
 create mode 100644 .github/workflows/Documentation.yml
 create mode 100644 docs/Project.toml
 create mode 100644 docs/make.jl
 create mode 100644 docs/src/advanced/internals.md
 create mode 100644 docs/src/advanced/macro-internals.md
 create mode 100644 docs/src/advanced/multi-threading.md
 create mode 100644 docs/src/guide/getting-started.md
 create mode 100644 docs/src/guide/safety.md
 create mode 100644 docs/src/index.md
 create mode 100644 docs/src/usage/api.md
 create mode 100644 docs/src/usage/configuration.md
 create mode 100644 docs/src/usage/cuda.md
 create mode 100644 docs/src/usage/maybe_with_pool.md

diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml
new file mode 100644
index 0000000..5e1668d
--- /dev/null
+++ b/.github/workflows/Documentation.yml
@@ -0,0 +1,53 @@
+name: Documentation
+
+on:
+  push:
+    branches:
+      - master
+    tags: '*'
+  pull_request:
+    branches:
+      - master
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.ref }}-docs
+  cancel-in-progress: true
+
+jobs:
+  build:
+    permissions:
+      contents: write
+      statuses: write
+    name: Documentation
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: julia-actions/setup-julia@latest
+
+      - uses: julia-actions/cache@v2
+
+      - name: Add FuseRegistry
+        run: |
+          rm -rf ~/.julia/registries/FuseRegistry
+          julia -e 'using Pkg; Pkg.Registry.add(RegistrySpec(url="https://github.com/ProjectTorreyPines/FuseRegistry.jl.git")); Pkg.Registry.add("General"); Pkg.Registry.update()'
+
+      - name: Replace git@github.com with https in Package.toml files
+        run: |
+          find ~/.julia/registries/FuseRegistry -type f -name 'Package.toml' -exec sed -i 's|git@github.com:|https://project-torrey-pines:${{secrets.PTP_READ_TOKEN}}@github.com/|g' {} +
+
+      - name: Install dependencies
+        run: |
+          julia --project=docs -e '
+            using Pkg
+            Pkg.activate("docs")
+            Pkg.develop(PackageSpec(path=pwd()))
+            Pkg.instantiate()
+          '
+
+      - name: Build and deploy
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+        run: julia --project=docs/ docs/make.jl
diff --git a/docs/Project.toml b/docs/Project.toml
new file mode 100644
index 0000000..d21f8f7
--- /dev/null
+++ b/docs/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+AdaptiveArrayPools = "5768322a-0810-4546-8322-123456789abc"
+LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
+
+[compat]
+Documenter = "1"
diff --git a/docs/make.jl b/docs/make.jl
new file mode 100644
index 0000000..7517dbd
--- /dev/null
+++ b/docs/make.jl
@@ -0,0 +1,143 @@
+using Documenter
+using AdaptiveArrayPools
+
+# ============================================
+# Helper: Conditional write (for LiveServer compatibility)
+# ============================================
+
+"""
+Write file only if content changed (prevents LiveServer infinite loop).
+"""
+function write_if_changed(path::String, content::String)
+    if isfile(path) && read(path, String) == content
+        return  # Content unchanged, skip write
+    end
+    write(path, content)
+end
+
+"""
+Copy file only if content changed (prevents mtime update triggering rebuild).
+"""
+function cp_if_changed(src::String, dst::String)
+    if isfile(dst) && read(src) == read(dst)
+        return  # Content unchanged, skip copy
+    end
+    cp(src, dst; force=true)
+end
+
+# ============================================
+# Helper: Rewrite relative paths in README
+# ============================================
+
+"""
+Rewrite relative paths in README.md for Documenter structure.
+
+Converts:
+- `docs/api.md` → `usage/api.md`
+- `docs/cuda.md` → `usage/cuda.md`
+- `docs/safety.md` → `guide/safety.md`
+- `docs/multi-threading.md` → `advanced/multi-threading.md`
+- `docs/configuration.md` → `usage/configuration.md`
+- `docs/maybe_with_pool.md` → `usage/maybe_with_pool.md`
+
+Also handles anchor links (e.g., `docs/api.md#convenience-functions`).
+"""
+function rewrite_readme_paths(content::String)
+    repo_url = "https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl"
+
+    # Usage docs (with optional anchors)
+    content = replace(content, r"\(docs/api\.md(#[^)]+)?\)" => s"(usage/api.md\1)")
+    content = replace(content, r"\(docs/cuda\.md(#[^)]+)?\)" => s"(usage/cuda.md\1)")
+    content = replace(content, r"\(docs/configuration\.md(#[^)]+)?\)" => s"(usage/configuration.md\1)")
+    content = replace(content, r"\(docs/maybe_with_pool\.md(#[^)]+)?\)" => s"(usage/maybe_with_pool.md\1)")
+
+    # Guide docs
+    content = replace(content, r"\(docs/safety\.md(#[^)]+)?\)" => s"(guide/safety.md\1)")
+
+    # Advanced docs
+    content = replace(content, r"\(docs/multi-threading\.md(#[^)]+)?\)" => s"(advanced/multi-threading.md\1)")
+
+    # LICENSE link → GitHub
+    content = replace(content, "(LICENSE)" => "($(repo_url)/blob/master/LICENSE)")
+
+    return content
+end
+
+# ============================================
+# Step 1: Setup directories
+# ============================================
+
+const DOCS_DIR = @__DIR__
+const DOCS_SRC = joinpath(DOCS_DIR, "src")
+
+# Create directory structure
+mkpath(DOCS_SRC)
+mkpath(joinpath(DOCS_SRC, "guide"))
+mkpath(joinpath(DOCS_SRC, "usage"))
+mkpath(joinpath(DOCS_SRC, "advanced"))
+
+# ============================================
+# Step 2: Copy and transform content
+# ============================================
+
+# README.md → index.md (with path rewriting)
+readme_content = read(joinpath(DOCS_DIR, "../README.md"), String)
+write_if_changed(joinpath(DOCS_SRC, "index.md"), rewrite_readme_paths(readme_content))
+
+# Copy existing docs to their new locations (with path fixes)
+
+# Guide section - fix relative links
+safety_content = read(joinpath(DOCS_DIR, "safety.md"), String)
+safety_content = replace(safety_content, "(multi-threading.md)" => "(../advanced/multi-threading.md)")
+write_if_changed(joinpath(DOCS_SRC, "guide/safety.md"), safety_content)
+
+# Usage section
+cp_if_changed(joinpath(DOCS_DIR, "api.md"), joinpath(DOCS_SRC, "usage/api.md"))
+cp_if_changed(joinpath(DOCS_DIR, "configuration.md"), joinpath(DOCS_SRC, "usage/configuration.md"))
+cp_if_changed(joinpath(DOCS_DIR, "maybe_with_pool.md"), joinpath(DOCS_SRC, "usage/maybe_with_pool.md"))
+cp_if_changed(joinpath(DOCS_DIR, "cuda.md"), joinpath(DOCS_SRC, "usage/cuda.md"))
+
+# Advanced section
+cp_if_changed(joinpath(DOCS_DIR, "multi-threading.md"), joinpath(DOCS_SRC, "advanced/multi-threading.md"))
+
+# ============================================
+# Step 3: Build documentation
+# ============================================
+
+makedocs(
+    sitename = "AdaptiveArrayPools.jl",
+    authors = "Min-Gu Yoo",
+    modules = [AdaptiveArrayPools],
+    format = Documenter.HTML(
+        prettyurls = get(ENV, "CI", nothing) == "true",
+        canonical = "https://projecttorreypines.github.io/AdaptiveArrayPools.jl",
+        assets = String[],
+    ),
+    pages = [
+        "Home" => "index.md",
+        "Guide" => [
+            "Getting Started" => "guide/getting-started.md",
+            "Safety Rules" => "guide/safety.md",
+        ],
+        "Usage" => [
+            "API Reference" => "usage/api.md",
+            "Configuration" => "usage/configuration.md",
+            "@maybe_with_pool" => "usage/maybe_with_pool.md",
+            "CUDA Support" => "usage/cuda.md",
+        ],
+        "Advanced" => [
+            "Multi-threading" => "advanced/multi-threading.md",
+            "How @with_pool Works" => "advanced/macro-internals.md",
+            "Internals" => "advanced/internals.md",
+        ],
+    ],
+    doctest = false,  # Doctests not set up in existing docs
+    checkdocs = :none,  # Using manual API tables, not @autodocs
+    warnonly = [:cross_references, :missing_docs],
+)
+
+deploydocs(
+    repo = "github.com/ProjectTorreyPines/AdaptiveArrayPools.jl.git",
+    devbranch = "master",
+    push_preview = false,  # Deploy only on master/tag, not on PR
+)
diff --git a/docs/src/advanced/internals.md b/docs/src/advanced/internals.md
new file mode 100644
index 0000000..3722203
--- /dev/null
+++ b/docs/src/advanced/internals.md
@@ -0,0 +1,136 @@
+# Internals
+
+This page provides an overview of the internal architecture of AdaptiveArrayPools.jl. For detailed design documents (in Korean), see the `design/` folder in the repository.
+
+## Checkpoint/Rewind Lifecycle
+
+The core mechanism that enables zero-allocation reuse:
+
+```
+@with_pool pool function foo()
+    │
+    ├─► checkpoint!(pool)     # Save current state (n_active counters)
+    │
+    │   A = acquire!(pool, ...)  # n_active += 1
+    │   B = acquire!(pool, ...)  # n_active += 1
+    │   C = acquire!(pool, ...)  # n_active += 1
+    │   ... compute ...
+    │
+    └─► rewind!(pool)         # Restore n_active → all arrays recycled
+end
+```
+
+On repeated calls, the same memory is reused without any allocation.
+
+## Fixed-Slot Type Dispatch
+
+To achieve zero-lookup overhead, common types have dedicated struct fields:
+
+```julia
+struct AdaptiveArrayPool
+    float64::TypedPool{Float64}
+    float32::TypedPool{Float32}
+    int64::TypedPool{Int64}
+    int32::TypedPool{Int32}
+    complexf64::TypedPool{ComplexF64}
+    complexf32::TypedPool{ComplexF32}
+    bool::TypedPool{Bool}
+    others::IdDict{DataType, Any}  # Fallback for rare types
+end
+```
+
+When you call `acquire!(pool, Float64, n)`, the compiler inlines directly to `pool.float64` — no dictionary lookup, no type instability.
+
+## N-Way Set Associative Cache
+
+For `unsafe_acquire!` (which returns native `Array` types), we use an N-way cache to reduce header allocation:
+
+```
+                    CACHE_WAYS = 4 (default)
+                    ┌────┬────┬────┬────┐
+Slot 0 (Float64):   │way0│way1│way2│way3│  ← round-robin eviction
+                    └────┴────┴────┴────┘
+                    ┌────┬────┬────┬────┐
+Slot 1 (Float32):   │way0│way1│way2│way3│
+                    └────┴────┴────┴────┘
+                    ...
+```
+
+### Cache Lookup Pseudocode
+
+```julia
+function unsafe_acquire!(pool, T, dims...)
+    typed_pool = get_typed_pool!(pool, T)
+    slot = n_active + 1
+    base = (slot - 1) * CACHE_WAYS
+
+    # Search all ways for matching dimensions
+    for k in 1:CACHE_WAYS
+        idx = base + k
+        if dims == typed_pool.nd_dims[idx]
+            # Cache hit! Check if underlying vector was resized
+            if pointer matches
+                return typed_pool.nd_arrays[idx]
+            end
+        end
+    end
+
+    # Cache miss: create new Array header, store in next way (round-robin)
+    way = typed_pool.nd_next_way[slot]
+    typed_pool.nd_next_way[slot] = (way + 1) % CACHE_WAYS
+    # ... create and cache Array ...
+end
+```
+
+**Key insight**: Even on cache miss, only the `Array` header (~80-144 bytes) is allocated. The actual data memory is always reused from the pool.
+
+## View vs Array Return Types
+
+Type stability is critical for performance. AdaptiveArrayPools provides two APIs:
+
+| API | 1D Return | N-D Return | Allocation |
+|-----|-----------|------------|------------|
+| `acquire!` | `SubArray{T,1}` | `ReshapedArray{T,N}` | Always 0 bytes |
+| `unsafe_acquire!` | `Vector{T}` | `Array{T,N}` | 0 bytes (hit) / ~100 bytes (miss) |
+
+### Why Two APIs?
+
+**`acquire!` (views)** — The compiler can eliminate view wrappers entirely through SROA (Scalar Replacement of Aggregates) and escape analysis. This is why 1D `SubArray` and N-D `ReshapedArray` achieve true zero allocation.
+
+**`unsafe_acquire!` (arrays)** — Sometimes you need a concrete `Array` type:
+- FFI/C interop requiring `Ptr{T}` from contiguous memory
+- Type signatures that explicitly require `Array{T,N}`
+- Avoiding runtime dispatch in polymorphic code
+
+## Typed Checkpoint/Rewind Optimization
+
+When the `@with_pool` macro can statically determine which types are used, it generates optimized code:
+
+```julia
+# If only Float64 is used in the block:
+checkpoint!(pool, Float64)  # ~77% faster than full checkpoint
+# ... compute ...
+rewind!(pool, Float64)
+```
+
+This avoids iterating over all type slots and the `others` IdDict.
+
+## 1-Based Sentinel Pattern
+
+Internal state vectors use a sentinel at index 0 to eliminate `isempty()` checks:
+
+```julia
+_checkpoint_n_active = [0]  # Sentinel at depth=0
+_checkpoint_depths = [0]    # Global scope marker
+```
+
+This pattern reduces branching in hot paths where every nanosecond counts.
+
+## Further Reading
+
+For detailed design documents (in Korean):
+- `design/hybrid_api_design.md` — Two-API strategy rationale
+- `design/cuda_extension_design.md` — CUDA backend architecture
+- `design/untracked_acquire_design.md` — Untracked acquire detection
+- `design/fixed_slots_codegen_design.md` — Code generation for fixed slots
+- `design/nd_array_approach_comparison.md` — N-way cache design comparison
diff --git a/docs/src/advanced/macro-internals.md b/docs/src/advanced/macro-internals.md
new file mode 100644
index 0000000..e7aa69d
--- /dev/null
+++ b/docs/src/advanced/macro-internals.md
@@ -0,0 +1,256 @@
+# How `@with_pool` Works
+
+This page explains the internal mechanics of the `@with_pool` macro for advanced users and contributors who want to understand the optimization strategies.
+
+## Overview
+
+The `@with_pool` macro provides automatic lifecycle management with three key optimizations:
+
+1. **Try-Finally Safety** — Guarantees cleanup even on exceptions
+2. **Typed Checkpoint/Rewind** — Only saves/restores used types (~77% faster)
+3. **Untracked Acquire Detection** — Safely handles `acquire!` calls outside macro visibility
+
+## Basic Lifecycle Flow
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│  @with_pool pool function foo(x)                            │
+│      A = acquire!(pool, Float64, 100)                       │
+│      B = similar!(pool, A)                                  │
+│      return sum(A) + sum(B)                                 │
+│  end                                                        │
+└─────────────────────────────────────────────────────────────┘
+                              ↓
+              ┌───────────────────────────────────┐
+              │       Macro Transformation        │
+              └───────────────────────────────────┘
+                              ↓
+┌─────────────────────────────────────────────────────────────┐
+│  function foo(x)                                            │
+│      pool = get_task_local_pool()                           │
+│      checkpoint!(pool, Float64)     # ← Type-specific       │
+│      try                                                    │
+│          A = _acquire_impl!(pool, Float64, 100)             │
+│          B = _similar_impl!(pool, A)                        │
+│          return sum(A) + sum(B)                             │
+│      finally                                                │
+│          rewind!(pool, Float64)     # ← Type-specific       │
+│      end                                                    │
+│  end                                                        │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### Key Points
+
+- **`try-finally`** ensures `rewind!` executes even if an exception occurs
+- `acquire!` → `_acquire_impl!` transformation bypasses untracked marking overhead
+- Type-specific `checkpoint!(pool, Float64)` is ~77% faster than full checkpoint
+
+## Type Extraction: Static Analysis at Compile Time
+
+The macro analyzes the AST to extract types used in `acquire!` calls:
+
+```julia
+# Macro sees these acquire! calls:
+@with_pool pool begin
+    A = acquire!(pool, Float64, 10, 10)     # → extracts Float64
+    B = zeros!(pool, ComplexF64, 100)       # → extracts ComplexF64
+    C = similar!(pool, A)                    # → extracts eltype(A) → Float64
+end
+
+# Generated code uses typed checkpoint/rewind:
+checkpoint!(pool, Float64, ComplexF64)
+try
+    ...
+finally
+    rewind!(pool, Float64, ComplexF64)
+end
+```
+
+### Type Extraction Rules
+
+| Call Pattern | Extracted Type |
+|--------------|----------------|
+| `acquire!(pool, Float64, dims...)` | `Float64` |
+| `acquire!(pool, x)` | `eltype(x)` (if x is external) |
+| `zeros!(pool, dims...)` | `default_eltype(pool)` |
+| `zeros!(pool, Float32, dims...)` | `Float32` |
+| `similar!(pool, x)` | `eltype(x)` |
+| `similar!(pool, x, Int64, ...)` | `Int64` |
+
+### When Type Extraction Fails → Full Checkpoint
+
+The macro falls back to full `checkpoint!(pool)` when:
+
+```julia
+@with_pool pool begin
+    T = eltype(data)                  # T defined locally AFTER checkpoint
+    A = acquire!(pool, T, 100)        # Can't use T at checkpoint time!
+end
+# → Falls back to checkpoint!(pool) / rewind!(pool)
+
+@with_pool pool begin
+    local_arr = compute()             # local_arr defined AFTER checkpoint
+    B = similar!(pool, local_arr)     # eltype(local_arr) unavailable
+end
+# → Falls back to checkpoint!(pool) / rewind!(pool)
+```
+
+## Untracked Acquire Detection
+
+### The Problem
+
+The macro can only see `acquire!` calls **directly in its AST**. Calls inside helper functions are invisible:
+
+```julia
+function helper!(pool)
+    return zeros!(pool, Float64, 100)   # Macro can't see this!
+end
+
+@with_pool pool begin
+    A = acquire!(pool, Int64, 10)       # ← Macro sees this (Int64)
+    B = helper!(pool)                    # ← Macro can't see Float64 inside!
+end
+
+# If only checkpoint!(pool, Int64), Float64 arrays won't be rewound!
+```
+
+### The Solution: `_untracked_flags`
+
+Every `acquire!` call (and convenience functions) marks itself as "untracked":
+
+```julia
+# Public API (called from user code outside macro)
+@inline function acquire!(pool, ::Type{T}, n::Int) where {T}
+    _mark_untracked!(pool)              # ← Sets flag!
+    _acquire_impl!(pool, T, n)
+end
+
+# Macro-transformed calls skip the marking
+# (because macro already knows about them)
+_acquire_impl!(pool, T, n)               # ← No flag
+```
+
+### Flow Diagram
+
+```
+@with_pool pool begin                    State of pool._untracked_flags
+    │                                    ─────────────────────────────────
+    ├─► checkpoint!(pool, Int64)         depth=2, flag[2]=false
+    │
+    │   A = _acquire_impl!(...)          (macro-transformed, no flag set)
+    │   B = helper!(pool)
+    │       └─► zeros!(pool, Float64, N)
+    │           └─► _mark_untracked!(pool)  flag[2]=TRUE ←──┐
+    │                                                        │
+    │   ... more code ...                                    │
+    │                                                        │
+    └─► rewind! check:                                       │
+        if pool._untracked_flags[2]  ─────────────────────────┘
+            rewind!(pool)            # Full rewind (safe)
+        else
+            rewind!(pool, Int64)     # Typed rewind (fast)
+        end
+end
+```
+
+### Why This Works
+
+1. **Macro-tracked calls**: Transformed to `_acquire_impl!` → no flag → typed rewind
+2. **Untracked calls**: Use public API → sets flag → triggers full rewind
+3. **Result**: Always safe, with optimization when possible
+
+## Nested `@with_pool` Handling
+
+Each `@with_pool` maintains its own checkpoint depth:
+
+```
+@with_pool p1 begin                      depth: 1 → 2
+    v1 = acquire!(p1, Float64, 10)
+    │
+    ├─► @with_pool p2 begin              depth: 2 → 3
+    │       v2 = acquire!(p2, Int64, 5)
+    │       helper!(p2)                  # sets flag[3]=true
+    │       sum(v2)
+    │   end                              depth: 3 → 2, flag[3] checked
+    │
+    │   # v1 still valid here!
+    sum(v1)
+end                                      depth: 2 → 1, flag[2] checked
+```
+
+### Depth Tracking Data Structures
+
+```julia
+struct AdaptiveArrayPool
+    # ... type pools ...
+    _current_depth::Int              # Current scope depth (1 = global)
+    _untracked_flags::Vector{Bool}   # Per-depth flag array
+end
+
+# Initialized with sentinel:
+_current_depth = 1                   # Global scope
+_untracked_flags = [false]           # Sentinel for depth=1
+```
+
+## Performance Impact
+
+| Scenario | Checkpoint Method | Relative Speed |
+|----------|-------------------|----------------|
+| 1 type, no untracked | `checkpoint!(pool, T)` | **~77% faster** |
+| Multiple types, no untracked | `checkpoint!(pool, T1, T2, ...)` | **~50% faster** |
+| Any untracked acquire | `checkpoint!(pool)` | Baseline |
+
+The optimization matters most in tight loops with many iterations.
+
+## Code Generation Summary
+
+```julia
+# INPUT
+@with_pool pool function compute(data)
+    A = acquire!(pool, Float64, length(data))
+    result = helper!(pool, A)  # May have untracked acquires
+    return result
+end
+
+# OUTPUT (simplified)
+function compute(data)
+    pool = get_task_local_pool()
+
+    # Check if parent scope had untracked (for nested pools)
+    if pool._untracked_flags[pool._current_depth]
+        checkpoint!(pool)                    # Full checkpoint
+    else
+        checkpoint!(pool, Float64)           # Typed checkpoint
+    end
+
+    try
+        A = _acquire_impl!(pool, Float64, length(data))
+        result = helper!(pool, A)
+        return result
+    finally
+        # Check if untracked acquires occurred in this scope
+        if pool._untracked_flags[pool._current_depth]
+            rewind!(pool)                    # Full rewind
+        else
+            rewind!(pool, Float64)           # Typed rewind
+        end
+    end
+end
+```
+
+## Key Internal Functions
+
+| Function | Purpose |
+|----------|---------|
+| `_extract_acquire_types(expr, pool_name)` | AST walk to find types |
+| `_filter_static_types(types, local_vars)` | Filter out locally-defined types |
+| `_transform_acquire_calls(expr, pool_name)` | Replace `acquire!` → `_acquire_impl!` |
+| `_mark_untracked!(pool)` | Set untracked flag for current depth |
+| `_generate_typed_checkpoint_call(pool, types)` | Generate `checkpoint!(pool, T...)` |
+
+## See Also
+
+- [Internals](internals.md) — Overview of pool architecture
+- [Safety Rules](../guide/safety.md) — Scope rules and best practices
+- [Configuration](../usage/configuration.md) — Performance tuning options
diff --git a/docs/src/advanced/multi-threading.md b/docs/src/advanced/multi-threading.md
new file mode 100644
index 0000000..4135019
--- /dev/null
+++ b/docs/src/advanced/multi-threading.md
@@ -0,0 +1,284 @@
+# Multi-Threading Guide
+
+AdaptiveArrayPools uses `task_local_storage()` for **task-local isolation**: each Julia Task gets its own independent pool. This design ensures thread safety when used correctly.
+
+## Table of Contents
+
+- [Understanding Julia's Task/Thread Model](#understanding-julias-taskthread-model)
+- [How Pools Work with @threads](#how-pools-work-with-threads)
+- [Safe Patterns](#safe-patterns)
+- [Unsafe Patterns](#unsafe-patterns)
+- [Why Task-Local (Not Thread-Local)?](#why-task-local-not-thread-local)
+- [User Responsibility](#user-responsibility)
+
+---
+
+## Understanding Julia's Task/Thread Model
+
+Julia uses an **M:N threading model** where multiple Tasks (lightweight coroutines) can run on multiple OS threads.
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                     Julia Process                            │
+│                                                              │
+│  Thread 1              Thread 2              Thread 3        │
+│  ┌─────────┐          ┌─────────┐          ┌─────────┐      │
+│  │ Task A  │          │ Task C  │          │ Task E  │      │
+│  │ (TLS-A) │          │ (TLS-C) │          │ (TLS-E) │      │
+│  └─────────┘          └─────────┘          └─────────┘      │
+│  ┌─────────┐          ┌─────────┐                           │
+│  │ Task B  │          │ Task D  │                           │
+│  │ (TLS-B) │          │ (TLS-D) │                           │
+│  └─────────┘          └─────────┘                           │
+└─────────────────────────────────────────────────────────────┘
+```
+
+Key concepts:
+
+| Concept | Description |
+|---------|-------------|
+| **Thread** | OS-level execution unit. Fixed count at Julia startup. |
+| **Task** | Julia's lightweight coroutine (Green Thread). Created dynamically. |
+| **task_local_storage()** | Per-Task storage. Each Task has its own isolated TLS. |
+
+### Important: One Thread Can Run Multiple Tasks
+
+A single thread can execute multiple Tasks by switching between them at **yield points** (I/O, `sleep()`, `yield()`, etc.):
+
+```julia
+# Both tasks run on Thread 1, interleaved!
+task_a = @spawn begin
+    println("A start")
+    sleep(0.1)      # yield point - switch to Task B
+    println("A end")
+end
+
+task_b = @spawn begin
+    println("B start")
+    sleep(0.1)      # yield point - switch back to Task A
+    println("B end")
+end
+
+# Output (single thread):
+# A start
+# B start
+# A end
+# B end
+```
+
+---
+
+## How Pools Work with @threads
+
+When you use `Threads.@threads`, Julia distributes iterations across threads. Each thread gets **one Task** that processes its assigned iterations.
+
+```
+Threads.@threads for i in 1:100_000   (4 threads)
+│
+├─ Thread 1: Task-1 → Pool-1
+│   └─ Processes i = 1..25,000 (same pool reused for all!)
+│
+├─ Thread 2: Task-2 → Pool-2
+│   └─ Processes i = 25,001..50,000
+│
+├─ Thread 3: Task-3 → Pool-3
+│   └─ Processes i = 50,001..75,000
+│
+└─ Thread 4: Task-4 → Pool-4
+    └─ Processes i = 75,001..100,000
+
+Total: 4 pools created, each reused ~25,000 times
+```
+
+### Key Insight
+
+- `@threads` creates **one Task per thread** (not one per iteration!)
+- Each Task has its own `task_local_storage()` → its own pool
+- Within one `@threads` block, pools are efficiently reused
+- Calling `@threads` **multiple times** creates new Tasks → new pools each time
+
+---
+
+## Safe Patterns
+
+### Pattern 1: `@with_pool` Inside `@threads`
+
+```julia
+Threads.@threads for i in 1:N
+    @with_pool pool begin
+        a = acquire!(pool, Float64, 100)
+        # ... computation ...
+    end  # pool automatically rewinds
+end
+```
+
+Each thread's Task gets its own pool. Safe and efficient.
+
+### Pattern 2: Function Defined with `@with_pool`
+
+```julia
+# Define function with @with_pool
+@with_pool pool function inner_work(x)
+    tmp = acquire!(pool, Float64, length(x))
+    tmp .= x
+    return sum(tmp)
+end
+
+# Call from @threads - each thread gets its own pool
+Threads.@threads for i in 1:N
+    result = inner_work(data[i])
+end
+```
+
+The pool is created per-Task when the function is called, not when defined.
+
+### Pattern 3: Nested Functions
+
+```julia
+@with_pool outer_pool function outer_work(data)
+    # outer_pool belongs to Main Task
+    tmp = acquire!(outer_pool, Float64, 100)
+
+    Threads.@threads for i in 1:length(data)
+        # inner_work creates its own pool per thread
+        inner_work(data[i])  # Inner pool ≠ outer_pool (safe!)
+    end
+end
+```
+
+Outer and inner pools are completely independent.
+
+---
+
+## Unsafe Patterns
+
+### Pattern 1: `@with_pool` Outside `@threads`
+
+```julia
+# ❌ DANGER: Race condition!
+@with_pool pool Threads.@threads for i in 1:N
+    a = acquire!(pool, Float64, 100)  # All threads share ONE pool!
+end
+```
+
+**Why it fails**: `pool` is created in the Main Task's TLS. All threads access the same pool simultaneously.
+
+### Pattern 2: Sharing Pool Reference
+
+```julia
+# ❌ DANGER: Race condition!
+pool = get_task_local_pool()  # Main Task's pool
+Threads.@threads for i in 1:N
+    a = acquire!(pool, Float64, 100)  # Shared access!
+end
+```
+
+### Pattern 3: Passing Pool to `@spawn`
+
+```julia
+# ❌ DANGER: Race condition!
+@with_pool pool begin
+    tasks = [Threads.@spawn begin
+        a = acquire!(pool, Float64, 100)  # Multiple tasks, one pool!
+    end for _ in 1:4]
+    wait.(tasks)
+end
+```
+
+---
+
+## Why Task-Local (Not Thread-Local)?
+
+You might wonder: "Why not use thread-local pools? They persist across `@threads` calls!"
+
+### The Stack Discipline Problem
+
+AdaptiveArrayPools uses `checkpoint!` and `rewind!` - a **stack-based** allocation system:
+
+```julia
+@with_pool pool begin
+    checkpoint!(pool)  # Push current state
+    a = acquire!(pool, ...)
+    b = acquire!(pool, ...)
+    # ...
+    rewind!(pool)      # Pop and restore state (LIFO!)
+end
+```
+
+This requires **strict LIFO ordering**: the Task that checkpoints first must rewind last.
+
+### Why Thread-Local Fails with `@spawn`
+
+With `@spawn`, multiple Tasks can interleave on the same thread:
+
+```
+Thread 1 (with Thread-Local Pool):
+
+Time →
+Task A: checkpoint! ──── acquire! ──── sleep ────────────── rewind!
+Task B:        checkpoint! ──── acquire! ──── sleep ──── rewind!
+                                                    ↑
+                                           A finishes first!
+```
+
+**Stack corruption occurs:**
+
+1. Task A: `checkpoint!` → stack = `[0]`
+2. Task B: `checkpoint!` → stack = `[0, 1]`
+3. Task A: `rewind!` → pops `1` (B's checkpoint!) → stack = `[0]`
+4. Task B: `rewind!` → pops `0` (A's checkpoint!) → **WRONG!**
+
+**Result**: B's arrays may be reused while B is still using them → memory corruption.
+
+### Locks Don't Help
+
+Adding locks only prevents **simultaneous access**, not **LIFO violations**. The stack still gets corrupted because Tasks finish in unpredictable order.
+
+### Task-Local: The Only Safe Solution
+
+With Task-local pools:
+- Each Task has its own pool
+- Each pool has its own stack
+- No interleaving possible → LIFO always preserved
+
+---
+
+## User Responsibility
+
+### The Core Rule
+
+> **Pool objects must not be shared across Tasks.**
+
+This library prioritizes **zero-overhead performance** over runtime safety checks. No locks are added because:
+
+1. Locks would defeat the purpose of zero-allocation pooling
+2. Even with locks, stack corruption would occur (LIFO violations)
+
+### Quick Reference
+
+| Pattern | Safety | Reason |
+|---------|--------|--------|
+| `@with_pool` inside `@threads` | ✅ Safe | Each Task gets own pool |
+| `@with_pool` outside `@threads` | ❌ Unsafe | All threads share one pool |
+| Function with `@with_pool` called from `@threads` | ✅ Safe | Pool created per-Task at call time |
+| Passing pool to `@spawn` | ❌ Unsafe | Multiple Tasks access same pool |
+| Nested `@with_pool` (outer/inner) | ✅ Safe | Each level has independent pool |
+
+### Debugging Tips
+
+If you encounter unexpected behavior:
+
+1. **Check pool placement**: Is `@with_pool` inside or outside `@threads`?
+2. **Check pool sharing**: Is the same pool variable accessed from multiple Tasks?
+3. **Enable POOL_DEBUG**: `POOL_DEBUG[] = true` catches some (not all) misuse patterns
+
+---
+
+## Summary
+
+- AdaptiveArrayPools uses **Task-local isolation** for thread safety
+- Each Julia Task gets its own independent pool via `task_local_storage()`
+- `@threads` creates one Task per thread → pools are reused within the block
+- **Always place `@with_pool` inside `@threads`**, not outside
+- Thread-local pools are **not an alternative** due to stack discipline requirements
+- Correct usage is the user's responsibility (no runtime checks for performance)
diff --git a/docs/src/guide/getting-started.md b/docs/src/guide/getting-started.md
new file mode 100644
index 0000000..76bb0c3
--- /dev/null
+++ b/docs/src/guide/getting-started.md
@@ -0,0 +1,116 @@
+# Getting Started
+
+This guide will help you get up and running with AdaptiveArrayPools.jl in minutes.
+
+## Installation
+
+```julia
+using Pkg
+Pkg.Registry.add(Pkg.RegistrySpec(url="https://github.com/ProjectTorreyPines/FuseRegistry.jl.git"))
+Pkg.add("AdaptiveArrayPools")
+```
+
+## Basic Usage
+
+The core workflow is simple:
+1. Wrap your function with `@with_pool`
+2. Replace allocations with `acquire!` or convenience functions
+3. Return computed values (scalars, copies), not the arrays themselves
+
+### Before (Standard Julia)
+
+```julia
+function compute(n)
+    A = rand(n, n)      # allocates
+    B = rand(n, n)      # allocates
+    C = A * B           # allocates
+    return sum(C)
+end
+
+for i in 1:10_000
+    compute(100)  # 90k allocations, 2.75 GiB, 31% GC time
+end
+```
+
+### After (With Pooling)
+
+```julia
+using AdaptiveArrayPools, LinearAlgebra, Random
+
+@with_pool pool function compute_pooled(n)
+    A = acquire!(pool, Float64, n, n)  # reuses memory
+    B = similar!(pool, A)
+    C = similar!(pool, A)
+
+    rand!(A); rand!(B)
+    mul!(C, A, B)
+    return sum(C)
+end
+
+compute_pooled(100)  # warmup (first call allocates)
+for i in 1:10_000
+    compute_pooled(100)  # zero allocations, 0% GC
+end
+```
+
+## Convenience Functions
+
+Common initialization patterns have shortcuts:
+
+| Function | Equivalent to |
+|----------|---------------|
+| `zeros!(pool, 10)` | `acquire!` + `fill!(0)` |
+| `ones!(pool, Float32, 3, 3)` | `acquire!` + `fill!(1)` |
+| `similar!(pool, A)` | `acquire!` matching `eltype(A)`, `size(A)` |
+
+```julia
+@with_pool pool function example(n)
+    A = zeros!(pool, n, n)        # zero-initialized
+    B = ones!(pool, Float32, n)   # Float32 ones
+    C = similar!(pool, A)         # same type and size as A
+    # ...
+end
+```
+
+## Return Types
+
+`acquire!` and convenience functions return **view types** (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK:
+
+```julia
+A = acquire!(pool, Float64, 10, 10)  # ReshapedArray{Float64,2}
+mul!(C, A, B)  # works perfectly with BLAS
+```
+
+If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`:
+
+```julia
+A = unsafe_acquire!(pool, Float64, 10, 10)  # Array{Float64,2}
+```
+
+## Important Safety Rules
+
+Arrays from the pool are **only valid within the `@with_pool` scope**:
+
+```julia
+# DO NOT return pool-backed arrays
+@with_pool pool function bad_example()
+    A = acquire!(pool, Float64, 10)
+    return A  # WRONG - A will be recycled after this scope!
+end
+
+# Return computed values instead
+@with_pool pool function good_example()
+    A = acquire!(pool, Float64, 10)
+    return sum(A)  # OK - returning a scalar
+end
+```
+
+For complete safety guidelines, see [Safety Rules](safety.md).
+
+## Next Steps
+
+- [Safety Rules](safety.md) - Complete scope rules and anti-patterns
+- [API Reference](../usage/api.md) - Full function and macro reference
+- [Configuration](../usage/configuration.md) - Preferences and cache tuning
+- [Multi-threading](../advanced/multi-threading.md) - Task/thread safety patterns
+- [CUDA Support](../usage/cuda.md) - GPU backend usage
diff --git a/docs/src/guide/safety.md b/docs/src/guide/safety.md
new file mode 100644
index 0000000..e4eedc1
--- /dev/null
+++ b/docs/src/guide/safety.md
@@ -0,0 +1,110 @@
+# Safety Guide
+
+AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This requires one simple rule: **acquired arrays are only valid within their `@with_pool` scope**.
+
+## The Scope Rule
+
+When `@with_pool` ends, all arrays acquired within that scope are recycled. Using them after the scope ends leads to undefined behavior.
+
+```julia
+@with_pool pool begin
+    v = acquire!(pool, Float64, 100)
+
+    result = sum(v)  # ✅ compute and return values
+    copied = copy(v) # ✅ copy if you need data outside
+end
+# v is no longer valid here
+```
+
+## What NOT to Do
+
+### Don't return pool-backed arrays
+
+```julia
+# ❌ Wrong: returning the array itself
+@with_pool pool function bad_example()
+    v = acquire!(pool, Float64, 100)
+    return v  # v will be recycled after this returns!
+end
+
+# ✅ Correct: return computed values or copies
+@with_pool pool function good_example()
+    v = acquire!(pool, Float64, 100)
+    return sum(v)  # scalar result
+end
+```
+
+### Don't store in globals or closures
+
+```julia
+# ❌ Wrong: storing in global
+global_ref = nothing
+@with_pool pool begin
+    global_ref = acquire!(pool, Float64, 100)
+end
+# global_ref now points to recycled memory
+
+# ❌ Wrong: capturing in closure
+@with_pool pool begin
+    v = acquire!(pool, Float64, 100)
+    callback = () -> sum(v)  # v captured but will be invalid
+end
+```
+
+### Don't resize or push! to unsafe_acquire! arrays
+
+```julia
+@with_pool pool begin
+    v = unsafe_acquire!(pool, Float64, 100)
+    # ❌ These break pool memory management:
+    # resize!(v, 200)
+    # push!(v, 1.0)
+    # append!(v, [1.0, 2.0])
+end
+```
+
+## Debugging with POOL_DEBUG
+
+Enable runtime safety checks during development:
+
+```julia
+using AdaptiveArrayPools
+AdaptiveArrayPools.POOL_DEBUG[] = true
+
+@with_pool pool function test()
+    v = acquire!(pool, Float64, 100)
+    return v  # Will warn about returning pool-backed array
+end
+```
+
+## acquire! vs unsafe_acquire!
+
+| Function | Returns | Best For |
+|----------|---------|----------|
+| `acquire!` | View types (`SubArray`, `ReshapedArray`) | General use, BLAS/LAPACK |
+| `unsafe_acquire!` | Native `Array`/`CuArray` | FFI, type constraints |
+
+Both follow the same scope rules. Use `acquire!` by default—views work with all standard Julia linear algebra operations.
+
+## Thread Safety
+
+Pools are task-local, so each thread automatically gets its own pool:
+
+```julia
+# ✅ Safe: each task has independent pool
+Threads.@threads for i in 1:N
+    @with_pool pool begin
+        a = acquire!(pool, Float64, 100)
+        # work with a...
+    end
+end
+
+# ❌ Unsafe: pool created outside threaded region
+@with_pool pool begin
+    Threads.@threads for i in 1:N
+        a = acquire!(pool, Float64, 100)  # race condition!
+    end
+end
+```
+
+See [Multi-Threading](../advanced/multi-threading.md) for more patterns.
diff --git a/docs/src/index.md b/docs/src/index.md
new file mode 100644
index 0000000..c89a222
--- /dev/null
+++ b/docs/src/index.md
@@ -0,0 +1,117 @@
+[![CI](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/actions/workflows/CI.yml/badge.svg)](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/actions/workflows/CI.yml)
+[![codecov](https://codecov.io/github/projecttorreypines/adaptivearraypools.jl/graph/badge.svg?token=ZL0U0OvnL2)](https://codecov.io/github/projecttorreypines/adaptivearraypools.jl)
+
+# AdaptiveArrayPools.jl
+
+**Zero-allocation temporary arrays for Julia.**
+
+A lightweight library that lets you write natural, allocation-style code while automatically reusing memory behind the scenes. Eliminates GC pressure in hot loops without the complexity of manual buffer management.
+
+**Supported backends:**
+- **CPU** — `Array`, works out of the box
+- **CUDA** — `CuArray`, loads automatically when [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) is available
+
+## The Problem
+
+In performance-critical code, temporary array allocations inside loops create massive GC pressure:
+
+```julia
+function compute(n)
+    A = rand(n, n)      # allocates
+    B = rand(n, n)      # allocates
+    C = A * B           # allocates
+    return sum(C)
+end
+
+for i in 1:10_000
+    compute(100)  # ⚠️ 90k allocations, 2.75 GiB, 31% GC time
+end
+```
+
+The traditional fix—passing pre-allocated buffers—works for simple cases but quickly becomes impractical:
+
+- **API pollution**: Every function needs extra buffer arguments, breaking clean interfaces
+- **Nested calls**: Buffers must be threaded through entire call stacks, even third-party code
+- **Dynamic shapes**: Hard to pre-allocate when array sizes depend on runtime values
+- **Package boundaries**: You can't easily pass buffers into library functions you don't control
+
+## The Solution
+
+Wrap your function with `@with_pool` and replace allocations with `acquire!` or convenience functions:
+
+```julia
+using AdaptiveArrayPools, LinearAlgebra, Random
+
+@with_pool pool function compute_pooled(n)
+    A = acquire!(pool, Float64, n, n)  # reuses memory from pool
+    B = similar!(pool, A)
+    C = similar!(pool, A)
+
+    rand!(A); rand!(B)
+    mul!(C, A, B)
+    return sum(C)
+end
+
+compute_pooled(100)  # warmup
+for i in 1:10_000
+    compute_pooled(100) # ✅ Zero allocations, 0% GC
+end
+```
+
+| Metric | Standard | **AdaptiveArrayPools** | Improvement |
+|--------|----------|------------------------|-------------|
+| Time | 787 ms | **525 ms** | 1.5× faster |
+| Allocations | ⚠️ 90,000 (2.75 GiB) | ✅ **0** | 100% eliminated |
+| GC Time | ⚠️ 31% | ✅ **0%** | No GC pauses |
+
+> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](usage/cuda.md).
+
+## How It Works
+
+`@with_pool` automatically manages memory lifecycle for you:
+
+1. **Checkpoint** — Saves current pool state when entering the block
+2. **Acquire** — `acquire!` returns arrays backed by pooled memory
+3. **Rewind** — When the block ends, all acquired arrays are recycled for reuse
+
+This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors.
+
+`acquire!` returns lightweight views (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK. If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`—see [API Reference](usage/api.md).
+
+> **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](guide/safety.md).
+
+**Thread-safe by design**: Each Julia Task gets its own independent pool—no locks needed. See [Multi-Threading](advanced/multi-threading.md) for patterns.
+
+### Convenience Functions
+
+Common initialization patterns have convenience functions:
+
+| Function | Equivalent to |
+|----------|---------------|
+| `zeros!(pool, 10)` | `acquire!` + `fill!(0)` |
+| `ones!(pool, Float32, 3, 3)` | `acquire!` + `fill!(1)` |
+| `similar!(pool, A)` | `acquire!` matching `eltype(A)`, `size(A)` |
+
+These return views like `acquire!`. For raw `Array` types, use `unsafe_acquire!` or its convenience variants (`unsafe_zeros!`, `unsafe_ones!`, `unsafe_similar!`). See [API Reference](usage/api.md#convenience-functions).
+
+## Installation
+
+```julia
+using Pkg
+Pkg.Registry.add(Pkg.RegistrySpec(url="https://github.com/ProjectTorreyPines/FuseRegistry.jl.git"))
+Pkg.add("AdaptiveArrayPools")
+```
+
+## Documentation
+
+| Guide | Description |
+|-------|-------------|
+| [API Reference](usage/api.md) | Complete function and macro reference |
+| [CUDA Backend](usage/cuda.md) | GPU-specific usage and examples |
+| [Safety Guide](guide/safety.md) | Scope rules and best practices |
+| [Multi-Threading](advanced/multi-threading.md) | Task/thread safety patterns |
+| [Configuration](usage/configuration.md) | Preferences and cache tuning |
+
+## License
+
+[Apache 2.0](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/LICENSE)
diff --git a/docs/src/usage/api.md b/docs/src/usage/api.md
new file mode 100644
index 0000000..4d3eb8c
--- /dev/null
+++ b/docs/src/usage/api.md
@@ -0,0 +1,111 @@
+# API Reference
+
+## Macros
+
+| Macro | Description |
+|-------|-------------|
+| `@with_pool name expr` | **Recommended.** Injects a global, task-local pool named `name`. Automatically checkpoints and rewinds. |
+| `@maybe_with_pool name expr` | Same as `@with_pool`, but can be toggled on/off at runtime via `MAYBE_POOLING_ENABLED[]`. |
+
+## Functions
+
+| Function | Description |
+|----------|-------------|
+| `acquire!(pool, T, dims...)` | Returns a view: `SubArray{T,1}` for 1D, `ReshapedArray{T,N}` for N-D. Always 0 bytes. |
+| `acquire!(pool, T, dims::Tuple)` | Tuple overload for `acquire!` (e.g., `acquire!(pool, T, size(x))`). |
+| `acquire!(pool, x::AbstractArray)` | Similar-style: acquires array matching `eltype(x)` and `size(x)`. |
+| `unsafe_acquire!(pool, T, dims...)` | Returns native `Array`/`CuArray` (CPU: `Vector{T}` for 1D, `Array{T,N}` for N-D). Only for FFI/type constraints. |
+| `unsafe_acquire!(pool, T, dims::Tuple)` | Tuple overload for `unsafe_acquire!`. |
+| `unsafe_acquire!(pool, x::AbstractArray)` | Similar-style: acquires raw array matching `eltype(x)` and `size(x)`. |
+| `acquire_view!(pool, T, dims...)` | Alias for `acquire!`. Returns view types. |
+| `acquire_array!(pool, T, dims...)` | Alias for `unsafe_acquire!`. Returns Array for N-D. |
+| `checkpoint!(pool)` | Saves the current pool state (stack pointer). |
+| `checkpoint!(pool, T...)` | Type-specific checkpoint for optimized performance. |
+| `rewind!(pool)` | Restores the pool to the last checkpoint, freeing all arrays acquired since then. |
+| `rewind!(pool, T...)` | Type-specific rewind for optimized performance. |
+| `pool_stats(pool)` | Prints detailed statistics about pool usage. |
+| `get_task_local_pool()` | Returns the task-local pool instance. |
+| `empty!(pool)` | Clears all internal storage, releasing all memory. |
+
+## Convenience Functions
+
+Shortcuts for common `acquire!` + initialization patterns. Default element type is `Float64` (CPU) or `Float32` (CUDA).
+
+### View-returning (like `acquire!`)
+
+| Function | Description |
+|----------|-------------|
+| `zeros!(pool, [T,] dims...)` | Zero-initialized view. Equivalent to `acquire!` + `fill!(0)`. |
+| `ones!(pool, [T,] dims...)` | One-initialized view. Equivalent to `acquire!` + `fill!(1)`. |
+| `similar!(pool, A)` | View matching `eltype(A)` and `size(A)`. |
+| `similar!(pool, A, T)` | View with type `T`, size from `A`. |
+| `similar!(pool, A, dims...)` | View with `eltype(A)`, specified dimensions. |
+| `similar!(pool, A, T, dims...)` | View with type `T`, specified dimensions. |
+
+### Array-returning (like `unsafe_acquire!`)
+
+| Function | Description |
+|----------|-------------|
+| `unsafe_zeros!(pool, [T,] dims...)` | Zero-initialized raw `Array`. |
+| `unsafe_ones!(pool, [T,] dims...)` | One-initialized raw `Array`. |
+| `unsafe_similar!(pool, A, ...)` | Raw `Array` with same signatures as `similar!`. |
+
+All convenience functions support tuple dimensions: `zeros!(pool, (3, 4))`.
+
+**CUDA note**: Default type is `Float32` to match `CUDA.zeros()` behavior.
+
+## Types
+
+| Type | Description |
+|------|-------------|
+| `AdaptiveArrayPool` | The main pool type. Create with `AdaptiveArrayPool()`. |
+| `DisabledPool{Backend}` | Sentinel type when pooling is disabled. Preserves backend context (`:cpu` or `:cuda`). |
+
+## Utility Functions
+
+| Function | Description |
+|----------|-------------|
+| `pooling_enabled(pool)` | Returns `true` if pool is active, `false` if `DisabledPool`. Use instead of `pool === nothing`. |
+| `default_eltype(pool)` | Returns default element type: `Float64` (CPU) or `Float32` (CUDA). |
+
+## Constants
+
+| Constant | Description |
+|----------|-------------|
+| `USE_POOLING` | Compile-time constant. Set via `Preferences.jl` to disable all pooling. |
+| `MAYBE_POOLING_ENABLED` | Runtime `Ref{Bool}`. Only affects `@maybe_with_pool`. |
+| `POOL_DEBUG` | Runtime `Ref{Bool}`. Enable safety validation for debugging. |
+| `CACHE_WAYS` | Compile-time constant. N-way cache size for `unsafe_acquire!` (default: 4, range: 1-16). |
+
+## Configuration Functions
+
+| Function | Description |
+|----------|-------------|
+| `set_cache_ways!(n)` | Set N-way cache size. Requires Julia restart. |
+
+## Safety Notes
+
+Arrays acquired from a pool are **only valid within the `@with_pool` scope**. Do not:
+- Return pool-backed arrays from functions
+- Store them in global variables
+- Capture them in closures that outlive the scope
+- Call `resize!`, `push!`, or `append!` on arrays from `unsafe_acquire!`
+
+Use `POOL_DEBUG[] = true` during development to catch direct returns of pool-backed arrays.
+
+## `acquire!` vs `unsafe_acquire!`
+
+| Function | 1D Return | N-D Return | Allocation |
+|----------|-----------|------------|------------|
+| `acquire!` | `SubArray{T,1}` | `ReshapedArray{T,N}` | Always 0 bytes (stack-based views) |
+| `unsafe_acquire!` | `Vector{T}` | `Array{T,N}` | 0 bytes (hit) / ~100 bytes header (miss) |
+
+Both share the same underlying pool memory. Even on cache miss, only the `Array` header is allocated—**data memory is always reused from the pool**. **Use `acquire!` by default**—BLAS/LAPACK are fully optimized for `StridedArray`, so there's no performance difference.
+
+Use `unsafe_acquire!` only when you need a concrete `Array` type (FFI, type signatures, runtime dispatch).
+
+**Caching**:
+- `acquire!` 1D uses simple 1:1 cache (reuses `SubArray` if same length)
+- `unsafe_acquire!` (all dimensions) uses N-way cache (up to `CACHE_WAYS`, default: 4) per slot; exceeding this causes eviction
+
+> **Header size by dimensionality**: The `~100 bytes` is an average. Actual `Array` header allocation varies: 1D → 80 bytes, 2D-3D → 112 bytes, 4D-5D → 144 bytes. This is Julia's internal `Array` metadata; actual data memory is always reused from the pool.
diff --git a/docs/src/usage/configuration.md b/docs/src/usage/configuration.md
new file mode 100644
index 0000000..fdc1d16
--- /dev/null
+++ b/docs/src/usage/configuration.md
@@ -0,0 +1,102 @@
+# Configuration
+
+AdaptiveArrayPools can be configured via `LocalPreferences.toml`:
+
+```toml
+[AdaptiveArrayPools]
+use_pooling = false  # ⭐ Primary: Disable pooling entirely
+cache_ways = 8       # Advanced: N-way cache size (default: 4)
+```
+
+## Compile-time: USE_POOLING (⭐ Primary)
+
+**The most important configuration.** Completely disable pooling to make `acquire!` behave like standard allocation.
+
+```toml
+# LocalPreferences.toml
+[AdaptiveArrayPools]
+use_pooling = false
+```
+
+Or programmatically:
+
+```julia
+using Preferences
+Preferences.set_preferences!(AdaptiveArrayPools, "use_pooling" => false)
+# Restart Julia for changes to take effect
+```
+
+When `USE_POOLING = false`:
+- `pool` becomes `DisabledPool{backend}()` instead of an active pool
+- All pool functions fall back to standard allocation
+- Backend context is preserved: `:cuda` still returns `CuArray`
+
+```julia
+# These become equivalent:
+@with_pool pool acquire!(pool, Float64, n, n)  →  Matrix{Float64}(undef, n, n)
+@with_pool pool acquire!(pool, Float64, n)     →  Vector{Float64}(undef, n)
+
+# With CUDA backend:
+@with_pool :cuda pool zeros!(pool, 100)        →  CUDA.zeros(Float32, 100)
+```
+
+Use `pooling_enabled(pool)` to check if pooling is active.
+
+**Use cases:**
+- **Debugging**: Compare behavior with/without pooling
+- **Benchmarking**: Measure pooling overhead vs direct allocation
+- **Gradual adoption**: Add `@with_pool` annotations now, enable pooling later
+- **CI/Testing**: Run tests without pooling to isolate issues
+
+All pooling code is **completely eliminated at compile time** (zero overhead).
+
+## Runtime: MAYBE_POOLING_ENABLED
+
+Only affects `@maybe_with_pool`. Toggle without restart.
+
+```julia
+MAYBE_POOLING_ENABLED[] = false  # Disable
+MAYBE_POOLING_ENABLED[] = true   # Enable (default)
+```
+
+## Runtime: POOL_DEBUG
+
+Enable safety validation to catch direct returns of pool-backed arrays.
+
+```julia
+POOL_DEBUG[] = true   # Enable safety checks (development)
+POOL_DEBUG[] = false  # Disable (default, production)
+```
+
+When enabled, returning a pool-backed array from a `@with_pool` block will throw an error.
+
+## Compile-time: CACHE_WAYS
+
+Configure the N-way cache size for `unsafe_acquire!`. Higher values reduce cache eviction but increase memory per slot.
+
+```toml
+# LocalPreferences.toml
+[AdaptiveArrayPools]
+cache_ways = 8  # Default: 4, Range: 1-16
+```
+
+Or programmatically:
+
+```julia
+using AdaptiveArrayPools
+set_cache_ways!(8)
+# Restart Julia for changes to take effect
+```
+
+**When to increase**: If your code alternates between more than 4 dimension patterns per pool slot, increase `cache_ways` to avoid cache eviction (~100 bytes header per miss).
+
+> **Scope**: `cache_ways` affects **all `unsafe_acquire!`** calls (including 1D). Only `acquire!` 1D uses simple 1:1 caching.
+
+## Summary
+
+| Setting | Scope | Restart? | Priority | Affects |
+|---------|-------|----------|----------|---------|
+| `use_pooling` | Compile-time | Yes | ⭐ Primary | All macros, `acquire!` behavior |
+| `cache_ways` | Compile-time | Yes | Advanced | `unsafe_acquire!` N-D caching |
+| `MAYBE_POOLING_ENABLED` | Runtime | No | Optional | `@maybe_with_pool` only |
+| `POOL_DEBUG` | Runtime | No | Debug | Safety validation |
diff --git a/docs/src/usage/cuda.md b/docs/src/usage/cuda.md
new file mode 100644
index 0000000..c5778c8
--- /dev/null
+++ b/docs/src/usage/cuda.md
@@ -0,0 +1,123 @@
+# CUDA Backend
+
+AdaptiveArrayPools provides native CUDA support through a package extension that loads automatically when CUDA.jl is available.
+
+## Quick Start
+
+```julia
+using AdaptiveArrayPools, CUDA
+
+# Use :cuda backend for GPU arrays
+@with_pool :cuda pool function gpu_computation(n)
+    A = acquire!(pool, Float64, n, n)  # CuArray view
+    B = acquire!(pool, Float64, n, n)  # CuArray view
+
+    fill!(A, 1.0)
+    fill!(B, 2.0)
+
+    return sum(A .+ B)
+end
+
+# Zero GPU allocation in hot loops
+for i in 1:1000
+    gpu_computation(100)  # GPU memory reused from pool
+end
+```
+
+## API
+
+The CUDA backend uses the same API as CPU, with `:cuda` backend specifier:
+
+| Macro/Function | Description |
+|----------------|-------------|
+| `@with_pool :cuda pool expr` | GPU pool with automatic checkpoint/rewind |
+| `acquire!(pool, T, dims...)` | Returns `CuArray` view (always 0 bytes GPU alloc) |
+| `unsafe_acquire!(pool, T, dims...)` | Returns raw `CuArray` (for FFI/type constraints) |
+| `get_task_local_cuda_pool()` | Returns the task-local CUDA pool |
+| `pool_stats(:cuda)` | Print CUDA pool statistics |
+
+## Return Types
+
+| Function | 1D Return | N-D Return |
+|----------|-----------|------------|
+| `acquire!` | `CuArray{T,1}` (view) | `CuArray{T,N}` (view) |
+| `unsafe_acquire!` | `CuArray{T,1}` | `CuArray{T,N}` |
+
+## Allocation Behavior
+
+**GPU Memory**: Always 0 bytes allocation after warmup. The underlying `CuVector` is resized as needed and reused.
+
+**CPU Memory**:
+- Cache hit (≤4 dimension patterns per slot): 0 bytes
+- Cache miss (>4 patterns): ~100 bytes for wrapper metadata
+
+```julia
+# Example: 4 patterns fit in 4-way cache → zero CPU allocation
+dims_list = ((10, 10), (5, 20), (20, 5), (4, 25))
+for dims in dims_list
+    @with_pool :cuda p begin
+        A = acquire!(p, Float64, dims...)
+        # Use A...
+    end
+end
+```
+
+## Fixed Slot Types
+
+Optimized types with pre-allocated slots (same as CPU):
+
+| Type | Field |
+|------|-------|
+| `Float64` | `.float64` |
+| `Float32` | `.float32` |
+| `Float16` | `.float16` |
+| `Int64` | `.int64` |
+| `Int32` | `.int32` |
+| `ComplexF64` | `.complexf64` |
+| `ComplexF32` | `.complexf32` |
+| `Bool` | `.bool` |
+
+Other types use the fallback dictionary (`.others`).
+
+## Limitations
+
+- **No `@maybe_with_pool :cuda`**: Runtime toggle not supported for CUDA backend
+- **Task-local only**: Each Task gets its own CUDA pool, same as CPU
+- **Same device**: All arrays in a pool use the same CUDA device
+
+## Example: Matrix Multiplication
+
+```julia
+using AdaptiveArrayPools, CUDA, LinearAlgebra
+
+@with_pool :cuda pool function gpu_matmul(n)
+    A = acquire!(pool, Float64, n, n)
+    B = acquire!(pool, Float64, n, n)
+    C = acquire!(pool, Float64, n, n)
+
+    rand!(A); rand!(B)
+    mul!(C, A, B)
+
+    return sum(C)
+end
+
+# Warmup
+gpu_matmul(100)
+
+# Benchmark - zero GPU allocation
+using BenchmarkTools
+@benchmark gpu_matmul(1000)
+```
+
+## Debugging
+
+```julia
+# Check pool state
+pool_stats(:cuda)
+
+# Output:
+# CuAdaptiveArrayPool (device 0)
+#   Float64 (fixed) [GPU]
+#     slots: 3 (active: 0)
+#     elements: 30000 (234.375 KiB)
+```
diff --git a/docs/src/usage/maybe_with_pool.md b/docs/src/usage/maybe_with_pool.md
new file mode 100644
index 0000000..39c31b7
--- /dev/null
+++ b/docs/src/usage/maybe_with_pool.md
@@ -0,0 +1,53 @@
+# @maybe_with_pool
+
+Runtime-toggleable pooling. Users can enable/disable via `MAYBE_POOLING_ENABLED[]`.
+
+## Usage
+
+```julia
+@maybe_with_pool pool function compute(n)
+    v = acquire!(pool, Float64, n)
+    v .= 1.0
+    sum(v)
+end
+
+# Toggle at runtime
+MAYBE_POOLING_ENABLED[] = false  # Normal allocation
+MAYBE_POOLING_ENABLED[] = true   # Uses pool
+```
+
+## When to Use
+
+- Library code where end-users should control pooling behavior
+- Debugging: disable pooling to isolate memory issues
+- Benchmarking: compare pooled vs non-pooled performance
+
+## How It Works
+
+When `MAYBE_POOLING_ENABLED[] == false`:
+- `pool` becomes `DisabledPool{backend}()` (e.g., `DisabledPool{:cpu}()` or `DisabledPool{:cuda}()`)
+- All pool functions (`acquire!`, `zeros!`, etc.) fall back to standard allocation
+- Backend context is preserved: `:cuda` → `CuArray`, `:cpu` → `Array`
+
+Use `pooling_enabled(pool)` to check if pooling is active:
+```julia
+@maybe_with_pool pool begin
+    if pooling_enabled(pool)
+        # Using pooled memory
+    else
+        # Using standard allocation (DisabledPool)
+    end
+end
+```
+
+## vs @with_pool
+
+| | `@with_pool` | `@maybe_with_pool` |
+|---|---|---|
+| Runtime toggle | No | Yes |
+| Overhead when disabled | None | Branch check |
+| Use case | Application code | Library code |
+
+## Safety
+
+Same rules as `@with_pool`: arrays are only valid within the scope. Do not return or store them externally.

From e98413423fe69c8d15bd8124593046106d735149 Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Mon, 5 Jan 2026 10:36:01 -0800
Subject: [PATCH 2/8] docs: add English design documents to docs/design

- Translate 6 Korean/mixed documents to English:
  - fixed_slots_codegen_design.md (updated for 7 fixed slots)
  - hybrid_api_design.md
  - nd_array_approach_comparison.md
  - new_hybrid_api_design.md
  - untracked_acquire_design.md
  - macro-linenumbernode-improvement.md
- Copy 2 existing English documents:
  - cuda_extension_design.md
  - vector_resize_memory_behavior.md
- Verify codebase consistency with design specs
---
 docs/design/cuda_extension_design.md          | 1266 +++++++++++++++++
 docs/design/fixed_slots_codegen_design.md     |  362 +++++
 docs/design/hybrid_api_design.md              |  441 ++++++
 .../macro-linenumbernode-improvement.md       |  369 +++++
 docs/design/nd_array_approach_comparison.md   |  432 ++++++
 docs/design/new_hybrid_api_design.md          |  140 ++
 docs/design/untracked_acquire_design.md       |  598 ++++++++
 docs/design/vector_resize_memory_behavior.md  |  247 ++++
 8 files changed, 3855 insertions(+)
 create mode 100644 docs/design/cuda_extension_design.md
 create mode 100644 docs/design/fixed_slots_codegen_design.md
 create mode 100644 docs/design/hybrid_api_design.md
 create mode 100644 docs/design/macro-linenumbernode-improvement.md
 create mode 100644 docs/design/nd_array_approach_comparison.md
 create mode 100644 docs/design/new_hybrid_api_design.md
 create mode 100644 docs/design/untracked_acquire_design.md
 create mode 100644 docs/design/vector_resize_memory_behavior.md

diff --git a/docs/design/cuda_extension_design.md b/docs/design/cuda_extension_design.md
new file mode 100644
index 0000000..5b31c2d
--- /dev/null
+++ b/docs/design/cuda_extension_design.md
@@ -0,0 +1,1266 @@
+# AdaptiveArrayPools.jl CUDA Extension Design
+
+> **Status**: Draft v0.6 (Post-Review Revision)
+> **Version**: 0.6
+> **Date**: 2024-12-14
+> **Authors**: Design discussion with AI assistance
+
+## 1. Executive Summary
+
+This document outlines the design for extending AdaptiveArrayPools.jl to support GPU arrays via CUDA.jl. The design prioritizes:
+
+1. **Zero overhead when CUDA not loaded** - Pure CPU code path unchanged
+2. **Maximum code reuse** - Generic functions with minimal dispatch points
+3. **Extensibility** - Abstract type hierarchy for future GPU backends
+4. **Separate namespaces** - Independent CPU and GPU pools per task
+
+### Key Design Decision: Parametric Abstract Types
+
+Instead of duplicating code in the extension, we use parametric abstract types and generic functions. The extension only needs to define:
+- Type definitions (~50 lines)
+- One allocation method (~3 lines)
+- Task-local getter (~15 lines)
+
+**Total extension code: ~70 lines** (vs ~300 lines with full duplication)
+
+---
+
+## 2. Current Architecture Analysis
+
+### 2.1 Core Type Structure
+
+```julia
+# Current: Concrete types only
+mutable struct TypedPool{T}
+    vectors::Vector{Vector{T}}
+    views::Vector{SubArray{T,1,Vector{T},...}}
+    view_lengths::Vector{Int}
+    nd_arrays::Vector{Any}
+    nd_dims::Vector{Any}
+    nd_ptrs::Vector{UInt}
+    nd_next_way::Vector{Int}
+    n_active::Int
+    _checkpoint_n_active::Vector{Int}
+    _checkpoint_depths::Vector{Int}
+end
+```
+
+### 2.2 Code Reuse Analysis
+
+| Component | Operates On | GPU-Specific? |
+|-----------|-------------|---------------|
+| `get_view!` | vectors, n_active, cache | Only allocation |
+| `get_nd_view!` | calls get_view!, reshape | **No** |
+| `get_nd_array!` | calls get_view!, unsafe_wrap | Only wrap call |
+| `checkpoint!` | n_active, checkpoint stacks | **No** |
+| `rewind!` | n_active, checkpoint stacks | **No** |
+| `reset!` | n_active, checkpoint stacks | **No** |
+| `empty!` | all fields | Clear vectors only |
+
+**Key insight**: 95%+ of logic is type-agnostic. Only allocation/wrapping differs.
+
+---
+
+## 3. Proposed Architecture
+
+### 3.1 Type Hierarchy
+
+```
+              AbstractTypedPool{T, V<:AbstractVector{T}}
+                              │
+               ┌──────────────┼──────────────┐
+               │              │              │
+   TypedPool{T}        CuTypedPool{T}   [Future: ROCTypedPool{T}]
+   V = Vector{T}       V = CuVector{T}   V = ROCArray{T,1}
+
+
+                      AbstractArrayPool
+                              │
+               ┌──────────────┼──────────────┐
+               │              │              │
+   AdaptiveArrayPool   CuAdaptiveArrayPool   [Future: ROCArrayPool]
+```
+
+### 3.2 Abstract Type Definitions
+
+```julia
+# src/types.jl - New additions
+
+"""
+    AbstractTypedPool{T, V<:AbstractVector{T}}
+
+Abstract base for type-specific memory pools.
+`T` is the element type, `V` is the storage vector type.
+
+Concrete subtypes must have these fields:
+- vectors::Vector{V}
+- views, view_lengths, nd_* fields
+- n_active::Int
+- _checkpoint_n_active::Vector{Int}
+- _checkpoint_depths::Vector{Int}
+"""
+abstract type AbstractTypedPool{T, V<:AbstractVector{T}} end
+
+"""
+    AbstractArrayPool
+
+Abstract base for multi-type array pools.
+
+Concrete subtypes must have these fields:
+- Fixed slot fields (type-specific TypedPools)
+- others::IdDict{DataType,Any}
+- _current_depth::Int
+- _untracked_flags::Vector{Bool}
+"""
+abstract type AbstractArrayPool end
+
+# Storage type accessor (for generic code)
+storage_type(::AbstractTypedPool{T,V}) where {T,V} = V
+storage_type(::Type{<:AbstractTypedPool{T,V}}) where {T,V} = V
+```
+
+### 3.3 Concrete Types
+
+#### CPU (existing, modified to inherit)
+
+```julia
+# src/types.jl
+
+mutable struct TypedPool{T} <: AbstractTypedPool{T, Vector{T}}
+    # Storage
+    vectors::Vector{Vector{T}}
+    views::Vector{SubArray{T,1,Vector{T},Tuple{UnitRange{Int64}},true}}
+    view_lengths::Vector{Int}
+
+    # N-D cache
+    nd_arrays::Vector{Any}
+    nd_dims::Vector{Any}
+    nd_ptrs::Vector{UInt}
+    nd_next_way::Vector{Int}
+
+    # State
+    n_active::Int
+    _checkpoint_n_active::Vector{Int}
+    _checkpoint_depths::Vector{Int}
+end
+
+mutable struct AdaptiveArrayPool <: AbstractArrayPool
+    # Fixed slots (CPU types)
+    float64::TypedPool{Float64}
+    float32::TypedPool{Float32}
+    int64::TypedPool{Int64}
+    int32::TypedPool{Int32}
+    complexf64::TypedPool{ComplexF64}
+    complexf32::TypedPool{ComplexF32}
+    bool::TypedPool{Bool}
+
+    others::IdDict{DataType,Any}
+    _current_depth::Int
+    _untracked_flags::Vector{Bool}
+end
+```
+
+#### GPU (extension - minimal definitions)
+
+> **[AI Review: Float16 & Device Safety]**
+> 1. **Float16 Support**: Added `Float16` to fixed slots. This is critical for modern AI/ML workloads on GPU.
+> 2. **Device Awareness**: Added `device_id::Int` to `CuAdaptiveArrayPool`. This is crucial for multi-GPU setups. A pool created on Device 0 cannot be safely used on Device 1. We must track which device owns the memory.
+
+> **[Post-Review v0.6: Critical Type Correction]**
+> **`view(CuVector, 1:n)` returns `CuVector`, NOT `SubArray`!**
+>
+> GPUArrays.jl handles contiguous views via `derive()` which returns a new GPU array
+> sharing the same memory buffer (see `~/.julia/packages/GPUArrays/.../src/host/base.jl:302`).
+> This is fundamentally different from CPU where `view()` returns `SubArray`.
+>
+> **Implications for pool design**:
+> 1. We **cannot cache views separately** from backing vectors on GPU
+> 2. Instead, we store `CuVector{T}` directly and return slices via `view()` on each call
+> 3. View creation is cheap (no allocation, just metadata), so no caching benefit
+> 4. This simplifies the GPU pool: no `views` or `view_lengths` fields needed
+
+```julia
+# ext/AdaptiveArrayPoolsCUDAExt/types.jl
+
+using CUDA
+
+# IMPORTANT: Unlike CPU, GPU views are derived CuArrays, not SubArrays.
+# view(::CuVector{T}, ::UnitRange) -> CuVector{T} (shared memory, different offset/length)
+# This means:
+# 1. "views" vector would just hold more CuVectors (no savings)
+# 2. We skip view caching entirely - just return view(vec, 1:n) each time
+# 3. View creation is O(1) metadata operation, no GPU memory allocation
+
+mutable struct CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}}
+    # Storage (GPU vectors)
+    vectors::Vector{CuVector{T}}
+
+    # View length cache (for resize decision, but no view object cache)
+    # The actual view is created fresh each time since it's just metadata
+    view_lengths::Vector{Int}
+
+    # N-D cache (same structure as CPU)
+    nd_arrays::Vector{Any}
+    nd_dims::Vector{Any}
+    nd_ptrs::Vector{UInt}
+    nd_next_way::Vector{Int}
+
+    # State (identical to CPU)
+    n_active::Int
+    _checkpoint_n_active::Vector{Int}
+    _checkpoint_depths::Vector{Int}
+end
+
+# Constructor with sentinel pattern
+function CuTypedPool{T}() where T
+    CuTypedPool{T}(
+        CuVector{T}[], Int[],           # No views vector!
+        Any[], Any[], UInt[], Int[],
+        0, [0], [0]
+    )
+end
+
+# GPU-optimized fixed slots (different from CPU!)
+const GPU_FIXED_SLOT_FIELDS = (
+    :float32,      # Primary (GPU-optimized)
+    :float64,      # Precision when needed
+    :float16,      # ML inference (added per AI review)
+    :int32,        # Indexing (GPU-preferred)
+    :int64,        # Large indices
+    :complexf32,   # FFT, signal processing
+    :complexf64,   # High-precision complex
+    :bool,         # Masks
+)
+
+mutable struct CuAdaptiveArrayPool <: AbstractArrayPool
+    # Fixed slots (GPU-optimized order: Float32 first)
+    float32::CuTypedPool{Float32}
+    float64::CuTypedPool{Float64}
+    float16::CuTypedPool{Float16}  # Added per AI review
+    int32::CuTypedPool{Int32}
+    int64::CuTypedPool{Int64}
+    complexf32::CuTypedPool{ComplexF32}
+    complexf64::CuTypedPool{ComplexF64}
+    bool::CuTypedPool{Bool}
+
+    others::IdDict{DataType,Any}
+    _current_depth::Int
+    _untracked_flags::Vector{Bool}
+
+    # Safety: Track which device this pool belongs to (use public API!)
+    device_id::Int
+end
+
+function CuAdaptiveArrayPool()
+    dev = CUDA.device()
+    CuAdaptiveArrayPool(
+        CuTypedPool{Float32}(), CuTypedPool{Float64}(), CuTypedPool{Float16}(),
+        CuTypedPool{Int32}(), CuTypedPool{Int64}(),
+        CuTypedPool{ComplexF32}(), CuTypedPool{ComplexF64}(),
+        CuTypedPool{Bool}(),
+        IdDict{DataType,Any}(), 1, [false],
+        CUDA.deviceid(dev)  # Use public API, not internal .handle
+    )
+end
+```
+
+---
+
+## 4. Generic Functions with Minimal Dispatch
+
+### 4.1 Allocation Dispatch Point
+
+The **only** type-specific function needed:
+
+```julia
+# src/acquire.jl - CPU default
+"""
+    allocate_vector(tp::AbstractTypedPool{T}, n::Int) -> V
+
+Allocate a new vector of type V with n elements.
+This is the single dispatch point for storage-specific allocation.
+"""
+@inline allocate_vector(::AbstractTypedPool{T,Vector{T}}, n::Int) where T =
+    Vector{T}(undef, n)
+
+# ext/ - GPU override (THE ONLY METHOD EXTENSION NEEDS TO ADD!)
+@inline allocate_vector(::AbstractTypedPool{T,CuVector{T}}, n::Int) where T =
+    CuVector{T}(undef, n)
+```
+
+> **[AI Review: Interaction with CUDA.jl Allocator]**
+> It is important to note that `CuVector{T}(undef, n)` uses `CUDA.jl`'s own internal memory pool.
+> **Why do we need another pool?**
+> 1. **Overhead Reduction**: Even cached CUDA allocations have Julia-side overhead (struct creation, finalizer registration). `AdaptiveArrayPools` reuses the *Julia objects* (`CuArray` structs) and views, reducing GC pressure and allocation latency further.
+> 2. **Logical Grouping**: It allows "rewinding" a whole block of temporary allocations in one go, which `CUDA.jl`'s allocator doesn't support (it's `malloc`/`free` style).
+
+### 4.2 get_view! Implementation
+
+> **[Post-Review v0.6: CPU vs GPU Differences]**
+> Due to type differences (`view(Vector, 1:n) → SubArray` vs `view(CuVector, 1:n) → CuVector`),
+> the CPU and GPU implementations differ slightly. CPU caches view objects; GPU creates them fresh.
+
+#### CPU Version (existing, unchanged)
+
+```julia
+# src/acquire.jl - CPU implementation (caches SubArray views)
+
+function get_view!(tp::AbstractTypedPool{T,Vector{T}}, n::Int) where {T}
+    tp.n_active += 1
+    idx = tp.n_active
+
+    # 1. Expand pool if needed
+    if idx > length(tp.vectors)
+        push!(tp.vectors, allocate_vector(tp, n))
+        new_view = view(tp.vectors[idx], 1:n)
+        push!(tp.views, new_view)         # Cache the SubArray
+        push!(tp.view_lengths, n)
+        # ... growth warning ...
+        return new_view
+    end
+
+    # 2. Cache hit (return cached SubArray - ZERO ALLOC)
+    @inbounds cached_len = tp.view_lengths[idx]
+    if cached_len == n
+        return @inbounds tp.views[idx]
+    end
+
+    # 3. Cache miss - resize and update cached view
+    @inbounds vec = tp.vectors[idx]
+    if length(vec) < n
+        resize!(vec, n)
+    end
+    new_view = view(vec, 1:n)
+    @inbounds tp.views[idx] = new_view
+    @inbounds tp.view_lengths[idx] = n
+    return new_view
+end
+```
+
+#### GPU Version (extension)
+
+> **[Post-Review v0.6: resize! Cost Warning]**
+> `resize!(::CuVector, n)` with capacity increase triggers:
+> 1. New GPU buffer allocation
+> 2. Async copy of existing elements (even if we don't need them!)
+>
+> For pools, we typically don't need old data. Consider using `CUDA.unsafe_free!` + fresh
+> allocation instead, or just allocating oversized initially. This is a **performance
+> optimization opportunity** for v1.1+.
+
+```julia
+# ext/AdaptiveArrayPoolsCUDAExt/acquire.jl
+
+# GPU version: no view caching (view() returns CuVector, not SubArray)
+function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, n::Int) where {T}
+    tp.n_active += 1
+    idx = tp.n_active
+
+    # 1. Expand pool if needed
+    if idx > length(tp.vectors)
+        push!(tp.vectors, allocate_vector(tp, n))
+        push!(tp.view_lengths, n)
+        # Return fresh view (no caching - view creates CuVector metadata)
+        return view(tp.vectors[idx], 1:n)
+    end
+
+    # 2. Check if resize needed
+    @inbounds cached_len = tp.view_lengths[idx]
+    @inbounds vec = tp.vectors[idx]
+
+    if length(vec) < n
+        # WARNING: resize! on CuVector copies old data (wasteful for pools)
+        # TODO v1.1: Consider CUDA.unsafe_free! + fresh alloc instead
+        resize!(vec, n)
+    end
+
+    @inbounds tp.view_lengths[idx] = n
+
+    # Always create fresh view (O(1) metadata, no GPU allocation)
+    return view(vec, 1:n)
+end
+```
+
+### 4.3 get_nd_view! Implementation
+
+> **[Post-Review v0.6: reshape Behavior on GPU]**
+> `reshape(::CuVector, dims)` also uses GPUArrays' `derive()` mechanism, returning a
+> `CuArray{T,N}` (not `ReshapedArray`). This is actually simpler - we get a proper
+> GPU array that CUDA kernels can use directly.
+
+```julia
+# src/acquire.jl - Works for both, but return types differ:
+# - CPU: ReshapedArray{T,N,SubArray{...}}
+# - GPU: CuArray{T,N} (via derive)
+
+@inline function get_nd_view!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int}) where {T,N}
+    total_len = safe_prod(dims)
+    flat_view = get_view!(tp, total_len)
+    return reshape(flat_view, dims)  # CPU: ReshapedArray, GPU: CuArray
+end
+```
+
+### 4.4 Generic get_nd_array! (minimal dispatch)
+
+```julia
+# src/acquire.jl
+
+# CPU version uses unsafe_wrap
+@inline function wrap_array(::AbstractTypedPool{T,Vector{T}},
+                            flat_view, dims::NTuple{N,Int}) where {T,N}
+    unsafe_wrap(Array{T,N}, pointer(flat_view), dims)
+end
+
+# ext/ - GPU version
+@inline function wrap_array(::AbstractTypedPool{T,CuVector{T}},
+                            flat_view, dims::NTuple{N,Int}) where {T,N}
+    # Use reshape - returns CuArray{T,N} via GPUArrays derive()
+    reshape(flat_view, dims)
+end
+
+# Generic implementation
+@inline function get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int}) where {T,N}
+    total_len = safe_prod(dims)
+    flat_view = get_view!(tp, total_len)
+    slot = tp.n_active
+
+    # ... cache lookup logic (identical) ...
+
+    # DISPATCH POINT for array wrapping
+    arr = wrap_array(tp, flat_view, dims)
+
+    # ... cache update logic (identical) ...
+
+    return arr
+end
+```
+
+> **[Post-Review v0.6: GPU reshape Clarification]**
+> `reshape(::CuArray, dims)` returns a `CuArray{T,N}` (via GPUArrays `derive()`), **NOT**
+> `ReshapedArray`. This is actually better for GPU kernels - they work directly with
+> `CuArray` without any wrapper overhead. The `derive()` mechanism shares the underlying
+> GPU memory buffer with different offset/strides metadata.
+
+---
+
+## 5. State Management (100% Reusable)
+
+### 5.1 Generic State Functions
+
+All state functions operate only on `n_active` and checkpoint vectors - pure CPU operations.
+
+```julia
+# src/state.jl - These work for ANY AbstractTypedPool!
+
+@inline function _checkpoint_typed_pool!(tp::AbstractTypedPool, depth::Int)
+    push!(tp._checkpoint_n_active, tp.n_active)
+    push!(tp._checkpoint_depths, depth)
+    nothing
+end
+
+@inline function _rewind_typed_pool!(tp::AbstractTypedPool, current_depth::Int)
+    # Orphan cleanup
+    while @inbounds tp._checkpoint_depths[end] > current_depth
+        pop!(tp._checkpoint_depths)
+        pop!(tp._checkpoint_n_active)
+    end
+
+    # Restore
+    if @inbounds tp._checkpoint_depths[end] == current_depth
+        pop!(tp._checkpoint_depths)
+        tp.n_active = pop!(tp._checkpoint_n_active)
+    else
+        tp.n_active = @inbounds tp._checkpoint_n_active[end]
+    end
+    nothing
+end
+
+function _reset_typed_pool!(tp::AbstractTypedPool)
+    tp.n_active = 0
+    empty!(tp._checkpoint_n_active)
+    push!(tp._checkpoint_n_active, 0)
+    empty!(tp._checkpoint_depths)
+    push!(tp._checkpoint_depths, 0)
+    tp
+end
+
+# Concrete dispatches (trivial wrappers)
+reset!(tp::TypedPool) = _reset_typed_pool!(tp)
+reset!(tp::CuTypedPool) = _reset_typed_pool!(tp)  # ext/ adds this
+```
+
+### 5.2 empty! (Type-Specific)
+
+`empty!` needs to clear storage, but the logic is identical:
+
+```julia
+# src/state.jl - Generic implementation
+
+function Base.empty!(tp::AbstractTypedPool)
+    empty!(tp.vectors)
+    empty!(tp.views)          # CPU only (GPU CuTypedPool has no views field)
+    empty!(tp.view_lengths)
+    empty!(tp.nd_arrays)
+    empty!(tp.nd_dims)
+    empty!(tp.nd_ptrs)
+    empty!(tp.nd_next_way)
+    _reset_typed_pool!(tp)
+    tp
+end
+
+# GPU-specific version (no views field)
+function Base.empty!(tp::CuTypedPool)
+    empty!(tp.vectors)
+    empty!(tp.view_lengths)
+    empty!(tp.nd_arrays)
+    empty!(tp.nd_dims)
+    empty!(tp.nd_ptrs)
+    empty!(tp.nd_next_way)
+    _reset_typed_pool!(tp)
+    tp
+end
+```
+
+> **[Post-Review v0.6: GPU Memory Release Clarification]**
+> `empty!(tp.vectors)` **removes Julia references** to `CuVector` objects. This does NOT
+> guarantee immediate VRAM release! The actual GPU memory lifecycle is:
+>
+> 1. **Reference removed** → CuArray becomes GC-eligible
+> 2. **GC runs** → CuArray finalizer queued
+> 3. **Finalizer runs** → Returns memory to CUDA.jl's internal pool
+> 4. **CUDA.jl pool decision** → May or may not release to driver
+>
+> For **immediate VRAM release**, use `CUDA.reclaim()` after `empty!()`:
+> ```julia
+> empty!(get_task_local_cuda_pool())
+> GC.gc()           # Force finalizers to run
+> CUDA.reclaim()    # Request CUDA.jl to release cached memory
+> ```
+
+---
+
+## 6. Task-Local Pool Design
+
+> **[AI Review: Multi-Device Safety]**
+> The original design for `get_task_local_cuda_pool` was unsafe for multi-GPU workflows. If a task switches devices (e.g., `CUDA.device!(1)`), it must not use the pool created for Device 0.
+> **Revised Design**: We use a `Dict{Int, CuAdaptiveArrayPool}` in task local storage to manage one pool per device per task.
+
+### 6.1 Separate Keys & Device Awareness
+
+```julia
+# src/task_local_pool.jl
+const _POOL_KEY = :ADAPTIVE_ARRAY_POOL
+
+@inline function get_task_local_pool()
+    pool = get(task_local_storage(), _POOL_KEY, nothing)
+    if pool === nothing
+        pool = AdaptiveArrayPool()
+        task_local_storage(_POOL_KEY, pool)
+    end
+    return pool::AdaptiveArrayPool
+end
+
+# ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl
+const _CU_POOL_KEY = :ADAPTIVE_ARRAY_POOL_CUDA
+
+@inline function get_task_local_cuda_pool()
+    # Get the dictionary of pools (one per device)
+    pools = get(task_local_storage(), _CU_POOL_KEY, nothing)
+    if pools === nothing
+        pools = Dict{Int, CuAdaptiveArrayPool}()
+        task_local_storage(_CU_POOL_KEY, pools)
+    end
+
+    # Get current device ID using public API
+    dev_id = CUDA.deviceid(CUDA.device())
+
+    # Get or create pool for this device
+    if !haskey(pools, dev_id)
+        pools[dev_id] = CuAdaptiveArrayPool() # Constructor captures device_id
+    end
+
+    return pools[dev_id]
+end
+```
+
+> **[Post-Review v0.6: Public API for Device ID]**
+> Always use `CUDA.deviceid(dev)` instead of `dev.handle`. The `.handle` field is internal
+> and may change between CUDA.jl versions. `deviceid()` is the stable public API.
+
+### 6.2 Rationale for Separation
+
+| Scenario | Benefit |
+|----------|---------|
+| Mixed CPU/GPU workflow | Use both pools independently |
+| GPU memory pressure | `empty!(cuda_pool)` without affecting CPU |
+| Different lifecycles | CPU warm, GPU cleared per batch |
+| **Multi-GPU** | **Safety**: Prevents cross-device access errors |
+| Debugging | Clear distinction in profiling |
+
+---
+
+## 7. Macro Design
+
+### 7.1 Recommended: Unified Macro with Backend Symbol
+
+```julia
+# Unified API - single macro with optional backend symbol
+@with_pool pool begin ... end              # CPU (default, :cpu implied)
+@with_pool :cuda pool begin ... end        # GPU via CUDA
+@with_pool :metal pool begin ... end       # GPU via Metal (future)
+@with_pool :cpu pool begin ... end         # Explicit CPU
+
+# Without pool name (auto-generated)
+@with_pool begin ... end                   # CPU default
+@with_pool :cuda begin ... end             # GPU
+```
+
+**Advantages:**
+- Single macro to learn
+- Easy backend switching (`:cuda` → `:metal`)
+- Future-proof (just add new symbols in extensions)
+- Clean, consistent API
+
+### 7.2 Implementation
+
+> **[Post-Review v0.6: Zero-Overhead Backend Selection]**
+> The original `Dict{Symbol, Function}` registry has a critical flaw: runtime dictionary
+> lookup weakens type inference, preventing the compiler from inlining the pool getter.
+> This conflicts with our "zero overhead for CPU path" goal.
+>
+> **Solution**: Use `Val{:backend}` dispatch instead. Extensions add methods at load time,
+> and the compiler can fully inline the call chain.
+
+```julia
+# src/macros.jl - Val-based dispatch for zero overhead
+
+"""
+    _get_pool_for_backend(::Val{:cpu}) -> AdaptiveArrayPool
+
+Get task-local pool for the specified backend. Extensions add methods for their backends.
+Using Val{Symbol} enables compile-time dispatch and full inlining.
+"""
+@inline _get_pool_for_backend(::Val{:cpu}) = get_task_local_pool()
+
+# Fallback with helpful error message
+@noinline function _get_pool_for_backend(::Val{B}) where B
+    error("Pool backend :$B not found. Did you forget to load the extension (e.g., `using CUDA`)?")
+end
+
+# Macro signatures
+macro with_pool(backend::QuoteNode, pool_name, expr)
+    _generate_pool_code_with_backend(backend.value, pool_name, expr)
+end
+
+macro with_pool(backend::QuoteNode, expr)
+    # Backend symbol without pool name
+    pool_name = gensym(:pool)
+    _generate_pool_code_with_backend(backend.value, pool_name, expr)
+end
+
+macro with_pool(pool_name, expr)
+    # No backend = CPU default
+    _generate_pool_code_with_backend(:cpu, pool_name, expr)
+end
+
+macro with_pool(expr)
+    # No backend, no pool name
+    pool_name = gensym(:pool)
+    _generate_pool_code_with_backend(:cpu, pool_name, expr)
+end
+
+function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr)
+    transformed_expr = _transform_acquire_calls(expr, pool_name)
+
+    # Use Val{backend} for compile-time dispatch - fully inlinable!
+    quote
+        local $(esc(pool_name)) = $_get_pool_for_backend($(Val{backend}()))
+        checkpoint!($(esc(pool_name)))
+        try
+            $(esc(transformed_expr))
+        finally
+            rewind!($(esc(pool_name)))
+        end
+    end
+end
+```
+
+> **Why Val{:backend} instead of Dict?**
+>
+> | Approach | Lookup Cost | Type Inference | Inlining |
+> |----------|-------------|----------------|----------|
+> | `Dict{Symbol,Function}` | O(1) hash | ❌ Returns `Function` | ❌ Dynamic call |
+> | `Val{:cpu}` dispatch | O(0) compiled | ✅ Concrete type | ✅ Full inlining |
+>
+> With Val dispatch, `@with_pool :cpu` compiles to exactly the same code as the
+> original non-backend version—zero overhead.
+
+### 7.3 Extension Registration
+
+```julia
+# ext/AdaptiveArrayPoolsCUDAExt/macros.jl
+
+# Add method for :cuda backend via Val dispatch (no __init__ needed!)
+@inline AdaptiveArrayPools._get_pool_for_backend(::Val{:cuda}) = get_task_local_cuda_pool()
+
+# Optional: Explicit macro alias for users who prefer it
+macro with_cuda_pool(pool_name, expr)
+    esc(:(@with_pool :cuda $pool_name $expr))
+end
+
+macro with_cuda_pool(expr)
+    esc(:(@with_pool :cuda $expr))
+end
+
+export @with_cuda_pool  # Optional explicit alias
+```
+
+> **Note**: With Val dispatch, no `__init__` registration is needed. The method is added
+> when the extension module loads, and Julia's method dispatch handles the rest.
+
+### 7.4 Design Trade-offs
+
+| Approach | Pros | Cons |
+|----------|------|------|
+| **Unified** (`@with_pool :cuda`) | Single API, easy switching, extensible | Symbol must be literal |
+| **Explicit** (`@with_cuda_pool`) | Clear intent, better autocomplete | Multiple macros to learn |
+| **Hybrid** (both available) | User choice | Slight API redundancy |
+
+**Recommendation: Hybrid approach** - unified macro as primary API, explicit aliases optional.
+
+---
+
+## 8. Package Extension Structure
+
+### 8.1 Project.toml Changes
+
+```toml
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+AdaptiveArrayPoolsCUDAExt = "CUDA"
+```
+
+### 8.2 File Structure
+
+```
+AdaptiveArrayPools/
+├── src/
+│   ├── AdaptiveArrayPools.jl
+│   ├── types.jl              # + AbstractTypedPool{T,V}, AbstractArrayPool
+│   ├── acquire.jl            # + allocate_vector, wrap_array dispatch points
+│   ├── state.jl              # Generic _checkpoint/_rewind/_reset (unchanged logic)
+│   ├── task_local_pool.jl    # (unchanged)
+│   ├── macros.jl             # + _get_pool_for_backend(::Val{:cpu}) dispatch
+│   └── utils.jl              # (unchanged)
+└── ext/
+    └── AdaptiveArrayPoolsCUDAExt/
+        ├── AdaptiveArrayPoolsCUDAExt.jl  # ~25 lines
+        ├── types.jl                       # ~50 lines (no views field!)
+        ├── acquire.jl                     # ~30 lines (GPU-specific get_view!)
+        ├── dispatch.jl                    # ~35 lines (+ checkpoint correction)
+        ├── task_local_pool.jl             # ~25 lines (multi-device, public API)
+        └── macros.jl                      # ~15 lines (@with_cuda_pool)
+```
+
+**Total extension: ~180 lines** (slightly more due to GPU-specific get_view!)
+
+### 8.3 Extension Entry Point
+
+```julia
+# ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl
+
+module AdaptiveArrayPoolsCUDAExt
+
+using AdaptiveArrayPools
+using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool,
+                          allocate_vector, wrap_array, get_view!,
+                          _checkpoint_typed_pool!, _rewind_typed_pool!,
+                          _reset_typed_pool!, _get_pool_for_backend,
+                          CACHE_WAYS, checkpoint!, rewind!, reset!
+using CUDA
+
+include("types.jl")
+include("acquire.jl")      # GPU-specific get_view!
+include("dispatch.jl")
+include("task_local_pool.jl")
+include("macros.jl")
+
+# Exports
+export CuAdaptiveArrayPool, CuTypedPool
+export get_task_local_cuda_pool
+export @with_cuda_pool
+
+end # module
+```
+
+### 8.4 dispatch.jl
+
+```julia
+# ext/AdaptiveArrayPoolsCUDAExt/dispatch.jl
+
+# THE KEY DISPATCH METHODS
+
+@inline AdaptiveArrayPools.allocate_vector(
+    ::AbstractTypedPool{T,CuVector{T}}, n::Int
+) where T = CuVector{T}(undef, n)
+
+@inline AdaptiveArrayPools.wrap_array(
+    ::AbstractTypedPool{T,CuVector{T}}, flat_view, dims::NTuple{N,Int}
+) where {T,N} = reshape(flat_view, dims)
+
+# get_typed_pool! dispatches for CuAdaptiveArrayPool
+@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Float32}) = p.float32
+@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Float64}) = p.float64
+@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Float16}) = p.float16
+@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Int32}) = p.int32
+@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Int64}) = p.int64
+@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{ComplexF32}) = p.complexf32
+@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{ComplexF64}) = p.complexf64
+@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Bool}) = p.bool
+
+# Fallback for other types (with checkpoint correction!)
+@inline function AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{T}) where T
+    get!(p.others, T) do
+        tp = CuTypedPool{T}()
+        # CRITICAL: Match CPU behavior - auto-checkpoint new pool if inside @with_pool scope
+        # Without this, rewind! would corrupt state for dynamically-created pools
+        if p._current_depth > 1
+            push!(tp._checkpoint_n_active, 0)  # n_active starts at 0
+            push!(tp._checkpoint_depths, p._current_depth)
+        end
+        tp
+    end::CuTypedPool{T}
+end
+```
+
+> **[Post-Review v0.6: Checkpoint Correction for Dynamic Pools]**
+> When a new `CuTypedPool{T}` is created inside a `@with_pool` scope (i.e., when
+> `_current_depth > 1`), we must initialize its checkpoint state to match the current
+> depth. Otherwise, `rewind!` would pop from an incorrect checkpoint stack state.
+>
+> This mirrors the CPU implementation in `src/types.jl:230-238`.
+
+---
+
+## 9. Memory Layout Clarification
+
+### 9.1 Why `Vector{CuVector{T}}` (not `CuVector{CuVector{T}}`)
+
+```
+✅ Correct: Vector{CuVector{T}}
+
+   CPU RAM                      GPU VRAM
+   ┌─────────────────┐          ┌─────────────────┐
+   │ Vector          │          │                 │
+   │ ├─ CuVec meta1 ─┼──────────┼─► data1 [...]   │
+   │ ├─ CuVec meta2 ─┼──────────┼─► data2 [...]   │
+   │ └─ CuVec meta3 ─┼──────────┼─► data3 [...]   │
+   └─────────────────┘          └─────────────────┘
+
+   Pool management: CPU         Computation: GPU
+```
+
+### 9.2 What Lives Where
+
+| Component | Location | Reason |
+|-----------|----------|--------|
+| Pool struct | CPU | Julia runtime |
+| `vectors::Vector{...}` | CPU | Pool indexing |
+| CuVector metadata | CPU | Julia object wrapper |
+| CuVector data | **GPU** | Actual computation |
+| n_active, checkpoints | CPU | State management |
+
+---
+
+## 10. Migration Path
+
+### 10.1 Phase 1: Abstract Types (Non-Breaking)
+
+**Changes to src/:**
+```julia
+# types.jl
++ abstract type AbstractTypedPool{T, V<:AbstractVector{T}} end
++ abstract type AbstractArrayPool end
+- mutable struct TypedPool{T}
++ mutable struct TypedPool{T} <: AbstractTypedPool{T, Vector{T}}
+- mutable struct AdaptiveArrayPool
++ mutable struct AdaptiveArrayPool <: AbstractArrayPool
+
+# acquire.jl
++ allocate_vector(::AbstractTypedPool{T,Vector{T}}, n) where T = Vector{T}(undef, n)
++ wrap_array(::AbstractTypedPool{T,Vector{T}}, view, dims) where {T,N} = unsafe_wrap(...)
+# Change get_view!, get_nd_array! signatures to use AbstractTypedPool
+
+# state.jl
+# Change _checkpoint_typed_pool!, _rewind_typed_pool! to use AbstractTypedPool
+```
+
+**Breaking potential**: None (only adding supertypes and using more general signatures)
+
+### 10.2 Phase 2: CUDA Extension
+
+**New files in ext/:**
+- Minimal implementation as described above
+
+**Breaking potential**: None (purely additive)
+
+### 10.3 Phase 3: Macro Enhancement (Optional)
+
+- Consider Option B unified macro
+- Add `@with_cuda_pool` first, evaluate need for unification
+
+---
+
+## 11. Example Usage (Target API)
+
+### 11.1 Basic Usage - Unified Macro
+
+```julia
+using AdaptiveArrayPools
+using CUDA  # Triggers extension loading, registers :cuda backend
+
+# CPU workflow (default, unchanged)
+function cpu_compute(data)
+    @with_pool pool begin
+        tmp = acquire!(pool, Float64, length(data))
+        tmp .= data
+        sum(tmp)
+    end
+end
+
+# GPU workflow - using :cuda backend symbol
+function gpu_compute(data::CuVector)
+    @with_pool :cuda pool begin
+        A = acquire!(pool, Float32, 1000, 1000)  # Returns CuMatrix{Float32}
+        B = acquire!(pool, Float32, 1000, 1000)
+
+        A .= CUDA.rand(1000, 1000)
+        B .= A .* 2
+
+        sum(B)
+    end
+end
+
+# Explicit CPU backend (equivalent to default)
+function explicit_cpu_compute(data)
+    @with_pool :cpu pool begin
+        tmp = acquire!(pool, Float64, length(data))
+        tmp .= data
+        sum(tmp)
+    end
+end
+```
+
+### 11.2 Mixed CPU/GPU Workflow
+
+```julia
+function mixed_compute(host_data::Vector{Float32})
+    # CPU pool for staging
+    @with_pool cpu_pool begin
+        staging = acquire!(cpu_pool, Float32, length(host_data))
+        staging .= host_data
+
+        # Nested GPU pool
+        @with_pool :cuda gpu_pool begin
+            device_data = acquire!(gpu_pool, Float32, length(staging))
+            copyto!(device_data, staging)  # CPU → GPU
+            device_data .= device_data .^ 2
+            copyto!(staging, device_data)  # GPU → CPU
+        end  # GPU pool rewinds here
+
+        sum(staging)
+    end  # CPU pool rewinds here
+end
+```
+
+### 11.3 Without Pool Name (Auto-generated)
+
+```julia
+# When you don't need to reference the pool directly
+function simple_gpu_compute()
+    @with_pool :cuda begin
+        # pool name auto-generated, use get_task_local_cuda_pool() if needed
+        A = acquire!(get_task_local_cuda_pool(), Float32, 100, 100)
+        sum(A)
+    end
+end
+
+# Or use the explicit getter within the block
+function gpu_with_getter()
+    @with_pool :cuda begin
+        pool = get_task_local_cuda_pool()
+        A = acquire!(pool, Float32, 100, 100)
+        B = acquire!(pool, Float32, 100, 100)
+        A .+ B
+    end
+end
+```
+
+### 11.4 Backend Switching (Same Code, Different Backend)
+
+```julia
+# Parameterized backend - useful for testing/benchmarking
+function compute_on_backend(data, backend::Symbol)
+    if backend == :cpu
+        @with_pool pool begin
+            tmp = acquire!(pool, Float32, length(data))
+            tmp .= data
+            sum(tmp)
+        end
+    elseif backend == :cuda
+        @with_pool :cuda pool begin
+            tmp = acquire!(pool, Float32, length(data))
+            tmp .= data
+            sum(tmp)
+        end
+    end
+end
+
+# Note: Backend symbol must be literal in macro (compile-time)
+# For runtime dispatch, use explicit pool getters:
+function runtime_backend_dispatch(data, use_gpu::Bool)
+    pool = use_gpu ? get_task_local_cuda_pool() : get_task_local_pool()
+    checkpoint!(pool)
+    try
+        tmp = acquire!(pool, Float32, length(data))
+        tmp .= data
+        sum(tmp)
+    finally
+        rewind!(pool)
+    end
+end
+```
+
+### 11.5 Explicit Pool Management (Advanced)
+
+```julia
+# Manual checkpoint/rewind for fine-grained control
+function explicit_pool_management()
+    cpu = get_task_local_pool()
+    gpu = get_task_local_cuda_pool()
+
+    # Checkpoint both pools
+    checkpoint!(cpu)
+    checkpoint!(gpu)
+    try
+        cpu_buf = acquire!(cpu, Float64, 1000)
+        gpu_buf = acquire!(gpu, Float32, 1000)
+
+        # ... computation ...
+
+    finally
+        # Rewind in reverse order (LIFO)
+        rewind!(gpu)
+        rewind!(cpu)
+    end
+end
+
+# Clear GPU memory when under pressure
+function memory_sensitive_workflow()
+    @with_pool :cuda pool begin
+        # Heavy GPU computation
+        A = acquire!(pool, Float32, 10000, 10000)
+        # ...
+    end
+
+    # Explicitly free GPU memory if needed
+    empty!(get_task_local_cuda_pool())
+
+    # Continue with CPU work
+    @with_pool pool begin
+        # CPU pool unaffected
+    end
+end
+```
+
+### 11.6 Future: Multiple GPU Backends
+
+```julia
+# When Metal.jl extension is added (future)
+using Metal  # Registers :metal backend
+
+function apple_silicon_compute()
+    @with_pool :metal pool begin
+        A = acquire!(pool, Float32, 1000, 1000)  # MtlMatrix{Float32}
+        # Metal-specific computation
+    end
+end
+```
+
+> **[Post-Review v0.6: Backend Symbol Must Be Literal]**
+> The macro `@with_pool :backend` requires a **literal symbol** (`:cuda`, `:metal`),
+> not a variable containing a symbol. This is a Julia macro limitation—the backend
+> is resolved at macro expansion time (compile time), not runtime.
+>
+> **This does NOT work:**
+> ```julia
+> const GPU_BACKEND = Sys.isapple() ? :metal : :cuda
+> @with_pool GPU_BACKEND pool begin ... end  # ERROR: GPU_BACKEND is not a QuoteNode
+> ```
+>
+> **For runtime backend selection, use explicit pool getters:**
+> ```julia
+> function portable_gpu_compute(use_metal::Bool)
+>     pool = use_metal ? get_task_local_metal_pool() : get_task_local_cuda_pool()
+>     checkpoint!(pool)
+>     try
+>         A = acquire!(pool, Float32, 1000, 1000)
+>         # ... computation ...
+>     finally
+>         rewind!(pool)
+>     end
+> end
+> ```
+>
+> **Or use `@static` for compile-time platform selection:**
+> ```julia
+> function portable_gpu_compute()
+>     @static if Sys.isapple()
+>         @with_pool :metal pool begin
+>             # Metal path
+>         end
+>     else
+>         @with_pool :cuda pool begin
+>             # CUDA path
+>         end
+>     end
+> end
+> ```
+
+---
+
+## 12. Open Questions
+
+### 12.1 Resolved
+
+1. **Code duplication in extension** → Solved with parametric abstract types
+2. **Macro approach** → Hybrid: unified `@with_pool :cuda` + optional `@with_cuda_pool`
+3. **Memory layout** → `Vector{CuVector{T}}` is correct
+4. **Float16 support** → **Added** to GPU fixed slots (per AI review)
+5. **Multi-Device Safety** → **Solved** with `Dict{Int, Pool}` in task local storage (per AI review)
+6. **unsafe_wrap for GPU** → Use `reshape` instead (per AI review)
+7. **[v0.6] GPU view type** → `view(CuVector, 1:n)` returns `CuVector`, not `SubArray`. Pool design simplified.
+8. **[v0.6] Zero-overhead backend selection** → `Val{:backend}` dispatch instead of Dict registry
+9. **[v0.6] GPU checkpoint correction** → Added to `get_typed_pool!` fallback for `others` dict
+10. **[v0.6] Device ID API** → Use `CUDA.deviceid(dev)` instead of internal `.handle`
+11. **[v0.6] Backend symbol literal requirement** → Documented; `@static if` for platform selection
+
+### 12.2 Stream Synchronization (Critical Safety Documentation)
+
+> **[Post-Review v0.6: Expanded Safety Documentation]**
+
+**The Problem**: `rewind!` logically "frees" pooled memory. If a GPU kernel is still
+running asynchronously using that memory, and the pool re-issues it for a new allocation,
+**data corruption** or **use-after-free** occurs.
+
+**When It's Safe** (no synchronization needed):
+- Single Task, default stream: Julia tasks typically use CUDA's default stream, which
+  serializes operations. `rewind!` happens after all prior operations complete.
+- `CUDA.@sync` inside the block: Explicit synchronization before rewind.
+
+**When It's DANGEROUS** (must synchronize):
+
+1. **Passing arrays to other Tasks**:
+   ```julia
+   @with_pool :cuda pool begin
+       A = acquire!(pool, Float32, 1000)
+       @spawn begin
+           # DANGER: This task may still be using A after rewind!
+           expensive_computation!(A)
+       end
+   end  # rewind! happens here - A is now invalid!
+   ```
+   **Fix**: Wait for spawned task before exiting scope.
+
+2. **Explicit async streams**:
+   ```julia
+   @with_pool :cuda pool begin
+       A = acquire!(pool, Float32, 1000)
+       stream = CUDA.stream()
+       CUDA.@sync stream begin
+           # Kernel launched on non-default stream
+           my_kernel!(A; stream)
+       end
+       # If no @sync: kernel may still be running when rewind! executes
+   end
+   ```
+   **Fix**: `CUDA.synchronize(stream)` or use `CUDA.@sync` before scope ends.
+
+3. **Kernel launch then immediate exit**:
+   ```julia
+   @with_pool :cuda pool begin
+       A = acquire!(pool, Float32, 1000)
+       @cuda threads=1024 my_kernel!(A)
+       # Kernel is async! May still be running...
+   end  # rewind! immediately follows!
+   ```
+   **Fix**: `CUDA.synchronize()` or `CUDA.@sync @cuda ...`
+
+**Recommendation for Documentation**:
+```julia
+# GPU POOLING SAFETY RULES
+#
+# 1. DO NOT pass pooled arrays to other Tasks without synchronization
+# 2. DO synchronize before @with_pool block ends if using async streams
+# 3. PREFER `CUDA.@sync` around kernel launches in pooled scopes
+# 4. WHEN IN DOUBT: `CUDA.synchronize()` before the block ends
+```
+
+### 12.3 Still Open
+
+1. **Typed checkpoint for GPU**: Reuse existing macro logic?
+   - Should work with minimal changes
+   - Need to export `_transform_acquire_calls` etc.
+
+2. **resize! optimization for GPU** (v1.1+):
+   - Current: `resize!(CuVector, n)` copies old data (wasteful for pools)
+   - Consider: `CUDA.unsafe_free!` + fresh allocation, or pre-allocate oversized
+
+3. **Multi-backend single macro**: Support multiple pools in one call?
+   - Tuple syntax: `@with_pool (:cpu, cpu_pool) (:cuda, cuda_pool) begin ... end`
+   - Pro: Cleaner for mixed workflows, guaranteed proper rewind order
+   - Con: More complex macro implementation, less common use case
+   - Alternative: Nested `@with_pool` blocks (current approach)
+   - > **[AI Review]**: The tuple syntax is elegant but maybe over-engineering for V1.
+
+---
+
+## 13. Summary: What Changes Where
+
+### src/ Changes (Phase 1)
+
+| File | Changes |
+|------|---------|
+| types.jl | Add abstract types, inherit from them |
+| acquire.jl | Add `allocate_vector`, `wrap_array` dispatch points; generalize signatures |
+| state.jl | Generalize to `AbstractTypedPool` |
+| macros.jl | Add `_get_pool_for_backend(::Val{:cpu})` dispatch (NOT Dict registry) |
+| Others | No changes |
+
+### ext/ New Files (Phase 2)
+
+| File | Lines | Content |
+|------|-------|---------|
+| AdaptiveArrayPoolsCUDAExt.jl | ~20 | Module, imports, exports |
+| types.jl | ~50 | CuTypedPool (no views field!), CuAdaptiveArrayPool (+ Float16, device_id) |
+| acquire.jl | ~30 | GPU-specific `get_view!` (no view caching) |
+| dispatch.jl | ~35 | allocate_vector, wrap_array, get_typed_pool! (with checkpoint correction) |
+| task_local_pool.jl | ~25 | get_task_local_cuda_pool (multi-device aware, public API) |
+| macros.jl | ~25 | @with_cuda_pool |
+| **Total** | **~155** | |
+
+---
+
+## Changelog
+
+| Version | Date | Changes |
+|---------|------|---------|
+| 0.1 | 2024-12-10 | Initial draft |
+| 0.2 | 2024-12-10 | Redesigned with parametric abstract types for maximum code reuse |
+| 0.3 | 2024-12-10 | Unified macro design (`@with_pool :cuda`), comprehensive usage examples |
+| 0.3.1 | 2024-12-10 | Added open questions: macro style preference, multi-backend single macro |
+| 0.4 | 2024-12-10 | AI Review: Added Float16, device_id, multi-device pool getter, stream sync warning |
+| 0.5 | 2024-12-10 | Merged AI feedback with restored full documentation |
+| 0.6 | 2024-12-14 | **Post-Review Revision**: (1) Fixed GPU view type—`view(CuVector,1:n)` returns `CuVector` via GPUArrays `derive()`, not `SubArray`; simplified pool design by removing view caching. (2) Replaced Dict registry with `Val{:backend}` dispatch for zero-overhead backend selection. (3) Added checkpoint correction to GPU `get_typed_pool!` fallback. (4) Fixed `device_id` to use public API `CUDA.deviceid()`. (5) Clarified `empty!` semantics (reference removal ≠ VRAM release). (6) Documented `resize!` cost on GPU. (7) Expanded stream synchronization safety documentation. (8) Fixed backend symbol literal requirement (removed invalid `GPU_BACKEND` variable example). |
diff --git a/docs/design/fixed_slots_codegen_design.md b/docs/design/fixed_slots_codegen_design.md
new file mode 100644
index 0000000..c141e6b
--- /dev/null
+++ b/docs/design/fixed_slots_codegen_design.md
@@ -0,0 +1,362 @@
+# Fixed Slots Iteration Automation Design Document
+
+## 1. Problem Definition
+
+### Current State
+Iteration over fixed slot types was **manually repeated** across multiple functions:
+
+```julia
+# Inside checkpoint!
+_checkpoint_typed_pool!(pool.float64, depth)
+_checkpoint_typed_pool!(pool.float32, depth)
+_checkpoint_typed_pool!(pool.int64, depth)
+_checkpoint_typed_pool!(pool.int32, depth)
+_checkpoint_typed_pool!(pool.complexf64, depth)
+_checkpoint_typed_pool!(pool.complexf32, depth)
+_checkpoint_typed_pool!(pool.bool, depth)
+
+# Inside rewind! - same pattern repeated
+_rewind_typed_pool!(pool.float64, depth)
+_rewind_typed_pool!(pool.float32, depth)
+...
+
+# Inside empty! - repeated again
+empty!(pool.float64)
+empty!(pool.float32)
+...
+```
+
+### Improvement Goals
+- **Centralized iteration logic**: Define once, use everywhere
+- **Zero allocation**: No runtime overhead
+- **IDE support preserved**: Keep struct definitions explicit
+
+---
+
+## 2. Design Decision
+
+### Option B Adopted: const tuple + @generated (automate iteration only)
+
+**Core principle**: Keep struct definition manual, automate only iteration
+
+```julia
+# 1. Keep struct explicitly defined (full IDE support)
+mutable struct AdaptiveArrayPool
+    float64::TypedPool{Float64}
+    float32::TypedPool{Float32}
+    int64::TypedPool{Int64}
+    int32::TypedPool{Int32}
+    complexf64::TypedPool{ComplexF64}
+    complexf32::TypedPool{ComplexF32}
+    bool::TypedPool{Bool}
+    others::IdDict{DataType, Any}
+    _current_depth::Int
+    _untracked_flags::Vector{Bool}
+end
+
+# 2. Define field names as const tuple
+const FIXED_SLOT_FIELDS = (:float64, :float32, :int64, :int32, :complexf64, :complexf32, :bool)
+
+# 3. Use @generated for zero-allocation iteration
+@generated function foreach_fixed_slot(f::F, pool::AdaptiveArrayPool) where {F}
+    exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in FIXED_SLOT_FIELDS]
+    quote
+        $(exprs...)
+        nothing
+    end
+end
+```
+
+---
+
+## 3. Detailed Implementation
+
+### 3.1 types.jl Changes
+
+```julia
+# ==============================================================================
+# Fixed Slot Configuration
+# ==============================================================================
+
+"""
+    FIXED_SLOT_FIELDS
+
+Fixed slot field names for iteration. Used by `foreach_fixed_slot`.
+
+Note: When adding/removing fixed slots, update BOTH:
+1. This tuple
+2. The AdaptiveArrayPool struct definition below
+"""
+const FIXED_SLOT_FIELDS = (:float64, :float32, :int64, :int32, :complexf64, :complexf32, :bool)
+
+# ==============================================================================
+# AdaptiveArrayPool (explicit definition - full IDE support)
+# ==============================================================================
+
+mutable struct AdaptiveArrayPool
+    # Fixed Slots: common types with zero lookup overhead
+    # NOTE: Keep in sync with FIXED_SLOT_FIELDS above
+    float64::TypedPool{Float64}
+    float32::TypedPool{Float32}
+    int64::TypedPool{Int64}
+    int32::TypedPool{Int32}
+    complexf64::TypedPool{ComplexF64}
+    complexf32::TypedPool{ComplexF32}
+    bool::TypedPool{Bool}
+
+    # Fallback: rare types
+    others::IdDict{DataType, Any}
+
+    # Untracked acquire detection
+    _current_depth::Int
+    _untracked_flags::Vector{Bool}
+end
+
+# ... constructor, get_typed_pool! etc. remain unchanged ...
+
+# ==============================================================================
+# Zero-Allocation Iteration
+# ==============================================================================
+
+"""
+    foreach_fixed_slot(f, pool::AdaptiveArrayPool)
+
+Apply function `f` to each fixed slot TypedPool.
+Zero allocation via compile-time unrolling.
+
+## Example
+```julia
+foreach_fixed_slot(pool) do tp
+    _checkpoint_typed_pool!(tp, depth)
+end
+```
+"""
+@generated function foreach_fixed_slot(f::F, pool::AdaptiveArrayPool) where {F}
+    exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in FIXED_SLOT_FIELDS]
+    quote
+        $(exprs...)
+        nothing
+    end
+end
+```
+
+### 3.2 state.jl Changes
+
+```julia
+function checkpoint!(pool::AdaptiveArrayPool)
+    pool._current_depth += 1
+    push!(pool._untracked_flags, false)
+    depth = pool._current_depth
+
+    # Fixed slots - zero allocation via @generated
+    foreach_fixed_slot(pool) do tp
+        _checkpoint_typed_pool!(tp, depth)
+    end
+
+    # Others - fallback types
+    for p in values(pool.others)
+        _checkpoint_typed_pool!(p, depth)
+    end
+    nothing
+end
+
+function rewind!(pool::AdaptiveArrayPool)
+    depth = pool._current_depth
+
+    # Fixed slots - zero allocation
+    foreach_fixed_slot(pool) do tp
+        _rewind_typed_pool!(tp, depth)
+    end
+
+    # Others
+    for tp in values(pool.others)
+        _rewind_typed_pool!(tp, depth)
+    end
+
+    pop!(pool._untracked_flags)
+    pool._current_depth -= 1
+    nothing
+end
+
+function Base.empty!(pool::AdaptiveArrayPool)
+    # Fixed slots
+    foreach_fixed_slot(empty!, pool)
+
+    # Others
+    for tp in values(pool.others)
+        empty!(tp)
+    end
+    empty!(pool.others)
+
+    pool._current_depth = 0
+    empty!(pool._untracked_flags)
+    pool
+end
+```
+
+---
+
+## 4. Type Add/Remove Procedure
+
+### Adding UInt8
+
+**Locations requiring manual update (2 places)**:
+
+```julia
+# 1. Update FIXED_SLOT_FIELDS
+const FIXED_SLOT_FIELDS = (:float64, :float32, :int64, :int32, :complexf64, :complexf32, :bool, :uint8)
+
+# 2. Update AdaptiveArrayPool struct
+mutable struct AdaptiveArrayPool
+    float64::TypedPool{Float64}
+    float32::TypedPool{Float32}
+    int64::TypedPool{Int64}
+    int32::TypedPool{Int32}
+    complexf64::TypedPool{ComplexF64}
+    complexf32::TypedPool{ComplexF32}
+    bool::TypedPool{Bool}
+    uint8::TypedPool{UInt8}      # ← Added
+    ...
+end
+
+# 3. Update constructor
+function AdaptiveArrayPool()
+    AdaptiveArrayPool(
+        TypedPool{Float64}(),
+        ...
+        TypedPool{UInt8}(),      # ← Added
+        ...
+    )
+end
+
+# 4. Add get_typed_pool! dispatch
+@inline get_typed_pool!(p::AdaptiveArrayPool, ::Type{UInt8}) = p.uint8
+```
+
+**Automatically updated**:
+- `checkpoint!` internal iteration
+- `rewind!` internal iteration
+- `empty!` internal iteration
+- All code using `foreach_fixed_slot`
+
+---
+
+## 5. Testing Strategy
+
+```julia
+@testset "Fixed Slot Iteration" begin
+    pool = AdaptiveArrayPool()
+
+    # Verify FIXED_SLOT_FIELDS and struct synchronization
+    for field in FIXED_SLOT_FIELDS
+        @test hasfield(AdaptiveArrayPool, field)
+        @test getfield(pool, field) isa TypedPool
+    end
+
+    # Verify foreach_fixed_slot visits all slots
+    count = Ref(0)
+    foreach_fixed_slot(pool) do tp
+        count[] += 1
+    end
+    @test count[] == length(FIXED_SLOT_FIELDS)
+
+    # Zero allocation verification
+    pool2 = AdaptiveArrayPool()
+    foreach_fixed_slot(identity, pool2)  # warmup
+    allocs = @allocated foreach_fixed_slot(identity, pool2)
+    @test allocs == 0
+end
+```
+
+---
+
+## 6. Benefits
+
+### 6.1 Full IDE Support
+- Explicit struct definition → autocomplete, Go to Definition work correctly
+- Perfect type inference
+- LSP/Language Server compatible
+
+### 6.2 Simple Implementation
+- Single `@generated` function automates iteration
+- No `@eval` needed → no precompilation concerns
+- Most existing code preserved
+
+### 6.3 Easy Debugging
+- Clear struct definition allows field inspection in debugger
+- Compatible with tools like `@infiltrate`
+
+### 6.4 Zero Runtime Overhead
+```julia
+# @generated unrolls at compile time:
+# foreach_fixed_slot(f, pool) is equivalent to:
+f(pool.float64)
+f(pool.float32)
+f(pool.int64)
+f(pool.int32)
+f(pool.complexf64)
+f(pool.complexf32)
+f(pool.bool)
+```
+
+---
+
+## 7. Drawbacks and Considerations
+
+### 7.1 Synchronization Required (2 places)
+```julia
+# These two locations must always be in sync:
+const FIXED_SLOT_FIELDS = (:float64, :float32, ...)  # 1
+mutable struct AdaptiveArrayPool                      # 2
+    float64::TypedPool{Float64}
+    ...
+end
+```
+
+**Mitigation**: Explicit comments + test verification
+
+### 7.2 @generated First-Call Cost
+```julia
+# Recompiles for different closures
+foreach_fixed_slot(x -> checkpoint!(x, 1), pool)  # Compiles
+foreach_fixed_slot(x -> rewind!(x, 1), pool)      # Compiles again
+```
+
+**Impact**: Slight effect on TTFX (Time To First X)
+**Mitigation**: Warmup (precompile) at package load
+
+### 7.3 Metaprogramming Knowledge Required
+```julia
+@generated function foreach_fixed_slot(f::F, pool) where {F}
+    # Understanding this code requires @generated knowledge
+    exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in FIXED_SLOT_FIELDS]
+    ...
+end
+```
+
+**Mitigation**: Thorough comments and docstrings
+
+---
+
+## 8. Option Comparison Summary
+
+| Aspect | Current (Manual) | Option B (Adopted) | Option C (@eval) |
+|--------|------------------|-------------------|------------------|
+| Modification locations | 6+ places | 2 places + α | 1 place |
+| IDE support | Perfect | Perfect | Partial |
+| Complexity | Low | Low | High |
+| Debugging | Easy | Easy | Difficult |
+| Type addition safety | May miss | Test-verified | Automatic |
+
+---
+
+## 9. Conclusion
+
+**Reasons for adopting Option B**:
+
+1. **Practical balance**: Removes repetitive code without the complexity of full automation (Option C)
+2. **IDE support preserved**: Maintains the most important developer experience
+3. **Low risk**: Uses only `@generated` without `@eval`, ensuring precompilation stability
+4. **Incremental improvement**: Improves only iteration while preserving most existing code
+
+Since type changes are rare (1-2 times during package lifetime), and struct definition synchronization across 2 locations can be sufficiently verified by tests, Option B is the optimal choice.
diff --git a/docs/design/hybrid_api_design.md b/docs/design/hybrid_api_design.md
new file mode 100644
index 0000000..a407e72
--- /dev/null
+++ b/docs/design/hybrid_api_design.md
@@ -0,0 +1,441 @@
+# Hybrid API Design: acquire! vs unsafe_acquire!
+
+## Executive Summary
+
+Redesigning `AdaptiveArrayPools.jl`'s N-D array acquisition API with a **Two Tools Strategy**:
+
+| API | Return Type | Use Case | Allocation Characteristics |
+|-----|-------------|----------|---------------------------|
+| `acquire!` | `ReshapedArray` (fixed) | General use, Static dispatch | No cache needed, relies on compiler optimization |
+| `unsafe_acquire!` | `Array` (fixed) | Dynamic dispatch, FFI | Cache hit: 0, miss: 112 bytes |
+
+**Core Principle**: Return type does not change based on state (Type Stability guaranteed)
+
+> **Note**: ReshapedArray's "0 allocation" depends on compiler's SROA (Scalar Replacement of Aggregates) and
+> escape analysis. Not always guaranteed - heap allocation may occur if the value escapes from the function.
+
+---
+
+## Problem Statement
+
+### Current State (v0.2.0)
+
+```
+acquire!(pool, T, dims...)
+  └─> get_nd_view!()
+        └─> get_nd_array!()  ← uses unsafe_wrap
+              └─> 112 bytes on cache miss!
+```
+
+- Both `acquire!` and `unsafe_acquire!` internally use `unsafe_wrap`
+- Always 112 bytes allocation on cache miss
+- Tried to reduce miss rate with N-way cache, but 100% miss on cyclic patterns
+
+### v0.1.2 Approach
+
+```
+acquire!(pool, T, dims...)
+  └─> get_view!(tp, total_len)  ← 1D view (cached)
+        └─> reshape(view, dims)  ← 0 bytes always!
+```
+
+- `reshape(view, dims)` creates a wrapper object, but heap allocation can be avoided via compiler optimization (SROA/escape analysis)
+- Simple and predictable
+
+---
+
+## Why Not Mixed Return Types?
+
+### Proposed (but rejected) Approach
+
+```julia
+# ❌ BAD: Array on cache hit, View on miss
+function acquire!(pool, T, dims...)
+    if cache_hit
+        return cached_array::Array{T,N}
+    else
+        return reshape(view, dims)::ReshapedArray{...}
+    end
+end
+```
+
+### Problem: Type Instability
+
+| Aspect | Impact |
+|--------|--------|
+| **Compiler inference** | `Union{Array, ReshapedArray}` → Union splitting or dynamic dispatch |
+| **Performance** | Execution slowdown while trying to achieve zero-alloc |
+| **API semantics** | Same function returning different types → confusion |
+| **Module boundaries** | Inference widens when storing result or passing to other modules |
+
+**AI Feedback Quote**:
+> "State-dependent returns become Union{Array, ReshapedArray} from external view, breaking API-level type stability."
+
+---
+
+## Recommended Design: Two Tools Strategy
+
+### Principles
+
+1. **Fixed return type**: Each API always returns the same type
+2. **Purpose separation**: Users choose API based on situation
+3. **Simple implementation**: Minimize complex cache logic
+
+### API Design
+
+#### 1. `acquire!` → ReshapedArray (regression to v0.1.2 style)
+
+```julia
+@inline function acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N}
+    tp = get_typed_pool!(pool, T)
+    total_len = safe_prod(dims)
+    flat_view = get_view!(tp, total_len)  # 1D view (cached, 0 alloc)
+    return reshape(flat_view, dims)        # ReshapedArray (0 alloc always!)
+end
+```
+
+**Characteristics**:
+- Always returns `ReshapedArray{T, N, SubArray{T, 1, Vector{T}, ...}, ...}`
+- No `unsafe_wrap` call → no Array header creation cost (112B) even on cache miss
+- N-way cache unnecessary (simple 1D view cache sufficient)
+
+**Use Cases**:
+- General `Flux` layers (`mul!`, `broadcast`)
+- Code where static dispatch is guaranteed
+- Most use cases
+
+**Constraints**:
+- Escape optimization may fail in type-unspecified call paths, causing wrapper allocation
+- Incompatible with APIs requiring strict `Array` type (rare)
+
+#### 2. `unsafe_acquire!` → Array (maintains v0.2.0 + N-way Cache)
+
+```julia
+@inline function unsafe_acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N}
+    tp = get_typed_pool!(pool, T)
+    return get_nd_array!(tp, dims)  # Array with slot-based + N-way cache
+end
+```
+
+**Characteristics**:
+- Always returns `Array{T, N}`
+- Cache hit: 0 bytes, Cache miss: 112 bytes
+- Maintains existing N-way cache (4-way)
+
+**Use Cases**:
+- Type-unspecified call paths (e.g., `TGLFNNmodel._pooled_chain` - no concrete type parameters)
+- FFI / ccall
+- Special APIs that strictly require `StridedArray`
+- Places requiring strict `Array` type
+
+**Benefits**:
+- Array is already a heap-allocated object → no additional allocation when reusing cached instance
+- Avoids wrapper object optimization issues in type-unspecified paths
+
+---
+
+## Comparison Matrix
+
+| Strategy | Return Type | Cache Miss Cost | Type-Unspecified Path* | Type Stable |
+|----------|-------------|-----------------|------------------------|-------------|
+| **acquire! (new)** | `ReshapedArray` | **0 bytes** (no unsafe_wrap) | May allocate wrapper | **✓** |
+| **unsafe_acquire!** | `Array` | 112 bytes | **0 bytes** (on cache hit) | **✓** |
+| ~~Mixed (rejected)~~ | `Union{...}` | 0 bytes | Unspecified | **✗** |
+
+*Type-unspecified path: Calls through abstract fields without concrete type parameters, etc. Compiler cannot apply escape optimization, causing wrapper object heap allocation.
+
+### Recommended API by Situation
+
+| Situation | acquire! | unsafe_acquire! | Recommendation |
+|-----------|----------|-----------------|----------------|
+| Type-specified call path | Optimizable | 0 bytes (hit) | `acquire!` |
+| Variable dims (cyclic pattern) | Optimizable | cache miss occurs | `acquire!` |
+| Type-unspecified path | Wrapper alloc | **0 bytes** (hit) | **`unsafe_acquire!`** |
+| FFI / raw pointer | N/A | 0 bytes | `unsafe_acquire!` |
+
+---
+
+## Implementation Plan
+
+### Phase 1: Simplify `acquire!` N-D Path
+
+**File**: `src/core.jl`
+
+**Before**:
+```julia
+@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N}
+    arr = get_nd_array!(tp, dims)  # uses unsafe_wrap
+    idx = tp.n_active
+    # ... complex caching logic
+    new_view = view(arr, ntuple(_ -> Colon(), Val(N))...)
+    return new_view  # SubArray{T,N,Array{T,N}}
+end
+```
+
+**After**:
+```julia
+@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N}
+    total_len = safe_prod(dims)
+    flat_view = get_view!(tp, total_len)  # 1D view (cached)
+    return reshape(flat_view, dims)        # ReshapedArray (0 alloc!)
+end
+```
+
+**Change Summary**:
+- Remove `get_nd_array!` call
+- Directly return `reshape(1D_view, dims)`
+- N-D view cache (`nd_views`) not used (in acquire! path)
+
+### Phase 2: Maintain `unsafe_acquire!` Cache
+
+**No changes** - maintain current implementation:
+- `get_nd_array!` → `unsafe_wrap` + slot-based cache
+- Maintain N-way cache (4-way)
+- 112 bytes allocation on cache miss
+
+### Phase 3: TypedPool Field Cleanup (Optional)
+
+Since `acquire!` no longer uses N-D cache, redefine field purposes:
+
+```julia
+mutable struct TypedPool{T}
+    # Storage
+    vectors::Vector{Vector{T}}
+
+    # 1D Cache (shared by acquire! 1D + acquire! N-D)
+    views::Vector{SubArray{...}}
+    view_lengths::Vector{Int}
+
+    # N-D Cache (unsafe_acquire! only)
+    nd_arrays::Vector{Any}      # Array objects for unsafe_acquire!
+    nd_dims::Vector{Any}        # Dimension tuples
+    nd_ptrs::Vector{UInt}       # Pointer validation
+
+    # Note: nd_views can be removed (acquire! uses reshape)
+
+    # State
+    n_active::Int
+    _checkpoint_n_active::Vector{Int}
+    _checkpoint_depths::Vector{Int}
+end
+```
+
+### Phase 4: Test Updates
+
+**Files**: `test/test_nway_cache.jl`, `test/test_zero_allocation.jl`
+
+```julia
+@testset "acquire! returns ReshapedArray" begin
+    pool = AdaptiveArrayPool()
+    @with_pool pool begin
+        m = acquire!(pool, Float64, 10, 10)
+        @test m isa Base.ReshapedArray
+        @test size(m) == (10, 10)
+    end
+end
+
+@testset "acquire! is always zero-allocation" begin
+    pool = AdaptiveArrayPool()
+
+    # 5-way cycling (exceeds any cache) - still 0 alloc!
+    function test_5way!(p)
+        dims_list = ((5, 10), (10, 5), (7, 7), (3, 16), (4, 12))
+        for dims in dims_list
+            checkpoint!(p)
+            acquire!(p, Float64, dims...)  # ReshapedArray
+            rewind!(p)
+        end
+    end
+
+    test_5way!(pool); test_5way!(pool)
+    allocs = @allocated test_5way!(pool)
+    @test allocs == 0  # Always zero, regardless of pattern!
+end
+
+@testset "unsafe_acquire! returns Array" begin
+    pool = AdaptiveArrayPool()
+    @with_pool pool begin
+        m = unsafe_acquire!(pool, Float64, 10, 10)
+        @test m isa Array
+        @test size(m) == (10, 10)
+    end
+end
+```
+
+### Phase 5: Documentation
+
+**CHANGELOG.md** (not a breaking change, behavior improvement):
+```markdown
+## [Unreleased]
+### Changed
+- `acquire!` N-D path now returns `ReshapedArray` instead of `SubArray{Array}`
+  - Always zero-allocation, regardless of cache hit/miss
+  - Simpler implementation, no N-D cache dependency
+- `unsafe_acquire!` continues to return `Array` with N-way cache
+  - Use this when dynamic dispatch or raw Array is needed
+```
+
+**Docstring Updates**:
+```julia
+"""
+    acquire!(pool, Type{T}, dims...) -> ReshapedArray{T,N,...}
+
+Acquire a view with dimensions `dims` from the pool.
+
+Returns a `ReshapedArray` backed by pool memory. **Zero creation cost** - no
+`unsafe_wrap` call needed. Compiler may optimize away heap allocation via
+SROA/escape analysis in type-specified paths.
+
+For type-unspecified paths (struct fields without concrete type parameters),
+use [`unsafe_acquire!`](@ref) instead - cached Array instances can be reused.
+
+## Example
+```julia
+@with_pool pool begin
+    m = acquire!(pool, Float64, 64, 100)  # ReshapedArray
+    m .= 1.0
+    result = sum(m)
+end
+```
+"""
+```
+
+```julia
+"""
+    unsafe_acquire!(pool, Type{T}, dims...) -> Array{T,N}
+
+Acquire a raw `Array` backed by pool memory.
+
+Returns an `Array` object. Since Array is already heap-allocated, the cached
+instance can be reused without wrapper allocation overhead.
+
+## When to use
+- Type-unspecified paths (e.g., struct fields without concrete type parameters)
+- FFI / ccall requiring raw pointers
+- APIs that strictly require `Array` type
+
+## Allocation behavior
+- Cache hit: 0 bytes (cached Array instance reused)
+- Cache miss: 112 bytes (Array header creation)
+
+## Example
+```julia
+@with_pool pool begin
+    m = unsafe_acquire!(pool, Float64, 64, 100)  # Matrix{Float64}
+    # Safe for type-unspecified paths
+    some_abstract_field.process(m)  # 0 bytes - cached instance reused
+end
+```
+"""
+```
+
+---
+
+## TurbulentTransport Integration
+
+### Changed File: `src/tglf_nn.jl`
+
+**Already Applied** (line 277):
+```julia
+@with_pool pool function flux_array!(out_y::AbstractMatrix{T}, fluxmodel::TGLFNNmodel, x::AbstractMatrix{T}; ...) where {T<:Real}
+    # ...
+    # NOTE: Use unsafe_acquire! (returns Array) instead of acquire! (returns ReshapedArray)
+    # because _pooled_chain field lacks concrete type parameters, causing
+    # escape optimization failure. Array (cached instance) avoids wrapper allocation.
+    xx = unsafe_acquire!(pool, T, size(x))
+    # ...
+    fluxmodel._pooled_chain(out_y, xx)  # 0 bytes - cached Array instance reused
+end
+```
+
+### No Change Needed: `src/pooled_layers.jl`
+
+`PooledDense`, `PooledActivation` are in **static dispatch** environment:
+- Types are known at compile time
+- Maintain use of `acquire!` (ReshapedArray)
+- ReshapedArray is also 0 bytes in static dispatch
+
+```julia
+@inline function _pooled_dense_forward!(pd::PooledDense, x::AbstractVecOrMat)
+    pool = get_task_local_pool()
+    # acquire! usage OK - static dispatch environment
+    out = acquire!(pool, Float64, size(d.weight, 1), size(xT, 2))
+    mul!(out, d.weight, xT)  # ReshapedArray is StridedArray ✓
+    return Flux.NNlib.bias_act!(d.σ, out, d.bias)
+end
+```
+
+---
+
+## Summary
+
+### Before (v0.2.0)
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│  acquire!() ──┬──> get_nd_view!() ──> get_nd_array!()       │
+│               │         │                  │                │
+│               │         │           unsafe_wrap (112B miss) │
+│               │         │                  ↓                │
+│               │         └──────> SubArray{Array} ←──────────┘
+│               │                                             │
+│  unsafe_acquire!() ──> get_nd_array!() ──> Array            │
+│                               │                             │
+│                        unsafe_wrap (112B miss)              │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### After (Hybrid)
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│  acquire!() ──> get_view!() ──> reshape() ──> ReshapedArray │
+│                      │              │                       │
+│                  1D cache      0 bytes always!              │
+│                  (0 alloc)                                  │
+│                                                             │
+│  unsafe_acquire!() ──> get_nd_array!() ──> Array            │
+│                               │                             │
+│                        unsafe_wrap + N-way cache            │
+│                        (0B hit, 112B miss)                  │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### Decision Matrix for Users
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                    Which API to use?                        │
+├─────────────────────────────────────────────────────────────┤
+│                                                             │
+│  Is the code path type-unspecified?                         │
+│  (abstract fields without concrete type params,             │
+│   runtime-determined function calls)                        │
+│                                                             │
+│     YES ──────────────> unsafe_acquire!()                   │
+│      │                       │                              │
+│      │                  Returns Array                       │
+│      │                  (cached instance reused)            │
+│      │                                                      │
+│     NO ───────────────> acquire!()                          │
+│                              │                              │
+│                         Returns ReshapedArray               │
+│                         (0 bytes creation)                  │
+│                                                             │
+└─────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Open Questions for Review
+
+1. **N-way cache retention level**: Keep current 4-way? Reduce to 2-way?
+2. **nd_views field removal**: Can be removed since `acquire!` no longer uses it?
+3. **Backward compatibility**: Cases where existing `acquire!` users check for `SubArray` type?
+
+---
+
+## References
+
+- [nd_array_approach_comparison.md](./nd_array_approach_comparison.md) - Benchmark results and boxing analysis
+- [PR_MESSAGE.md](../PR_MESSAGE.md) - Original PR description
diff --git a/docs/design/macro-linenumbernode-improvement.md b/docs/design/macro-linenumbernode-improvement.md
new file mode 100644
index 0000000..26eab67
--- /dev/null
+++ b/docs/design/macro-linenumbernode-improvement.md
@@ -0,0 +1,369 @@
+# @with_pool Macro LineNumberNode Improvement Plan
+
+## Goal
+Utilize `__source__` and `LineNumberNode` to improve coverage, stack trace, and debugging
+
+## Target File
+- `/Users/yoo/.julia/dev/AdaptiveArrayPools/src/macros.jl`
+
+---
+
+## Phase 1: Add Helper Functions
+
+### 1.1 LineNumberNode Insertion Helper (New)
+
+**Location**: Add before `_generate_pool_code` function
+
+```julia
+"""
+    _maybe_add_source_location!(expr, source)
+
+Insert source location LineNumberNode at the beginning of an Expr block.
+No-op if source is nothing or expr is not an Expr(:block, ...).
+"""
+function _maybe_add_source_location!(expr::Expr, source::Union{LineNumberNode,Nothing})
+    if source !== nothing && expr.head === :block
+        pushfirst!(expr.args, LineNumberNode(source.line, source.file))
+    end
+    return expr
+end
+_maybe_add_source_location!(expr, ::Nothing) = expr
+```
+
+**Benefits**: Common application across all return paths, reduces risk of omission/drift
+
+### 1.2 Function Body LineNumberNode Correction Helper (New)
+
+**Goal**: Correct with `__source__` **only when no LNN exists at the body top level**
+
+> **Background**: `body` is the **user code AST** obtained from `func_def.args[2]`.
+> Existing LNNs point to **user file lines** and must be preserved.
+> The problem is **short function forms** like `f(x) = ...` that have no LNN.
+
+```julia
+"""
+    _has_toplevel_lnn(body) -> Bool
+
+Check if body has a LineNumberNode at the top level (within first few args).
+More robust than checking only args[1], handles Expr(:meta) etc.
+"""
+function _has_toplevel_lnn(body)
+    body isa Expr && body.head === :block || return false
+    # Check first 3 args for LNN (handles :meta, :line annotations, etc.)
+    for i in 1:min(3, length(body.args))
+        body.args[i] isa LineNumberNode && return true
+    end
+    return false
+end
+
+"""
+    _ensure_body_has_toplevel_lnn(body, source)
+
+Ensure body has a LineNumberNode at the top level.
+- If body already has a top-level LNN, preserve it (user file line info)
+- If not, prepend source LNN (macro call location as fallback)
+
+Returns a new Expr to avoid mutating the original AST.
+"""
+function _ensure_body_has_toplevel_lnn(body, source::Union{LineNumberNode,Nothing})
+    source === nothing && return body
+
+    # Check if top-level LNN already exists (robust check)
+    if _has_toplevel_lnn(body)
+        return body  # Preserve existing user file LNN
+    end
+
+    # No top-level LNN → add source as fallback (no mutation)
+    lnn = LineNumberNode(source.line, source.file)
+    if body isa Expr && body.head === :block
+        return Expr(:block, lnn, body.args...)
+    else
+        return Expr(:block, lnn, body)
+    end
+end
+```
+
+**Benefits**:
+- **User body LNN preserved**: If existing top-level LNN exists, keep it (accurate body line)
+- **Short function form handling**: If no LNN, correct with `__source__`
+- **Mutation prevention**: Returns new Expr to protect original AST
+
+---
+
+## Phase 2: Modify Helper Function Signatures
+
+### 2.1 Keyword Argument Approach (Recommended)
+
+To avoid the risk of fixed `:cpu` default, add `source` as keyword argument:
+
+| Function | Search Pattern | Change |
+|----------|----------------|--------|
+| `_generate_pool_code` | `function _generate_pool_code(pool_name, expr, force_enable)` | `(...; source::Union{LineNumberNode,Nothing}=nothing)` |
+| `_generate_pool_code_with_backend` | `function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, force_enable::Bool)` | `(...; source::Union{LineNumberNode,Nothing}=nothing)` |
+| `_generate_function_pool_code` | `function _generate_function_pool_code(pool_name, func_def, force_enable, disable_pooling, backend::Symbol=:cpu)` | `(...; source::Union{LineNumberNode,Nothing}=nothing)` |
+| `_generate_function_pool_code_with_backend` | `function _generate_function_pool_code_with_backend(backend::Symbol, pool_name, func_def, disable_pooling::Bool)` | `(...; source::Union{LineNumberNode,Nothing}=nothing)` |
+
+**Benefits**: Minimal changes to existing call sites, solves `backend` default value issue
+
+---
+
+## Phase 3: Pass source to Internal Function Calls
+
+### Inside `_generate_pool_code` (Search: `_generate_function_pool_code(pool_name`)
+
+```julia
+# Before
+return _generate_function_pool_code(pool_name, expr, force_enable, true, :cpu)
+return _generate_function_pool_code(pool_name, expr, force_enable, false)
+
+# After (pass as keyword argument)
+return _generate_function_pool_code(pool_name, expr, force_enable, true, :cpu; source)
+return _generate_function_pool_code(pool_name, expr, force_enable, false; source)
+```
+
+### Inside `_generate_pool_code_with_backend` (Search: `_generate_function_pool_code_with_backend(backend`)
+
+```julia
+# After
+_generate_function_pool_code_with_backend(backend, pool_name, expr, ...; source)
+```
+
+---
+
+## Phase 4: LineNumberNode Insertion
+
+Call helper before returning each `quote ... end` block:
+
+```julia
+result = quote
+    # ... generated code ...
+end
+_maybe_add_source_location!(result, source)
+return result
+```
+
+### Insertion Locations (Based on Search Patterns)
+
+**`_generate_pool_code`** (Search: `function _generate_pool_code`):
+- `return quote ... end` in `!USE_POOLING` branch
+- `return quote ... end` in `force_enable` branch
+- `return quote ... end` in `else` branch
+
+**`_generate_pool_code_with_backend`** (Search: `function _generate_pool_code_with_backend`):
+- All `return quote ... end` with same pattern
+
+**`_generate_function_pool_code`** (Search: `function _generate_function_pool_code`):
+- After `transformed_body` creation: `transformed_body = _ensure_body_has_toplevel_lnn(transformed_body, source)`
+- Also correct `body` in `disable_pooling` path: `body = _ensure_body_has_toplevel_lnn(body, source)`
+- Then `new_body = quote ... end`
+- **(Optional) To make wrapper appear as call-site**: Add `_maybe_add_source_location!(new_body, source)`
+
+**`_generate_function_pool_code_with_backend`** (Search: `function _generate_function_pool_code_with_backend`):
+- Apply `_ensure_body_has_toplevel_lnn(..., source)` to `transformed_body` and `body` before constructing `new_body`
+- **(Optional)** Can also apply `_maybe_add_source_location!` to `new_body`
+
+> **Core Principle**: Preserve if top-level LNN exists (user line), correct with `__source__` if not (short function form)
+> **Note**: Wrapper code (checkpoint/try/finally) lines may still point to macros.jl. Inserting LNN to `new_body` improves this but is not required.
+
+---
+
+## Phase 5: Macro Definition Modifications
+
+**@with_pool** (Search: `macro with_pool`):
+```julia
+macro with_pool(pool_name, expr)
+    _generate_pool_code(pool_name, expr, true; source=__source__)
+end
+# Same pattern for remaining 3
+```
+
+**@maybe_with_pool** (Search: `macro maybe_with_pool`):
+```julia
+macro maybe_with_pool(pool_name, expr)
+    _generate_pool_code(pool_name, expr, false; source=__source__)
+end
+# Same pattern for remaining 3
+```
+
+---
+
+## Phase 6: Testing
+
+### 6.1 Robust Search Helpers (test/test_macro_expansion.jl)
+
+```julia
+"""
+    find_linenumbernode_with_line(expr, target_line) -> Union{LineNumberNode, Nothing}
+
+Recursively search for a LineNumberNode matching target_line.
+More robust than checking only the first LNN (handles block forms where
+_maybe_add_source_location! may insert LNN before user code LNN).
+"""
+function find_linenumbernode_with_line(expr, target_line::Int)
+    if expr isa LineNumberNode && expr.line == target_line
+        return expr
+    elseif expr isa Expr
+        for arg in expr.args
+            result = find_linenumbernode_with_line(arg, target_line)
+            result !== nothing && return result
+        end
+    end
+    return nothing
+end
+
+"""
+    has_valid_linenumbernode(expr) -> Bool
+
+Check if expr contains any LineNumberNode with valid line info.
+"""
+function has_valid_linenumbernode(expr)
+    if expr isa LineNumberNode
+        return expr.line > 0 && expr.file !== :none
+    elseif expr isa Expr
+        for arg in expr.args
+            has_valid_linenumbernode(arg) && return true
+        end
+    end
+    return false
+end
+
+"""
+    get_function_body(expr) -> Union{Expr, Nothing}
+
+Extract function body from a function definition expression.
+Handles both `function f() ... end` and `f() = ...` forms.
+"""
+function get_function_body(expr)
+    if expr isa Expr
+        if expr.head === :function && length(expr.args) >= 2
+            return expr.args[2]
+        elseif expr.head === :(=) && expr.args[1] isa Expr && expr.args[1].head === :call
+            return expr.args[2]
+        end
+        # Recurse for wrapped expressions
+        for arg in expr.args
+            result = get_function_body(arg)
+            result !== nothing && return result
+        end
+    end
+    return nothing
+end
+```
+
+### 6.2 Test Cases (Full Coverage)
+
+> **Test Strategy**: Verify "existence of LNN matching expected line" rather than "first LNN".
+> In block forms, `_maybe_add_source_location!` may insert additional LNNs,
+> so checking for existence of LNN with specific line is more robust.
+
+```julia
+@testset "Source location preservation" begin
+    # Test 1: @with_pool block form
+    @testset "@with_pool block" begin
+        expected_line = @__LINE__ + 2
+        expr = @macroexpand @with_pool pool begin
+            v = acquire!(pool, Float64, 10)
+        end
+        # Check if LNN matching expected line exists
+        lnn = find_linenumbernode_with_line(expr, expected_line)
+        @test lnn !== nothing
+        @test lnn.file !== :none
+        # At minimum, valid LNN must exist
+        @test has_valid_linenumbernode(expr)
+    end
+
+    # Test 2: @with_pool function form
+    @testset "@with_pool function" begin
+        expected_line = @__LINE__ + 2
+        func_expr = @macroexpand @with_pool pool function test_func(n)
+            acquire!(pool, Float64, n)
+        end
+        body = get_function_body(func_expr)
+        @test body !== nothing
+        lnn = find_linenumbernode_with_line(body, expected_line)
+        @test lnn !== nothing
+    end
+
+    # Test 3: @maybe_with_pool
+    @testset "@maybe_with_pool" begin
+        expected_line = @__LINE__ + 2
+        expr = @macroexpand @maybe_with_pool pool begin
+            v = acquire!(pool, Float64, 10)
+        end
+        lnn = find_linenumbernode_with_line(expr, expected_line)
+        @test lnn !== nothing
+    end
+
+    # Test 4: Backend variant (@with_pool :cpu)
+    @testset "@with_pool :cpu backend" begin
+        expected_line = @__LINE__ + 2
+        expr = @macroexpand @with_pool :cpu pool begin
+            v = acquire!(pool, Float64, 10)
+        end
+        lnn = find_linenumbernode_with_line(expr, expected_line)
+        @test lnn !== nothing
+    end
+
+    # Test 5: Without pool name (implicit gensym)
+    @testset "@with_pool without pool name" begin
+        expected_line = @__LINE__ + 2
+        expr = @macroexpand @with_pool begin
+            inner_function()
+        end
+        lnn = find_linenumbernode_with_line(expr, expected_line)
+        @test lnn !== nothing
+    end
+
+    # Test 6: Short-form function (f(x) = ...) - Case without LNN, corrected with __source__
+    @testset "@with_pool short function" begin
+        expected_line = @__LINE__ + 1
+        func_expr = @macroexpand @with_pool pool test_func(x) = acquire!(pool, Float64, x)
+        body = get_function_body(func_expr)
+        @test body !== nothing
+        # Short function originally has no LNN, so corrected with __source__
+        lnn = find_linenumbernode_with_line(body, expected_line)
+        @test lnn !== nothing
+    end
+end
+```
+
+### 6.3 Verification Command
+```bash
+julia --project -e 'using Pkg; Pkg.test()'
+```
+
+---
+
+## Expected Results
+
+| Item | Before Improvement | After Improvement |
+|------|-------------------|-------------------|
+| Coverage | signature uncovered | Properly mapped |
+| Stack trace | macros.jl:XXX | Original source:line |
+| Breakpoint | Inside macros.jl | Improved to inside body |
+
+---
+
+## Considerations
+
+1. **Use Keyword Arguments**: Add `source` as keyword arg to minimize impact on existing call sites
+2. **Use Helper Functions**: Use `_maybe_add_source_location!` for consistent insertion across all paths
+3. **Body Line Correction**: Use `_ensure_body_has_toplevel_lnn` to preserve top-level LNN, correct with `__source__` if not present
+4. **Robust Tests**: Search-based verification resistant to AST structure changes + line number accuracy verification
+5. **esc() Interaction**: `LineNumberNode` is unrelated to hygiene → insert at quote block top
+6. **try-finally**: Lines inside wrapper still point to macros.jl (acceptable)
+7. **CUDA Extension**: Only registers backend dispatch, no macro definitions → no changes needed
+
+---
+
+## Change Summary
+
+| Phase | Work | Estimated Change |
+|-------|------|-----------------|
+| 1 | Add helper functions (3: `_maybe_add_source_location!`, `_has_toplevel_lnn`, `_ensure_body_has_toplevel_lnn`) | +35 lines |
+| 2 | Modify signatures (4 functions) | 4 lines modified |
+| 3 | Modify internal calls | ~5 lines modified |
+| 4 | LineNumberNode insertion | ~10 lines added |
+| 5 | Modify macro definitions (8) | 8 lines modified |
+| 6 | Add tests (3 helpers + 6 tests) | +80 lines |
+| **Total** | | ~140 lines |
diff --git a/docs/design/nd_array_approach_comparison.md b/docs/design/nd_array_approach_comparison.md
new file mode 100644
index 0000000..e01f308
--- /dev/null
+++ b/docs/design/nd_array_approach_comparison.md
@@ -0,0 +1,432 @@
+# N-D Array Approach Comparison: unsafe_wrap vs ReshapedArray
+
+## Summary
+
+This document analyzes two approaches for returning N-dimensional arrays from AdaptiveArrayPools:
+
+1. **Current (v1.1.x)**: `unsafe_wrap(Array, pointer, dims)` with N-way cache
+2. **Proposed (v1.0.2 style)**: `reshape(view(backing, 1:n), dims)` without cache
+
+**Recommendation**: Switch back to ReshapedArray approach for `acquire!` N-D path.
+
+---
+
+## Key Finding: SubArray Wrapper Allocation
+
+### The 48-byte Problem
+
+When using `acquire!` vs `unsafe_acquire!` in real code:
+
+```julia
+# In flux_array! (TurbulentTransport)
+xx = unsafe_acquire!(pool, T, size(x))  # 0 bytes - returns Array directly
+xx = acquire!(pool, T, size(x))         # 48 bytes - SubArray wrapper!
+```
+
+**Root Cause**: `acquire!` returns `SubArray`, which allocates its wrapper struct (48 bytes) when it escapes the function scope.
+
+### Allocation Breakdown
+
+| API | Return Type | Allocation |
+|-----|-------------|------------|
+| `unsafe_acquire!(pool, T, m, n)` | `Matrix{T}` | 0 bytes (cache hit), 112 bytes (miss) |
+| `acquire!(pool, T, m, n)` | `SubArray{..., Matrix}` | **48 bytes** (wrapper) + 112 bytes (miss) |
+
+### Why SubArray Allocates
+
+**Fundamental difference:**
+
+```
+unsafe_acquire! returns:
+┌─────────────────────────────────────────────────────┐
+│  Pool backing Vector                                │
+│  [████████████████████████████████████]             │
+│                ↓                                     │
+│  Array header lives in pool cache (reused)          │
+│  → Returns pointer to EXISTING object (0 alloc)     │
+└─────────────────────────────────────────────────────┘
+
+acquire! returns:
+┌─────────────────────────────────────────────────────┐
+│  Pool backing Vector                                │
+│  [████████████████████████████████████]             │
+│                ↓                                     │
+│  Array in cache                                     │
+│                ↓                                     │
+│  NEW SubArray struct (parent, indices, stride...)   │
+│  → Creates NEW wrapper object (48 bytes!)           │
+└─────────────────────────────────────────────────────┘
+```
+
+SubArray is stack-allocated **only when**:
+1. Used entirely within a single function
+2. Compiler can prove it doesn't escape
+
+In `flux_array!`, `xx` escapes because:
+- Passed to `_pooled_chain(out_y, xx)`
+- Used across multiple loop iterations
+- Compiler can't optimize away the wrapper
+
+### Root Cause: Type-Unspecified Path → Escape Optimization Failure
+
+**Critical Finding**: The core reason for allocation is that **compiler escape optimization fails in type-unspecified call paths**.
+
+> **Correction**: SubArray is a mutable struct (not immutable/isbits).
+> The explanation that "dynamic dispatch causes boxing" is inaccurate.
+> Precisely: wrapper objects are created at construction time, and the key factor is
+> whether the compiler can optimize to stack allocation through escape analysis.
+
+#### Why it happens in `flux_array!`:
+
+```julia
+# TGLFNNmodel struct (tglf_nn.jl)
+struct TGLFNNmodel <: TGLFmodel
+    fluxmodel::Flux.Chain
+    # ...
+    _pooled_chain::PooledChain  # ← No concrete type parameter!
+end
+```
+
+The `_pooled_chain` field is declared without concrete type parameters,
+so the call `fluxmodel._pooled_chain(out_y, xx)` is **not recompiled** and wrapper object optimization is not applied.
+
+#### Escape Optimization Failure:
+
+| Condition | Compiler Behavior | Result |
+|-----------|-------------------|--------|
+| Type specified + no escape | SROA/escape analysis applied | Stack allocation or elimination possible |
+| Type specified + escape | Partial optimization possible | Depends on situation |
+| **Type unspecified** | Optimization not applicable | **Wrapper object heap allocation** |
+
+```
+Type-specified path:
+┌─────────────────────────────────────────────────────────────────┐
+│  Compiler knows the type                                        │
+│  → SROA/escape analysis can be applied                          │
+│  → Wrapper object can be stack-allocated or completely removed  │
+└─────────────────────────────────────────────────────────────────┘
+
+Type-unspecified path (e.g., call through abstract field):
+┌─────────────────────────────────────────────────────────────────┐
+│  Compiler doesn't know concrete type                            │
+│  → Escape analysis cannot be applied                            │
+│  → Wrapper object is heap-allocated                             │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+#### Array vs View Types in Type-Unspecified Paths:
+
+| Type | Characteristic | In Type-Unspecified Path |
+|------|----------------|--------------------------|
+| `Array` | Already heap-allocated object | Cached instance reuse → **No additional allocation** |
+| `SubArray` | Requires wrapper object | Escape optimization failure → **Wrapper allocation** |
+| `ReshapedArray` | Requires wrapper object | Escape optimization failure → **Wrapper allocation** |
+
+**Key insight**: Array is an object that already exists on the heap, so returning the same instance from cache incurs no additional allocation.
+In contrast, SubArray/ReshapedArray create new wrapper objects each time, and optimization is not applied when type is unspecified.
+
+### Solutions for the Wrapper Allocation Problem
+
+#### 1. Use `unsafe_acquire!` (Recommended for this case)
+
+`unsafe_acquire!` returns `Array`, which is already a heap-allocated object (cached instance can be reused):
+
+```julia
+# flux_array! in tglf_nn.jl
+xx = unsafe_acquire!(pool, T, size(x))  # Returns Matrix{T} → cache hit = 0 alloc
+```
+
+✅ Zero allocation on cache hit (cached Array instance reused)
+✅ No code changes to TGLFNNmodel needed
+✅ Safe since `xx` is only used as scratch memory
+
+#### 2. Parameterize TGLFNNmodel (Fundamental fix)
+
+```julia
+struct TGLFNNmodel{M<:Flux.Chain, P<:PooledChain} <: TGLFmodel
+    fluxmodel::M
+    _pooled_chain::P  # Now compiler knows exact type
+end
+```
+
+✅ Enables compiler escape optimization
+✅ SubArray/ReshapedArray may become zero-alloc (depends on SROA/escape analysis)
+❌ Requires significant code changes
+❌ Changes serialization behavior
+
+#### 3. Function Barrier
+
+```julia
+# Force type specialization
+@inline function _call_pooled_chain(chain::PooledChain{M}, out, x) where M
+    chain(out, x)
+end
+```
+
+⚠️ May not help if extracting `_pooled_chain` from struct is also dynamic
+
+### Implications for API Design
+
+**Key Insight: Wrapper Types Depend on Compiler Optimization**
+
+> **Note**: Whether wrapper types (SubArray, ReshapedArray) allocate depends on compiler SROA/escape analysis.
+> The comparison below assumes **type-specified paths**. In type-unspecified paths, all wrapper types allocate.
+
+| Approach | Return Type | Creation Cost | Type-Specified Path | Type-Unspecified Path |
+|----------|-------------|---------------|---------------------|----------------------|
+| `unsafe_acquire!` | `Array` | 112 bytes (miss) | cache hit: 0 | cache hit: 0 |
+| `acquire!` (current) | `SubArray{Array}` | 112 bytes (miss) | Optimizable | Wrapper allocation |
+| **`acquire!` (reshape)** | `ReshapedArray{View}` | **0 bytes** | **Optimizable** | Wrapper allocation |
+
+**Advantages of ReshapedArray approach**:
+
+1. ✅ No creation cost (no unsafe_wrap call)
+2. ✅ Compiler optimization possible in type-specified paths
+3. ✅ BLAS compatible (StridedArray)
+4. ✅ Same operation performance (mul!, broadcast)
+
+**`unsafe_acquire!` is better in type-unspecified paths**:
+- Array already exists on heap → no additional allocation when reusing cached instance
+
+---
+
+## Benchmark Results
+
+### Test Environment
+- Benchmark file: `benchmark/nd_approach_comparison.jl`
+- Tests 8-10 specifically compare N-way cache behavior
+
+### Allocation Comparison
+
+| Scenario | unsafe_wrap | reshape | Savings |
+|----------|-------------|---------|---------|
+| Single call (cache miss) | 112 bytes | **0 bytes** | 100% |
+| 3-way cycling × 100 | 33,600 bytes | **0 bytes** | 100% |
+| 5-way cycling × 100 | 56,000 bytes | **0 bytes** | 100% |
+| With 4-way cache (3-way pattern) | 0 bytes | **0 bytes** | - |
+| With 4-way cache (5-way pattern) | 56,000 bytes | **0 bytes** | 100% |
+
+### Performance Comparison
+
+| Operation | unsafe_wrap | reshape | Winner |
+|-----------|-------------|---------|--------|
+| mul! (BLAS) | 9.92 μs | 9.96 μs | Tie |
+| Broadcast σ.(x) | 24.2 μs | 23.5 μs | Tie |
+| Dense layer | 35.3 μs | 33.7 μs | Tie |
+| 3-way cycling | 5.87 μs | **0.38 μs** | reshape (15x) |
+| 5-way cycling | 10.38 μs | **0.59 μs** | reshape (18x) |
+
+### Type Information
+
+All three types are `StridedArray` (BLAS compatible):
+
+```julia
+# unsafe_wrap
+Matrix{Float64}  # isa StridedArray ✓
+
+# SubArray of unsafe_wrap
+SubArray{Float64, 2, Matrix{Float64}, ...}  # isa StridedArray ✓
+
+# ReshapedArray of 1D view
+Base.ReshapedArray{Float64, 2, SubArray{Float64, 1, Vector{Float64}, ...}, ...}  # isa StridedArray ✓
+```
+
+### Memory Layout
+
+All have identical column-major layout:
+```
+Strides: (1, 4)  # Same for all three
+```
+
+---
+
+## Current Implementation (unsafe_wrap)
+
+### Call Path
+```
+acquire!(pool, Float64, 64, 100)
+  └─> get_nd_view!(tp, (64, 100))
+        └─> get_nd_array!(tp, (64, 100))
+              ├─> get_view!(tp, 6400)      # 1D view (0 alloc, cached)
+              └─> unsafe_wrap(...)          # 112 bytes on cache miss!
+```
+
+### Code Location
+`src/core.jl:129`:
+```julia
+arr = unsafe_wrap(Array{T, N}, pointer(flat_view), dims)
+```
+
+### N-way Cache Structure
+```julia
+# In TypedPool (src/types.jl)
+nd_views::Vector{Any}      # Cached SubArray objects
+nd_arrays::Vector{Any}     # Cached Array objects (from unsafe_wrap)
+nd_dims::Vector{Any}       # Cached dimension tuples
+nd_ptrs::Vector{UInt}      # Cached pointers for invalidation
+```
+
+---
+
+## Proposed Implementation (ReshapedArray)
+
+### Call Path
+```
+acquire!(pool, Float64, 64, 100)
+  └─> get_nd_view!(tp, (64, 100))
+        └─> get_view!(tp, 6400)      # 1D view (0 alloc, cached)
+              └─> reshape(view, dims)  # 0 alloc always!
+```
+
+### Proposed Code Change
+```julia
+# Replace get_nd_view! in src/core.jl
+@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N}
+    total_len = safe_prod(dims)
+    flat_view = get_view!(tp, total_len)  # 1D view (cached)
+    return reshape(flat_view, dims)        # Zero-alloc ReshapedArray
+end
+```
+
+---
+
+## Pros and Cons
+
+### ReshapedArray Approach (Proposed)
+
+#### Pros
+1. **Zero allocation always** - No 112-byte allocation regardless of cache hit/miss
+2. **No N-D cache needed** - Simpler code, less memory overhead
+3. **Faster** - No cache lookup overhead (0.38μs vs 5.87μs for cycling patterns)
+4. **BLAS compatible** - ReshapedArray is StridedArray
+5. **Same performance** - Identical mul!/broadcast speed
+6. **No Bélády's Anomaly** - Works with any access pattern (5-way, 10-way, etc.)
+7. **Simpler TypedPool** - Can remove nd_arrays, nd_dims, nd_ptrs fields
+
+#### Cons
+1. **Return type changes** - `SubArray{..., Array{...}}` → `ReshapedArray{..., SubArray{...}}`
+2. **Some libraries might check `isa Array`** - Rare, but possible (not BLAS though)
+3. **Slightly different printing** - Display shows as ReshapedArray
+
+### unsafe_wrap Approach (Current)
+
+#### Pros
+1. **Returns actual Array** - Some code might expect `Matrix{Float64}`
+2. **Cache hits are zero-alloc** - When pattern fits in N-way cache
+
+#### Cons
+1. **112 bytes per cache miss** - Adds up with varying batch sizes
+2. **N-way cache complexity** - Extra fields, cache lookup logic
+3. **Bélády's Anomaly** - 5+ patterns = 100% miss with 4-way cache
+4. **Slower cycling** - Cache lookup overhead even on hits
+
+---
+
+## Impact Analysis
+
+### TurbulentTransport Usage
+
+In `src/pooled_layers.jl`:
+
+```julia
+# PooledDense (line 86)
+out = acquire!(pool, Float64, size(d.weight, 1), size(xT, 2))
+mul!(out, d.weight, xT)  # Works with ReshapedArray ✓
+
+# PooledActivation (line 54)
+out = acquire!(pool, Float64, size(x))
+out .= pa.σ.(x)  # Works with ReshapedArray ✓
+```
+
+Both use cases are compatible with ReshapedArray:
+- `mul!` accepts any `StridedMatrix`
+- Broadcasting works on any `AbstractArray`
+
+### Flux.NNlib.bias_act!
+
+```julia
+Flux.NNlib.bias_act!(d.σ, out, d.bias)
+```
+
+This function accepts `AbstractArray` - ReshapedArray is compatible.
+
+### unsafe_acquire! Unchanged
+
+For code that explicitly needs raw `Array` (FFI, specific BLAS paths):
+```julia
+unsafe_acquire!(pool, Float64, 64, 100)  # Still returns Matrix{Float64}
+```
+
+This API remains unchanged and still uses `unsafe_wrap` with caching.
+
+---
+
+## Migration Path
+
+### Phase 1: Modify acquire! N-D path
+```julia
+# src/core.jl - Replace get_nd_view!
+@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N}
+    total_len = safe_prod(dims)
+    flat_view = get_view!(tp, total_len)
+    return reshape(flat_view, dims)
+end
+```
+
+### Phase 2: Simplify TypedPool (optional)
+Remove N-D cache fields if `unsafe_acquire!` usage is rare:
+- `nd_views`, `nd_arrays`, `nd_dims`, `nd_ptrs`
+
+### Phase 3: Update documentation
+- Note return type change in CHANGELOG
+- Update docstrings for `acquire!`
+
+---
+
+## Conclusion
+
+### Two Separate Problems, Two Solutions
+
+This investigation revealed **two distinct allocation issues**:
+
+#### Problem 1: N-D Array Creation (unsafe_wrap vs reshape)
+
+| Metric | unsafe_wrap + cache | reshape |
+|--------|---------------------|---------|
+| Allocation (miss) | 112 bytes | **0 bytes** |
+| Allocation (hit) | 0 bytes | **0 bytes** |
+| Speed (cycling) | 5-10 μs | **0.3-0.6 μs** |
+| BLAS compat | ✓ | ✓ |
+| Code complexity | High (cache) | **Low** |
+| Works with any pattern | ✗ (≤4 ways) | **✓ (any)** |
+
+**Solution**: Switch `acquire!` N-D path to use `reshape(view, dims)` instead of `unsafe_wrap`.
+
+#### Problem 2: Type-Unspecified Path Wrapper Allocation
+
+> **Correction**: SubArray and ReshapedArray are both **mutable structs**.
+> Whether allocation occurs depends on whether compiler escape optimization can be applied.
+
+| Type | Type-Specified Path | Type-Unspecified Path |
+|------|---------------------|----------------------|
+| `Array` | cache hit: 0 bytes | cache hit: **0 bytes** ✓ |
+| `SubArray` | Optimizable | **Wrapper allocation** ✗ |
+| `ReshapedArray` | Optimizable | **Wrapper allocation** ✗ |
+
+**Key difference**:
+- `Array`: Object already exists on heap → no additional allocation when reusing cached instance
+- Wrapper types: New object created each time → heap allocation when optimization fails in type-unspecified paths
+
+**Solution for TurbulentTransport**: Use `unsafe_acquire!` which returns `Array` (cached instance reusable).
+
+### Summary
+
+| Context | Recommended API | Reason |
+|---------|-----------------|--------|
+| Type-specified path (general use) | `acquire!` → ReshapedArray | 0 bytes creation, compiler optimization possible |
+| Type-unspecified path | `unsafe_acquire!` → Array | Cached instance can be reused |
+| FFI / raw pointer needs | `unsafe_acquire!` → Array | Direct memory access |
+
+**The N-way cache was solving the wrong problem** - caching `Array` objects when `reshape` is already zero-cost for creation.
+
+For type-unspecified paths (like `TGLFNNmodel._pooled_chain` without concrete type parameters), `unsafe_acquire!` returning `Array` is the correct choice because cached Array instances can be reused without additional allocation.
diff --git a/docs/design/new_hybrid_api_design.md b/docs/design/new_hybrid_api_design.md
new file mode 100644
index 0000000..14cd235
--- /dev/null
+++ b/docs/design/new_hybrid_api_design.md
@@ -0,0 +1,140 @@
+# Design Spec: Hybrid N-way Cache & ReshapedArray Strategy
+
+> **Note**: This document was written as a clear, concrete specification that another AI can
+> implement mechanically without needing to think through the design.
+
+---
+
+## 1. Objective
+Refactor AdaptiveArrayPools.jl to implement a **Hybrid Allocation Strategy**:
+1.  **`acquire!` (Default)**: Return `ReshapedArray` (Zero-Allocation, Stack-allocated). Remove N-D caching logic for this path.
+2.  **`unsafe_acquire!` (Special)**: Return `Array` (via `unsafe_wrap`). Implement **N-way Set Associative Cache** to minimize `unsafe_wrap` overhead (112 bytes) and support interleaved access patterns.
+
+## 2. Data Structure Changes (types.jl)
+
+### Constants
+Define the cache associativity level.
+```julia
+const CACHE_WAYS = 4
+```
+
+### `TypedPool{T}` Struct
+Modify fields to support N-way caching for Arrays, while removing unused View caching.
+
+*   **Remove**: `nd_views` (No longer needed as `acquire!` returns `ReshapedArray`).
+*   **Update**: `nd_arrays`, `nd_dims`, `nd_ptrs`. These vectors must store `CACHE_WAYS` items per active slot.
+*   **Add**: `nd_next_way::Vector{Int}` (To track Round-Robin replacement index for each slot).
+
+**Updated Layout:**
+```julia
+mutable struct TypedPool{T}
+    # --- Backing Storage ---
+    vectors::Vector{Vector{T}}
+
+    # --- 1D Cache (Simple 1-way or Direct) ---
+    views::Vector{SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int}}, true}}
+    view_lengths::Vector{Int}
+
+    # --- N-D Array Cache (N-way Set Associative) ---
+    # Layout: Flat Vector. Index = (slot_idx - 1) * CACHE_WAYS + way_idx
+    nd_arrays::Vector{Any}      # Stores Array{T, N}
+    nd_dims::Vector{Any}        # Stores NTuple{N, Int}
+    nd_ptrs::Vector{UInt}       # Stores objectid/pointer for validation
+    nd_next_way::Vector{Int}    # Round-Robin counter per slot (1 per slot)
+
+    n_active::Int
+    _checkpoint_n_active::Vector{Int}
+    _checkpoint_depths::Vector{Int}
+end
+```
+
+### Initialization
+Ensure `nd_arrays`, `nd_dims`, `nd_ptrs` are initialized with `nothing` or empty values, and `nd_next_way` with `0` or `1`.
+
+## 3. Logic Implementation (core.jl)
+
+### A. `acquire!` (The Fast Path)
+**Goal**: Always return `ReshapedArray`. No N-D cache lookup.
+
+**Implementation**:
+Modify `get_nd_view!` to:
+1.  Calculate total length (`prod(dims)`).
+2.  Call `get_view!(tp, len)` to get a 1D `SubArray`.
+3.  Return `reshape(flat_view, dims)`.
+
+```julia
+@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N}
+    len = safe_prod(dims)
+    flat_view = get_view!(tp, len)
+    return reshape(flat_view, dims)
+end
+```
+
+### B. `unsafe_acquire!` (The N-way Path)
+**Goal**: Return `Array`. Use N-way cache to avoid `unsafe_wrap`.
+
+**Implementation**:
+Modify `get_nd_array!` to use **Linear Search + Round-Robin Replacement**.
+
+**Algorithm**:
+1.  Get 1D view: `flat_view = get_view!(tp, prod(dims))`.
+2.  Get current pointer: `current_ptr = UInt(pointer(flat_view))`.
+3.  Calculate Base Index: `base = (tp.n_active - 1) * CACHE_WAYS`.
+4.  **Search (Hit Check)**:
+    *   Loop `k` from `1` to `CACHE_WAYS`.
+    *   Check if `nd_dims[base + k] == dims` **AND** `nd_ptrs[base + k] == current_ptr`.
+    *   If match: Return `nd_arrays[base + k]`.
+5.  **Miss (Replacement)**:
+    *   Get victim way from `nd_next_way[tp.n_active]`.
+    *   Target Index: `target = base + victim_way + 1`.
+    *   Create Array: `arr = unsafe_wrap(Array{T, N}, pointer(flat_view), dims)`.
+    *   **Update Cache**:
+        *   `nd_arrays[target] = arr`
+        *   `nd_dims[target] = dims`
+        *   `nd_ptrs[target] = current_ptr`
+    *   **Update Round-Robin**: Increment `nd_next_way` (modulo `CACHE_WAYS`).
+    *   Return `arr`.
+
+## 4. API & Aliases (AdaptiveArrayPools.jl)
+
+Add explicit aliases for clarity.
+
+```julia
+# Main APIs
+export acquire!, unsafe_acquire!
+
+# Explicit Aliases
+export acquire_view!, acquire_array!
+
+"""Alias for [`acquire!`](@ref). Returns a ReshapedArray (View)."""
+const acquire_view! = acquire!
+
+"""Alias for [`unsafe_acquire!`](@ref). Returns an Array (via unsafe_wrap)."""
+const acquire_array! = unsafe_acquire!
+```
+
+## 5. Client Integration (`TurbulentTransport.jl`)
+
+Update tglf_nn.jl to use the Array-returning API to avoid dynamic dispatch boxing.
+
+**File**: tglf_nn.jl
+**Function**: `flux_array!`
+**Change**:
+```julia
+# Before
+xx = acquire!(pool, T, size(x))
+
+# After
+xx = unsafe_acquire!(pool, T, size(x))
+# OR
+xx = acquire_array!(pool, T, size(x))
+```
+
+## 6. Verification Checklist
+
+1.  **Type Check**: `acquire!` must return `ReshapedArray`. `unsafe_acquire!` must return `Array`.
+2.  **Allocation Check**:
+    *   `acquire!`: 0 allocations always.
+    *   `unsafe_acquire!`: 0 allocations on cache hit.
+    *   `unsafe_acquire!`: 0 allocations on interleaved access (e.g., alternating 10x10 and 20x20) thanks to N-way cache.
+3.  **Safety**: Ensure `unsafe_acquire!` validates pointers (re-wraps if the backing vector was resized).
diff --git a/docs/design/untracked_acquire_design.md b/docs/design/untracked_acquire_design.md
new file mode 100644
index 0000000..7358d0e
--- /dev/null
+++ b/docs/design/untracked_acquire_design.md
@@ -0,0 +1,598 @@
+# AdaptiveArrayPools: Untracked Acquire Handling Design
+
+## 1. Background
+
+### 1.1 Current System
+Julia array pool library. The `@with_pool` macro manages temporary array allocation through checkpoint/rewind.
+
+```julia
+@with_pool p begin
+    v = acquire!(p, Float64, 100)  # Acquire array from pool
+    # ... use ...
+end  # Automatically returned
+```
+
+### 1.2 Optimization: Typed Checkpoint/Rewind
+The macro extracts types from acquire! calls at compile time, checkpointing/rewinding only those types (77% performance improvement).
+
+```julia
+# Code generated by macro
+checkpoint!(pool, Float64)  # Only checkpoint Float64 stack
+try
+    v = acquire!(pool, Float64, 100)
+finally
+    rewind!(pool, Float64)  # Only rewind Float64 stack
+end
+```
+
+### 1.3 Current Data Structures
+```julia
+mutable struct TypedPool{T}
+    vectors::Vector{Vector{T}}  # Reusable arrays
+    n_active::Int               # Number of currently active arrays
+    _checkpoint_n_active::Vector{Int}  # Checkpointed n_active values
+    _checkpoint_depths::Vector{Int}    # Depth at which each checkpoint was saved
+end
+
+mutable struct AdaptiveArrayPool
+    float64::TypedPool{Float64}
+    float32::TypedPool{Float32}
+    int64::TypedPool{Int64}
+    int32::TypedPool{Int32}
+    complexf64::TypedPool{ComplexF64}
+    complexf32::TypedPool{ComplexF32}
+    bool::TypedPool{Bool}
+    others::IdDict{DataType, Any}
+    _current_depth::Int
+    _untracked_flags::Vector{Bool}
+end
+```
+
+---
+
+## 2. Problem: Untracked Acquire
+
+### 2.1 Scenario
+```julia
+function inner_helper(pool)
+    acquire!(pool, Float32, 50)  # Macro can't see this!
+end
+
+@with_pool p begin
+    v = acquire!(p, Float64, 100)  # Macro extracts: Float64
+    inner_helper(p)                 # Float32 acquire is "untracked"
+end
+```
+
+**Problem:** Macro only knows about Float64, generating `checkpoint!(pool, Float64)`, `rewind!(pool, Float64)`.
+Float32's n_active is not restored → memory leak or corruption.
+
+### 2.2 Nested Case
+```julia
+@with_pool p begin                    # L1: checkpoint Float64
+    acquire!(p, Float64, 10)
+
+    @with_pool p begin                # L2: checkpoint Int64
+        acquire!(p, Int64, 5)
+        inner_untracked(p)            # Float32 acquire (untracked)
+    end                               # L2 rewind
+
+end                                   # L1 rewind
+```
+
+---
+
+## 3. Proposed Solution
+
+### 3.1 Core Idea: Shared Implementation Function + Marking Wrapper
+
+**Previous Problem:** Runtime detection in acquire! → overhead on every call
+
+**New Approach:**
+- `_acquire_impl!` → actual implementation (existing acquire! logic as-is)
+- `acquire!` → marking + `_acquire_impl!` call (1-line wrapper)
+- Macro transforms `acquire!` calls to `_acquire_impl!`
+
+```julia
+# User code
+@with_pool p begin
+    v = acquire!(p, Float64, 100)
+end
+
+# After macro transformation
+checkpoint!(p, Float64)
+try
+    v = _acquire_impl!(p, Float64, 100)  # ← Transformed! (no marking)
+finally
+    # ...
+end
+```
+
+### 3.2 Benefits
+1. **Zero overhead:** Inside @with_pool, impl is called directly
+2. **Automatic detection:** acquire! outside @with_pool passes through marking wrapper → untracked
+3. **No code duplication:** Implementation logic in `_acquire_impl!` only once
+
+### 3.3 Data Structure Changes
+
+```julia
+mutable struct TypedPool{T}
+    # ... existing fields ...
+    n_active::Int
+    _checkpoint_n_active::Vector{Int}
+    _checkpoint_depths::Vector{Int}   # NEW: Tracks which depth each checkpoint was saved at
+end
+
+mutable struct AdaptiveArrayPool
+    # ... existing TypedPools ...
+
+    # NEW: Depth tracking (pool level)
+    _current_depth::Int               # Current checkpoint depth
+    _untracked_flags::Vector{Bool}    # Whether untracked occurred per depth (typed vs full rewind decision)
+end
+```
+
+**Key Changes:**
+- `_checkpoint_depths` added to `TypedPool` → tracks which depth each checkpoint occurred at
+- `_untracked_flags` maintained → for typed rewind vs full rewind decision
+- `_full_rewind_with_types!` removed → `_checkpoint_depths` makes regular `rewind!(pool)` sufficient
+
+---
+
+## 4. Implementation Details
+
+### 4.1 Function Structure
+
+```julia
+# Implementation function (existing acquire! logic moved as-is)
+@inline function _acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where T
+    tp = get_typed_pool!(pool, T)
+    return get_view!(tp, n)
+end
+
+@inline function _acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N}
+    tp = get_typed_pool!(pool, T)
+    return get_nd_view!(tp, dims)
+end
+
+# User-facing: marking wrapper (1 line)
+@inline function acquire!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where T
+    _mark_untracked!(pool)
+    _acquire_impl!(pool, T, n)
+end
+
+@inline function acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N}
+    _mark_untracked!(pool)
+    _acquire_impl!(pool, T, dims...)
+end
+
+# Untracked marking (still needed - for typed rewind vs full rewind decision)
+@inline function _mark_untracked!(pool::AdaptiveArrayPool)
+    if pool._current_depth > 0
+        @inbounds pool._untracked_flags[pool._current_depth] = true
+    end
+end
+```
+
+**Macro transformation:** `acquire!(p, T, n)` → `_acquire_impl!(p, T, n)` (only function name substitution)
+
+**Note:** `_mark_untracked!` is maintained - for triggering full rewind when untracked occurs
+
+### 4.2 unsafe_acquire! and Aliases
+
+```julia
+# Implementation functions
+@inline function _unsafe_acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where T
+    tp = get_typed_pool!(pool, T)
+    return get_nd_array!(tp, (n,))
+end
+
+@inline function _unsafe_acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N}
+    tp = get_typed_pool!(pool, T)
+    return get_nd_array!(tp, dims)
+end
+
+# User-facing: marking wrapper
+@inline function unsafe_acquire!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where T
+    _mark_untracked!(pool)
+    _unsafe_acquire_impl!(pool, T, n)
+end
+
+@inline function unsafe_acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int,N}) where {T,N}
+    _mark_untracked!(pool)
+    _unsafe_acquire_impl!(pool, T, dims...)
+end
+
+# Aliases (user API)
+const acquire_view! = acquire!
+const acquire_array! = unsafe_acquire!
+
+# Aliases (internal impl)
+const _acquire_view_impl! = _acquire_impl!
+const _acquire_array_impl! = _unsafe_acquire_impl!
+```
+
+### 4.3 Macro Transformation
+
+```julia
+# Macro transforms acquire! calls in AST to _*_impl!
+function _transform_acquire_calls(expr, pool_name)
+    if expr isa Expr
+        if expr.head == :call && length(expr.args) >= 2
+            fn = expr.args[1]
+            pool_arg = expr.args[2]
+
+            # Only transform when matching target pool
+            if pool_arg == pool_name
+                if fn == :acquire! || fn == :acquire_view!
+                    expr.args[1] = :_acquire_impl!
+                elseif fn == :unsafe_acquire! || fn == :acquire_array!
+                    expr.args[1] = :_unsafe_acquire_impl!
+                end
+            end
+        end
+        # Recursively transform
+        for i in eachindex(expr.args)
+            expr.args[i] = _transform_acquire_calls(expr.args[i], pool_name)
+        end
+    end
+    return expr
+end
+```
+
+**Transformation Rules:**
+| Original | Transformed |
+|----------|-------------|
+| `acquire!(p, ...)` | `_acquire_impl!(p, ...)` |
+| `acquire_view!(p, ...)` | `_acquire_impl!(p, ...)` |
+| `unsafe_acquire!(p, ...)` | `_unsafe_acquire_impl!(p, ...)` |
+| `acquire_array!(p, ...)` | `_unsafe_acquire_impl!(p, ...)`|
+
+### 4.4 Checkpoint/Rewind
+
+```julia
+# Typed checkpoint
+function checkpoint!(pool::AdaptiveArrayPool, types::Type...)
+    pool._current_depth += 1
+    push!(pool._untracked_flags, false)
+    depth = pool._current_depth
+    for T in types
+        tp = get_typed_pool!(pool, T)
+        push!(tp._checkpoint_n_active, tp.n_active)
+        push!(tp._checkpoint_depths, depth)        # NEW: record depth
+    end
+end
+
+# Typed rewind (fast path - when no untracked)
+function rewind!(pool::AdaptiveArrayPool, types::Type...)
+    for T in types
+        tp = get_typed_pool!(pool, T)
+        pop!(tp._checkpoint_depths)                 # NEW: remove depth
+        tp.n_active = pop!(tp._checkpoint_n_active)
+    end
+    pop!(pool._untracked_flags)
+    pool._current_depth -= 1
+end
+
+# Full rewind (untracked fallback - simplified with _checkpoint_depths!)
+function rewind!(pool::AdaptiveArrayPool)
+    depth = pool._current_depth
+    for tp in all_type_stacks(pool)
+        if !isempty(tp._checkpoint_depths) && tp._checkpoint_depths[end] == depth
+            # Checkpointed at current depth → pop
+            pop!(tp._checkpoint_depths)
+            tp.n_active = pop!(tp._checkpoint_n_active)
+        elseif !isempty(tp._checkpoint_n_active)
+            # Checkpointed at previous depth → restore without pop
+            tp.n_active = tp._checkpoint_n_active[end]
+        elseif tp.n_active > 0
+            # ⚠️ CRITICAL ERROR: Would destroy arrays outside @with_pool
+            T = eltype(tp)
+            error("""
+                [AdaptiveArrayPools] Cannot rewind type $T: no checkpoint exists.
+                Found $(tp.n_active) active array(s) that were never checkpointed.
+
+                Fix: Wrap the scope where $T was first acquired in @with_pool.
+                """)
+        end
+        # else: _checkpoint_n_active empty and n_active == 0 → normal, do nothing
+    end
+    pop!(pool._untracked_flags)
+    pool._current_depth -= 1
+end
+```
+
+**Key Improvement:** `_checkpoint_depths[end] == depth` comparison enables accurate pop/restore decision → `_full_rewind_with_types!` not needed!
+
+### 4.5 all_type_stacks Implementation
+
+```julia
+# Generator to iterate all TypedPools (fixed slots + others)
+function all_type_stacks(pool::AdaptiveArrayPool)
+    return Iterators.flatten((
+        # Fixed slots (7)
+        (pool.float64, pool.float32, pool.int64, pool.int32, pool.complexf64, pool.complexf32, pool.bool),
+        # Others (IdDict values)
+        values(pool.others)
+    ))
+end
+
+# Or callback pattern (more efficient, no allocation)
+@inline function foreach_type_stack(f, pool::AdaptiveArrayPool)
+    f(pool.float64)
+    f(pool.float32)
+    f(pool.int64)
+    f(pool.int32)
+    f(pool.complexf64)
+    f(pool.complexf32)
+    f(pool.bool)
+    for tp in values(pool.others)
+        f(tp)
+    end
+end
+```
+
+### 4.6 Macro Generated Code
+
+```julia
+@with_pool p begin
+    v = acquire!(p, Float64, 100)
+    inner_function(p)  # untracked acquire possible here
+end
+
+# Generated code:
+local p = get_task_local_pool()
+checkpoint!(p, Float64)
+try
+    local _result = begin
+        v = _acquire_impl!(p, Float64, 100)  # ← Transformed! (no marking)
+        inner_function(p)  # acquire! inside not transformed → passes through marking wrapper → untracked
+    end
+    _result
+finally
+    if p._untracked_flags[p._current_depth]
+        rewind!(p)                  # Full rewind (handled accurately with _checkpoint_depths)
+    else
+        rewind!(p, Float64)         # Typed rewind (fast)
+    end
+end
+```
+
+**Change:** `_full_rewind_with_types!(p, Float64)` → `rewind!(p)` (simplified!)
+
+---
+
+## 5. Behavior Verification
+
+### 5.1 Simple Case (no untracked)
+
+```julia
+@with_pool p begin
+    v = acquire!(p, Float64, 100)
+end
+```
+
+**Flow:**
+1. `checkpoint!(p, Float64)` → _current_depth=1, _untracked_flags=[false]
+2. `_acquire_impl!(p, Float64, 100)` → no marking (macro transformed)
+3. `_untracked_flags[1] == false` → `rewind!(p, Float64)` (fast path)
+
+### 5.2 Untracked Case
+
+```julia
+function helper(p)
+    acquire!(p, Float32, 50)  # Defined outside @with_pool → not transformed!
+end
+
+@with_pool p begin
+    v = acquire!(p, Float64, 100)
+    helper(p)
+end
+```
+
+**Flow:**
+1. `checkpoint!(p, Float64)` → _current_depth=1, _untracked_flags=[false], Float64._checkpoint_depths=[1]
+2. `_acquire_impl!(p, Float64, 100)` → no marking (macro transformed)
+3. `acquire!(p, Float32, 50)` → marking wrapper → `_mark_untracked!` → _untracked_flags=[true]
+4. `_untracked_flags[1] == true` → `rewind!(p)` (full rewind)
+   - Float64: _checkpoint_depths[end]=1 == depth=1 → pop
+   - Float32: _checkpoint_depths=[] → if n_active=0 do nothing, if >0 restore
+
+### 5.3 Nested Case
+
+```
+L1: @with_pool (Float64)
+  L2: @with_pool (Int64) + untracked Float32
+    L3: @with_pool (Bool)
+```
+
+| Timing | depth | Float64 saved/depths | Int64 saved/depths | Float32 saved/depths | Bool saved/depths |
+|--------|-------|---------------------|-------------------|---------------------|------------------|
+| Initial | 0 | []/[] | []/[] | []/[] | []/[] |
+| L1 checkpoint(F64) | 1 | [0]/[1] | []/[] | []/[] | []/[] |
+| L2 checkpoint(I64) | 2 | [0]/[1] | [0]/[2] | []/[] | []/[] |
+| Float32 untracked | 2 | [0]/[1] | [0]/[2] | []/[] (n=1) | []/[] |
+| L3 checkpoint(Bool) | 3 | [0]/[1] | [0]/[2] | []/[] | [0]/[3] |
+| L3 rewind(Bool) | 2 | [0]/[1] | [0]/[2] | []/[] | []/[] |
+| L2 rewind() full | 1 | [0]/[1] | []/[] | []/[] (n=0) | []/[] |
+| L1 rewind(F64) | 0 | []/[] | []/[] | []/[] | []/[] |
+
+**L2 Full Rewind Detail (depth=2):**
+- Float64: depths[end]=1 ≠ 2 → no pop, restore only ✓
+- Int64: depths[end]=2 == 2 → **pop!** ✓ (now works correctly!)
+- Float32: depths=[] → n_active to 0 (untracked cleanup) ✓
+- Bool: depths=[] → n_active=0, do nothing ✓
+
+**Key:** Thanks to `_checkpoint_depths`, we can accurately know Int64 was checkpointed at depth=2!
+
+---
+
+## 6. API Summary
+
+### 6.1 User API (unchanged)
+```julia
+acquire!(pool, T, dims...)       # Acquire array (returns view)
+unsafe_acquire!(pool, T, dims...)  # Acquire array (returns Array)
+acquire_view!(pool, T, dims...)    # alias for acquire!
+acquire_array!(pool, T, dims...)   # alias for unsafe_acquire!
+```
+
+### 6.2 Internal API (newly added)
+```julia
+# Implementation functions (macro transforms to direct call)
+_acquire_impl!(pool, T, dims...)
+_unsafe_acquire_impl!(pool, T, dims...)
+
+# Aliases (const)
+_acquire_view_impl! = _acquire_impl!
+_acquire_array_impl! = _unsafe_acquire_impl!
+
+# Untracked marking
+_mark_untracked!(pool)
+
+# TypedPool iteration
+all_type_stacks(pool)  # Generator
+foreach_type_stack(f, pool)  # Callback pattern
+```
+
+### 6.3 Struct Changes
+```julia
+mutable struct TypedPool{T}
+    # existing fields...
+    n_active::Int
+    _checkpoint_n_active::Vector{Int}
+    _checkpoint_depths::Vector{Int}     # NEW: checkpoint depth tracking
+end
+
+mutable struct AdaptiveArrayPool
+    # existing fields...
+    _current_depth::Int              # NEW
+    _untracked_flags::Vector{Bool}   # NEW
+end
+```
+
+---
+
+## 7. Performance Characteristics
+
+| Path | Condition | Behavior | Overhead |
+|------|-----------|----------|----------|
+| Fast path | no untracked | typed rewind | Minimal (same as existing) |
+| Fallback | untracked exists | full rewind | Iterate all types (accurate handling with _checkpoint_depths) |
+
+### 7.1 acquire! Overhead
+```julia
+# Implementation function (existing logic as-is)
+@inline function _acquire_impl!(pool, T, n)
+    tp = get_typed_pool!(pool, T)
+    get_view!(tp, n)
+end
+
+# User wrapper (marking added)
+@inline function acquire!(pool, T, n)
+    _mark_untracked!(pool)  # Added: if + array access
+    _acquire_impl!(pool, T, n)
+end
+
+@inline function _mark_untracked!(pool)
+    if pool._current_depth > 0  # branch (usually false)
+        @inbounds pool._untracked_flags[pool._current_depth] = true
+    end
+end
+```
+
+**Overhead Analysis:**
+- Outside @with_pool: `_current_depth == 0` → branch only (very light)
+- Inside @with_pool: macro calls `_acquire_impl!` directly → **zero overhead**
+
+---
+
+## 8. Edge Case: Parent Scope Corruption Prevention
+
+### 8.1 Problem Scenario
+```julia
+# acquire outside @with_pool
+v_parent = acquire!(p, Int64, 10)  # Int64 n_active = 1
+
+@with_pool p begin  # checkpoint(Float64) - Int64 not checkpointed
+    v = acquire!(p, Float64, 100)
+
+    # Untracked Int64 acquire!
+    v_child = acquire!(p, Int64, 5)  # Int64 n_active = 2
+end
+# Full rewind occurs
+# Int64: _checkpoint_n_active=[], n_active=2
+# If reset n_active=0, v_parent would be destroyed!
+```
+
+### 8.2 Solution: Clear Error
+```julia
+elseif tp.n_active > 0
+    # _checkpoint_n_active empty but n_active > 0
+    # = attempting rewind without checkpoint
+    T = eltype(tp)
+    error("""
+        [AdaptiveArrayPools] Cannot rewind type $T: no checkpoint exists.
+        Found $(tp.n_active) active array(s) that were never checkpointed.
+
+        Fix: Wrap the scope where $T was first acquired in @with_pool.
+        """)
+end
+```
+
+### 8.3 Error Conditions Summary
+| _checkpoint_n_active | n_active | Situation | Action |
+|----------------------|----------|-----------|--------|
+| len == depth | any | Checkpointed at current depth | pop |
+| len > 0, < depth | any | Checkpointed at previous depth | restore to [end] |
+| empty | 0 | Not used | do nothing |
+| empty | > 0 | **Rewind attempted without checkpoint** | **ERROR** |
+
+### 8.4 User Resolution
+When error occurs, two choices:
+1. **Wrap parent in @with_pool too:**
+   ```julia
+   @with_pool p begin
+       v_parent = acquire!(p, Int64, 10)
+       @with_pool p begin
+           # inner block
+       end
+   end
+   ```
+
+2. **Avoid untracked acquire in inner block:**
+   ```julia
+   v_parent = acquire!(p, Int64, 10)
+   @with_pool p begin
+       # Don't acquire Int64
+       # Or use @with_pool in inner helper too
+   end
+   ```
+
+---
+
+## 9. Feedback Requests
+
+1. **Function separation approach:** Is the `acquire!` wrapper + `_acquire_impl!` implementation separation appropriate?
+2. **length vs _current_depth comparison:** Is it correct in all nested cases?
+3. **Macro transformation:** Is AST function name substitution safe? (qualified names, macros, etc.)
+4. **_untracked_flags Vector{Bool}:** Are there more efficient alternatives?
+5. **_full_rewind_with_types!:** Performance of checking types with Set?
+6. **Simpler alternative:** Is there a way to simplify the entire design?
+
+---
+
+## 10. Alternative Review
+
+### 10.1 Option: Always Full Checkpoint/Rewind
+- Pros: Simple implementation
+- Cons: Loses 77% performance improvement
+
+### 10.2 Option: Don't Support Untracked
+- Documentation: "Use @with_pool in helper functions too"
+- Pros: No implementation changes
+- Cons: Silent corruption on user error
+
+### 10.3 Option: Current Proposal (impl function separation)
+- Pros: Maintains performance, safe fallback, no code duplication
+- Cons: Requires macro transformation
diff --git a/docs/design/vector_resize_memory_behavior.md b/docs/design/vector_resize_memory_behavior.md
new file mode 100644
index 0000000..e0194a4
--- /dev/null
+++ b/docs/design/vector_resize_memory_behavior.md
@@ -0,0 +1,247 @@
+# Vector Resize Memory Behavior: CPU vs GPU
+
+## Context
+AdaptiveArrayPools uses backing vectors that may need to grow when larger arrays are requested.
+Current implementation only grows vectors, never shrinks them.
+
+**Question**: Should we shrink vectors when smaller sizes are requested? What are the memory implications?
+
+---
+
+## CPU Julia Vector Behavior
+
+```julia
+v = Vector{Float64}(undef, 1000)
+resize!(v, 100)   # Shrink to 100 elements
+resize!(v, 500)   # Grow back to 500
+```
+
+### Key Facts (needs verification):
+1. **Capacity vs Length**: Does Julia Vector maintain separate capacity?
+2. **Shrink behavior**: Does `resize!(v, smaller)` release memory immediately?
+3. **Regrow cost**: If we shrink then grow again, is there reallocation?
+
+### My Understanding:
+- Julia's `Vector` uses a growth strategy (typically 2x)
+- `resize!` to smaller size may NOT release memory (keeps capacity)
+- Growing back within capacity is O(1), no allocation
+- Memory is only released when Vector is GC'd
+
+**Question for review**: Is this accurate? Does Julia guarantee capacity preservation on shrink?
+
+---
+
+## GPU CuVector Behavior
+
+```julia
+using CUDA
+v = CUDA.zeros(Float64, 1000)
+resize!(v, 100)   # Shrink - what happens to GPU memory?
+resize!(v, 500)   # Grow back - allocation?
+```
+
+### VERIFIED: CUDA.jl resize! Implementation (src/array.jl:889)
+
+**CuVector has capacity tracking via `A.maxsize` field.**
+
+```julia
+# CUDA.jl constants
+const RESIZE_THRESHOLD = 100 * 1024^2     # 100 MiB
+const RESIZE_INCREMENT = 32  * 1024^2     # 32  MiB
+
+function Base.resize!(A::CuVector{T}, n::Integer) where T
+  n == length(A) && return A
+
+  # only resize when the new length exceeds the capacity or is much smaller
+  cap = A.maxsize ÷ aligned_sizeof(T)
+  if n > cap || n < cap ÷ 4    # ← SHRINK THRESHOLD: 25%
+    len = if n < cap
+      # shrink to fit (allocates EXACT new size, no over-allocation)
+      n
+    elseif A.maxsize > RESIZE_THRESHOLD
+      # large arrays (>100MB): grow by fixed +32 MiB increments
+      max(n, cap + RESIZE_INCREMENT ÷ aligned_sizeof(T))
+    else
+      # small arrays (<100MB): double in size
+      max(n, 2 * length(A))
+    end
+    # ... allocates new buffer, copies data ...
+  end
+  # If within capacity: just update length, no reallocation
+end
+```
+
+### Key Findings:
+
+| Aspect | CUDA.jl CuVector |
+|--------|------------------|
+| **Capacity tracking** | Yes, via `A.maxsize` |
+| **Shrink threshold** | `n < cap ÷ 4` (25%) |
+| **Shrink behavior** | Reallocates to EXACT new size |
+| **Growth (small <100MB)** | 2x doubling |
+| **Growth (large ≥100MB)** | +32 MiB increments |
+
+### CUDA.jl Memory Management:
+- CUDA.jl uses a memory pool (stream-ordered or binned allocator)
+- Released memory goes back to pool, not immediately to OS/driver
+- `CUDA.reclaim()` forces return to driver
+- Pool may return same block on regrow (observed in verification tests)
+
+---
+
+## Current Pool Design Trade-offs
+
+### Current Approach: Never Shrink
+```julia
+# In get_view!:
+if length(vec) < total_len
+    resize!(vec, total_len)  # Only grow, never shrink
+end
+new_view = view(vec, 1:total_len)  # View handles size
+```
+
+**Pros**:
+- Simple implementation
+- Avoids any potential reallocation costs
+- Views already handle returning correct size
+
+**Cons**:
+- One large allocation permanently increases memory footprint
+- GPU memory is precious and limited
+- No way to recover memory without `empty!(pool)`
+
+### Alternative: Shrink When Significantly Smaller
+```julia
+if length(vec) < total_len
+    resize!(vec, total_len)
+elseif length(vec) > total_len * 4  # Example: 4x threshold
+    resize!(vec, total_len)  # Shrink to save memory
+end
+```
+
+**Pros**:
+- Recovers memory from outlier large allocations
+- Better memory efficiency over time
+
+**Cons**:
+- May cause reallocations
+- Added complexity
+- Need to invalidate cached views on shrink too
+
+---
+
+## Specific Questions for Review
+
+1. **Julia Vector capacity**:
+   - Does `resize!(v, smaller)` preserve capacity?
+   - Is this behavior documented/guaranteed?
+   - Is there a way to query capacity vs length?
+
+2. **CuVector resize behavior**:
+   - Does CUDA.jl's CuVector follow same capacity model?
+   - What happens to GPU memory on shrink?
+   - Does CUDA memory pool make shrink "free" anyway?
+
+3. **Design recommendation**:
+   - Should pools shrink vectors at some threshold?
+   - What threshold makes sense? (2x? 4x? 10x?)
+   - Should CPU and GPU have different policies?
+
+4. **Memory pressure handling**:
+   - Should pool respond to memory pressure signals?
+   - Is there a way to detect "memory is tight"?
+
+---
+
+## Test Code to Verify Behavior
+
+```julia
+# CPU Test
+function test_cpu_resize_behavior()
+    v = Vector{Float64}(undef, 10_000_000)  # ~80MB
+    @show Base.summarysize(v)
+
+    resize!(v, 100)
+    @show Base.summarysize(v)  # Does this shrink?
+
+    resize!(v, 5_000_000)
+    @show Base.summarysize(v)  # Reallocation needed?
+
+    # Is there a way to check capacity?
+end
+
+# GPU Test
+function test_gpu_resize_behavior()
+    CUDA.reclaim()  # Start clean
+
+    v = CUDA.zeros(Float64, 10_000_000)  # ~80MB GPU
+    @show CUDA.memory_status()
+
+    resize!(v, 100)
+    @show CUDA.memory_status()  # Memory returned to pool?
+
+    resize!(v, 5_000_000)
+    @show CUDA.memory_status()  # New allocation?
+end
+```
+
+---
+
+## Related: View Cache Invalidation
+
+Currently, when `resize!` is called (grow only), we invalidate all cached views:
+
+```julia
+if length(vec) < total_len
+    resize!(vec, total_len)
+    # Invalidate all N-way cache entries for this slot
+    for k in 1:CUDA_CACHE_WAYS
+        @inbounds tp.views[base + k] = nothing
+        @inbounds tp.view_dims[base + k] = nothing
+    end
+end
+```
+
+If we add shrinking, same invalidation would be needed since shrink can also reallocate.
+
+---
+
+## Summary
+
+### VERIFIED Results
+
+| Aspect | CPU Vector | GPU CuVector |
+|--------|------------|--------------|
+| **Capacity tracking** | Yes (implicit) | Yes (`A.maxsize`) |
+| **Capacity preservation on shrink** | Yes (pointer unchanged) | No (reallocates at 25%) |
+| **Memory returned on shrink** | No (until GC) | To pool (can be reclaimed) |
+| **Regrow cost after shrink** | O(1) within capacity | May realloc (pool often returns same block) |
+| **CUDA.jl shrink threshold** | N/A | `n < cap ÷ 4` (25%) |
+
+### Design Recommendation for AdaptiveArrayPools
+
+**Current "never shrink" is suboptimal for GPU.** CUDA.jl already implements a 25% threshold, meaning:
+
+1. **Our explicit `resize!(vec, smaller)` calls would trigger CUDA.jl's internal shrink anyway** if below 25%
+2. **We're just deferring the inevitable reallocation** when usage drops significantly
+3. **GPU memory is precious** - holding 4x+ more than needed is wasteful
+
+**Recommendation**: Add lazy shrink for GPU at 25% threshold (matching CUDA.jl):
+
+```julia
+# In get_view! for CuTypedPool:
+cap = length(vec)
+if total_len > cap
+    resize!(vec, total_len)  # Grow
+    # invalidate cache...
+elseif total_len < cap ÷ 4
+    resize!(vec, total_len)  # Shrink when using <25% capacity
+    # invalidate cache...
+end
+```
+
+**Why 25%?**
+- Matches CUDA.jl's internal threshold
+- Consistent behavior - calling resize! directly would shrink at same point
+- Allows 4x variation without reallocation (handles typical size fluctuations)
+- Recovers memory from outlier large allocations

From 7384f84d512ab0fb05db4c671f0436eea58ffd4f Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Mon, 5 Jan 2026 10:40:03 -0800
Subject: [PATCH 3/8] docs: merge hybrid API design docs into single file

- Merge new_hybrid_api_design.md into hybrid_api_design.md
- Add Implementation Specification section (struct layout, N-way algorithm)
- Add API Aliases section (acquire_view!, acquire_array!)
- Add Verification Checklist
- Remove redundant new_hybrid_api_design.md
---
 docs/design/hybrid_api_design.md     | 140 +++++++++++++++++++++------
 docs/design/new_hybrid_api_design.md | 140 ---------------------------
 2 files changed, 109 insertions(+), 171 deletions(-)
 delete mode 100644 docs/design/new_hybrid_api_design.md

diff --git a/docs/design/hybrid_api_design.md b/docs/design/hybrid_api_design.md
index a407e72..23fa851 100644
--- a/docs/design/hybrid_api_design.md
+++ b/docs/design/hybrid_api_design.md
@@ -132,6 +132,24 @@ end
 - Array is already a heap-allocated object → no additional allocation when reusing cached instance
 - Avoids wrapper object optimization issues in type-unspecified paths
 
+### API Aliases
+
+For clarity, explicit aliases are provided:
+
+```julia
+# Main APIs
+export acquire!, unsafe_acquire!
+
+# Explicit Aliases
+export acquire_view!, acquire_array!
+
+"""Alias for [`acquire!`](@ref). Returns a ReshapedArray (View)."""
+const acquire_view! = acquire!
+
+"""Alias for [`unsafe_acquire!`](@ref). Returns an Array (via unsafe_wrap)."""
+const acquire_array! = unsafe_acquire!
+```
+
 ---
 
 ## Comparison Matrix
@@ -155,6 +173,84 @@ end
 
 ---
 
+## Implementation Specification
+
+### Data Structure Changes (types.jl)
+
+#### Constants
+```julia
+const CACHE_WAYS = 4
+```
+
+#### TypedPool Struct Layout
+
+```julia
+mutable struct TypedPool{T}
+    # --- Backing Storage ---
+    vectors::Vector{Vector{T}}
+
+    # --- 1D Cache (Simple 1-way or Direct) ---
+    views::Vector{SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int}}, true}}
+    view_lengths::Vector{Int}
+
+    # --- N-D Array Cache (N-way Set Associative) ---
+    # Layout: Flat Vector. Index = (slot_idx - 1) * CACHE_WAYS + way_idx
+    nd_arrays::Vector{Any}      # Stores Array{T, N}
+    nd_dims::Vector{Any}        # Stores NTuple{N, Int}
+    nd_ptrs::Vector{UInt}       # Stores objectid/pointer for validation
+    nd_next_way::Vector{Int}    # Round-Robin counter per slot (1 per slot)
+
+    # --- State ---
+    n_active::Int
+    _checkpoint_n_active::Vector{Int}
+    _checkpoint_depths::Vector{Int}
+end
+```
+
+**Key Changes**:
+- **Remove**: `nd_views` (No longer needed as `acquire!` returns `ReshapedArray`)
+- **Update**: `nd_arrays`, `nd_dims`, `nd_ptrs` store `CACHE_WAYS` items per active slot
+- **Add**: `nd_next_way::Vector{Int}` for Round-Robin replacement index per slot
+
+### Logic Implementation (core.jl)
+
+#### `acquire!` (The Fast Path)
+
+**Goal**: Always return `ReshapedArray`. No N-D cache lookup.
+
+```julia
+@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N}
+    len = safe_prod(dims)
+    flat_view = get_view!(tp, len)
+    return reshape(flat_view, dims)
+end
+```
+
+#### `unsafe_acquire!` (The N-way Cache Path)
+
+**Goal**: Return `Array`. Use N-way cache with Linear Search + Round-Robin Replacement.
+
+**Algorithm**:
+1. Get 1D view: `flat_view = get_view!(tp, prod(dims))`
+2. Get current pointer: `current_ptr = UInt(pointer(flat_view))`
+3. Calculate Base Index: `base = (tp.n_active - 1) * CACHE_WAYS`
+4. **Search (Hit Check)**:
+   - Loop `k` from `1` to `CACHE_WAYS`
+   - Check if `nd_dims[base + k] == dims` **AND** `nd_ptrs[base + k] == current_ptr`
+   - If match: Return `nd_arrays[base + k]`
+5. **Miss (Replacement)**:
+   - Get victim way from `nd_next_way[tp.n_active]`
+   - Target Index: `target = base + victim_way + 1`
+   - Create Array: `arr = unsafe_wrap(Array{T, N}, pointer(flat_view), dims)`
+   - **Update Cache**:
+     - `nd_arrays[target] = arr`
+     - `nd_dims[target] = dims`
+     - `nd_ptrs[target] = current_ptr`
+   - **Update Round-Robin**: Increment `nd_next_way` (modulo `CACHE_WAYS`)
+   - Return `arr`
+
+---
+
 ## Implementation Plan
 
 ### Phase 1: Simplify `acquire!` N-D Path
@@ -181,11 +277,6 @@ end
 end
 ```
 
-**Change Summary**:
-- Remove `get_nd_array!` call
-- Directly return `reshape(1D_view, dims)`
-- N-D view cache (`nd_views`) not used (in acquire! path)
-
 ### Phase 2: Maintain `unsafe_acquire!` Cache
 
 **No changes** - maintain current implementation:
@@ -193,32 +284,9 @@ end
 - Maintain N-way cache (4-way)
 - 112 bytes allocation on cache miss
 
-### Phase 3: TypedPool Field Cleanup (Optional)
-
-Since `acquire!` no longer uses N-D cache, redefine field purposes:
+### Phase 3: TypedPool Field Updates
 
-```julia
-mutable struct TypedPool{T}
-    # Storage
-    vectors::Vector{Vector{T}}
-
-    # 1D Cache (shared by acquire! 1D + acquire! N-D)
-    views::Vector{SubArray{...}}
-    view_lengths::Vector{Int}
-
-    # N-D Cache (unsafe_acquire! only)
-    nd_arrays::Vector{Any}      # Array objects for unsafe_acquire!
-    nd_dims::Vector{Any}        # Dimension tuples
-    nd_ptrs::Vector{UInt}       # Pointer validation
-
-    # Note: nd_views can be removed (acquire! uses reshape)
-
-    # State
-    n_active::Int
-    _checkpoint_n_active::Vector{Int}
-    _checkpoint_depths::Vector{Int}
-end
-```
+Update struct as specified in Implementation Specification above.
 
 ### Phase 4: Test Updates
 
@@ -331,6 +399,17 @@ end
 
 ---
 
+## Verification Checklist
+
+1. **Type Check**: `acquire!` must return `ReshapedArray`. `unsafe_acquire!` must return `Array`.
+2. **Allocation Check**:
+   - `acquire!`: 0 allocations always
+   - `unsafe_acquire!`: 0 allocations on cache hit
+   - `unsafe_acquire!`: 0 allocations on interleaved access (e.g., alternating 10x10 and 20x20) thanks to N-way cache
+3. **Safety**: Ensure `unsafe_acquire!` validates pointers (re-wraps if the backing vector was resized)
+
+---
+
 ## TurbulentTransport Integration
 
 ### Changed File: `src/tglf_nn.jl`
@@ -438,4 +517,3 @@ end
 ## References
 
 - [nd_array_approach_comparison.md](./nd_array_approach_comparison.md) - Benchmark results and boxing analysis
-- [PR_MESSAGE.md](../PR_MESSAGE.md) - Original PR description
diff --git a/docs/design/new_hybrid_api_design.md b/docs/design/new_hybrid_api_design.md
deleted file mode 100644
index 14cd235..0000000
--- a/docs/design/new_hybrid_api_design.md
+++ /dev/null
@@ -1,140 +0,0 @@
-# Design Spec: Hybrid N-way Cache & ReshapedArray Strategy
-
-> **Note**: This document was written as a clear, concrete specification that another AI can
-> implement mechanically without needing to think through the design.
-
----
-
-## 1. Objective
-Refactor AdaptiveArrayPools.jl to implement a **Hybrid Allocation Strategy**:
-1.  **`acquire!` (Default)**: Return `ReshapedArray` (Zero-Allocation, Stack-allocated). Remove N-D caching logic for this path.
-2.  **`unsafe_acquire!` (Special)**: Return `Array` (via `unsafe_wrap`). Implement **N-way Set Associative Cache** to minimize `unsafe_wrap` overhead (112 bytes) and support interleaved access patterns.
-
-## 2. Data Structure Changes (types.jl)
-
-### Constants
-Define the cache associativity level.
-```julia
-const CACHE_WAYS = 4
-```
-
-### `TypedPool{T}` Struct
-Modify fields to support N-way caching for Arrays, while removing unused View caching.
-
-*   **Remove**: `nd_views` (No longer needed as `acquire!` returns `ReshapedArray`).
-*   **Update**: `nd_arrays`, `nd_dims`, `nd_ptrs`. These vectors must store `CACHE_WAYS` items per active slot.
-*   **Add**: `nd_next_way::Vector{Int}` (To track Round-Robin replacement index for each slot).
-
-**Updated Layout:**
-```julia
-mutable struct TypedPool{T}
-    # --- Backing Storage ---
-    vectors::Vector{Vector{T}}
-
-    # --- 1D Cache (Simple 1-way or Direct) ---
-    views::Vector{SubArray{T, 1, Vector{T}, Tuple{UnitRange{Int}}, true}}
-    view_lengths::Vector{Int}
-
-    # --- N-D Array Cache (N-way Set Associative) ---
-    # Layout: Flat Vector. Index = (slot_idx - 1) * CACHE_WAYS + way_idx
-    nd_arrays::Vector{Any}      # Stores Array{T, N}
-    nd_dims::Vector{Any}        # Stores NTuple{N, Int}
-    nd_ptrs::Vector{UInt}       # Stores objectid/pointer for validation
-    nd_next_way::Vector{Int}    # Round-Robin counter per slot (1 per slot)
-
-    n_active::Int
-    _checkpoint_n_active::Vector{Int}
-    _checkpoint_depths::Vector{Int}
-end
-```
-
-### Initialization
-Ensure `nd_arrays`, `nd_dims`, `nd_ptrs` are initialized with `nothing` or empty values, and `nd_next_way` with `0` or `1`.
-
-## 3. Logic Implementation (core.jl)
-
-### A. `acquire!` (The Fast Path)
-**Goal**: Always return `ReshapedArray`. No N-D cache lookup.
-
-**Implementation**:
-Modify `get_nd_view!` to:
-1.  Calculate total length (`prod(dims)`).
-2.  Call `get_view!(tp, len)` to get a 1D `SubArray`.
-3.  Return `reshape(flat_view, dims)`.
-
-```julia
-@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N}
-    len = safe_prod(dims)
-    flat_view = get_view!(tp, len)
-    return reshape(flat_view, dims)
-end
-```
-
-### B. `unsafe_acquire!` (The N-way Path)
-**Goal**: Return `Array`. Use N-way cache to avoid `unsafe_wrap`.
-
-**Implementation**:
-Modify `get_nd_array!` to use **Linear Search + Round-Robin Replacement**.
-
-**Algorithm**:
-1.  Get 1D view: `flat_view = get_view!(tp, prod(dims))`.
-2.  Get current pointer: `current_ptr = UInt(pointer(flat_view))`.
-3.  Calculate Base Index: `base = (tp.n_active - 1) * CACHE_WAYS`.
-4.  **Search (Hit Check)**:
-    *   Loop `k` from `1` to `CACHE_WAYS`.
-    *   Check if `nd_dims[base + k] == dims` **AND** `nd_ptrs[base + k] == current_ptr`.
-    *   If match: Return `nd_arrays[base + k]`.
-5.  **Miss (Replacement)**:
-    *   Get victim way from `nd_next_way[tp.n_active]`.
-    *   Target Index: `target = base + victim_way + 1`.
-    *   Create Array: `arr = unsafe_wrap(Array{T, N}, pointer(flat_view), dims)`.
-    *   **Update Cache**:
-        *   `nd_arrays[target] = arr`
-        *   `nd_dims[target] = dims`
-        *   `nd_ptrs[target] = current_ptr`
-    *   **Update Round-Robin**: Increment `nd_next_way` (modulo `CACHE_WAYS`).
-    *   Return `arr`.
-
-## 4. API & Aliases (AdaptiveArrayPools.jl)
-
-Add explicit aliases for clarity.
-
-```julia
-# Main APIs
-export acquire!, unsafe_acquire!
-
-# Explicit Aliases
-export acquire_view!, acquire_array!
-
-"""Alias for [`acquire!`](@ref). Returns a ReshapedArray (View)."""
-const acquire_view! = acquire!
-
-"""Alias for [`unsafe_acquire!`](@ref). Returns an Array (via unsafe_wrap)."""
-const acquire_array! = unsafe_acquire!
-```
-
-## 5. Client Integration (`TurbulentTransport.jl`)
-
-Update tglf_nn.jl to use the Array-returning API to avoid dynamic dispatch boxing.
-
-**File**: tglf_nn.jl
-**Function**: `flux_array!`
-**Change**:
-```julia
-# Before
-xx = acquire!(pool, T, size(x))
-
-# After
-xx = unsafe_acquire!(pool, T, size(x))
-# OR
-xx = acquire_array!(pool, T, size(x))
-```
-
-## 6. Verification Checklist
-
-1.  **Type Check**: `acquire!` must return `ReshapedArray`. `unsafe_acquire!` must return `Array`.
-2.  **Allocation Check**:
-    *   `acquire!`: 0 allocations always.
-    *   `unsafe_acquire!`: 0 allocations on cache hit.
-    *   `unsafe_acquire!`: 0 allocations on interleaved access (e.g., alternating 10x10 and 20x20) thanks to N-way cache.
-3.  **Safety**: Ensure `unsafe_acquire!` validates pointers (re-wraps if the backing vector was resized).

From 27ef39a1ec6ad0bff45320d0a0c8d6d6c288d3a2 Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Mon, 5 Jan 2026 10:46:22 -0800
Subject: [PATCH 4/8] docs: update design documents to match current codebase

- fixed_slots_codegen_design.md: Update empty!() to 1-based sentinel pattern
- hybrid_api_design.md: Mark changes as implemented, resolve open questions
- nd_array_approach_comparison.md: Update N-way cache structure (nd_views removed)
- untracked_acquire_design.md: Update _mark_untracked!(), rewind!(), replace
  all_type_stacks() with foreach_fixed_slot()
---
 docs/design/fixed_slots_codegen_design.md   |   4 +-
 docs/design/hybrid_api_design.md            |  16 +--
 docs/design/nd_array_approach_comparison.md |   7 +-
 docs/design/untracked_acquire_design.md     | 108 +++++++++++---------
 4 files changed, 77 insertions(+), 58 deletions(-)

diff --git a/docs/design/fixed_slots_codegen_design.md b/docs/design/fixed_slots_codegen_design.md
index c141e6b..7b57ec9 100644
--- a/docs/design/fixed_slots_codegen_design.md
+++ b/docs/design/fixed_slots_codegen_design.md
@@ -188,8 +188,10 @@ function Base.empty!(pool::AdaptiveArrayPool)
     end
     empty!(pool.others)
 
-    pool._current_depth = 0
+    # Reset untracked detection state (1-based sentinel pattern)
+    pool._current_depth = 1                   # 1 = global scope (sentinel)
     empty!(pool._untracked_flags)
+    push!(pool._untracked_flags, false)       # Sentinel: global scope starts with false
     pool
 end
 ```
diff --git a/docs/design/hybrid_api_design.md b/docs/design/hybrid_api_design.md
index 23fa851..3d8c84a 100644
--- a/docs/design/hybrid_api_design.md
+++ b/docs/design/hybrid_api_design.md
@@ -207,10 +207,10 @@ mutable struct TypedPool{T}
 end
 ```
 
-**Key Changes**:
-- **Remove**: `nd_views` (No longer needed as `acquire!` returns `ReshapedArray`)
-- **Update**: `nd_arrays`, `nd_dims`, `nd_ptrs` store `CACHE_WAYS` items per active slot
-- **Add**: `nd_next_way::Vector{Int}` for Round-Robin replacement index per slot
+**Implemented Changes** (compared to initial design):
+- **Removed**: `nd_views` (No longer needed as `acquire!` returns `ReshapedArray`) ✓
+- **Updated**: `nd_arrays`, `nd_dims`, `nd_ptrs` store `CACHE_WAYS` items per active slot ✓
+- **Added**: `nd_next_way::Vector{Int}` for Round-Robin replacement index per slot ✓
 
 ### Logic Implementation (core.jl)
 
@@ -506,11 +506,11 @@ end
 
 ---
 
-## Open Questions for Review
+## Resolved Questions
 
-1. **N-way cache retention level**: Keep current 4-way? Reduce to 2-way?
-2. **nd_views field removal**: Can be removed since `acquire!` no longer uses it?
-3. **Backward compatibility**: Cases where existing `acquire!` users check for `SubArray` type?
+1. **N-way cache retention level**: Configurable via `CACHE_WAYS` preference (default: 4-way). ✅
+2. **nd_views field removal**: Removed. `acquire!` now returns `ReshapedArray` via `reshape()`. ✅
+3. **Backward compatibility**: `acquire!` returns `ReshapedArray` (a type of `AbstractArray`), maintaining API compatibility. ✅
 
 ---
 
diff --git a/docs/design/nd_array_approach_comparison.md b/docs/design/nd_array_approach_comparison.md
index e01f308..d4a531f 100644
--- a/docs/design/nd_array_approach_comparison.md
+++ b/docs/design/nd_array_approach_comparison.md
@@ -257,13 +257,14 @@ acquire!(pool, Float64, 64, 100)
 arr = unsafe_wrap(Array{T, N}, pointer(flat_view), dims)
 ```
 
-### N-way Cache Structure
+### N-way Cache Structure (Current Implementation)
 ```julia
-# In TypedPool (src/types.jl)
-nd_views::Vector{Any}      # Cached SubArray objects
+# In TypedPool (src/types.jl) - used by unsafe_acquire! only
+# Note: nd_views was removed since acquire! now uses reshape()
 nd_arrays::Vector{Any}     # Cached Array objects (from unsafe_wrap)
 nd_dims::Vector{Any}       # Cached dimension tuples
 nd_ptrs::Vector{UInt}      # Cached pointers for invalidation
+nd_next_way::Vector{Int}   # Round-robin counter per slot
 ```
 
 ---
diff --git a/docs/design/untracked_acquire_design.md b/docs/design/untracked_acquire_design.md
index 7358d0e..c1568fb 100644
--- a/docs/design/untracked_acquire_design.md
+++ b/docs/design/untracked_acquire_design.md
@@ -167,10 +167,10 @@ end
 end
 
 # Untracked marking (still needed - for typed rewind vs full rewind decision)
-@inline function _mark_untracked!(pool::AdaptiveArrayPool)
-    if pool._current_depth > 0
-        @inbounds pool._untracked_flags[pool._current_depth] = true
-    end
+# Note: 1-based sentinel pattern guarantees _current_depth >= 1, so no check needed
+@inline function _mark_untracked!(pool::AbstractArrayPool)
+    # Always mark (_current_depth >= 1 guaranteed by sentinel)
+    @inbounds pool._untracked_flags[pool._current_depth] = true
 end
 ```
 
@@ -275,63 +275,79 @@ function rewind!(pool::AdaptiveArrayPool, types::Type...)
 end
 
 # Full rewind (untracked fallback - simplified with _checkpoint_depths!)
+# Note: Uses 1-based sentinel pattern. At global scope (depth=1), delegates to reset!()
 function rewind!(pool::AdaptiveArrayPool)
-    depth = pool._current_depth
-    for tp in all_type_stacks(pool)
-        if !isempty(tp._checkpoint_depths) && tp._checkpoint_depths[end] == depth
-            # Checkpointed at current depth → pop
-            pop!(tp._checkpoint_depths)
-            tp.n_active = pop!(tp._checkpoint_n_active)
-        elseif !isempty(tp._checkpoint_n_active)
-            # Checkpointed at previous depth → restore without pop
-            tp.n_active = tp._checkpoint_n_active[end]
-        elseif tp.n_active > 0
-            # ⚠️ CRITICAL ERROR: Would destroy arrays outside @with_pool
-            T = eltype(tp)
-            error("""
-                [AdaptiveArrayPools] Cannot rewind type $T: no checkpoint exists.
-                Found $(tp.n_active) active array(s) that were never checkpointed.
-
-                Fix: Wrap the scope where $T was first acquired in @with_pool.
-                """)
-        end
-        # else: _checkpoint_n_active empty and n_active == 0 → normal, do nothing
+    cur_depth = pool._current_depth
+
+    # Safety guard: at global scope (depth=1), no checkpoint to rewind to
+    # Delegate to reset! which safely clears all n_active counters
+    if cur_depth == 1
+        reset!(pool)
+        return nothing
     end
+
+    # Fixed slots - zero allocation via @generated iteration
+    foreach_fixed_slot(pool) do tp
+        _rewind_typed_pool!(tp, cur_depth)
+    end
+
+    # Process fallback types
+    for tp in values(pool.others)
+        _rewind_typed_pool!(tp, cur_depth)
+    end
+
     pop!(pool._untracked_flags)
     pool._current_depth -= 1
+    return nothing
+end
+
+# Internal helper for rewind with orphan cleanup
+# Uses 1-based sentinel pattern: no isempty checks needed
+@inline function _rewind_typed_pool!(tp::AbstractTypedPool, current_depth::Int)
+    # 1. Orphaned Checkpoints Cleanup
+    while @inbounds tp._checkpoint_depths[end] > current_depth
+        pop!(tp._checkpoint_depths)
+        pop!(tp._checkpoint_n_active)
+    end
+
+    # 2. Normal Rewind Logic (Sentinel Pattern)
+    if @inbounds tp._checkpoint_depths[end] == current_depth
+        pop!(tp._checkpoint_depths)
+        tp.n_active = pop!(tp._checkpoint_n_active)
+    else
+        # No checkpoint at current depth - restore from parent
+        tp.n_active = @inbounds tp._checkpoint_n_active[end]
+    end
+    nothing
 end
 ```
 
 **Key Improvement:** `_checkpoint_depths[end] == depth` comparison enables accurate pop/restore decision → `_full_rewind_with_types!` not needed!
 
-### 4.5 all_type_stacks Implementation
+### 4.5 Zero-Allocation Iteration (Current Implementation)
+
+> **Note**: The design originally proposed `all_type_stacks()` generator, but was replaced with
+> `foreach_fixed_slot()` @generated function for zero allocation via compile-time unrolling.
 
 ```julia
-# Generator to iterate all TypedPools (fixed slots + others)
-function all_type_stacks(pool::AdaptiveArrayPool)
-    return Iterators.flatten((
-        # Fixed slots (7)
-        (pool.float64, pool.float32, pool.int64, pool.int32, pool.complexf64, pool.complexf32, pool.bool),
-        # Others (IdDict values)
-        values(pool.others)
-    ))
-end
-
-# Or callback pattern (more efficient, no allocation)
-@inline function foreach_type_stack(f, pool::AdaptiveArrayPool)
-    f(pool.float64)
-    f(pool.float32)
-    f(pool.int64)
-    f(pool.int32)
-    f(pool.complexf64)
-    f(pool.complexf32)
-    f(pool.bool)
-    for tp in values(pool.others)
-        f(tp)
+# Current implementation uses @generated for zero allocation
+const FIXED_SLOT_FIELDS = (:float64, :float32, :int64, :int32, :complexf64, :complexf32, :bool)
+
+@generated function foreach_fixed_slot(f::F, pool::AdaptiveArrayPool) where {F}
+    exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in FIXED_SLOT_FIELDS]
+    quote
+        Base.@_inline_meta
+        $(exprs...)
+        nothing
     end
 end
 ```
 
+**Benefits over generator approach**:
+- Zero allocation via compile-time unrolling
+- Full inlining for hot paths
+- No runtime iteration overhead
+
 ### 4.6 Macro Generated Code
 
 ```julia

From 5049263af14c4e034f066891244bbc54eeb10bee Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Mon, 5 Jan 2026 10:53:09 -0800
Subject: [PATCH 5/8] docs: update Further Reading with GitHub links and
 English descriptions

---
 docs/src/advanced/internals.md | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/src/advanced/internals.md b/docs/src/advanced/internals.md
index 3722203..a31ec81 100644
--- a/docs/src/advanced/internals.md
+++ b/docs/src/advanced/internals.md
@@ -1,6 +1,6 @@
 # Internals
 
-This page provides an overview of the internal architecture of AdaptiveArrayPools.jl. For detailed design documents (in Korean), see the `design/` folder in the repository.
+This page provides an overview of the internal architecture of AdaptiveArrayPools.jl. For detailed design documents, see the [`docs/design/`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/tree/master/docs/design) folder in the repository.
 
 ## Checkpoint/Rewind Lifecycle
 
@@ -128,9 +128,10 @@ This pattern reduces branching in hot paths where every nanosecond counts.
 
 ## Further Reading
 
-For detailed design documents (in Korean):
-- `design/hybrid_api_design.md` — Two-API strategy rationale
-- `design/cuda_extension_design.md` — CUDA backend architecture
-- `design/untracked_acquire_design.md` — Untracked acquire detection
-- `design/fixed_slots_codegen_design.md` — Code generation for fixed slots
-- `design/nd_array_approach_comparison.md` — N-way cache design comparison
+For detailed design documents:
+
+- [`hybrid_api_design.md`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/hybrid_api_design.md) — Two-API strategy (`acquire!` vs `unsafe_acquire!`) and type stability analysis
+- [`nd_array_approach_comparison.md`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/nd_array_approach_comparison.md) — N-way cache design, boxing analysis, and ReshapedArray benchmarks
+- [`untracked_acquire_design.md`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/untracked_acquire_design.md) — Macro-based untracked acquire detection and 1-based sentinel pattern
+- [`fixed_slots_codegen_design.md`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/fixed_slots_codegen_design.md) — Zero-allocation iteration via `@generated` functions
+- [`cuda_extension_design.md`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/cuda_extension_design.md) — CUDA backend architecture and extension loading

From d807d05b45d740975aa0a6bb08c98553be6261f2 Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Mon, 5 Jan 2026 11:00:14 -0800
Subject: [PATCH 6/8] refactor: move docs to docs/src/, update README links to
 GitHub Pages

- Remove copy logic from make.jl (files now live in docs/src/)
- Delete redundant docs/*.md files (content is in docs/src/**)
- Update README links to point to GitHub Pages stable URLs
- Simplify documentation build process
---
 README.md               |  20 +--
 docs/api.md             | 111 ----------------
 docs/configuration.md   | 102 ---------------
 docs/cuda.md            | 123 -----------------
 docs/make.jl            |  49 +------
 docs/maybe_with_pool.md |  53 --------
 docs/multi-threading.md | 284 ----------------------------------------
 docs/safety.md          | 110 ----------------
 8 files changed, 17 insertions(+), 835 deletions(-)
 delete mode 100644 docs/api.md
 delete mode 100644 docs/configuration.md
 delete mode 100644 docs/cuda.md
 delete mode 100644 docs/maybe_with_pool.md
 delete mode 100644 docs/multi-threading.md
 delete mode 100644 docs/safety.md

diff --git a/README.md b/README.md
index a904175..248854a 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,7 @@ end
 | Allocations | ⚠️ 90,000 (2.75 GiB) | ✅ **0** | 100% eliminated |
 | GC Time | ⚠️ 31% | ✅ **0%** | No GC pauses |
 
-> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](docs/cuda.md).
+> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/cuda).
 
 ## How It Works
 
@@ -76,11 +76,11 @@ end
 
 This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors.
 
-`acquire!` returns lightweight views (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK. If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`—see [API Reference](docs/api.md).
+`acquire!` returns lightweight views (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK. If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`—see [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api).
 
-> **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](docs/safety.md).
+> **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/guide/safety).
 
-**Thread-safe by design**: Each Julia Task gets its own independent pool—no locks needed. See [Multi-Threading](docs/multi-threading.md) for patterns.
+**Thread-safe by design**: Each Julia Task gets its own independent pool—no locks needed. See [Multi-Threading](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/advanced/multi-threading) for patterns.
 
 ### Convenience Functions
 
@@ -92,7 +92,7 @@ Common initialization patterns have convenience functions:
 | `ones!(pool, Float32, 3, 3)` | `acquire!` + `fill!(1)` |
 | `similar!(pool, A)` | `acquire!` matching `eltype(A)`, `size(A)` |
 
-These return views like `acquire!`. For raw `Array` types, use `unsafe_acquire!` or its convenience variants (`unsafe_zeros!`, `unsafe_ones!`, `unsafe_similar!`). See [API Reference](docs/api.md#convenience-functions).
+These return views like `acquire!`. For raw `Array` types, use `unsafe_acquire!` or its convenience variants (`unsafe_zeros!`, `unsafe_ones!`, `unsafe_similar!`). See [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api#convenience-functions).
 
 ## Installation
 
@@ -106,11 +106,11 @@ Pkg.add("AdaptiveArrayPools")
 
 | Guide | Description |
 |-------|-------------|
-| [API Reference](docs/api.md) | Complete function and macro reference |
-| [CUDA Backend](docs/cuda.md) | GPU-specific usage and examples |
-| [Safety Guide](docs/safety.md) | Scope rules and best practices |
-| [Multi-Threading](docs/multi-threading.md) | Task/thread safety patterns |
-| [Configuration](docs/configuration.md) | Preferences and cache tuning |
+| [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api) | Complete function and macro reference |
+| [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/cuda) | GPU-specific usage and examples |
+| [Safety Guide](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/guide/safety) | Scope rules and best practices |
+| [Multi-Threading](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/advanced/multi-threading) | Task/thread safety patterns |
+| [Configuration](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/configuration) | Preferences and cache tuning |
 
 ## License
 
diff --git a/docs/api.md b/docs/api.md
deleted file mode 100644
index 4d3eb8c..0000000
--- a/docs/api.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# API Reference
-
-## Macros
-
-| Macro | Description |
-|-------|-------------|
-| `@with_pool name expr` | **Recommended.** Injects a global, task-local pool named `name`. Automatically checkpoints and rewinds. |
-| `@maybe_with_pool name expr` | Same as `@with_pool`, but can be toggled on/off at runtime via `MAYBE_POOLING_ENABLED[]`. |
-
-## Functions
-
-| Function | Description |
-|----------|-------------|
-| `acquire!(pool, T, dims...)` | Returns a view: `SubArray{T,1}` for 1D, `ReshapedArray{T,N}` for N-D. Always 0 bytes. |
-| `acquire!(pool, T, dims::Tuple)` | Tuple overload for `acquire!` (e.g., `acquire!(pool, T, size(x))`). |
-| `acquire!(pool, x::AbstractArray)` | Similar-style: acquires array matching `eltype(x)` and `size(x)`. |
-| `unsafe_acquire!(pool, T, dims...)` | Returns native `Array`/`CuArray` (CPU: `Vector{T}` for 1D, `Array{T,N}` for N-D). Only for FFI/type constraints. |
-| `unsafe_acquire!(pool, T, dims::Tuple)` | Tuple overload for `unsafe_acquire!`. |
-| `unsafe_acquire!(pool, x::AbstractArray)` | Similar-style: acquires raw array matching `eltype(x)` and `size(x)`. |
-| `acquire_view!(pool, T, dims...)` | Alias for `acquire!`. Returns view types. |
-| `acquire_array!(pool, T, dims...)` | Alias for `unsafe_acquire!`. Returns Array for N-D. |
-| `checkpoint!(pool)` | Saves the current pool state (stack pointer). |
-| `checkpoint!(pool, T...)` | Type-specific checkpoint for optimized performance. |
-| `rewind!(pool)` | Restores the pool to the last checkpoint, freeing all arrays acquired since then. |
-| `rewind!(pool, T...)` | Type-specific rewind for optimized performance. |
-| `pool_stats(pool)` | Prints detailed statistics about pool usage. |
-| `get_task_local_pool()` | Returns the task-local pool instance. |
-| `empty!(pool)` | Clears all internal storage, releasing all memory. |
-
-## Convenience Functions
-
-Shortcuts for common `acquire!` + initialization patterns. Default element type is `Float64` (CPU) or `Float32` (CUDA).
-
-### View-returning (like `acquire!`)
-
-| Function | Description |
-|----------|-------------|
-| `zeros!(pool, [T,] dims...)` | Zero-initialized view. Equivalent to `acquire!` + `fill!(0)`. |
-| `ones!(pool, [T,] dims...)` | One-initialized view. Equivalent to `acquire!` + `fill!(1)`. |
-| `similar!(pool, A)` | View matching `eltype(A)` and `size(A)`. |
-| `similar!(pool, A, T)` | View with type `T`, size from `A`. |
-| `similar!(pool, A, dims...)` | View with `eltype(A)`, specified dimensions. |
-| `similar!(pool, A, T, dims...)` | View with type `T`, specified dimensions. |
-
-### Array-returning (like `unsafe_acquire!`)
-
-| Function | Description |
-|----------|-------------|
-| `unsafe_zeros!(pool, [T,] dims...)` | Zero-initialized raw `Array`. |
-| `unsafe_ones!(pool, [T,] dims...)` | One-initialized raw `Array`. |
-| `unsafe_similar!(pool, A, ...)` | Raw `Array` with same signatures as `similar!`. |
-
-All convenience functions support tuple dimensions: `zeros!(pool, (3, 4))`.
-
-**CUDA note**: Default type is `Float32` to match `CUDA.zeros()` behavior.
-
-## Types
-
-| Type | Description |
-|------|-------------|
-| `AdaptiveArrayPool` | The main pool type. Create with `AdaptiveArrayPool()`. |
-| `DisabledPool{Backend}` | Sentinel type when pooling is disabled. Preserves backend context (`:cpu` or `:cuda`). |
-
-## Utility Functions
-
-| Function | Description |
-|----------|-------------|
-| `pooling_enabled(pool)` | Returns `true` if pool is active, `false` if `DisabledPool`. Use instead of `pool === nothing`. |
-| `default_eltype(pool)` | Returns default element type: `Float64` (CPU) or `Float32` (CUDA). |
-
-## Constants
-
-| Constant | Description |
-|----------|-------------|
-| `USE_POOLING` | Compile-time constant. Set via `Preferences.jl` to disable all pooling. |
-| `MAYBE_POOLING_ENABLED` | Runtime `Ref{Bool}`. Only affects `@maybe_with_pool`. |
-| `POOL_DEBUG` | Runtime `Ref{Bool}`. Enable safety validation for debugging. |
-| `CACHE_WAYS` | Compile-time constant. N-way cache size for `unsafe_acquire!` (default: 4, range: 1-16). |
-
-## Configuration Functions
-
-| Function | Description |
-|----------|-------------|
-| `set_cache_ways!(n)` | Set N-way cache size. Requires Julia restart. |
-
-## Safety Notes
-
-Arrays acquired from a pool are **only valid within the `@with_pool` scope**. Do not:
-- Return pool-backed arrays from functions
-- Store them in global variables
-- Capture them in closures that outlive the scope
-- Call `resize!`, `push!`, or `append!` on arrays from `unsafe_acquire!`
-
-Use `POOL_DEBUG[] = true` during development to catch direct returns of pool-backed arrays.
-
-## `acquire!` vs `unsafe_acquire!`
-
-| Function | 1D Return | N-D Return | Allocation |
-|----------|-----------|------------|------------|
-| `acquire!` | `SubArray{T,1}` | `ReshapedArray{T,N}` | Always 0 bytes (stack-based views) |
-| `unsafe_acquire!` | `Vector{T}` | `Array{T,N}` | 0 bytes (hit) / ~100 bytes header (miss) |
-
-Both share the same underlying pool memory. Even on cache miss, only the `Array` header is allocated—**data memory is always reused from the pool**. **Use `acquire!` by default**—BLAS/LAPACK are fully optimized for `StridedArray`, so there's no performance difference.
-
-Use `unsafe_acquire!` only when you need a concrete `Array` type (FFI, type signatures, runtime dispatch).
-
-**Caching**:
-- `acquire!` 1D uses simple 1:1 cache (reuses `SubArray` if same length)
-- `unsafe_acquire!` (all dimensions) uses N-way cache (up to `CACHE_WAYS`, default: 4) per slot; exceeding this causes eviction
-
-> **Header size by dimensionality**: The `~100 bytes` is an average. Actual `Array` header allocation varies: 1D → 80 bytes, 2D-3D → 112 bytes, 4D-5D → 144 bytes. This is Julia's internal `Array` metadata; actual data memory is always reused from the pool.
diff --git a/docs/configuration.md b/docs/configuration.md
deleted file mode 100644
index fdc1d16..0000000
--- a/docs/configuration.md
+++ /dev/null
@@ -1,102 +0,0 @@
-# Configuration
-
-AdaptiveArrayPools can be configured via `LocalPreferences.toml`:
-
-```toml
-[AdaptiveArrayPools]
-use_pooling = false  # ⭐ Primary: Disable pooling entirely
-cache_ways = 8       # Advanced: N-way cache size (default: 4)
-```
-
-## Compile-time: USE_POOLING (⭐ Primary)
-
-**The most important configuration.** Completely disable pooling to make `acquire!` behave like standard allocation.
-
-```toml
-# LocalPreferences.toml
-[AdaptiveArrayPools]
-use_pooling = false
-```
-
-Or programmatically:
-
-```julia
-using Preferences
-Preferences.set_preferences!(AdaptiveArrayPools, "use_pooling" => false)
-# Restart Julia for changes to take effect
-```
-
-When `USE_POOLING = false`:
-- `pool` becomes `DisabledPool{backend}()` instead of an active pool
-- All pool functions fall back to standard allocation
-- Backend context is preserved: `:cuda` still returns `CuArray`
-
-```julia
-# These become equivalent:
-@with_pool pool acquire!(pool, Float64, n, n)  →  Matrix{Float64}(undef, n, n)
-@with_pool pool acquire!(pool, Float64, n)     →  Vector{Float64}(undef, n)
-
-# With CUDA backend:
-@with_pool :cuda pool zeros!(pool, 100)        →  CUDA.zeros(Float32, 100)
-```
-
-Use `pooling_enabled(pool)` to check if pooling is active.
-
-**Use cases:**
-- **Debugging**: Compare behavior with/without pooling
-- **Benchmarking**: Measure pooling overhead vs direct allocation
-- **Gradual adoption**: Add `@with_pool` annotations now, enable pooling later
-- **CI/Testing**: Run tests without pooling to isolate issues
-
-All pooling code is **completely eliminated at compile time** (zero overhead).
-
-## Runtime: MAYBE_POOLING_ENABLED
-
-Only affects `@maybe_with_pool`. Toggle without restart.
-
-```julia
-MAYBE_POOLING_ENABLED[] = false  # Disable
-MAYBE_POOLING_ENABLED[] = true   # Enable (default)
-```
-
-## Runtime: POOL_DEBUG
-
-Enable safety validation to catch direct returns of pool-backed arrays.
-
-```julia
-POOL_DEBUG[] = true   # Enable safety checks (development)
-POOL_DEBUG[] = false  # Disable (default, production)
-```
-
-When enabled, returning a pool-backed array from a `@with_pool` block will throw an error.
-
-## Compile-time: CACHE_WAYS
-
-Configure the N-way cache size for `unsafe_acquire!`. Higher values reduce cache eviction but increase memory per slot.
-
-```toml
-# LocalPreferences.toml
-[AdaptiveArrayPools]
-cache_ways = 8  # Default: 4, Range: 1-16
-```
-
-Or programmatically:
-
-```julia
-using AdaptiveArrayPools
-set_cache_ways!(8)
-# Restart Julia for changes to take effect
-```
-
-**When to increase**: If your code alternates between more than 4 dimension patterns per pool slot, increase `cache_ways` to avoid cache eviction (~100 bytes header per miss).
-
-> **Scope**: `cache_ways` affects **all `unsafe_acquire!`** calls (including 1D). Only `acquire!` 1D uses simple 1:1 caching.
-
-## Summary
-
-| Setting | Scope | Restart? | Priority | Affects |
-|---------|-------|----------|----------|---------|
-| `use_pooling` | Compile-time | Yes | ⭐ Primary | All macros, `acquire!` behavior |
-| `cache_ways` | Compile-time | Yes | Advanced | `unsafe_acquire!` N-D caching |
-| `MAYBE_POOLING_ENABLED` | Runtime | No | Optional | `@maybe_with_pool` only |
-| `POOL_DEBUG` | Runtime | No | Debug | Safety validation |
diff --git a/docs/cuda.md b/docs/cuda.md
deleted file mode 100644
index c5778c8..0000000
--- a/docs/cuda.md
+++ /dev/null
@@ -1,123 +0,0 @@
-# CUDA Backend
-
-AdaptiveArrayPools provides native CUDA support through a package extension that loads automatically when CUDA.jl is available.
-
-## Quick Start
-
-```julia
-using AdaptiveArrayPools, CUDA
-
-# Use :cuda backend for GPU arrays
-@with_pool :cuda pool function gpu_computation(n)
-    A = acquire!(pool, Float64, n, n)  # CuArray view
-    B = acquire!(pool, Float64, n, n)  # CuArray view
-
-    fill!(A, 1.0)
-    fill!(B, 2.0)
-
-    return sum(A .+ B)
-end
-
-# Zero GPU allocation in hot loops
-for i in 1:1000
-    gpu_computation(100)  # GPU memory reused from pool
-end
-```
-
-## API
-
-The CUDA backend uses the same API as CPU, with `:cuda` backend specifier:
-
-| Macro/Function | Description |
-|----------------|-------------|
-| `@with_pool :cuda pool expr` | GPU pool with automatic checkpoint/rewind |
-| `acquire!(pool, T, dims...)` | Returns `CuArray` view (always 0 bytes GPU alloc) |
-| `unsafe_acquire!(pool, T, dims...)` | Returns raw `CuArray` (for FFI/type constraints) |
-| `get_task_local_cuda_pool()` | Returns the task-local CUDA pool |
-| `pool_stats(:cuda)` | Print CUDA pool statistics |
-
-## Return Types
-
-| Function | 1D Return | N-D Return |
-|----------|-----------|------------|
-| `acquire!` | `CuArray{T,1}` (view) | `CuArray{T,N}` (view) |
-| `unsafe_acquire!` | `CuArray{T,1}` | `CuArray{T,N}` |
-
-## Allocation Behavior
-
-**GPU Memory**: Always 0 bytes allocation after warmup. The underlying `CuVector` is resized as needed and reused.
-
-**CPU Memory**:
-- Cache hit (≤4 dimension patterns per slot): 0 bytes
-- Cache miss (>4 patterns): ~100 bytes for wrapper metadata
-
-```julia
-# Example: 4 patterns fit in 4-way cache → zero CPU allocation
-dims_list = ((10, 10), (5, 20), (20, 5), (4, 25))
-for dims in dims_list
-    @with_pool :cuda p begin
-        A = acquire!(p, Float64, dims...)
-        # Use A...
-    end
-end
-```
-
-## Fixed Slot Types
-
-Optimized types with pre-allocated slots (same as CPU):
-
-| Type | Field |
-|------|-------|
-| `Float64` | `.float64` |
-| `Float32` | `.float32` |
-| `Float16` | `.float16` |
-| `Int64` | `.int64` |
-| `Int32` | `.int32` |
-| `ComplexF64` | `.complexf64` |
-| `ComplexF32` | `.complexf32` |
-| `Bool` | `.bool` |
-
-Other types use the fallback dictionary (`.others`).
-
-## Limitations
-
-- **No `@maybe_with_pool :cuda`**: Runtime toggle not supported for CUDA backend
-- **Task-local only**: Each Task gets its own CUDA pool, same as CPU
-- **Same device**: All arrays in a pool use the same CUDA device
-
-## Example: Matrix Multiplication
-
-```julia
-using AdaptiveArrayPools, CUDA, LinearAlgebra
-
-@with_pool :cuda pool function gpu_matmul(n)
-    A = acquire!(pool, Float64, n, n)
-    B = acquire!(pool, Float64, n, n)
-    C = acquire!(pool, Float64, n, n)
-
-    rand!(A); rand!(B)
-    mul!(C, A, B)
-
-    return sum(C)
-end
-
-# Warmup
-gpu_matmul(100)
-
-# Benchmark - zero GPU allocation
-using BenchmarkTools
-@benchmark gpu_matmul(1000)
-```
-
-## Debugging
-
-```julia
-# Check pool state
-pool_stats(:cuda)
-
-# Output:
-# CuAdaptiveArrayPool (device 0)
-#   Float64 (fixed) [GPU]
-#     slots: 3 (active: 0)
-#     elements: 30000 (234.375 KiB)
-```
diff --git a/docs/make.jl b/docs/make.jl
index 7517dbd..2147369 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -15,24 +15,17 @@ function write_if_changed(path::String, content::String)
     write(path, content)
 end
 
-"""
-Copy file only if content changed (prevents mtime update triggering rebuild).
-"""
-function cp_if_changed(src::String, dst::String)
-    if isfile(dst) && read(src) == read(dst)
-        return  # Content unchanged, skip copy
-    end
-    cp(src, dst; force=true)
-end
-
 # ============================================
 # Helper: Rewrite relative paths in README
 # ============================================
 
+const GITHUB_PAGES_BASE = "https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable"
+const REPO_URL = "https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl"
+
 """
 Rewrite relative paths in README.md for Documenter structure.
 
-Converts:
+Converts GitHub repo links to internal Documenter links:
 - `docs/api.md` → `usage/api.md`
 - `docs/cuda.md` → `usage/cuda.md`
 - `docs/safety.md` → `guide/safety.md`
@@ -43,8 +36,6 @@ Converts:
 Also handles anchor links (e.g., `docs/api.md#convenience-functions`).
 """
 function rewrite_readme_paths(content::String)
-    repo_url = "https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl"
-
     # Usage docs (with optional anchors)
     content = replace(content, r"\(docs/api\.md(#[^)]+)?\)" => s"(usage/api.md\1)")
     content = replace(content, r"\(docs/cuda\.md(#[^)]+)?\)" => s"(usage/cuda.md\1)")
@@ -58,50 +49,24 @@ function rewrite_readme_paths(content::String)
     content = replace(content, r"\(docs/multi-threading\.md(#[^)]+)?\)" => s"(advanced/multi-threading.md\1)")
 
     # LICENSE link → GitHub
-    content = replace(content, "(LICENSE)" => "($(repo_url)/blob/master/LICENSE)")
+    content = replace(content, "(LICENSE)" => "($(REPO_URL)/blob/master/LICENSE)")
 
     return content
 end
 
 # ============================================
-# Step 1: Setup directories
+# Generate index.md from README
 # ============================================
 
 const DOCS_DIR = @__DIR__
 const DOCS_SRC = joinpath(DOCS_DIR, "src")
 
-# Create directory structure
-mkpath(DOCS_SRC)
-mkpath(joinpath(DOCS_SRC, "guide"))
-mkpath(joinpath(DOCS_SRC, "usage"))
-mkpath(joinpath(DOCS_SRC, "advanced"))
-
-# ============================================
-# Step 2: Copy and transform content
-# ============================================
-
 # README.md → index.md (with path rewriting)
 readme_content = read(joinpath(DOCS_DIR, "../README.md"), String)
 write_if_changed(joinpath(DOCS_SRC, "index.md"), rewrite_readme_paths(readme_content))
 
-# Copy existing docs to their new locations (with path fixes)
-
-# Guide section - fix relative links
-safety_content = read(joinpath(DOCS_DIR, "safety.md"), String)
-safety_content = replace(safety_content, "(multi-threading.md)" => "(../advanced/multi-threading.md)")
-write_if_changed(joinpath(DOCS_SRC, "guide/safety.md"), safety_content)
-
-# Usage section
-cp_if_changed(joinpath(DOCS_DIR, "api.md"), joinpath(DOCS_SRC, "usage/api.md"))
-cp_if_changed(joinpath(DOCS_DIR, "configuration.md"), joinpath(DOCS_SRC, "usage/configuration.md"))
-cp_if_changed(joinpath(DOCS_DIR, "maybe_with_pool.md"), joinpath(DOCS_SRC, "usage/maybe_with_pool.md"))
-cp_if_changed(joinpath(DOCS_DIR, "cuda.md"), joinpath(DOCS_SRC, "usage/cuda.md"))
-
-# Advanced section
-cp_if_changed(joinpath(DOCS_DIR, "multi-threading.md"), joinpath(DOCS_SRC, "advanced/multi-threading.md"))
-
 # ============================================
-# Step 3: Build documentation
+# Build documentation
 # ============================================
 
 makedocs(
diff --git a/docs/maybe_with_pool.md b/docs/maybe_with_pool.md
deleted file mode 100644
index 39c31b7..0000000
--- a/docs/maybe_with_pool.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# @maybe_with_pool
-
-Runtime-toggleable pooling. Users can enable/disable via `MAYBE_POOLING_ENABLED[]`.
-
-## Usage
-
-```julia
-@maybe_with_pool pool function compute(n)
-    v = acquire!(pool, Float64, n)
-    v .= 1.0
-    sum(v)
-end
-
-# Toggle at runtime
-MAYBE_POOLING_ENABLED[] = false  # Normal allocation
-MAYBE_POOLING_ENABLED[] = true   # Uses pool
-```
-
-## When to Use
-
-- Library code where end-users should control pooling behavior
-- Debugging: disable pooling to isolate memory issues
-- Benchmarking: compare pooled vs non-pooled performance
-
-## How It Works
-
-When `MAYBE_POOLING_ENABLED[] == false`:
-- `pool` becomes `DisabledPool{backend}()` (e.g., `DisabledPool{:cpu}()` or `DisabledPool{:cuda}()`)
-- All pool functions (`acquire!`, `zeros!`, etc.) fall back to standard allocation
-- Backend context is preserved: `:cuda` → `CuArray`, `:cpu` → `Array`
-
-Use `pooling_enabled(pool)` to check if pooling is active:
-```julia
-@maybe_with_pool pool begin
-    if pooling_enabled(pool)
-        # Using pooled memory
-    else
-        # Using standard allocation (DisabledPool)
-    end
-end
-```
-
-## vs @with_pool
-
-| | `@with_pool` | `@maybe_with_pool` |
-|---|---|---|
-| Runtime toggle | No | Yes |
-| Overhead when disabled | None | Branch check |
-| Use case | Application code | Library code |
-
-## Safety
-
-Same rules as `@with_pool`: arrays are only valid within the scope. Do not return or store them externally.
diff --git a/docs/multi-threading.md b/docs/multi-threading.md
deleted file mode 100644
index 4135019..0000000
--- a/docs/multi-threading.md
+++ /dev/null
@@ -1,284 +0,0 @@
-# Multi-Threading Guide
-
-AdaptiveArrayPools uses `task_local_storage()` for **task-local isolation**: each Julia Task gets its own independent pool. This design ensures thread safety when used correctly.
-
-## Table of Contents
-
-- [Understanding Julia's Task/Thread Model](#understanding-julias-taskthread-model)
-- [How Pools Work with @threads](#how-pools-work-with-threads)
-- [Safe Patterns](#safe-patterns)
-- [Unsafe Patterns](#unsafe-patterns)
-- [Why Task-Local (Not Thread-Local)?](#why-task-local-not-thread-local)
-- [User Responsibility](#user-responsibility)
-
----
-
-## Understanding Julia's Task/Thread Model
-
-Julia uses an **M:N threading model** where multiple Tasks (lightweight coroutines) can run on multiple OS threads.
-
-```
-┌─────────────────────────────────────────────────────────────┐
-│                     Julia Process                            │
-│                                                              │
-│  Thread 1              Thread 2              Thread 3        │
-│  ┌─────────┐          ┌─────────┐          ┌─────────┐      │
-│  │ Task A  │          │ Task C  │          │ Task E  │      │
-│  │ (TLS-A) │          │ (TLS-C) │          │ (TLS-E) │      │
-│  └─────────┘          └─────────┘          └─────────┘      │
-│  ┌─────────┐          ┌─────────┐                           │
-│  │ Task B  │          │ Task D  │                           │
-│  │ (TLS-B) │          │ (TLS-D) │                           │
-│  └─────────┘          └─────────┘                           │
-└─────────────────────────────────────────────────────────────┘
-```
-
-Key concepts:
-
-| Concept | Description |
-|---------|-------------|
-| **Thread** | OS-level execution unit. Fixed count at Julia startup. |
-| **Task** | Julia's lightweight coroutine (Green Thread). Created dynamically. |
-| **task_local_storage()** | Per-Task storage. Each Task has its own isolated TLS. |
-
-### Important: One Thread Can Run Multiple Tasks
-
-A single thread can execute multiple Tasks by switching between them at **yield points** (I/O, `sleep()`, `yield()`, etc.):
-
-```julia
-# Both tasks run on Thread 1, interleaved!
-task_a = @spawn begin
-    println("A start")
-    sleep(0.1)      # yield point - switch to Task B
-    println("A end")
-end
-
-task_b = @spawn begin
-    println("B start")
-    sleep(0.1)      # yield point - switch back to Task A
-    println("B end")
-end
-
-# Output (single thread):
-# A start
-# B start
-# A end
-# B end
-```
-
----
-
-## How Pools Work with @threads
-
-When you use `Threads.@threads`, Julia distributes iterations across threads. Each thread gets **one Task** that processes its assigned iterations.
-
-```
-Threads.@threads for i in 1:100_000   (4 threads)
-│
-├─ Thread 1: Task-1 → Pool-1
-│   └─ Processes i = 1..25,000 (same pool reused for all!)
-│
-├─ Thread 2: Task-2 → Pool-2
-│   └─ Processes i = 25,001..50,000
-│
-├─ Thread 3: Task-3 → Pool-3
-│   └─ Processes i = 50,001..75,000
-│
-└─ Thread 4: Task-4 → Pool-4
-    └─ Processes i = 75,001..100,000
-
-Total: 4 pools created, each reused ~25,000 times
-```
-
-### Key Insight
-
-- `@threads` creates **one Task per thread** (not one per iteration!)
-- Each Task has its own `task_local_storage()` → its own pool
-- Within one `@threads` block, pools are efficiently reused
-- Calling `@threads` **multiple times** creates new Tasks → new pools each time
-
----
-
-## Safe Patterns
-
-### Pattern 1: `@with_pool` Inside `@threads`
-
-```julia
-Threads.@threads for i in 1:N
-    @with_pool pool begin
-        a = acquire!(pool, Float64, 100)
-        # ... computation ...
-    end  # pool automatically rewinds
-end
-```
-
-Each thread's Task gets its own pool. Safe and efficient.
-
-### Pattern 2: Function Defined with `@with_pool`
-
-```julia
-# Define function with @with_pool
-@with_pool pool function inner_work(x)
-    tmp = acquire!(pool, Float64, length(x))
-    tmp .= x
-    return sum(tmp)
-end
-
-# Call from @threads - each thread gets its own pool
-Threads.@threads for i in 1:N
-    result = inner_work(data[i])
-end
-```
-
-The pool is created per-Task when the function is called, not when defined.
-
-### Pattern 3: Nested Functions
-
-```julia
-@with_pool outer_pool function outer_work(data)
-    # outer_pool belongs to Main Task
-    tmp = acquire!(outer_pool, Float64, 100)
-
-    Threads.@threads for i in 1:length(data)
-        # inner_work creates its own pool per thread
-        inner_work(data[i])  # Inner pool ≠ outer_pool (safe!)
-    end
-end
-```
-
-Outer and inner pools are completely independent.
-
----
-
-## Unsafe Patterns
-
-### Pattern 1: `@with_pool` Outside `@threads`
-
-```julia
-# ❌ DANGER: Race condition!
-@with_pool pool Threads.@threads for i in 1:N
-    a = acquire!(pool, Float64, 100)  # All threads share ONE pool!
-end
-```
-
-**Why it fails**: `pool` is created in the Main Task's TLS. All threads access the same pool simultaneously.
-
-### Pattern 2: Sharing Pool Reference
-
-```julia
-# ❌ DANGER: Race condition!
-pool = get_task_local_pool()  # Main Task's pool
-Threads.@threads for i in 1:N
-    a = acquire!(pool, Float64, 100)  # Shared access!
-end
-```
-
-### Pattern 3: Passing Pool to `@spawn`
-
-```julia
-# ❌ DANGER: Race condition!
-@with_pool pool begin
-    tasks = [Threads.@spawn begin
-        a = acquire!(pool, Float64, 100)  # Multiple tasks, one pool!
-    end for _ in 1:4]
-    wait.(tasks)
-end
-```
-
----
-
-## Why Task-Local (Not Thread-Local)?
-
-You might wonder: "Why not use thread-local pools? They persist across `@threads` calls!"
-
-### The Stack Discipline Problem
-
-AdaptiveArrayPools uses `checkpoint!` and `rewind!` - a **stack-based** allocation system:
-
-```julia
-@with_pool pool begin
-    checkpoint!(pool)  # Push current state
-    a = acquire!(pool, ...)
-    b = acquire!(pool, ...)
-    # ...
-    rewind!(pool)      # Pop and restore state (LIFO!)
-end
-```
-
-This requires **strict LIFO ordering**: the Task that checkpoints first must rewind last.
-
-### Why Thread-Local Fails with `@spawn`
-
-With `@spawn`, multiple Tasks can interleave on the same thread:
-
-```
-Thread 1 (with Thread-Local Pool):
-
-Time →
-Task A: checkpoint! ──── acquire! ──── sleep ────────────── rewind!
-Task B:        checkpoint! ──── acquire! ──── sleep ──── rewind!
-                                                    ↑
-                                           A finishes first!
-```
-
-**Stack corruption occurs:**
-
-1. Task A: `checkpoint!` → stack = `[0]`
-2. Task B: `checkpoint!` → stack = `[0, 1]`
-3. Task A: `rewind!` → pops `1` (B's checkpoint!) → stack = `[0]`
-4. Task B: `rewind!` → pops `0` (A's checkpoint!) → **WRONG!**
-
-**Result**: B's arrays may be reused while B is still using them → memory corruption.
-
-### Locks Don't Help
-
-Adding locks only prevents **simultaneous access**, not **LIFO violations**. The stack still gets corrupted because Tasks finish in unpredictable order.
-
-### Task-Local: The Only Safe Solution
-
-With Task-local pools:
-- Each Task has its own pool
-- Each pool has its own stack
-- No interleaving possible → LIFO always preserved
-
----
-
-## User Responsibility
-
-### The Core Rule
-
-> **Pool objects must not be shared across Tasks.**
-
-This library prioritizes **zero-overhead performance** over runtime safety checks. No locks are added because:
-
-1. Locks would defeat the purpose of zero-allocation pooling
-2. Even with locks, stack corruption would occur (LIFO violations)
-
-### Quick Reference
-
-| Pattern | Safety | Reason |
-|---------|--------|--------|
-| `@with_pool` inside `@threads` | ✅ Safe | Each Task gets own pool |
-| `@with_pool` outside `@threads` | ❌ Unsafe | All threads share one pool |
-| Function with `@with_pool` called from `@threads` | ✅ Safe | Pool created per-Task at call time |
-| Passing pool to `@spawn` | ❌ Unsafe | Multiple Tasks access same pool |
-| Nested `@with_pool` (outer/inner) | ✅ Safe | Each level has independent pool |
-
-### Debugging Tips
-
-If you encounter unexpected behavior:
-
-1. **Check pool placement**: Is `@with_pool` inside or outside `@threads`?
-2. **Check pool sharing**: Is the same pool variable accessed from multiple Tasks?
-3. **Enable POOL_DEBUG**: `POOL_DEBUG[] = true` catches some (not all) misuse patterns
-
----
-
-## Summary
-
-- AdaptiveArrayPools uses **Task-local isolation** for thread safety
-- Each Julia Task gets its own independent pool via `task_local_storage()`
-- `@threads` creates one Task per thread → pools are reused within the block
-- **Always place `@with_pool` inside `@threads`**, not outside
-- Thread-local pools are **not an alternative** due to stack discipline requirements
-- Correct usage is the user's responsibility (no runtime checks for performance)
diff --git a/docs/safety.md b/docs/safety.md
deleted file mode 100644
index 0016d5a..0000000
--- a/docs/safety.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# Safety Guide
-
-AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This requires one simple rule: **acquired arrays are only valid within their `@with_pool` scope**.
-
-## The Scope Rule
-
-When `@with_pool` ends, all arrays acquired within that scope are recycled. Using them after the scope ends leads to undefined behavior.
-
-```julia
-@with_pool pool begin
-    v = acquire!(pool, Float64, 100)
-
-    result = sum(v)  # ✅ compute and return values
-    copied = copy(v) # ✅ copy if you need data outside
-end
-# v is no longer valid here
-```
-
-## What NOT to Do
-
-### Don't return pool-backed arrays
-
-```julia
-# ❌ Wrong: returning the array itself
-@with_pool pool function bad_example()
-    v = acquire!(pool, Float64, 100)
-    return v  # v will be recycled after this returns!
-end
-
-# ✅ Correct: return computed values or copies
-@with_pool pool function good_example()
-    v = acquire!(pool, Float64, 100)
-    return sum(v)  # scalar result
-end
-```
-
-### Don't store in globals or closures
-
-```julia
-# ❌ Wrong: storing in global
-global_ref = nothing
-@with_pool pool begin
-    global_ref = acquire!(pool, Float64, 100)
-end
-# global_ref now points to recycled memory
-
-# ❌ Wrong: capturing in closure
-@with_pool pool begin
-    v = acquire!(pool, Float64, 100)
-    callback = () -> sum(v)  # v captured but will be invalid
-end
-```
-
-### Don't resize or push! to unsafe_acquire! arrays
-
-```julia
-@with_pool pool begin
-    v = unsafe_acquire!(pool, Float64, 100)
-    # ❌ These break pool memory management:
-    # resize!(v, 200)
-    # push!(v, 1.0)
-    # append!(v, [1.0, 2.0])
-end
-```
-
-## Debugging with POOL_DEBUG
-
-Enable runtime safety checks during development:
-
-```julia
-using AdaptiveArrayPools
-AdaptiveArrayPools.POOL_DEBUG[] = true
-
-@with_pool pool function test()
-    v = acquire!(pool, Float64, 100)
-    return v  # Will warn about returning pool-backed array
-end
-```
-
-## acquire! vs unsafe_acquire!
-
-| Function | Returns | Best For |
-|----------|---------|----------|
-| `acquire!` | View types (`SubArray`, `ReshapedArray`) | General use, BLAS/LAPACK |
-| `unsafe_acquire!` | Native `Array`/`CuArray` | FFI, type constraints |
-
-Both follow the same scope rules. Use `acquire!` by default—views work with all standard Julia linear algebra operations.
-
-## Thread Safety
-
-Pools are task-local, so each thread automatically gets its own pool:
-
-```julia
-# ✅ Safe: each task has independent pool
-Threads.@threads for i in 1:N
-    @with_pool pool begin
-        a = acquire!(pool, Float64, 100)
-        # work with a...
-    end
-end
-
-# ❌ Unsafe: pool created outside threaded region
-@with_pool pool begin
-    Threads.@threads for i in 1:N
-        a = acquire!(pool, Float64, 100)  # race condition!
-    end
-end
-```
-
-See [Multi-Threading](multi-threading.md) for more patterns.

From 6c13552e6accdeaa597757331a616f036b95c389 Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Mon, 5 Jan 2026 11:29:22 -0800
Subject: [PATCH 7/8] docs: restructure to 4-stage learning journey

Reorganize documentation from Guide/Usage/Advanced to:
- Basics: quick-start, @with_pool patterns, essential API, safety rules
- Features: @maybe_with_pool, CUDA, multi-threading, configuration
- Reference: full API
- Architecture: how-it-works, type dispatch, macro internals, design docs

New content:
- basics/with-pool-patterns.md: function decorator vs block wrapper
- basics/api-essentials.md: core API with reset! and fill! patterns
- architecture/type-dispatch.md: N-way cache + View vs Array guide
- architecture/design-docs.md: links to design documents

Improvements:
- safety-rules.md: visual emphasis with tables
- how-it-works.md: zero-alloc diagram + try...finally explanation
- make.jl: mapping table approach for path rewriting
---
 docs/make.jl                                  |  69 +++++----
 docs/src/architecture/design-docs.md          |  44 ++++++
 .../how-it-works.md}                          |  65 ++++++--
 .../macro-internals.md                        |   6 +-
 docs/src/architecture/type-dispatch.md        | 140 ++++++++++++++++++
 docs/src/basics/api-essentials.md             | 120 +++++++++++++++
 .../quick-start.md}                           |  14 +-
 .../safety.md => basics/safety-rules.md}      |  41 ++++-
 docs/src/basics/with-pool-patterns.md         | 111 ++++++++++++++
 docs/src/{usage => features}/configuration.md |   0
 .../cuda.md => features/cuda-support.md}      |   0
 .../maybe-with-pool.md}                       |   2 +-
 .../{advanced => features}/multi-threading.md |   0
 docs/src/index.md                             |  20 +--
 docs/src/{usage => reference}/api.md          |   0
 15 files changed, 564 insertions(+), 68 deletions(-)
 create mode 100644 docs/src/architecture/design-docs.md
 rename docs/src/{advanced/internals.md => architecture/how-it-works.md} (72%)
 rename docs/src/{advanced => architecture}/macro-internals.md (97%)
 create mode 100644 docs/src/architecture/type-dispatch.md
 create mode 100644 docs/src/basics/api-essentials.md
 rename docs/src/{guide/getting-started.md => basics/quick-start.md} (85%)
 rename docs/src/{guide/safety.md => basics/safety-rules.md} (66%)
 create mode 100644 docs/src/basics/with-pool-patterns.md
 rename docs/src/{usage => features}/configuration.md (100%)
 rename docs/src/{usage/cuda.md => features/cuda-support.md} (100%)
 rename docs/src/{usage/maybe_with_pool.md => features/maybe-with-pool.md} (98%)
 rename docs/src/{advanced => features}/multi-threading.md (100%)
 rename docs/src/{usage => reference}/api.md (100%)

diff --git a/docs/make.jl b/docs/make.jl
index 2147369..f385e83 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -22,31 +22,32 @@ end
 const GITHUB_PAGES_BASE = "https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable"
 const REPO_URL = "https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl"
 
+# Path mapping table: (pattern, replacement)
+# Order matters for overlapping patterns
+const README_PATH_MAPPINGS = [
+    # Reference
+    (r"\(docs/api\.md(#[^)]+)?\)", s"(reference/api.md\1)"),
+
+    # Features
+    (r"\(docs/cuda\.md(#[^)]+)?\)", s"(features/cuda-support.md\1)"),
+    (r"\(docs/configuration\.md(#[^)]+)?\)", s"(features/configuration.md\1)"),
+    (r"\(docs/maybe_with_pool\.md(#[^)]+)?\)", s"(features/maybe-with-pool.md\1)"),
+    (r"\(docs/multi-threading\.md(#[^)]+)?\)", s"(features/multi-threading.md\1)"),
+
+    # Basics
+    (r"\(docs/safety\.md(#[^)]+)?\)", s"(basics/safety-rules.md\1)"),
+]
+
 """
 Rewrite relative paths in README.md for Documenter structure.
 
-Converts GitHub repo links to internal Documenter links:
-- `docs/api.md` → `usage/api.md`
-- `docs/cuda.md` → `usage/cuda.md`
-- `docs/safety.md` → `guide/safety.md`
-- `docs/multi-threading.md` → `advanced/multi-threading.md`
-- `docs/configuration.md` → `usage/configuration.md`
-- `docs/maybe_with_pool.md` → `usage/maybe_with_pool.md`
-
+Uses mapping table to convert GitHub repo links to internal Documenter links.
 Also handles anchor links (e.g., `docs/api.md#convenience-functions`).
 """
 function rewrite_readme_paths(content::String)
-    # Usage docs (with optional anchors)
-    content = replace(content, r"\(docs/api\.md(#[^)]+)?\)" => s"(usage/api.md\1)")
-    content = replace(content, r"\(docs/cuda\.md(#[^)]+)?\)" => s"(usage/cuda.md\1)")
-    content = replace(content, r"\(docs/configuration\.md(#[^)]+)?\)" => s"(usage/configuration.md\1)")
-    content = replace(content, r"\(docs/maybe_with_pool\.md(#[^)]+)?\)" => s"(usage/maybe_with_pool.md\1)")
-
-    # Guide docs
-    content = replace(content, r"\(docs/safety\.md(#[^)]+)?\)" => s"(guide/safety.md\1)")
-
-    # Advanced docs
-    content = replace(content, r"\(docs/multi-threading\.md(#[^)]+)?\)" => s"(advanced/multi-threading.md\1)")
+    for (pattern, replacement) in README_PATH_MAPPINGS
+        content = replace(content, pattern => replacement)
+    end
 
     # LICENSE link → GitHub
     content = replace(content, "(LICENSE)" => "($(REPO_URL)/blob/master/LICENSE)")
@@ -80,20 +81,26 @@ makedocs(
     ),
     pages = [
         "Home" => "index.md",
-        "Guide" => [
-            "Getting Started" => "guide/getting-started.md",
-            "Safety Rules" => "guide/safety.md",
+        "Basics" => [
+            "Quick Start" => "basics/quick-start.md",
+            "@with_pool Patterns" => "basics/with-pool-patterns.md",
+            "Essential API" => "basics/api-essentials.md",
+            "Safety Rules" => "basics/safety-rules.md",
+        ],
+        "Features" => [
+            "@maybe_with_pool" => "features/maybe-with-pool.md",
+            "CUDA Support" => "features/cuda-support.md",
+            "Multi-threading" => "features/multi-threading.md",
+            "Configuration" => "features/configuration.md",
         ],
-        "Usage" => [
-            "API Reference" => "usage/api.md",
-            "Configuration" => "usage/configuration.md",
-            "@maybe_with_pool" => "usage/maybe_with_pool.md",
-            "CUDA Support" => "usage/cuda.md",
+        "Reference" => [
+            "Full API" => "reference/api.md",
         ],
-        "Advanced" => [
-            "Multi-threading" => "advanced/multi-threading.md",
-            "How @with_pool Works" => "advanced/macro-internals.md",
-            "Internals" => "advanced/internals.md",
+        "Architecture" => [
+            "How It Works" => "architecture/how-it-works.md",
+            "Type Dispatch & Cache" => "architecture/type-dispatch.md",
+            "@with_pool Internals" => "architecture/macro-internals.md",
+            "Design Documents" => "architecture/design-docs.md",
         ],
     ],
     doctest = false,  # Doctests not set up in existing docs
diff --git a/docs/src/architecture/design-docs.md b/docs/src/architecture/design-docs.md
new file mode 100644
index 0000000..4b5e051
--- /dev/null
+++ b/docs/src/architecture/design-docs.md
@@ -0,0 +1,44 @@
+# Design Documents
+
+For in-depth analysis of design decisions, implementation tradeoffs, and architectural choices, see the design documents in the repository:
+
+## API Design
+
+- **[hybrid_api_design.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/hybrid_api_design.md)**
+  Two-API strategy (`acquire!` vs `unsafe_acquire!`) and type stability analysis
+
+## Caching & Performance
+
+- **[nd_array_approach_comparison.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/nd_array_approach_comparison.md)**
+  N-way cache design, boxing analysis, and ReshapedArray benchmarks
+
+- **[fixed_slots_codegen_design.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/fixed_slots_codegen_design.md)**
+  Zero-allocation iteration via `@generated` functions and fixed-slot type dispatch
+
+## Macro Internals
+
+- **[untracked_acquire_design.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/untracked_acquire_design.md)**
+  Macro-based untracked acquire detection and 1-based sentinel pattern
+
+## Backend Extensions
+
+- **[cuda_extension_design.md](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/blob/master/docs/design/cuda_extension_design.md)**
+  CUDA backend architecture and package extension loading
+
+---
+
+## Document Overview
+
+| Document | Focus Area | Key Insights |
+|----------|------------|--------------|
+| hybrid_api_design | API strategy | View types for zero-alloc, Array for FFI |
+| nd_array_approach_comparison | Caching | N-way associative cache reduces header allocation |
+| fixed_slots_codegen_design | Codegen | @generated functions enable type-stable iteration |
+| untracked_acquire_design | Macro safety | Sentinel pattern ensures correct cleanup |
+| cuda_extension_design | GPU support | Seamless CPU/CUDA API parity |
+
+## See Also
+
+- [How It Works](how-it-works.md) - High-level architecture overview
+- [Type Dispatch & Cache](type-dispatch.md) - Technical deep-dive
+- [@with_pool Macro Internals](macro-internals.md) - Macro transformation details
diff --git a/docs/src/advanced/internals.md b/docs/src/architecture/how-it-works.md
similarity index 72%
rename from docs/src/advanced/internals.md
rename to docs/src/architecture/how-it-works.md
index a31ec81..39840a2 100644
--- a/docs/src/advanced/internals.md
+++ b/docs/src/architecture/how-it-works.md
@@ -1,27 +1,68 @@
-# Internals
+# How It Works
 
-This page provides an overview of the internal architecture of AdaptiveArrayPools.jl. For detailed design documents, see the [`docs/design/`](https://github.com/ProjectTorreyPines/AdaptiveArrayPools.jl/tree/master/docs/design) folder in the repository.
+This page explains the core mechanisms that enable zero-allocation array reuse.
+
+## The Zero-Allocation Promise
+
+```
++-------------------------------------------------------------+
+|  Call 1 (warmup):                                           |
+|    checkpoint! --> acquire! x 3 --> rewind!                 |
+|         |                                                   |
+|         +-- backing memory allocated                        |
+|                                                             |
+|  Call 2+ (zero-alloc):                                      |
+|    checkpoint! --> acquire! x 3 --> rewind!                 |
+|         |                                                   |
+|         +-- same memory reused, 0 bytes allocated           |
++-------------------------------------------------------------+
+```
 
 ## Checkpoint/Rewind Lifecycle
 
-The core mechanism that enables zero-allocation reuse:
+The core mechanism that enables memory reuse:
 
 ```
 @with_pool pool function foo()
-    │
-    ├─► checkpoint!(pool)     # Save current state (n_active counters)
-    │
-    │   A = acquire!(pool, ...)  # n_active += 1
-    │   B = acquire!(pool, ...)  # n_active += 1
-    │   C = acquire!(pool, ...)  # n_active += 1
-    │   ... compute ...
-    │
-    └─► rewind!(pool)         # Restore n_active → all arrays recycled
+    |
+    +---> checkpoint!(pool)     # Save current state (n_active counters)
+    |
+    |     A = acquire!(pool, ...)  # n_active += 1
+    |     B = acquire!(pool, ...)  # n_active += 1
+    |     C = acquire!(pool, ...)  # n_active += 1
+    |     ... compute ...
+    |
+    +---> rewind!(pool)         # Restore n_active, arrays recycled
 end
 ```
 
 On repeated calls, the same memory is reused without any allocation.
 
+## Exception Safety: try...finally
+
+The `@with_pool` macro generates code with exception-safe cleanup:
+
+```julia
+# What you write:
+@with_pool pool begin
+    A = acquire!(pool, Float64, 100)
+    result = compute(A)
+end
+
+# What the macro generates:
+let pool = get_task_local_pool()
+    checkpoint!(pool)
+    try
+        A = acquire!(pool, Float64, 100)
+        result = compute(A)
+    finally
+        rewind!(pool)  # Always executes, even on exception
+    end
+end
+```
+
+**Key guarantee**: The `finally` block ensures `rewind!` is called even if an exception occurs, preventing memory leaks and state corruption.
+
 ## Fixed-Slot Type Dispatch
 
 To achieve zero-lookup overhead, common types have dedicated struct fields:
diff --git a/docs/src/advanced/macro-internals.md b/docs/src/architecture/macro-internals.md
similarity index 97%
rename from docs/src/advanced/macro-internals.md
rename to docs/src/architecture/macro-internals.md
index e7aa69d..d8e5f93 100644
--- a/docs/src/advanced/macro-internals.md
+++ b/docs/src/architecture/macro-internals.md
@@ -251,6 +251,6 @@ end
 
 ## See Also
 
-- [Internals](internals.md) — Overview of pool architecture
-- [Safety Rules](../guide/safety.md) — Scope rules and best practices
-- [Configuration](../usage/configuration.md) — Performance tuning options
+- [How It Works](how-it-works.md) — Overview of pool architecture
+- [Safety Rules](../basics/safety-rules.md) — Scope rules and best practices
+- [Configuration](../features/configuration.md) — Performance tuning options
diff --git a/docs/src/architecture/type-dispatch.md b/docs/src/architecture/type-dispatch.md
new file mode 100644
index 0000000..896d17f
--- /dev/null
+++ b/docs/src/architecture/type-dispatch.md
@@ -0,0 +1,140 @@
+# Type Dispatch & Caching
+
+This page explains the internal mechanisms that enable zero-allocation performance.
+
+## Fixed-Slot Type Dispatch
+
+To achieve zero-lookup overhead, common types have dedicated struct fields:
+
+```julia
+struct AdaptiveArrayPool
+    float64::TypedPool{Float64}
+    float32::TypedPool{Float32}
+    int64::TypedPool{Int64}
+    int32::TypedPool{Int32}
+    complexf64::TypedPool{ComplexF64}
+    complexf32::TypedPool{ComplexF32}
+    bool::TypedPool{Bool}
+    others::IdDict{DataType, Any}  # Fallback for rare types
+end
+```
+
+When you call `acquire!(pool, Float64, n)`, the compiler inlines directly to `pool.float64` - no dictionary lookup, no type instability.
+
+## N-Way Set Associative Cache
+
+For `unsafe_acquire!` (which returns native `Array` types), we use an N-way cache to reduce header allocation:
+
+```
+                    CACHE_WAYS = 4 (default)
+                    +----+----+----+----+
+Slot 0 (Float64):   |way0|way1|way2|way3|  <-- round-robin eviction
+                    +----+----+----+----+
+                    +----+----+----+----+
+Slot 1 (Float32):   |way0|way1|way2|way3|
+                    +----+----+----+----+
+                    ...
+```
+
+### Cache Lookup Logic
+
+```julia
+function unsafe_acquire!(pool, T, dims...)
+    typed_pool = get_typed_pool!(pool, T)
+    slot = n_active + 1
+    base = (slot - 1) * CACHE_WAYS
+
+    # Search all ways for matching dimensions
+    for k in 1:CACHE_WAYS
+        idx = base + k
+        if dims == typed_pool.nd_dims[idx]
+            # Cache hit! Check if underlying vector was resized
+            if pointer matches
+                return typed_pool.nd_arrays[idx]
+            end
+        end
+    end
+
+    # Cache miss: create new Array header, store in next way (round-robin)
+    way = typed_pool.nd_next_way[slot]
+    typed_pool.nd_next_way[slot] = (way % CACHE_WAYS) + 1
+    # ... create and cache Array ...
+end
+```
+
+**Key insight**: Even on cache miss, only the `Array` header (~80-144 bytes) is allocated. The actual data memory is always reused from the pool.
+
+---
+
+## View vs Array: When to Use What?
+
+| API | Return Type | Allocation | Recommended For |
+|-----|-------------|------------|-----------------|
+| `acquire!` | `SubArray` / `ReshapedArray` | **Always 0 bytes** | 99% of cases |
+| `unsafe_acquire!` | `Vector` / `Array` | 0-144 bytes | FFI, type constraints |
+
+### Why View is the Default
+
+1. **Zero-allocation guarantee**: Compiler eliminates view wrappers via SROA (Scalar Replacement of Aggregates)
+2. **BLAS/LAPACK compatible**: Processed as `StridedArray`, no performance difference
+3. **Type stable**: Always returns the same wrapper types
+
+### When to Use unsafe_acquire!
+
+1. **C FFI**: When `ccall` requires `Ptr{T}` from contiguous memory
+
+```julia
+arr = unsafe_acquire!(pool, Float64, 100)
+ccall(:c_function, Cvoid, (Ptr{Float64}, Cint), arr, 100)
+```
+
+2. **Type signature constraints**: Function explicitly requires `Array{T,N}`
+
+```julia
+function process(data::Array{Float64,2})
+    # Only accepts Array, not AbstractArray
+end
+
+m = unsafe_acquire!(pool, Float64, 10, 10)
+process(m)  # Works
+```
+
+3. **Runtime dispatch avoidance**: When types are determined at runtime
+
+```julia
+# Polymorphic code where type stability matters
+function dispatch_heavy(pool, T)
+    arr = unsafe_acquire!(pool, T, 100)  # Concrete Array type
+    # ... operations that would trigger dispatch with views
+end
+```
+
+### Performance Comparison
+
+| Operation | acquire! (View) | unsafe_acquire! (Array) |
+|-----------|-----------------|-------------------------|
+| Allocation (cached) | 0 bytes | 0 bytes |
+| Allocation (miss) | 0 bytes | 80-144 bytes |
+| BLAS operations | Identical | Identical |
+| Type stability | Guaranteed | Guaranteed |
+| FFI compatibility | Requires conversion | Direct |
+
+### Header Size by Dimensionality
+
+When `unsafe_acquire!` has a cache miss:
+
+| Dimensions | Header Size |
+|------------|-------------|
+| 1D (Vector) | 80 bytes |
+| 2D-3D | 112 bytes |
+| 4D-5D | 144 bytes |
+
+This is Julia's internal `Array` metadata; actual data memory is always reused from the pool.
+
+---
+
+## See Also
+
+- [How It Works](how-it-works.md) - Checkpoint/Rewind mechanism
+- [Design Documents](design-docs.md) - Detailed design analysis
+- [Configuration](../features/configuration.md) - Cache tuning options
diff --git a/docs/src/basics/api-essentials.md b/docs/src/basics/api-essentials.md
new file mode 100644
index 0000000..1c2e63e
--- /dev/null
+++ b/docs/src/basics/api-essentials.md
@@ -0,0 +1,120 @@
+# Essential API
+
+This page covers the core functions you'll use 99% of the time. For the complete API reference, see [Full API](../reference/api.md).
+
+## Array Acquisition
+
+### `acquire!(pool, T, dims...)`
+
+The primary function. Returns a view (`SubArray` for 1D, `ReshapedArray` for N-D).
+
+```julia
+@with_pool pool begin
+    v = acquire!(pool, Float64, 100)        # 1D: SubArray{Float64,1}
+    m = acquire!(pool, Float64, 10, 10)     # 2D: ReshapedArray{Float64,2}
+    t = acquire!(pool, Int64, 2, 3, 4)      # 3D: ReshapedArray{Int64,3}
+end
+```
+
+**Always use `acquire!` by default.** Views are zero-allocation and work with all BLAS/LAPACK operations.
+
+### `unsafe_acquire!(pool, T, dims...)`
+
+Returns a native `Array` type. Only use when you specifically need `Array{T,N}`:
+
+```julia
+@with_pool pool begin
+    # Use when you need Array for:
+    arr = unsafe_acquire!(pool, Float64, 100)
+
+    # - FFI/ccall requiring Ptr{T}
+    ccall(:some_c_function, Cvoid, (Ptr{Float64}, Cint), arr, length(arr))
+
+    # - Functions with strict Array{T,N} type signatures
+end
+```
+
+## Convenience Functions
+
+Zero-initialized arrays:
+
+```julia
+@with_pool pool begin
+    z = zeros!(pool, Float64, 10, 10)   # All zeros
+    o = ones!(pool, Float64, 100)       # All ones
+end
+```
+
+Match existing array properties:
+
+```julia
+@with_pool pool begin
+    A = acquire!(pool, Float64, 50, 50)
+    B = similar!(pool, A)                # Same type and size as A
+    C = similar!(pool, A, ComplexF64)    # Same size, different type
+end
+```
+
+### Custom Initialization with `fill!`
+
+For values other than 0 or 1, use Julia's built-in `fill!`:
+
+```julia
+@with_pool pool begin
+    v = acquire!(pool, Float64, 100)
+    fill!(v, 3.14)              # Fill with pi
+
+    m = acquire!(pool, Int64, 10, 10)
+    fill!(m, -1)                # Fill with sentinel value
+end
+```
+
+This pattern works because pool arrays are mutable views into the underlying storage.
+
+## Pool Management
+
+### `reset!(pool)`
+
+Releases all memory held by the pool. Useful for long-running processes:
+
+```julia
+# After processing a large batch
+@with_pool pool begin
+    # ... large computation ...
+end
+
+# Optionally release memory if pool grew too large
+reset!(get_task_local_pool())
+```
+
+### `pooling_enabled(pool)`
+
+Check if pooling is active (returns `false` for `DisabledPool`):
+
+```julia
+@maybe_with_pool pool begin
+    if pooling_enabled(pool)
+        println("Using pool")
+    else
+        println("Pooling disabled")
+    end
+end
+```
+
+## Quick Reference
+
+| Function | Returns | Allocation | Use Case |
+|----------|---------|------------|----------|
+| `acquire!(pool, T, dims...)` | View type | 0 bytes | Default choice |
+| `unsafe_acquire!(pool, T, dims...)` | `Array{T,N}` | 0-144 bytes | FFI, type constraints |
+| `zeros!(pool, [T,] dims...)` | View type | 0 bytes | Zero-initialized |
+| `ones!(pool, [T,] dims...)` | View type | 0 bytes | One-initialized |
+| `similar!(pool, A)` | View type | 0 bytes | Match existing array |
+| `reset!(pool)` | `nothing` | - | Release all memory |
+| `pooling_enabled(pool)` | `Bool` | - | Check pool status |
+
+## See Also
+
+- [Full API Reference](../reference/api.md) - Complete function list
+- [@with_pool Patterns](with-pool-patterns.md) - Usage patterns
+- [Safety Rules](safety-rules.md) - Scope rules
diff --git a/docs/src/guide/getting-started.md b/docs/src/basics/quick-start.md
similarity index 85%
rename from docs/src/guide/getting-started.md
rename to docs/src/basics/quick-start.md
index 76bb0c3..2c4f89a 100644
--- a/docs/src/guide/getting-started.md
+++ b/docs/src/basics/quick-start.md
@@ -1,4 +1,4 @@
-# Getting Started
+# Quick Start
 
 This guide will help you get up and running with AdaptiveArrayPools.jl in minutes.
 
@@ -105,12 +105,12 @@ end
 end
 ```
 
-For complete safety guidelines, see [Safety Rules](safety.md).
+For complete safety guidelines, see [Safety Rules](safety-rules.md).
 
 ## Next Steps
 
-- [Safety Rules](safety.md) - Complete scope rules and anti-patterns
-- [API Reference](../usage/api.md) - Full function and macro reference
-- [Configuration](../usage/configuration.md) - Preferences and cache tuning
-- [Multi-threading](../advanced/multi-threading.md) - Task/thread safety patterns
-- [CUDA Support](../usage/cuda.md) - GPU backend usage
+- [Safety Rules](safety-rules.md) - Complete scope rules and anti-patterns
+- [Full API Reference](../reference/api.md) - Complete function and macro reference
+- [Configuration](../features/configuration.md) - Preferences and cache tuning
+- [Multi-threading](../features/multi-threading.md) - Task/thread safety patterns
+- [CUDA Support](../features/cuda-support.md) - GPU backend usage
diff --git a/docs/src/guide/safety.md b/docs/src/basics/safety-rules.md
similarity index 66%
rename from docs/src/guide/safety.md
rename to docs/src/basics/safety-rules.md
index e4eedc1..ab4685c 100644
--- a/docs/src/guide/safety.md
+++ b/docs/src/basics/safety-rules.md
@@ -1,8 +1,41 @@
-# Safety Guide
+# Safety Rules
 
-AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This requires one simple rule: **acquired arrays are only valid within their `@with_pool` scope**.
+AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This requires understanding one critical rule.
 
-## The Scope Rule
+---
+
+## The One Rule
+
+```
++-------------------------------------------------------------+
+|                                                             |
+|  Pool arrays are ONLY valid within their @with_pool scope   |
+|                                                             |
+|  When the scope ends, the memory is recycled.               |
+|  Using arrays after scope ends = UNDEFINED BEHAVIOR         |
+|                                                             |
++-------------------------------------------------------------+
+```
+
+### What's Safe
+
+| Pattern | Example | Why It Works |
+|---------|---------|--------------|
+| Return computed values | `return sum(v)` | Scalar escapes, not the array |
+| Return copies | `return copy(v)` | New allocation, independent data |
+| Use within scope | `result = A * B` | Arrays valid during computation |
+
+### What's Dangerous
+
+| Pattern | Example | Why It Fails |
+|---------|---------|--------------|
+| Return array | `return v` | Array recycled after return |
+| Store in global | `global_ref = v` | Points to recycled memory |
+| Capture in closure | `() -> sum(v)` | v invalid when closure runs |
+
+---
+
+## The Scope Rule in Detail
 
 When `@with_pool` ends, all arrays acquired within that scope are recycled. Using them after the scope ends leads to undefined behavior.
 
@@ -107,4 +140,4 @@ end
 end
 ```
 
-See [Multi-Threading](../advanced/multi-threading.md) for more patterns.
+See [Multi-Threading](../features/multi-threading.md) for more patterns.
diff --git a/docs/src/basics/with-pool-patterns.md b/docs/src/basics/with-pool-patterns.md
new file mode 100644
index 0000000..83906dd
--- /dev/null
+++ b/docs/src/basics/with-pool-patterns.md
@@ -0,0 +1,111 @@
+# @with_pool Patterns
+
+The `@with_pool` macro provides automatic memory lifecycle management. It supports two usage patterns depending on your needs.
+
+## Pattern 1: Function Decorator
+
+Wraps an entire function with pool management. The pool is active for the function's full duration.
+
+```julia
+@with_pool pool function compute(n)
+    A = acquire!(pool, Float64, n, n)
+    B = zeros!(pool, Float64, n)
+
+    # ... compute with A and B ...
+
+    return sum(A) + sum(B)  # Return computed values, not arrays
+end
+
+# Usage
+result = compute(100)  # Zero-allocation after warmup
+```
+
+**Best for:**
+- Functions that exclusively use pooled arrays
+- Hot-path functions called repeatedly
+- Clear ownership semantics
+
+## Pattern 2: Block Wrapper
+
+Wraps only a portion of a function. Useful when you need pool arrays for part of the computation.
+
+```julia
+function process_data(data)
+    # Pre-processing (no pool needed)
+    n = length(data)
+
+    @with_pool pool begin
+        # Pool is only active inside this block
+        temp = acquire!(pool, Float64, n)
+        temp .= data .* 2
+        result = sum(temp)
+    end  # Pool arrays recycled here
+
+    # Post-processing
+    return result * 1.5
+end
+```
+
+**Best for:**
+- Functions with mixed allocation needs
+- Gradual adoption in existing code
+- Fine-grained scope control
+
+## Pattern Comparison
+
+| Aspect | Function Decorator | Block Wrapper |
+|--------|-------------------|---------------|
+| Scope | Entire function | begin...end block |
+| Syntax | `@with_pool pool function ...` | `@with_pool pool begin ... end` |
+| Pool lifetime | Function start to return | Block entry to exit |
+| Nesting | Functions can call each other | Blocks can be nested |
+
+## Nested Pools
+
+Both patterns support nesting. Each scope maintains independent checkpoint state:
+
+```julia
+@with_pool pool function outer(n)
+    A = acquire!(pool, Float64, n)
+
+    @with_pool pool begin
+        # Inner scope - new checkpoint
+        B = acquire!(pool, Float64, n * 2)
+        inner_result = sum(B)
+    end  # B recycled here
+
+    # A still valid here
+    return sum(A) + inner_result
+end
+```
+
+## Common Mistakes
+
+### Returning pool arrays (wrong)
+
+```julia
+@with_pool pool function bad()
+    v = acquire!(pool, Float64, 100)
+    return v  # v is recycled after return!
+end
+```
+
+### Correct: return computed values
+
+```julia
+@with_pool pool function good()
+    v = acquire!(pool, Float64, 100)
+    return sum(v)  # Scalar result is safe
+end
+
+# Or copy if you need the array
+@with_pool pool function also_good()
+    v = acquire!(pool, Float64, 100)
+    return copy(v)  # Explicit copy is safe
+end
+```
+
+## See Also
+
+- [Essential API](api-essentials.md) - Core functions for pool operations
+- [Safety Rules](safety-rules.md) - Important scope rules
diff --git a/docs/src/usage/configuration.md b/docs/src/features/configuration.md
similarity index 100%
rename from docs/src/usage/configuration.md
rename to docs/src/features/configuration.md
diff --git a/docs/src/usage/cuda.md b/docs/src/features/cuda-support.md
similarity index 100%
rename from docs/src/usage/cuda.md
rename to docs/src/features/cuda-support.md
diff --git a/docs/src/usage/maybe_with_pool.md b/docs/src/features/maybe-with-pool.md
similarity index 98%
rename from docs/src/usage/maybe_with_pool.md
rename to docs/src/features/maybe-with-pool.md
index 39c31b7..bb0ec30 100644
--- a/docs/src/usage/maybe_with_pool.md
+++ b/docs/src/features/maybe-with-pool.md
@@ -1,4 +1,4 @@
-# @maybe_with_pool
+# `@maybe_with_pool`
 
 Runtime-toggleable pooling. Users can enable/disable via `MAYBE_POOLING_ENABLED[]`.
 
diff --git a/docs/src/advanced/multi-threading.md b/docs/src/features/multi-threading.md
similarity index 100%
rename from docs/src/advanced/multi-threading.md
rename to docs/src/features/multi-threading.md
diff --git a/docs/src/index.md b/docs/src/index.md
index c89a222..82d9275 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -64,7 +64,7 @@ end
 | Allocations | ⚠️ 90,000 (2.75 GiB) | ✅ **0** | 100% eliminated |
 | GC Time | ⚠️ 31% | ✅ **0%** | No GC pauses |
 
-> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](usage/cuda.md).
+> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/cuda).
 
 ## How It Works
 
@@ -76,11 +76,11 @@ end
 
 This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors.
 
-`acquire!` returns lightweight views (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK. If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`—see [API Reference](usage/api.md).
+`acquire!` returns lightweight views (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK. If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`—see [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api).
 
-> **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](guide/safety.md).
+> **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/guide/safety).
 
-**Thread-safe by design**: Each Julia Task gets its own independent pool—no locks needed. See [Multi-Threading](advanced/multi-threading.md) for patterns.
+**Thread-safe by design**: Each Julia Task gets its own independent pool—no locks needed. See [Multi-Threading](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/advanced/multi-threading) for patterns.
 
 ### Convenience Functions
 
@@ -92,7 +92,7 @@ Common initialization patterns have convenience functions:
 | `ones!(pool, Float32, 3, 3)` | `acquire!` + `fill!(1)` |
 | `similar!(pool, A)` | `acquire!` matching `eltype(A)`, `size(A)` |
 
-These return views like `acquire!`. For raw `Array` types, use `unsafe_acquire!` or its convenience variants (`unsafe_zeros!`, `unsafe_ones!`, `unsafe_similar!`). See [API Reference](usage/api.md#convenience-functions).
+These return views like `acquire!`. For raw `Array` types, use `unsafe_acquire!` or its convenience variants (`unsafe_zeros!`, `unsafe_ones!`, `unsafe_similar!`). See [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api#convenience-functions).
 
 ## Installation
 
@@ -106,11 +106,11 @@ Pkg.add("AdaptiveArrayPools")
 
 | Guide | Description |
 |-------|-------------|
-| [API Reference](usage/api.md) | Complete function and macro reference |
-| [CUDA Backend](usage/cuda.md) | GPU-specific usage and examples |
-| [Safety Guide](guide/safety.md) | Scope rules and best practices |
-| [Multi-Threading](advanced/multi-threading.md) | Task/thread safety patterns |
-| [Configuration](usage/configuration.md) | Preferences and cache tuning |
+| [API Reference](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/api) | Complete function and macro reference |
+| [CUDA Backend](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/cuda) | GPU-specific usage and examples |
+| [Safety Guide](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/guide/safety) | Scope rules and best practices |
+| [Multi-Threading](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/advanced/multi-threading) | Task/thread safety patterns |
+| [Configuration](https://projecttorreypines.github.io/AdaptiveArrayPools.jl/stable/usage/configuration) | Preferences and cache tuning |
 
 ## License
 
diff --git a/docs/src/usage/api.md b/docs/src/reference/api.md
similarity index 100%
rename from docs/src/usage/api.md
rename to docs/src/reference/api.md

From b7cb5bf5c65386c45cbcc3c2016c87744c6fe4e3 Mon Sep 17 00:00:00 2001
From: Min-Gu Yoo <mgyoo86@gmail.com>
Date: Mon, 5 Jan 2026 12:29:35 -0800
Subject: [PATCH 8/8] docs: improve technical accuracy and add advanced pool
 patterns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add advanced/pool-patterns.md with direct pool access pattern
- Fix memory semantics: "recycled" → "marked available for reuse"
- Add warning admonition explaining undefined behavior after scope
- Clarify unsafe_acquire! is zero-allocation on cache hit
- Add tip admonition for cache behavior documentation
- Separate basics and advanced pool patterns into distinct sections
---
 README.md                             |   2 +-
 docs/make.jl                          |  11 +-
 docs/src/advanced/pool-patterns.md    | 199 ++++++++++++++++++++++++++
 docs/src/architecture/how-it-works.md |   2 +-
 docs/src/basics/api-essentials.md     |   7 +-
 docs/src/basics/quick-start.md        |   2 +-
 docs/src/basics/safety-rules.md       |  27 ++--
 docs/src/basics/with-pool-patterns.md |  74 ++++------
 docs/src/index.md                     |   2 +-
 9 files changed, 258 insertions(+), 68 deletions(-)
 create mode 100644 docs/src/advanced/pool-patterns.md

diff --git a/README.md b/README.md
index 248854a..0d572ab 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ end
 
 1. **Checkpoint** — Saves current pool state when entering the block
 2. **Acquire** — `acquire!` returns arrays backed by pooled memory
-3. **Rewind** — When the block ends, all acquired arrays are recycled for reuse
+3. **Rewind** — When the block ends, all acquired arrays are marked available for reuse
 
 This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors.
 
diff --git a/docs/make.jl b/docs/make.jl
index f385e83..d441c21 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -83,14 +83,17 @@ makedocs(
         "Home" => "index.md",
         "Basics" => [
             "Quick Start" => "basics/quick-start.md",
-            "@with_pool Patterns" => "basics/with-pool-patterns.md",
+            "`@with_pool` Patterns" => "basics/with-pool-patterns.md",
             "Essential API" => "basics/api-essentials.md",
             "Safety Rules" => "basics/safety-rules.md",
         ],
+        "Advanced" => [
+            "Pool Patterns" => "advanced/pool-patterns.md",
+            "Multi-threading" => "features/multi-threading.md",
+        ],
         "Features" => [
-            "@maybe_with_pool" => "features/maybe-with-pool.md",
+            "`@maybe_with_pool`" => "features/maybe-with-pool.md",
             "CUDA Support" => "features/cuda-support.md",
-            "Multi-threading" => "features/multi-threading.md",
             "Configuration" => "features/configuration.md",
         ],
         "Reference" => [
@@ -99,7 +102,7 @@ makedocs(
         "Architecture" => [
             "How It Works" => "architecture/how-it-works.md",
             "Type Dispatch & Cache" => "architecture/type-dispatch.md",
-            "@with_pool Internals" => "architecture/macro-internals.md",
+            "`@with_pool` Internals" => "architecture/macro-internals.md",
             "Design Documents" => "architecture/design-docs.md",
         ],
     ],
diff --git a/docs/src/advanced/pool-patterns.md b/docs/src/advanced/pool-patterns.md
new file mode 100644
index 0000000..8aedc3e
--- /dev/null
+++ b/docs/src/advanced/pool-patterns.md
@@ -0,0 +1,199 @@
+# Advanced Pool Patterns
+
+This page covers advanced usage patterns for experienced users.
+
+## Calling Other `@with_pool` Functions
+
+Each `@with_pool` function manages its own checkpoint. They can call each other freely:
+
+```julia
+@with_pool pool function step1(n)
+    A = zeros!(pool, Float64, n)
+    fill!(A, 1.0)
+    return sum(A)
+end
+
+@with_pool pool function step2(n)
+    B = zeros!(pool, Float64, n)
+    fill!(B, 2.0)
+    return sum(B)
+end
+
+@with_pool pool function pipeline(n)
+    a = step1(n)   # step1's arrays marked for reuse when it returns
+    b = step2(n)   # step2's arrays marked for reuse when it returns
+    C = acquire!(pool, Float64, n)
+    fill!(C, a + b)
+    return sum(C)
+end
+```
+
+## Passing Pool as Argument
+
+For complex call hierarchies, use `@with_pool` only at the top level and pass the pool through function arguments:
+
+```julia
+# Inner functions receive pool as argument - no @with_pool needed
+function compute_step!(pool, data, result)
+    temp = acquire!(pool, Float64, length(data))
+    temp .= data .* 2
+    result[] += sum(temp)
+end
+
+function process_chunk!(pool, chunk, result)
+    temp = zeros!(pool, Float64, length(chunk))
+    compute_step!(pool, chunk, temp)
+    result[] += sum(temp)
+end
+
+# Only the entry point uses @with_pool
+@with_pool pool function main_computation(chunks)
+    result = Ref(0.0)
+    for chunk in chunks
+        process_chunk!(pool, chunk, result)
+    end
+    return result[]
+end
+```
+
+**Benefits:**
+- Single checkpoint/rewind at top level
+- Inner functions are simpler (no macro overhead)
+- Pool lifetime is explicit and controlled
+
+## Direct Pool Access in Inner Functions
+
+An alternative to passing pool as argument: inner functions call `get_task_local_pool()` directly, while a top-level `@with_pool` function controls the lifecycle.
+
+```julia
+# Inner functions access pool directly - no argument needed
+function compute_step!(data, result)
+    pool = get_task_local_pool()  # Direct access
+    temp = acquire!(pool, Float64, length(data))
+    temp .= data .* 2
+    result[] += sum(temp)
+    # temp NOT released here - stays active
+end
+
+function process_chunk!(chunk, accumulator)
+    pool = get_task_local_pool()  # Direct access
+    buffer = zeros!(pool, Float64, length(chunk))
+    compute_step!(chunk, buffer)
+    accumulator[] += sum(buffer)
+    # buffer NOT released here - stays active
+end
+
+# Top-level controls lifecycle with @with_pool
+@with_pool pool function main_pipeline(chunks)
+    #  checkpoint!() ─────────────────────────────────┐
+    accumulator = Ref(0.0)                          # │
+    for chunk in chunks                             # │
+        process_chunk!(chunk, accumulator)          # │  All arrays from
+        # └─ compute_step! allocates temp           # │  inner functions
+        # └─ process_chunk! allocates buffer        # │  accumulate here
+    end                                             # │
+    return accumulator[]                            # │
+    #  rewind!() ─────────────────────────────────────┘
+    #     └─ ALL arrays (temp, buffer, ...) marked for reuse
+end
+```
+
+### Memory Flow Visualization
+
+```
+main_pipeline(chunks)          Inner Functions
+       │
+  checkpoint!()
+       │
+       ├──► process_chunk!()
+       │         │
+       │         ├──► get_task_local_pool() ──► buffer allocated
+       │         │
+       │         └──► compute_step!()
+       │                   │
+       │                   └──► get_task_local_pool() ──► temp allocated
+       │
+       ├──► process_chunk!()  (next iteration)
+       │         └──► ... more allocations ...
+       │
+       ▼
+    rewind!()  ◄─────── ALL arrays marked for reuse
+```
+
+### ⚠️ User Responsibility Warning
+
+This pattern requires **you** to guarantee that inner functions are **always** called through a `@with_pool` entry point:
+
+```julia
+# SAFE: Called through main_pipeline
+main_pipeline(my_chunks)  # ✓ Lifecycle managed
+
+# DANGEROUS: Direct call without @with_pool wrapper
+compute_step!(some_data, some_ref)  # ✗ No checkpoint/rewind!
+# └─ Arrays allocated but NEVER marked for reuse → pool grows unboundedly
+```
+
+**When to use this pattern:**
+- Deep call hierarchies where threading pool through every function is tedious
+- Performance-critical code where you want to avoid argument passing overhead
+- You can enforce that all entry points use `@with_pool`
+
+**When to prefer "Passing Pool as Argument":**
+- Functions may be called from various contexts (some pooled, some not)
+- Library code where you can't control the caller
+- You want explicit documentation of pool dependency in function signatures
+
+## Manual Checkpoint/Rewind
+
+For fine-grained control, use `checkpoint!` and `rewind!` directly:
+
+```julia
+function manual_control()
+    pool = get_task_local_pool()
+
+    checkpoint!(pool)
+    try
+        A = acquire!(pool, Float64, 100)
+        B = acquire!(pool, Float64, 100)
+        # ... compute ...
+        return sum(A) + sum(B)
+    finally
+        rewind!(pool)
+    end
+end
+```
+
+This is what `@with_pool` generates internally. Use manual control when:
+- Integrating with existing try/catch blocks
+- Conditional checkpoint/rewind logic needed
+- Building custom pool management abstractions
+
+## Scope-Only `@with_pool`
+
+You can omit the pool name when inner functions handle their own acquire:
+
+```julia
+@with_pool p function step1()
+    v = acquire!(p, Float64, 100)
+    sum(v)
+end
+
+@with_pool p function step2()
+    v = acquire!(p, Float64, 200)
+    sum(v)
+end
+
+# Outer function just provides scope management
+@with_pool function orchestrate()
+    a = step1()
+    b = step2()
+    return a + b
+end
+```
+
+The name-less `@with_pool` still performs checkpoint/rewind but doesn't expose the pool variable. This is useful when you're orchestrating other `@with_pool` functions.
+
+## See Also
+
+- [`@with_pool` Patterns](../basics/with-pool-patterns.md) - Basic usage patterns
+- [Safety Rules](../basics/safety-rules.md) - Scope rules
diff --git a/docs/src/architecture/how-it-works.md b/docs/src/architecture/how-it-works.md
index 39840a2..94ac084 100644
--- a/docs/src/architecture/how-it-works.md
+++ b/docs/src/architecture/how-it-works.md
@@ -32,7 +32,7 @@ The core mechanism that enables memory reuse:
     |     C = acquire!(pool, ...)  # n_active += 1
     |     ... compute ...
     |
-    +---> rewind!(pool)         # Restore n_active, arrays recycled
+    +---> rewind!(pool)         # Restore n_active, arrays available for reuse
 end
 ```
 
diff --git a/docs/src/basics/api-essentials.md b/docs/src/basics/api-essentials.md
index 1c2e63e..b0c5e42 100644
--- a/docs/src/basics/api-essentials.md
+++ b/docs/src/basics/api-essentials.md
@@ -20,7 +20,7 @@ end
 
 ### `unsafe_acquire!(pool, T, dims...)`
 
-Returns a native `Array` type. Only use when you specifically need `Array{T,N}`:
+Returns a native `Array` type. **Zero-allocation on cache hit**—only allocates a small header (~80-144 bytes) on cache miss. Use when you specifically need `Array{T,N}`:
 
 ```julia
 @with_pool pool begin
@@ -34,6 +34,9 @@ Returns a native `Array` type. Only use when you specifically need `Array{T,N}`:
 end
 ```
 
+!!! tip "Cache behavior"
+    Same dimension pattern → **0 bytes**. Different pattern → 80-144 bytes header only (data memory always reused). See [N-Way Cache](../architecture/type-dispatch.md#n-way-set-associative-cache) for details.
+
 ## Convenience Functions
 
 Zero-initialized arrays:
@@ -106,7 +109,7 @@ end
 | Function | Returns | Allocation | Use Case |
 |----------|---------|------------|----------|
 | `acquire!(pool, T, dims...)` | View type | 0 bytes | Default choice |
-| `unsafe_acquire!(pool, T, dims...)` | `Array{T,N}` | 0-144 bytes | FFI, type constraints |
+| `unsafe_acquire!(pool, T, dims...)` | `Array{T,N}` | 0 (hit) / 80-144 (miss) | FFI, type constraints |
 | `zeros!(pool, [T,] dims...)` | View type | 0 bytes | Zero-initialized |
 | `ones!(pool, [T,] dims...)` | View type | 0 bytes | One-initialized |
 | `similar!(pool, A)` | View type | 0 bytes | Match existing array |
diff --git a/docs/src/basics/quick-start.md b/docs/src/basics/quick-start.md
index 2c4f89a..87eaa5b 100644
--- a/docs/src/basics/quick-start.md
+++ b/docs/src/basics/quick-start.md
@@ -95,7 +95,7 @@ Arrays from the pool are **only valid within the `@with_pool` scope**:
 # DO NOT return pool-backed arrays
 @with_pool pool function bad_example()
     A = acquire!(pool, Float64, 10)
-    return A  # WRONG - A will be recycled after this scope!
+    return A  # WRONG - A marked for reuse, data may be overwritten!
 end
 
 # Return computed values instead
diff --git a/docs/src/basics/safety-rules.md b/docs/src/basics/safety-rules.md
index ab4685c..0faec74 100644
--- a/docs/src/basics/safety-rules.md
+++ b/docs/src/basics/safety-rules.md
@@ -11,7 +11,7 @@ AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This
 |                                                             |
 |  Pool arrays are ONLY valid within their @with_pool scope   |
 |                                                             |
-|  When the scope ends, the memory is recycled.               |
+|  When the scope ends, arrays are marked for reuse.          |
 |  Using arrays after scope ends = UNDEFINED BEHAVIOR         |
 |                                                             |
 +-------------------------------------------------------------+
@@ -29,15 +29,15 @@ AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This
 
 | Pattern | Example | Why It Fails |
 |---------|---------|--------------|
-| Return array | `return v` | Array recycled after return |
-| Store in global | `global_ref = v` | Points to recycled memory |
-| Capture in closure | `() -> sum(v)` | v invalid when closure runs |
+| Return array | `return v` | Array marked for reuse after return |
+| Store in global | `global_ref = v` | Points to reusable memory |
+| Capture in closure | `() -> sum(v)` | v may be overwritten when closure runs |
 
 ---
 
 ## The Scope Rule in Detail
 
-When `@with_pool` ends, all arrays acquired within that scope are recycled. Using them after the scope ends leads to undefined behavior.
+When `@with_pool` ends, all arrays acquired within that scope are **marked available for reuse**—not immediately freed. This is what makes zero-allocation possible on subsequent calls.
 
 ```julia
 @with_pool pool begin
@@ -46,9 +46,18 @@ When `@with_pool` ends, all arrays acquired within that scope are recycled. Usin
     result = sum(v)  # ✅ compute and return values
     copied = copy(v) # ✅ copy if you need data outside
 end
-# v is no longer valid here
+# v is no longer valid here - it's marked for reuse
 ```
 
+!!! warning "Why Undefined Behavior?"
+    After scope ends, using `v` is undefined because:
+
+    - **Subsequent `acquire!` calls may overwrite the data** — the memory is available for reuse
+    - **Task termination may trigger GC** — the pool itself could be garbage collected
+    - **It might "work" by luck** — data unchanged until next acquire, but don't rely on this
+
+    The worst case is **silent data corruption**: your code appears to work but produces wrong results intermittently.
+
 ## What NOT to Do
 
 ### Don't return pool-backed arrays
@@ -57,7 +66,7 @@ end
 # ❌ Wrong: returning the array itself
 @with_pool pool function bad_example()
     v = acquire!(pool, Float64, 100)
-    return v  # v will be recycled after this returns!
+    return v  # v marked for reuse after return!
 end
 
 # ✅ Correct: return computed values or copies
@@ -75,12 +84,12 @@ global_ref = nothing
 @with_pool pool begin
     global_ref = acquire!(pool, Float64, 100)
 end
-# global_ref now points to recycled memory
+# global_ref now points to reusable memory - data may be overwritten
 
 # ❌ Wrong: capturing in closure
 @with_pool pool begin
     v = acquire!(pool, Float64, 100)
-    callback = () -> sum(v)  # v captured but will be invalid
+    callback = () -> sum(v)  # v captured but may be overwritten later
 end
 ```
 
diff --git a/docs/src/basics/with-pool-patterns.md b/docs/src/basics/with-pool-patterns.md
index 83906dd..e77aa88 100644
--- a/docs/src/basics/with-pool-patterns.md
+++ b/docs/src/basics/with-pool-patterns.md
@@ -1,55 +1,55 @@
-# @with_pool Patterns
+# `@with_pool` Patterns
 
 The `@with_pool` macro provides automatic memory lifecycle management. It supports two usage patterns depending on your needs.
 
+## Pool Name: Choose Any Identifier
+
+The first argument to `@with_pool` is a **variable name** you choose - it doesn't have to be `pool`:
+
+```julia
+@with_pool p function foo() ... end
+@with_pool mypool function bar() ... end
+@with_pool scratch function baz() ... end
+```
+
+Use whatever name makes your code clearest.
+
 ## Pattern 1: Function Decorator
 
-Wraps an entire function with pool management. The pool is active for the function's full duration.
+Wraps an entire function with pool management:
 
 ```julia
 @with_pool pool function compute(n)
     A = acquire!(pool, Float64, n, n)
     B = zeros!(pool, Float64, n)
-
-    # ... compute with A and B ...
-
-    return sum(A) + sum(B)  # Return computed values, not arrays
+    # ... compute ...
+    return sum(A) + sum(B)
 end
 
-# Usage
 result = compute(100)  # Zero-allocation after warmup
 ```
 
-**Best for:**
-- Functions that exclusively use pooled arrays
-- Hot-path functions called repeatedly
-- Clear ownership semantics
+**Best for:** Functions that exclusively use pooled arrays, hot-path functions.
 
 ## Pattern 2: Block Wrapper
 
-Wraps only a portion of a function. Useful when you need pool arrays for part of the computation.
+Wraps only a portion of a function:
 
 ```julia
 function process_data(data)
-    # Pre-processing (no pool needed)
     n = length(data)
 
     @with_pool pool begin
-        # Pool is only active inside this block
         temp = acquire!(pool, Float64, n)
         temp .= data .* 2
         result = sum(temp)
-    end  # Pool arrays recycled here
+    end  # temp marked for reuse here
 
-    # Post-processing
     return result * 1.5
 end
 ```
 
-**Best for:**
-- Functions with mixed allocation needs
-- Gradual adoption in existing code
-- Fine-grained scope control
+**Best for:** Functions with mixed allocation needs, gradual adoption.
 
 ## Pattern Comparison
 
@@ -58,50 +58,26 @@ end
 | Scope | Entire function | begin...end block |
 | Syntax | `@with_pool pool function ...` | `@with_pool pool begin ... end` |
 | Pool lifetime | Function start to return | Block entry to exit |
-| Nesting | Functions can call each other | Blocks can be nested |
-
-## Nested Pools
-
-Both patterns support nesting. Each scope maintains independent checkpoint state:
-
-```julia
-@with_pool pool function outer(n)
-    A = acquire!(pool, Float64, n)
-
-    @with_pool pool begin
-        # Inner scope - new checkpoint
-        B = acquire!(pool, Float64, n * 2)
-        inner_result = sum(B)
-    end  # B recycled here
-
-    # A still valid here
-    return sum(A) + inner_result
-end
-```
 
 ## Common Mistakes
 
-### Returning pool arrays (wrong)
-
 ```julia
+# WRONG: returning the array itself
 @with_pool pool function bad()
     v = acquire!(pool, Float64, 100)
-    return v  # v is recycled after return!
+    return v  # v marked for reuse after return!
 end
-```
 
-### Correct: return computed values
-
-```julia
+# CORRECT: return computed values
 @with_pool pool function good()
     v = acquire!(pool, Float64, 100)
     return sum(v)  # Scalar result is safe
 end
 
-# Or copy if you need the array
+# CORRECT: return a copy if you need the data
 @with_pool pool function also_good()
     v = acquire!(pool, Float64, 100)
-    return copy(v)  # Explicit copy is safe
+    return copy(v)
 end
 ```
 
diff --git a/docs/src/index.md b/docs/src/index.md
index 82d9275..8b95f77 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -72,7 +72,7 @@ end
 
 1. **Checkpoint** — Saves current pool state when entering the block
 2. **Acquire** — `acquire!` returns arrays backed by pooled memory
-3. **Rewind** — When the block ends, all acquired arrays are recycled for reuse
+3. **Rewind** — When the block ends, all acquired arrays are marked available for reuse
 
 This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors.