From f9765dff8b5b442c2b2bd1241953124a3220dc7a Mon Sep 17 00:00:00 2001 From: Tim Waugh Date: Sun, 23 Nov 2025 10:23:32 +0000 Subject: [PATCH] fix(rag): store full hybrid ID in page-level chunk metadata Page-level chunks (__PAGE__) were stored with block_id="__PAGE__" in ChromaDB metadata, causing all pages to map to the same short ID ("1") in LLM prompts. This meant the LLM couldn't distinguish between different page-level chunks when making integration decisions. Changes: - Store full hybrid ID (e.g., "Andrew McNamara::__PAGE__") instead of just "__PAGE__" in metadata block_id field - Bump INDEX_SCHEMA_VERSION to 5 to trigger automatic reindex - Each page-level chunk now gets unique short ID (1, 2, 3, ...) Impact: - Before: for all pages (collision) - After: , , (unique) - LLM can now reference specific page-level chunks correctly The schema version bump ensures automatic index rebuild on next logsqueak extract/search command with no user intervention needed. Assisted-by: Claude Code --- src/logsqueak/services/page_indexer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/logsqueak/services/page_indexer.py b/src/logsqueak/services/page_indexer.py index 0b4fa20..685f4a4 100644 --- a/src/logsqueak/services/page_indexer.py +++ b/src/logsqueak/services/page_indexer.py @@ -23,7 +23,8 @@ # - 2: Added deleted page cleanup, version tracking # - 3: Added page_frontmatter to metadata (avoids re-parsing pages during RAG search) # - 4: Pre-clean contexts during indexing (strip id:: and page properties) -INDEX_SCHEMA_VERSION = 4 +# - 5: Store full hybrid ID in page-level chunk metadata to avoid ID collisions in LLM prompts +INDEX_SCHEMA_VERSION = 5 def generate_graph_db_name(graph_path: Path) -> str: @@ -421,7 +422,7 @@ def _prepare_page_chunks( "document": page_context, "metadata": { "page_name": page_name, - "block_id": "__PAGE__", + "block_id": page_chunk_id, # Full hybrid ID to avoid collision in LLM prompts "mtime": mtime, "page_title": page_title, # Store title:: for display "page_frontmatter": json.dumps(page_frontmatter)