From f75b46bcf9fa72c99104ce2aa5f52cb93facd53e Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Feb 2026 04:18:24 +0000 Subject: [PATCH 01/39] mit-salvage: reintroduce _graph edges accelerator + admin lifecycle parity feat(graph-edges): add qdrant _graph backfill + symbol_graph accel; mirror admin lifecycle feat: materialize _graph edges and use for symbol_graph; ensure clone/delete handles companion collections graph: add _graph edge collection + backfill; prune/watcher cleanup; admin copy/delete parity feat(admin+graph): copy/delete companion _graph collections; add UI status + tests --- scripts/collection_admin.py | 33 ++- scripts/indexing_admin.py | 12 + scripts/ingest/graph_edges.py | 363 ++++++++++++++++++++++++++ scripts/ingest/pipeline.py | 66 +++++ scripts/ingest_code.py | 22 ++ scripts/mcp_impl/symbol_graph.py | 189 +++++++++++++- scripts/prune.py | 39 +-- scripts/upload_service.py | 54 +++- scripts/watch_index_core/processor.py | 9 + scripts/watch_index_core/pseudo.py | 26 +- templates/admin/acl.html | 26 ++ tests/test_admin_collection_delete.py | 23 ++ tests/test_staging_lifecycle.py | 48 ++++ 13 files changed, 879 insertions(+), 31 deletions(-) create mode 100644 scripts/ingest/graph_edges.py diff --git a/scripts/collection_admin.py b/scripts/collection_admin.py index d970e941..2882fc78 100644 --- a/scripts/collection_admin.py +++ b/scripts/collection_admin.py @@ -193,6 +193,7 @@ def delete_collection_everywhere( out: Dict[str, Any] = { "collection": name, "qdrant_deleted": False, + "qdrant_graph_deleted": False, "registry_marked_deleted": False, "deleted_state_files": 0, "deleted_managed_workspaces": 0, @@ -209,6 +210,14 @@ def delete_collection_everywhere( out["qdrant_deleted"] = True except Exception: out["qdrant_deleted"] = False + # Best-effort: also delete companion graph edges collection when present. + # This branch stores file-level edges in `_graph`. + if not name.endswith("_graph"): + try: + cli.delete_collection(collection_name=f"{name}_graph") + out["qdrant_graph_deleted"] = True + except Exception: + out["qdrant_graph_deleted"] = False except Exception: out["qdrant_deleted"] = False @@ -359,8 +368,10 @@ def _manual_copy_points() -> None: vectors_config = None sparse_vectors_config = None + # Support vector-less collections (e.g. payload-only graph edge collections). if vectors_config is None: - raise RuntimeError(f"Cannot determine vectors config for source collection {src}") + vectors_config = {} + vectorless = isinstance(vectors_config, dict) and not vectors_config try: cli.create_collection( @@ -401,7 +412,7 @@ def _manual_copy_points() -> None: limit=batch_limit, offset=offset, with_payload=True, - with_vectors=True, + with_vectors=(not vectorless), ) except Exception as exc: raise RuntimeError(f"Failed to scroll points from {src}: {exc}") from exc @@ -414,7 +425,9 @@ def _manual_copy_points() -> None: point_id = getattr(record, "id", None) payload = getattr(record, "payload", None) vector = None - if hasattr(record, "vector") and getattr(record, "vector") is not None: + if vectorless: + vector = {} + elif hasattr(record, "vector") and getattr(record, "vector") is not None: vector = getattr(record, "vector") elif hasattr(record, "vectors") and getattr(record, "vectors") is not None: vector = getattr(record, "vectors") @@ -477,4 +490,18 @@ def _count_points(name: str) -> Optional[int]: # The manual path guarantees the destination gets the exact same points/payloads/vectors. _manual_copy_points() + # Best-effort: copy the companion graph collection when copying a base collection. + # Graph edges are derived data and can be rebuilt, but copying avoids a cold-start window + # during staging cutovers where the clone has no graph. + if not src.endswith("_graph") and not dest.endswith("_graph"): + try: + copy_collection_qdrant( + source=f"{src}_graph", + target=f"{dest}_graph", + qdrant_url=base_url, + overwrite=overwrite, + ) + except Exception: + pass + return dest diff --git a/scripts/indexing_admin.py b/scripts/indexing_admin.py index f3bb69d8..260feecf 100644 --- a/scripts/indexing_admin.py +++ b/scripts/indexing_admin.py @@ -927,6 +927,12 @@ def delete_collection_qdrant(*, qdrant_url: str, api_key: Optional[str], collect return try: cli.delete_collection(collection_name=name) + # Best-effort: also delete companion graph edges collection when present. + if not name.endswith("_graph"): + try: + cli.delete_collection(collection_name=f"{name}_graph") + except Exception: + pass except Exception: pass finally: @@ -951,6 +957,12 @@ def recreate_collection_qdrant(*, qdrant_url: str, api_key: Optional[str], colle cli.delete_collection(collection_name=name) except Exception as delete_error: raise RuntimeError(f"Failed to delete existing collection '{name}' in Qdrant: {delete_error}") from delete_error + # Best-effort: also delete companion graph edges collection when present. + if not name.endswith("_graph"): + try: + cli.delete_collection(collection_name=f"{name}_graph") + except Exception: + pass finally: try: cli.close() diff --git a/scripts/ingest/graph_edges.py b/scripts/ingest/graph_edges.py new file mode 100644 index 00000000..d7167e8f --- /dev/null +++ b/scripts/ingest/graph_edges.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python3 +""" +ingest/graph_edges.py - Materialized graph edges in Qdrant. + +This is a small, MIT-safe reimplementation of the "graph edges collection" idea: +- Maintain a dedicated Qdrant collection named `_graph` +- Store payload-only edge docs for fast lookups: + - callers/importers queries become simple keyword filters on an indexed payload field + +Design goals for this branch: +- Keep this as an *accelerator* (symbol_graph still works without it) +- Avoid Neo4j/PageRank/GraphRAG complexity +- Avoid CLI flags; watcher can backfill opportunistically +""" + +from __future__ import annotations + +import hashlib +import logging +import os +from typing import Any, Dict, Iterable, List, Optional, Tuple + +logger = logging.getLogger(__name__) + +GRAPH_COLLECTION_SUFFIX = "_graph" + +EDGE_TYPE_CALLS = "calls" +EDGE_TYPE_IMPORTS = "imports" + +GRAPH_INDEX_FIELDS: Tuple[str, ...] = ( + "caller_path", + "callee_symbol", + "edge_type", + "repo", +) + +_ENSURED_GRAPH_COLLECTIONS: set[str] = set() +_GRAPH_VECTOR_MODE: dict[str, str] = {} +_MISSING_GRAPH_COLLECTIONS: set[str] = set() +_BACKFILL_OFFSETS: dict[tuple[str, Optional[str]], Any] = {} + +_EDGE_VECTOR_NAME = "_edge" +_EDGE_VECTOR_VALUE = [0.0] + + +def _normalize_path(path: str) -> str: + if not path: + return "" + try: + normalized = os.path.normpath(str(path)) + except Exception: + normalized = str(path) + return normalized.replace("\\", "/") + + +def get_graph_collection_name(base_collection: str) -> str: + return f"{base_collection}{GRAPH_COLLECTION_SUFFIX}" + + +def _edge_vector_for_upsert(graph_collection: str) -> dict: + mode = _GRAPH_VECTOR_MODE.get(graph_collection) + if mode == "named": + return {_EDGE_VECTOR_NAME: _EDGE_VECTOR_VALUE} + return {} + + +def ensure_graph_collection(client: Any, base_collection: str) -> Optional[str]: + """Ensure `_graph` exists and has payload indexes.""" + from qdrant_client import models as qmodels + + if not base_collection: + return None + graph_coll = get_graph_collection_name(base_collection) + if graph_coll in _ENSURED_GRAPH_COLLECTIONS: + return graph_coll + + def _detect_vector_mode(info: Any) -> str: + try: + vectors = getattr( + getattr(getattr(info, "config", None), "params", None), "vectors", None + ) + if isinstance(vectors, dict): + return "none" if not vectors else "named" + return "none" if vectors is None else "named" + except Exception: + return "named" + + try: + info = client.get_collection(graph_coll) + _GRAPH_VECTOR_MODE[graph_coll] = _detect_vector_mode(info) + _ENSURED_GRAPH_COLLECTIONS.add(graph_coll) + _MISSING_GRAPH_COLLECTIONS.discard(graph_coll) + return graph_coll + except Exception: + pass + + try: + # Prefer vector-less collection when supported by server/client. + try: + client.create_collection( + collection_name=graph_coll, + vectors_config={}, + ) + _GRAPH_VECTOR_MODE[graph_coll] = "none" + except Exception: + client.create_collection( + collection_name=graph_coll, + vectors_config={ + _EDGE_VECTOR_NAME: qmodels.VectorParams( + size=1, distance=qmodels.Distance.COSINE + ) + }, + ) + _GRAPH_VECTOR_MODE[graph_coll] = "named" + + # Create payload indexes (best-effort). + for field in GRAPH_INDEX_FIELDS: + try: + client.create_payload_index( + collection_name=graph_coll, + field_name=field, + field_schema=qmodels.PayloadSchemaType.KEYWORD, + ) + except Exception: + pass + + _ENSURED_GRAPH_COLLECTIONS.add(graph_coll) + _MISSING_GRAPH_COLLECTIONS.discard(graph_coll) + return graph_coll + except Exception as e: + logger.debug("Failed to ensure graph collection %s: %s", graph_coll, e) + return None + + +def _edge_id(edge_type: str, repo: str, caller_path: str, callee_symbol: str) -> str: + key = f"{edge_type}:{repo}:{caller_path}:{callee_symbol}" + return hashlib.sha256(key.encode("utf-8", errors="ignore")).hexdigest()[:32] + + +def _iter_edges( + *, + caller_path: str, + repo: str, + calls: Iterable[str] = (), + imports: Iterable[str] = (), +) -> List[Dict[str, Any]]: + norm_path = _normalize_path(caller_path) + repo_s = (repo or "").strip() or "default" + + edges: List[Dict[str, Any]] = [] + for sym in calls or []: + s = str(sym).strip() + if not s: + continue + edges.append( + { + "id": _edge_id(EDGE_TYPE_CALLS, repo_s, norm_path, s), + "payload": { + "caller_path": norm_path, + "callee_symbol": s, + "edge_type": EDGE_TYPE_CALLS, + "repo": repo_s, + }, + } + ) + for sym in imports or []: + s = str(sym).strip() + if not s: + continue + edges.append( + { + "id": _edge_id(EDGE_TYPE_IMPORTS, repo_s, norm_path, s), + "payload": { + "caller_path": norm_path, + "callee_symbol": s, + "edge_type": EDGE_TYPE_IMPORTS, + "repo": repo_s, + }, + } + ) + return edges + + +def upsert_file_edges( + client: Any, + base_collection: str, + *, + caller_path: str, + repo: str | None, + calls: List[str] | None = None, + imports: List[str] | None = None, +) -> int: + graph_coll = ensure_graph_collection(client, base_collection) + if not graph_coll: + return 0 + edges = _iter_edges( + caller_path=caller_path, + repo=repo or "default", + calls=calls or [], + imports=imports or [], + ) + if not edges: + return 0 + + from qdrant_client import models as qmodels + + points = [ + qmodels.PointStruct( + id=e["id"], + vector=_edge_vector_for_upsert(graph_coll), + payload=e["payload"], + ) + for e in edges + ] + try: + client.upsert(collection_name=graph_coll, points=points, wait=True) + return len(points) + except Exception as e: + logger.debug("Graph edge upsert failed for %s: %s", caller_path, e) + return 0 + + +def delete_edges_by_path( + client: Any, + base_collection: str, + *, + caller_path: str, + repo: str | None = None, +) -> int: + graph_coll = get_graph_collection_name(base_collection) + if graph_coll in _MISSING_GRAPH_COLLECTIONS: + return 0 + + from qdrant_client import models as qmodels + + norm_path = _normalize_path(caller_path) + must: list[Any] = [ + qmodels.FieldCondition( + key="caller_path", match=qmodels.MatchValue(value=norm_path) + ) + ] + if repo: + r = str(repo).strip() + if r and r != "*": + must.append( + qmodels.FieldCondition(key="repo", match=qmodels.MatchValue(value=r)) + ) + + try: + client.delete( + collection_name=graph_coll, + points_selector=qmodels.FilterSelector(filter=qmodels.Filter(must=must)), + ) + return 1 + except Exception as e: + err = str(e).lower() + if "404" in err or "doesn't exist" in err or "not found" in err: + _MISSING_GRAPH_COLLECTIONS.add(graph_coll) + return 0 + + +def graph_edges_backfill_tick( + client: Any, + base_collection: str, + *, + repo_name: str | None = None, + max_files: int = 128, +) -> int: + """Best-effort incremental backfill from `` into `_graph`. + + This scans the main collection and upserts file-level edges into the graph collection. + It's idempotent (deterministic IDs) and safe to run continuously in a watcher worker. + """ + from qdrant_client import models as qmodels + + if not base_collection or max_files <= 0: + return 0 + + graph_coll = ensure_graph_collection(client, base_collection) + if not graph_coll: + return 0 + + must: list[Any] = [] + if repo_name: + must.append( + qmodels.FieldCondition( + key="metadata.repo", match=qmodels.MatchValue(value=repo_name) + ) + ) + flt = qmodels.Filter(must=must or None) + + processed_files = 0 + seen_paths: set[str] = set() + + key = (base_collection, repo_name) + next_offset = _BACKFILL_OFFSETS.get(key) + + # We may need to overscan because the main collection is chunked. + overscan = max_files * 8 + while processed_files < max_files: + try: + points, next_offset = client.scroll( + collection_name=base_collection, + scroll_filter=flt, + limit=min(64, overscan), + with_payload=True, + with_vectors=False, + offset=next_offset, + ) + except Exception: + break + + if not points: + break + + for rec in points: + if processed_files >= max_files: + break + payload = getattr(rec, "payload", None) or {} + md = payload.get("metadata") or {} + path = md.get("path") or "" + if not path: + continue + norm_path = _normalize_path(str(path)) + if norm_path in seen_paths: + continue + seen_paths.add(norm_path) + + repo = md.get("repo") or repo_name or "default" + calls = md.get("calls") or [] + imports = md.get("imports") or [] + if not isinstance(calls, list): + calls = [] + if not isinstance(imports, list): + imports = [] + + upsert_file_edges( + client, + base_collection, + caller_path=norm_path, + repo=str(repo), + calls=[str(x) for x in calls if x], + imports=[str(x) for x in imports if x], + ) + processed_files += 1 + + if next_offset is None: + break + + _BACKFILL_OFFSETS[key] = next_offset + return processed_files + + +__all__ = [ + "GRAPH_COLLECTION_SUFFIX", + "EDGE_TYPE_CALLS", + "EDGE_TYPE_IMPORTS", + "get_graph_collection_name", + "ensure_graph_collection", + "upsert_file_edges", + "delete_edges_by_path", + "graph_edges_backfill_tick", +] diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py index 45716049..0e0aae1f 100644 --- a/scripts/ingest/pipeline.py +++ b/scripts/ingest/pipeline.py @@ -653,6 +653,40 @@ def make_point(pid, dense_vec, lex_vec, payload, lex_text: str = "", code_text: for i, v, lx, m, lt, ct in zip(batch_ids, vectors, batch_lex, batch_meta, batch_lex_text, batch_code) ] upsert_points(client, collection, points) + # Optional: materialize file-level graph edges in a companion `_graph` store. + # This is an accelerator for symbol_graph callers/importers and is safe to skip on failure. + try: + enabled = str(os.environ.get("GRAPH_EDGES_ENABLE", "1") or "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + if enabled: + from scripts.ingest.graph_edges import ( + delete_edges_by_path as _delete_edges_by_path, + ensure_graph_collection as _ensure_graph_collection, + upsert_file_edges as _upsert_file_edges, + ) + + _ensure_graph_collection(client, collection) + # Important: delete stale edges for this file before upserting the new set. + _delete_edges_by_path( + client, + collection, + caller_path=str(file_path), + repo=repo_tag, + ) + _upsert_file_edges( + client, + collection, + caller_path=str(file_path), + repo=repo_tag, + calls=calls, + imports=imports, + ) + except Exception: + pass try: ws = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" if set_cached_file_hash: @@ -1367,6 +1401,38 @@ def process_file_with_smart_reindexing( if all_points: _upsert_points_fn(client, current_collection, all_points) + # Optional: materialize file-level graph edges (best-effort). + try: + enabled = str(os.environ.get("GRAPH_EDGES_ENABLE", "1") or "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + if enabled: + from scripts.ingest.graph_edges import ( + delete_edges_by_path as _delete_edges_by_path, + ensure_graph_collection as _ensure_graph_collection, + upsert_file_edges as _upsert_file_edges, + ) + + _ensure_graph_collection(client, current_collection) + _delete_edges_by_path( + client, + current_collection, + caller_path=str(file_path), + repo=per_file_repo, + ) + _upsert_file_edges( + client, + current_collection, + caller_path=str(file_path), + repo=per_file_repo, + calls=calls, + imports=imports, + ) + except Exception: + pass try: if set_cached_symbols: diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 2da24911..93088ae0 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -212,6 +212,24 @@ index_repo, process_file_with_smart_reindexing, ) + +# --------------------------------------------------------------------------- +# Graph edges (optional accelerator) +# --------------------------------------------------------------------------- +try: + from scripts.ingest.graph_edges import ( + graph_edges_backfill_tick, + delete_edges_by_path as delete_graph_edges_by_path, + upsert_file_edges as upsert_graph_edges_for_file, + ) +except ImportError: + graph_edges_backfill_tick = None # type: ignore[assignment] + + def delete_graph_edges_by_path(*_args, **_kwargs) -> int: + return 0 + + def upsert_graph_edges_for_file(*_args, **_kwargs) -> int: + return 0 # --------------------------------------------------------------------------- # Re-exports from ingest/cli.py # --------------------------------------------------------------------------- @@ -338,6 +356,10 @@ def main(): "index_repo", "process_file_with_smart_reindexing", "pseudo_backfill_tick", + # Graph edges (optional) + "graph_edges_backfill_tick", + "delete_graph_edges_by_path", + "upsert_graph_edges_for_file", # CLI "main", # Backward compat diff --git a/scripts/mcp_impl/symbol_graph.py b/scripts/mcp_impl/symbol_graph.py index da518ac4..2967035b 100644 --- a/scripts/mcp_impl/symbol_graph.py +++ b/scripts/mcp_impl/symbol_graph.py @@ -24,6 +24,9 @@ logger = logging.getLogger(__name__) +GRAPH_COLLECTION_SUFFIX = "_graph" +_MISSING_GRAPH_COLLECTIONS: set[str] = set() + __all__ = [ "_symbol_graph_impl", "_format_symbol_graph_toon", @@ -195,16 +198,28 @@ async def _symbol_graph_impl( try: if query_type == "callers": - # Find chunks where metadata.calls array contains the symbol (exact match) - results = await _query_array_field( + # Prefer graph edges collection when available (fast keyword filters). + results = await _query_graph_edges_collection( client=client, collection=coll, - field_key="metadata.calls", - value=symbol, + symbol=symbol, + edge_type="calls", limit=limit, language=language, + repo_filter=None, under=_norm_under(under), ) + if not results: + # Fall back to array field lookup in the main collection. + results = await _query_array_field( + client=client, + collection=coll, + field_key="metadata.calls", + value=symbol, + limit=limit, + language=language, + under=_norm_under(under), + ) elif query_type == "definition": # Find chunks where symbol_path matches the symbol results = await _query_definition( @@ -216,16 +231,27 @@ async def _symbol_graph_impl( under=_norm_under(under), ) elif query_type == "importers": - # Find chunks where metadata.imports array contains the symbol - results = await _query_array_field( + results = await _query_graph_edges_collection( client=client, collection=coll, - field_key="metadata.imports", - value=symbol, + symbol=symbol, + edge_type="imports", limit=limit, language=language, + repo_filter=None, under=_norm_under(under), ) + if not results: + # Fall back to array field lookup in the main collection. + results = await _query_array_field( + client=client, + collection=coll, + field_key="metadata.imports", + value=symbol, + limit=limit, + language=language, + under=_norm_under(under), + ) # If no results, fall back to semantic search if not results: @@ -259,6 +285,153 @@ async def _symbol_graph_impl( } +async def _query_graph_edges_collection( + client: Any, + collection: str, + symbol: str, + edge_type: str, + limit: int, + language: Optional[str] = None, + repo_filter: str | None = None, + under: str | None = None, +) -> List[Dict[str, Any]]: + """Query `_graph` and hydrate results from the main collection. + + The graph collection stores file-level edges: + - caller_path -> callee_symbol (calls/imports) + """ + from qdrant_client import models as qmodels + + graph_coll = f"{collection}{GRAPH_COLLECTION_SUFFIX}" + if graph_coll in _MISSING_GRAPH_COLLECTIONS: + return [] + + # Build graph filter + must: list[Any] = [ + qmodels.FieldCondition( + key="edge_type", match=qmodels.MatchValue(value=str(edge_type)) + ) + ] + if repo_filter: + rf = str(repo_filter).strip() + if rf and rf != "*": + must.append( + qmodels.FieldCondition(key="repo", match=qmodels.MatchValue(value=rf)) + ) + + # Try exact match, then symbol variants. + callee_variants = _symbol_variants(symbol) or [symbol] + seen_paths: set[str] = set() + caller_paths: List[str] = [] + + for variant in callee_variants: + if len(caller_paths) >= limit: + break + v = str(variant).strip() + if not v: + continue + flt = qmodels.Filter( + must=must + + [ + qmodels.FieldCondition( + key="callee_symbol", match=qmodels.MatchValue(value=v) + ) + ] + ) + + def _scroll(): + return client.scroll( + collection_name=graph_coll, + scroll_filter=flt, + limit=max(32, limit * 4), + with_payload=True, + with_vectors=False, + ) + + try: + points, _ = await asyncio.to_thread(_scroll) + except Exception as e: + err = str(e).lower() + if "404" in err or "doesn't exist" in err or "not found" in err: + _MISSING_GRAPH_COLLECTIONS.add(graph_coll) + return [] + logger.exception( + "_query_graph_edges_collection scroll failed for %s", graph_coll + ) + raise + + for rec in points or []: + payload = getattr(rec, "payload", None) or {} + p = payload.get("caller_path") or "" + if not p: + continue + path_s = str(p) + if under and not str(path_s).startswith(str(under)): + continue + if path_s in seen_paths: + continue + seen_paths.add(path_s) + caller_paths.append(path_s) + if len(caller_paths) >= limit: + break + + if not caller_paths: + return [] + + # Hydrate caller paths back into normal symbol_graph point-shaped results. + hydrated: List[Dict[str, Any]] = [] + for p in caller_paths[:limit]: + if len(hydrated) >= limit: + break + + def _scroll_main(): + must = [ + qmodels.FieldCondition( + key="metadata.path", match=qmodels.MatchValue(value=p) + ) + ] + if language: + must.append( + qmodels.FieldCondition( + key="metadata.language", + match=qmodels.MatchValue(value=str(language).lower()), + ) + ) + return client.scroll( + collection_name=collection, + scroll_filter=qmodels.Filter( + must=must + ), + limit=1, + with_payload=True, + with_vectors=False, + ) + + try: + pts, _ = await asyncio.to_thread(_scroll_main) + except Exception: + pts = [] + + if pts: + hydrated.append(_format_point(pts[0])) + else: + # If language filtering was requested but no matching main-collection doc + # exists (or hydration failed), skip returning a placeholder to avoid + # producing language-inconsistent results. + if not language: + hydrated.append( + { + "path": p, + "symbol": "", + "symbol_path": "", + "start_line": 0, + "end_line": 0, + } + ) + + return hydrated + + async def _query_array_field( client: Any, collection: str, diff --git a/scripts/prune.py b/scripts/prune.py index 5e2f14fb..90e22863 100755 --- a/scripts/prune.py +++ b/scripts/prune.py @@ -39,26 +39,33 @@ def delete_by_path(client: QdrantClient, path_str: str) -> int: return 0 -def delete_graph_edges_by_path(client: QdrantClient, path_str: str) -> int: +def delete_graph_edges_by_path(client: QdrantClient, path_str: str, repo: str | None = None) -> int: """Best-effort deletion for graph-edge collections (if present). Some deployments store symbol-graph edges in a separate Qdrant collection - (commonly `${COLLECTION}_graph`). Those points may reference a file path as - either caller or callee; delete both to prevent stale graph results. + (commonly `${COLLECTION}_graph`). On this branch, edge docs are file-level and + reference a file path as `caller_path`. """ if not path_str: return 0 - - flt = models.Filter( - should=[ - models.FieldCondition( - key="caller_path", match=models.MatchValue(value=path_str) - ), - models.FieldCondition( - key="callee_path", match=models.MatchValue(value=path_str) - ), - ] - ) + try: + path_str = os.path.normpath(str(path_str)).replace("\\", "/") + except Exception: + path_str = str(path_str) + + must = [ + models.FieldCondition(key="caller_path", match=models.MatchValue(value=path_str)) + ] + if repo: + try: + r = str(repo).strip() + except Exception: + r = "" + if r and r != "*": + must.append( + models.FieldCondition(key="repo", match=models.MatchValue(value=r)) + ) + flt = models.Filter(must=must) try: res = client.delete( collection_name=GRAPH_COLLECTION, @@ -116,13 +123,13 @@ def main(): ) if not abs_path.exists(): removed_missing += delete_by_path(client, path_str) - removed_graph_edges += delete_graph_edges_by_path(client, path_str) + removed_graph_edges += delete_graph_edges_by_path(client, path_str, md.get("repo")) print(f"[prune] removed missing file points: {path_str}") continue current_hash = sha1_file(abs_path) if file_hash and current_hash and current_hash != file_hash: removed_mismatch += delete_by_path(client, path_str) - removed_graph_edges += delete_graph_edges_by_path(client, path_str) + removed_graph_edges += delete_graph_edges_by_path(client, path_str, md.get("repo")) print(f"[prune] removed outdated points (hash mismatch): {path_str}") if next_page is None: diff --git a/scripts/upload_service.py b/scripts/upload_service.py index 6771d652..067e2e48 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -935,7 +935,7 @@ async def admin_delete_collection( cleanup_fs = False try: - delete_collection_everywhere( + out = delete_collection_everywhere( collection=name, work_dir=WORK_DIR, qdrant_url=QDRANT_URL, @@ -949,7 +949,23 @@ async def admin_delete_collection( back_href="/admin/acl", ) - return RedirectResponse(url="/admin/acl", status_code=302) + graph_deleted: Optional[str] = None + try: + if isinstance(out, dict) and not name.endswith("_graph"): + graph_deleted = "1" if bool(out.get("qdrant_graph_deleted")) else "0" + except Exception: + graph_deleted = None + + try: + from urllib.parse import urlencode + + params = {"deleted": name} + if graph_deleted is not None: + params["graph_deleted"] = graph_deleted + url = "/admin/acl?" + urlencode(params) + except Exception: + url = "/admin/acl" + return RedirectResponse(url=url, status_code=302) @app.post("/admin/staging/start") @@ -1222,7 +1238,39 @@ async def admin_copy_collection( back_href="/admin/acl", ) - return RedirectResponse(url="/admin/acl", status_code=302) + graph_copied: Optional[str] = None + try: + if not name.endswith("_graph") and not str(new_name).endswith("_graph"): + from qdrant_client import QdrantClient # type: ignore + + cli = QdrantClient( + url=QDRANT_URL, + api_key=os.environ.get("QDRANT_API_KEY"), + timeout=float(os.environ.get("QDRANT_TIMEOUT", "5") or 5), + ) + try: + cli.get_collection(collection_name=f"{new_name}_graph") + graph_copied = "1" + except Exception: + graph_copied = "0" + finally: + try: + cli.close() + except Exception: + pass + except Exception: + graph_copied = None + + try: + from urllib.parse import urlencode + + params = {"copied": name, "new": new_name} + if graph_copied is not None: + params["graph_copied"] = graph_copied + url = "/admin/acl?" + urlencode(params) + except Exception: + url = "/admin/acl" + return RedirectResponse(url=url, status_code=302) @app.post("/admin/users") diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index 45e9db7e..6f529cf7 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -251,6 +251,15 @@ def _process_paths( if client is not None: try: idx.delete_points_by_path(client, collection, str(p)) + try: + idx.delete_graph_edges_by_path( + client, + collection, + caller_path=str(p), + repo=repo_name, + ) + except Exception: + pass safe_print(f"[deleted] {p} -> {collection}") except Exception: pass diff --git a/scripts/watch_index_core/pseudo.py b/scripts/watch_index_core/pseudo.py index dc7bb0a8..ec76d7a0 100644 --- a/scripts/watch_index_core/pseudo.py +++ b/scripts/watch_index_core/pseudo.py @@ -90,6 +90,31 @@ def _worker() -> None: "[pseudo_backfill] repo=%s collection=%s processed=%d", repo_name or "default", coll, processed, ) + # Optional: backfill graph edge collection from main points. + # Controlled separately because it may scan large collections over time. + if get_boolean_env("GRAPH_EDGES_BACKFILL"): + try: + files_done = idx.graph_edges_backfill_tick( + client, + coll, + repo_name=repo_name, + max_files=max_points, + ) + if files_done: + logger.info( + "[graph_backfill] repo=%s collection=%s files=%d", + repo_name or "default", + coll, + files_done, + ) + except Exception as exc: + logger.error( + "[graph_backfill] error repo=%s collection=%s: %s", + repo_name or "default", + coll, + exc, + exc_info=True, + ) except Exception as exc: logger.error( "[pseudo_backfill] error repo=%s collection=%s: %s", @@ -110,4 +135,3 @@ def _worker() -> None: __all__ = ["_start_pseudo_backfill_worker"] - diff --git a/templates/admin/acl.html b/templates/admin/acl.html index 952a0ce9..98292beb 100644 --- a/templates/admin/acl.html +++ b/templates/admin/acl.html @@ -1,6 +1,32 @@ {% extends "admin/base.html" %} {% block content %} + {% set qp = request.query_params %} + {% if qp and ((qp.get("copied") and qp.get("new")) or qp.get("deleted")) %} +
+ {% if qp.get("copied") and qp.get("new") %} +
+ Copied collection {{ qp.get("copied") }}{{ qp.get("new") }}. + {% if qp.get("graph_copied") == "1" %} + (graph clone copied) + {% elif qp.get("graph_copied") == "0" %} + (graph clone not copied; will rebuild/backfill) + {% endif %} +
+ {% endif %} + {% if qp.get("deleted") %} +
+ Deleted collection {{ qp.get("deleted") }}. + {% if qp.get("graph_deleted") == "1" %} + (graph clone deleted) + {% elif qp.get("graph_deleted") == "0" %} + (graph clone not deleted or missing) + {% endif %} +
+ {% endif %} +
+ {% endif %} +

Users

diff --git a/tests/test_admin_collection_delete.py b/tests/test_admin_collection_delete.py index ad42807e..c407b5da 100644 --- a/tests/test_admin_collection_delete.py +++ b/tests/test_admin_collection_delete.py @@ -52,6 +52,29 @@ def test_admin_role_gate_blocks_non_admin(monkeypatch): assert resp.json().get("detail") == "Admin required" +@pytest.mark.unit +def test_delete_redirect_includes_graph_deleted_param(monkeypatch): + monkeypatch.setenv("CTXCE_AUTH_ENABLED", "1") + monkeypatch.setenv("CTXCE_ADMIN_COLLECTION_DELETE_ENABLED", "1") + + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + + monkeypatch.setattr(srv, "_require_admin_session", lambda _req: {"user_id": "admin"}) + + def _fake_delete_collection_everywhere(**_kwargs): + return {"qdrant_deleted": True, "qdrant_graph_deleted": True} + + monkeypatch.setattr(srv, "delete_collection_everywhere", _fake_delete_collection_everywhere) + + client = TestClient(srv.app) + resp = client.post("/admin/collections/delete", data={"collection": "c1", "delete_fs": ""}, follow_redirects=False) + assert resp.status_code == 302 + loc = resp.headers.get("location") or "" + assert "deleted=c1" in loc + assert "graph_deleted=1" in loc + + @pytest.mark.unit def test_collection_admin_refuses_when_env_disabled(monkeypatch): monkeypatch.setenv("CTXCE_ADMIN_COLLECTION_DELETE_ENABLED", "0") diff --git a/tests/test_staging_lifecycle.py b/tests/test_staging_lifecycle.py index 01734e32..d7bb9cbf 100644 --- a/tests/test_staging_lifecycle.py +++ b/tests/test_staging_lifecycle.py @@ -542,6 +542,54 @@ def fake_abort(**kwargs): assert calls["abort"] == 1 +def test_admin_copy_endpoint_reports_graph_clone_in_redirect(monkeypatch: pytest.MonkeyPatch): + import sys + import types + from urllib.parse import parse_qs, urlparse + + from scripts import upload_service + + monkeypatch.setattr(upload_service, "AUTH_ENABLED", True) + monkeypatch.setattr(upload_service, "_require_admin_session", lambda request: {"user_id": "admin"}) + monkeypatch.setattr(upload_service, "WORK_DIR", "/fake/work") + monkeypatch.setenv("WORK_DIR", "/fake/work") + + def fake_copy_collection_qdrant(**kwargs): + assert kwargs.get("source") == "src" + assert kwargs.get("target") == "dst" + return "dst" + + monkeypatch.setattr(upload_service, "copy_collection_qdrant", fake_copy_collection_qdrant) + + class _FakeQdrantClient: + def __init__(self, *args, **kwargs): + pass + + def get_collection(self, collection_name: str): + if collection_name == "dst_graph": + return {"name": collection_name} + raise RuntimeError("not found") + + def close(self): + return None + + monkeypatch.setitem(sys.modules, "qdrant_client", types.SimpleNamespace(QdrantClient=_FakeQdrantClient)) + + client = TestClient(upload_service.app) + resp = client.post( + "/admin/staging/copy", + data={"collection": "src", "target": "dst", "overwrite": ""}, + follow_redirects=False, + ) + assert resp.status_code == 302 + loc = resp.headers.get("location") or "" + parsed = urlparse(loc) + qs = parse_qs(parsed.query) + assert qs.get("copied") == ["src"] + assert qs.get("new") == ["dst"] + assert qs.get("graph_copied") == ["1"] + + def test_watcher_collection_resolution_prefers_serving_state_when_staging_enabled(monkeypatch: pytest.MonkeyPatch, tmp_path: Path): from scripts.watch_index_core import utils as watch_utils From d4be80a83b5347136fa0d0f59ae77f701d643588 Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Feb 2026 11:14:06 +0000 Subject: [PATCH 02/39] Admin: fix option to clear indexing caches Adds a command-line option to clear indexing caches before running the ingest process. This ensures a clean indexing run by removing any stale file hash or symbol caches. This is useful for scenarios where the underlying code has changed significantly, invalidating the existing cache. Also, ensures `CTXCE_FORCE_COLLECTION_NAME` disables multi-repo enumeration in `ingest_code`, forcing the use of the specified collection name, and clarifies its purpose in `indexing_admin.py`. Broken by commit 2e6317d --- scripts/indexing_admin.py | 9 +++---- scripts/ingest/cli.py | 38 ++++++++++++++++++++++++-- tests/test_ingest_cli.py | 57 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 8 deletions(-) create mode 100644 tests/test_ingest_cli.py diff --git a/scripts/indexing_admin.py b/scripts/indexing_admin.py index 260feecf..7ab1415c 100644 --- a/scripts/indexing_admin.py +++ b/scripts/indexing_admin.py @@ -996,12 +996,9 @@ def spawn_ingest_code( env.pop(k, None) else: env[str(k)] = str(v) - # When we provide env overrides for a run (e.g. staging rebuild), we also want to - # force ingest_code to honor the explicit COLLECTION_NAME instead of routing based - # on per-repo state/serving_collection in multi-repo mode. - # CTXCE_FORCE_COLLECTION_NAME is only used for these subprocess runs; normal watcher - # and indexer flows do not set it. - env["CTXCE_FORCE_COLLECTION_NAME"] = "1" # Force ingest_code to use COLLECTION_NAME for staging/pending env overrides + # For admin-triggered subprocess runs (recreate/reindex/staging), force ingest_code to + # honor explicit COLLECTION_NAME and avoid multi-repo enumeration. + env["CTXCE_FORCE_COLLECTION_NAME"] = "1" env["COLLECTION_NAME"] = collection env["WATCH_ROOT"] = work_dir env["WORKSPACE_PATH"] = work_dir diff --git a/scripts/ingest/cli.py b/scripts/ingest/cli.py index 8561a9c2..aa76864a 100644 --- a/scripts/ingest/cli.py +++ b/scripts/ingest/cli.py @@ -15,6 +15,7 @@ is_multi_repo_mode, get_collection_name, ) +from scripts import workspace_state as _ws from scripts.ingest.pipeline import index_repo from scripts.ingest.pseudo import generate_pseudo_tags @@ -40,6 +41,11 @@ def parse_args(): action="store_true", help="Do not skip files whose content hash matches existing index", ) + parser.add_argument( + "--clear-indexing-caches", + action="store_true", + help="Clear local indexing caches (file hash/symbol caches) before indexing", + ) parser.add_argument( "--schema-mode", type=str, @@ -186,13 +192,35 @@ def main(): ) return + def _clear_indexing_caches(workspace_root: Path, repo_name: str | None) -> None: + try: + _ws.clear_symbol_cache(workspace_path=str(workspace_root), repo_name=repo_name) + except Exception: + pass + try: + if _ws.is_multi_repo_mode() and repo_name: + state_dir = _ws._get_repo_state_dir(repo_name) + cache_path = state_dir / _ws.CACHE_FILENAME + else: + cache_path = _ws._get_cache_path(workspace_root) + if cache_path.exists(): + cache_path.unlink() + except Exception: + pass + qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333") api_key = os.environ.get("QDRANT_API_KEY") collection = os.environ.get("COLLECTION_NAME") or os.environ.get("DEFAULT_COLLECTION") or "codebase" model_name = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") # Resolve collection name based on multi-repo mode - multi_repo = bool(is_multi_repo_mode and is_multi_repo_mode()) + force_collection = (os.environ.get("CTXCE_FORCE_COLLECTION_NAME") or "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + multi_repo = bool(is_multi_repo_mode and is_multi_repo_mode()) and not force_collection if multi_repo: print("[multi_repo] Multi-repo mode enabled - will create separate collections per repository") @@ -231,6 +259,9 @@ def main(): if not repo_collection: repo_collection = "codebase" + if args.clear_indexing_caches: + _clear_indexing_caches(root_path, repo_name) + index_repo( repo_root, qdrant_url, @@ -249,7 +280,7 @@ def main(): try: resolved = get_collection_name(str(Path(args.root).resolve())) placeholders = {"", "default-collection", "my-collection", "codebase"} - if resolved and collection in placeholders: + if resolved and collection in placeholders and not force_collection: collection = resolved except Exception: pass @@ -260,6 +291,9 @@ def main(): flag = (os.environ.get("PSEUDO_DEFER_TO_WORKER") or "").strip().lower() pseudo_mode = "off" if flag in {"1", "true", "yes", "on"} else "full" + if args.clear_indexing_caches: + _clear_indexing_caches(Path(args.root).resolve(), None) + index_repo( Path(args.root).resolve(), qdrant_url, diff --git a/tests/test_ingest_cli.py b/tests/test_ingest_cli.py new file mode 100644 index 00000000..493d1a68 --- /dev/null +++ b/tests/test_ingest_cli.py @@ -0,0 +1,57 @@ +import sys +from pathlib import Path + +import pytest + + +@pytest.mark.unit +def test_cli_force_collection_disables_multi_repo_enumeration(monkeypatch, tmp_path: Path): + from scripts.ingest import cli + + # Create fake repo dirs to prove we are not enumerating them. + (tmp_path / "repo_a").mkdir() + (tmp_path / "repo_b").mkdir() + + calls = [] + + def _fake_index_repo( + root, + qdrant_url, + api_key, + collection, + model_name, + recreate, + dedupe, + skip_unchanged, + pseudo_mode, + schema_mode, + ): + calls.append( + { + "root": Path(root), + "collection": collection, + "recreate": recreate, + "dedupe": dedupe, + "skip_unchanged": skip_unchanged, + } + ) + + monkeypatch.setattr(cli, "index_repo", _fake_index_repo) + monkeypatch.setattr(cli, "is_multi_repo_mode", lambda: True) + monkeypatch.setattr(cli, "get_collection_name", lambda *_: "should-not-use") + + monkeypatch.setenv("MULTI_REPO_MODE", "1") + monkeypatch.setenv("COLLECTION_NAME", "forced-collection") + monkeypatch.setenv("CTXCE_FORCE_COLLECTION_NAME", "1") + + monkeypatch.setattr( + sys, + "argv", + ["ingest_code.py", "--root", str(tmp_path)], + ) + + cli.main() + + assert len(calls) == 1 + assert calls[0]["root"] == tmp_path + assert calls[0]["collection"] == "forced-collection" From 2e59455a4c6c2517aae92ccd8c6f2d0b1952de30 Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Feb 2026 11:15:40 +0000 Subject: [PATCH 03/39] fix(upload-client): stop dev-workspace recursion in dev-remote watch paths --- scripts/remote_upload_client.py | 52 +++++++++++++++++++++-------- scripts/standalone_upload_client.py | 48 ++++++++++++++++++++------ 2 files changed, 75 insertions(+), 25 deletions(-) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index fcc7d6ba..2f87337c 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -520,6 +520,34 @@ def log_mapping_summary(self) -> None: logger.info(f" source_path: {info['source_path']}") logger.info(f" container_path: {info['container_path']}") + def _excluded_dirnames(self) -> set: + # Keep in sync with standalone_upload_client exclusions. + excluded = { + "node_modules", "vendor", "dist", "build", "target", "out", + ".git", ".hg", ".svn", ".vscode", ".idea", ".venv", "venv", + "__pycache__", ".pytest_cache", ".mypy_cache", ".cache", + ".context-engine", ".context-engine-uploader", ".codebase", + } + dev_remote = os.environ.get("DEV_REMOTE_MODE") == "1" or os.environ.get("REMOTE_UPLOAD_MODE") == "development" + if dev_remote: + excluded.add("dev-workspace") + return excluded + + def _is_ignored_path(self, path: Path) -> bool: + """Return True when path is outside workspace or under excluded dirs.""" + try: + workspace_root = Path(self.workspace_path).resolve() + rel = path.resolve().relative_to(workspace_root) + except Exception: + return True + + parts = set(rel.parts) + if parts & self._excluded_dirnames(): + return True + if any(p.startswith(".") for p in rel.parts): + return True + return False + def _get_temp_bundle_dir(self) -> Path: """Get or create temporary directory for bundle creation.""" if not self.temp_dir: @@ -547,6 +575,8 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: } for path in changed_paths: + if self._is_ignored_path(path): + continue # Resolve to an absolute path for stable cache keys try: abs_path = str(path.resolve()) @@ -1297,15 +1327,7 @@ def get_all_code_files(self) -> List[Path]: # Single walk with early pruning similar to standalone client ext_suffixes = {str(ext).lower() for ext in idx.CODE_EXTS if str(ext).startswith('.')} name_matches = {str(ext) for ext in idx.CODE_EXTS if not str(ext).startswith('.')} - dev_remote = os.environ.get("DEV_REMOTE_MODE") == "1" or os.environ.get("REMOTE_UPLOAD_MODE") == "development" - excluded = { - "node_modules", "vendor", "dist", "build", "target", "out", - ".git", ".hg", ".svn", ".vscode", ".idea", ".venv", "venv", - "__pycache__", ".pytest_cache", ".mypy_cache", ".cache", - ".context-engine", ".context-engine-uploader", ".codebase" - } - if dev_remote: - excluded.add("dev-workspace") + excluded = self._excluded_dirnames() seen = set() for root, dirnames, filenames in os.walk(workspace_path): @@ -1315,6 +1337,8 @@ def get_all_code_files(self) -> List[Path]: if filename.startswith('.'): continue candidate = Path(root) / filename + if self._is_ignored_path(candidate): + continue suffix = candidate.suffix.lower() if filename in name_matches or suffix in ext_suffixes: resolved = candidate.resolve() @@ -1368,13 +1392,13 @@ def on_any_event(self, event): # Always check src_path src_path = Path(event.src_path) - if idx.CODE_EXTS.get(src_path.suffix.lower(), "unknown") != "unknown": + if not self.client._is_ignored_path(src_path) and idx.CODE_EXTS.get(src_path.suffix.lower(), "unknown") != "unknown": paths_to_process.append(src_path) # For FileMovedEvent, also process the destination path if hasattr(event, 'dest_path') and event.dest_path: dest_path = Path(event.dest_path) - if idx.CODE_EXTS.get(dest_path.suffix.lower(), "unknown") != "unknown": + if not self.client._is_ignored_path(dest_path) and idx.CODE_EXTS.get(dest_path.suffix.lower(), "unknown") != "unknown": paths_to_process.append(dest_path) if not paths_to_process: @@ -1413,9 +1437,9 @@ def _process_pending_changes(self): self.client.workspace_path, self.client.repo_name ) - all_paths = list(set(pending + [ - Path(p) for p in cached_file_hashes.keys() - ])) + cached_paths = [Path(p) for p in cached_file_hashes.keys()] + cached_paths = [p for p in cached_paths if not self.client._is_ignored_path(p)] + all_paths = list(set(pending + cached_paths)) else: all_paths = pending diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 7cbd9dd1..77f3fdb2 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -730,6 +730,34 @@ def log_mapping_summary(self) -> None: logger.info(f" source_path: {info['source_path']}") logger.info(f" container_path: {info['container_path']}") + def _excluded_dirnames(self) -> set: + # Keep in sync with get_all_code_files exclusions. + excluded = { + "node_modules", "vendor", "dist", "build", "target", "out", + ".git", ".hg", ".svn", ".vscode", ".idea", ".venv", "venv", + "__pycache__", ".pytest_cache", ".mypy_cache", ".cache", + ".context-engine", ".context-engine-uploader", ".codebase", + } + dev_remote = os.environ.get("DEV_REMOTE_MODE") == "1" or os.environ.get("REMOTE_UPLOAD_MODE") == "development" + if dev_remote: + excluded.add("dev-workspace") + return excluded + + def _is_ignored_path(self, path: Path) -> bool: + """Return True when path is outside workspace or under excluded dirs.""" + try: + workspace_root = Path(self.workspace_path).resolve() + rel = path.resolve().relative_to(workspace_root) + except Exception: + return True + + parts = set(rel.parts) + if parts & self._excluded_dirnames(): + return True + if any(p.startswith(".") for p in rel.parts): + return True + return False + def _get_temp_bundle_dir(self) -> Path: """Get or create temporary directory for bundle creation.""" if not self.temp_dir: @@ -757,6 +785,8 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: } for path in changed_paths: + if self._is_ignored_path(path): + continue try: abs_path = str(path.resolve()) except Exception: @@ -1532,13 +1562,13 @@ def on_any_event(self, event): # Always check src_path src_path = Path(event.src_path) - if detect_language(src_path) != "unknown": + if not self.client._is_ignored_path(src_path) and detect_language(src_path) != "unknown": paths_to_process.append(src_path) # For FileMovedEvent, also process the destination path if hasattr(event, 'dest_path') and event.dest_path: dest_path = Path(event.dest_path) - if detect_language(dest_path) != "unknown": + if not self.client._is_ignored_path(dest_path) and detect_language(dest_path) != "unknown": paths_to_process.append(dest_path) if not paths_to_process: @@ -1569,9 +1599,11 @@ def _process_pending_changes(self): try: # Only include cached paths when deletion-related events occurred if check_deletions: - all_paths = list(set(pending + [ + cached_paths = [ Path(p) for p in get_all_cached_paths(self.client.repo_name) - ])) + ] + cached_paths = [p for p in cached_paths if not self.client._is_ignored_path(p)] + all_paths = list(set(pending + cached_paths)) else: all_paths = pending @@ -1722,13 +1754,7 @@ def get_all_code_files(self) -> List[Path]: extensionless_names = set(EXTENSIONLESS_FILES.keys()) # Always exclude dev-workspace to prevent recursive upload loops # (upload service creates dev-workspace// which would otherwise get re-uploaded) - excluded = { - "node_modules", "vendor", "dist", "build", "target", "out", - ".git", ".hg", ".svn", ".vscode", ".idea", ".venv", "venv", - "__pycache__", ".pytest_cache", ".mypy_cache", ".cache", - ".context-engine", ".context-engine-uploader", ".codebase", - "dev-workspace" - } + excluded = self._excluded_dirnames() seen = set() for root, dirnames, filenames in os.walk(workspace_path): From 881f1f417a9e0ea455737fd421d5a58a5779faa6 Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Feb 2026 12:35:08 +0000 Subject: [PATCH 04/39] fix(indexer): harden graph edge backfill + align upload client ignore rules --- scripts/ingest/graph_edges.py | 55 ++++++++++++++++++++++------- scripts/remote_upload_client.py | 25 ++++++++++--- scripts/standalone_upload_client.py | 10 ++++-- scripts/watch_index_core/pseudo.py | 46 +++++++++++++++--------- tests/test_staging_lifecycle.py | 4 ++- 5 files changed, 103 insertions(+), 37 deletions(-) diff --git a/scripts/ingest/graph_edges.py b/scripts/ingest/graph_edges.py index d7167e8f..297f393b 100644 --- a/scripts/ingest/graph_edges.py +++ b/scripts/ingest/graph_edges.py @@ -67,6 +67,7 @@ def _edge_vector_for_upsert(graph_collection: str) -> dict: def ensure_graph_collection(client: Any, base_collection: str) -> Optional[str]: """Ensure `_graph` exists and has payload indexes.""" from qdrant_client import models as qmodels + from qdrant_client.http.exceptions import UnexpectedResponse if not base_collection: return None @@ -91,8 +92,19 @@ def _detect_vector_mode(info: Any) -> str: _ENSURED_GRAPH_COLLECTIONS.add(graph_coll) _MISSING_GRAPH_COLLECTIONS.discard(graph_coll) return graph_coll - except Exception: - pass + except UnexpectedResponse as e: + # Only a 404 means "missing"; any other HTTP failure should be visible. + if getattr(e, "status_code", None) != 404: + logger.exception( + "Failed to get graph collection %s (status=%s): %s", + graph_coll, + getattr(e, "status_code", None), + e, + ) + return None + except Exception as e: + logger.exception("Failed to get graph collection %s: %s", graph_coll, e) + return None try: # Prefer vector-less collection when supported by server/client. @@ -298,17 +310,34 @@ def graph_edges_backfill_tick( # We may need to overscan because the main collection is chunked. overscan = max_files * 8 while processed_files < max_files: - try: - points, next_offset = client.scroll( - collection_name=base_collection, - scroll_filter=flt, - limit=min(64, overscan), - with_payload=True, - with_vectors=False, - offset=next_offset, - ) - except Exception: - break + attempts = 0 + while True: + try: + points, next_offset = client.scroll( + collection_name=base_collection, + scroll_filter=flt, + limit=min(64, overscan), + with_payload=True, + with_vectors=False, + offset=next_offset, + ) + break + except Exception as e: + attempts += 1 + logger.exception( + "Graph edge backfill scroll failed (collection=%s repo=%s offset=%s attempt=%d): %s", + base_collection, + repo_name or "default", + next_offset, + attempts, + e, + ) + # Retry a couple times for transient errors, then raise so failures are not silent. + if attempts >= 3: + raise + import time + + time.sleep(0.25 * (2 ** (attempts - 1))) if not points: break diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 2f87337c..8c88c754 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -544,7 +544,15 @@ def _is_ignored_path(self, path: Path) -> bool: parts = set(rel.parts) if parts & self._excluded_dirnames(): return True - if any(p.startswith(".") for p in rel.parts): + # Ignore hidden directories anywhere under the workspace, but allow + # extensionless dotfiles like `.gitignore` that we explicitly support. + if any(p.startswith(".") for p in rel.parts[:-1]): + return True + try: + extensionless = set((idx.EXTENSIONLESS_FILES or {}).keys()) + except Exception: + extensionless = set() + if rel.name.startswith(".") and rel.name.lower() not in extensionless: return True return False @@ -1326,7 +1334,10 @@ def get_all_code_files(self) -> List[Path]: # Single walk with early pruning similar to standalone client ext_suffixes = {str(ext).lower() for ext in idx.CODE_EXTS if str(ext).startswith('.')} - name_matches = {str(ext) for ext in idx.CODE_EXTS if not str(ext).startswith('.')} + try: + extensionless_names = {k.lower() for k in (idx.EXTENSIONLESS_FILES or {}).keys()} + except Exception: + extensionless_names = set() excluded = self._excluded_dirnames() seen = set() @@ -1334,13 +1345,19 @@ def get_all_code_files(self) -> List[Path]: dirnames[:] = [d for d in dirnames if d not in excluded and not d.startswith('.')] for filename in filenames: - if filename.startswith('.'): + # Allow dotfiles that are in EXTENSIONLESS_FILES (e.g., .gitignore) + fname_lower = filename.lower() + if filename.startswith('.') and fname_lower not in extensionless_names: continue candidate = Path(root) / filename if self._is_ignored_path(candidate): continue suffix = candidate.suffix.lower() - if filename in name_matches or suffix in ext_suffixes: + if ( + suffix in ext_suffixes + or fname_lower in extensionless_names + or fname_lower.startswith("dockerfile") + ): resolved = candidate.resolve() if resolved not in seen: seen.add(resolved) diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 77f3fdb2..7d407d9e 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -754,7 +754,11 @@ def _is_ignored_path(self, path: Path) -> bool: parts = set(rel.parts) if parts & self._excluded_dirnames(): return True - if any(p.startswith(".") for p in rel.parts): + # Ignore hidden directories anywhere under the workspace, but allow + # extensionless dotfiles like `.gitignore` that we explicitly support. + if any(p.startswith(".") for p in rel.parts[:-1]): + return True + if rel.name.startswith(".") and rel.name.lower() not in EXTENSIONLESS_FILES: return True return False @@ -1751,7 +1755,7 @@ def get_all_code_files(self) -> List[Path]: # Single walk with early pruning and set-based matching to reduce IO ext_suffixes = {str(ext).lower() for ext in CODE_EXTS if str(ext).startswith('.')} - extensionless_names = set(EXTENSIONLESS_FILES.keys()) + extensionless_names = {k.lower() for k in EXTENSIONLESS_FILES.keys()} # Always exclude dev-workspace to prevent recursive upload loops # (upload service creates dev-workspace// which would otherwise get re-uploaded) excluded = self._excluded_dirnames() @@ -1767,6 +1771,8 @@ def get_all_code_files(self) -> List[Path]: if filename.startswith('.') and fname_lower not in extensionless_names: continue candidate = Path(root) / filename + if self._is_ignored_path(candidate): + continue suffix = candidate.suffix.lower() # Match by extension, extensionless name, or Dockerfile.* prefix if (suffix in ext_suffixes or diff --git a/scripts/watch_index_core/pseudo.py b/scripts/watch_index_core/pseudo.py index ec76d7a0..33fb6f46 100644 --- a/scripts/watch_index_core/pseudo.py +++ b/scripts/watch_index_core/pseudo.py @@ -49,12 +49,21 @@ def _start_pseudo_backfill_worker( max_points = 256 if max_points <= 0: max_points = 1 + try: + graph_max_files = int( + os.environ.get("GRAPH_EDGES_BACKFILL_MAX_FILES", "128") or 128 + ) + except Exception: + graph_max_files = 128 + if graph_max_files <= 0: + graph_max_files = 1 shutdown_event = threading.Event() def _worker() -> None: while not shutdown_event.is_set(): try: + graph_backfill_enabled = get_boolean_env("GRAPH_EDGES_BACKFILL") try: mappings = get_collection_mappings(search_root=str(ROOT)) except Exception: @@ -90,31 +99,34 @@ def _worker() -> None: "[pseudo_backfill] repo=%s collection=%s processed=%d", repo_name or "default", coll, processed, ) - # Optional: backfill graph edge collection from main points. - # Controlled separately because it may scan large collections over time. - if get_boolean_env("GRAPH_EDGES_BACKFILL"): - try: + # Optional: backfill graph edge collection from main points. + # Controlled separately because it may scan large collections over time. + # Run under its own lock to avoid blocking pseudo/tag backfill workers. + if graph_backfill_enabled: + try: + graph_lock_path = state_dir / "graph_edges.lock" + with _cross_process_lock(graph_lock_path): files_done = idx.graph_edges_backfill_tick( client, coll, repo_name=repo_name, - max_files=max_points, + max_files=graph_max_files, ) - if files_done: - logger.info( - "[graph_backfill] repo=%s collection=%s files=%d", - repo_name or "default", - coll, - files_done, - ) - except Exception as exc: - logger.error( - "[graph_backfill] error repo=%s collection=%s: %s", + if files_done: + logger.info( + "[graph_backfill] repo=%s collection=%s files=%d", repo_name or "default", coll, - exc, - exc_info=True, + files_done, ) + except Exception as exc: + logger.error( + "[graph_backfill] error repo=%s collection=%s: %s", + repo_name or "default", + coll, + exc, + exc_info=True, + ) except Exception as exc: logger.error( "[pseudo_backfill] error repo=%s collection=%s: %s", diff --git a/tests/test_staging_lifecycle.py b/tests/test_staging_lifecycle.py index d7bb9cbf..eb8a93cf 100644 --- a/tests/test_staging_lifecycle.py +++ b/tests/test_staging_lifecycle.py @@ -698,7 +698,9 @@ class _Proc: env = captured["env"] assert env["BASE_ONLY"] == "system" assert env["COLLECTION_NAME"] == "primary-coll" - assert "CTXCE_FORCE_COLLECTION_NAME" not in env + # Admin-spawned ingests should never enumerate `/work/*` in multi-repo mode; + # force exact collection/root handling even when no explicit overrides are provided. + assert env.get("CTXCE_FORCE_COLLECTION_NAME") == "1" def test_promote_pending_env_without_pending_config(staging_workspace: dict): From 7961a72346612750620d23349e002faff39091d7 Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Feb 2026 13:04:32 +0000 Subject: [PATCH 05/39] chore: harden graph-edge ops, cache uploader excludes, and sync helpers --- scripts/collection_admin.py | 9 +- scripts/indexing_admin.py | 18 +++- scripts/ingest/cli.py | 13 +-- scripts/ingest/graph_edges.py | 9 +- scripts/ingest/pipeline.py | 126 ++++++++++++++-------------- scripts/ingest_code.py | 2 + scripts/mcp_impl/symbol_graph.py | 19 +++-- scripts/prune.py | 3 +- scripts/remote_upload_client.py | 11 ++- scripts/standalone_upload_client.py | 11 ++- scripts/upload_service.py | 50 +++++++---- 11 files changed, 160 insertions(+), 111 deletions(-) diff --git a/scripts/collection_admin.py b/scripts/collection_admin.py index 2882fc78..55715403 100644 --- a/scripts/collection_admin.py +++ b/scripts/collection_admin.py @@ -501,7 +501,12 @@ def _count_points(name: str) -> Optional[int]: qdrant_url=base_url, overwrite=overwrite, ) - except Exception: - pass + except Exception as exc: + logger.debug( + "Best-effort graph collection copy %s_graph -> %s_graph failed: %s", + src, + dest, + exc, + ) return dest diff --git a/scripts/indexing_admin.py b/scripts/indexing_admin.py index 7ab1415c..4ab8932d 100644 --- a/scripts/indexing_admin.py +++ b/scripts/indexing_admin.py @@ -931,8 +931,13 @@ def delete_collection_qdrant(*, qdrant_url: str, api_key: Optional[str], collect if not name.endswith("_graph"): try: cli.delete_collection(collection_name=f"{name}_graph") - except Exception: - pass + except Exception as exc: + try: + print( + f"[indexing_admin] best-effort graph collection delete failed for {name}_graph: {exc}" + ) + except Exception: + pass except Exception: pass finally: @@ -961,8 +966,13 @@ def recreate_collection_qdrant(*, qdrant_url: str, api_key: Optional[str], colle if not name.endswith("_graph"): try: cli.delete_collection(collection_name=f"{name}_graph") - except Exception: - pass + except Exception as exc: + try: + print( + f"[indexing_admin] best-effort graph collection delete failed for {name}_graph: {exc}" + ) + except Exception: + pass finally: try: cli.close() diff --git a/scripts/ingest/cli.py b/scripts/ingest/cli.py index aa76864a..676ee77a 100644 --- a/scripts/ingest/cli.py +++ b/scripts/ingest/cli.py @@ -16,6 +16,7 @@ get_collection_name, ) from scripts import workspace_state as _ws +from scripts.collection_health import clear_indexing_caches as _clear_indexing_caches_impl from scripts.ingest.pipeline import index_repo from scripts.ingest.pseudo import generate_pseudo_tags @@ -194,17 +195,7 @@ def main(): def _clear_indexing_caches(workspace_root: Path, repo_name: str | None) -> None: try: - _ws.clear_symbol_cache(workspace_path=str(workspace_root), repo_name=repo_name) - except Exception: - pass - try: - if _ws.is_multi_repo_mode() and repo_name: - state_dir = _ws._get_repo_state_dir(repo_name) - cache_path = state_dir / _ws.CACHE_FILENAME - else: - cache_path = _ws._get_cache_path(workspace_root) - if cache_path.exists(): - cache_path.unlink() + _clear_indexing_caches_impl(str(workspace_root), repo_name=repo_name) except Exception: pass diff --git a/scripts/ingest/graph_edges.py b/scripts/ingest/graph_edges.py index 297f393b..e3b5cb66 100644 --- a/scripts/ingest/graph_edges.py +++ b/scripts/ingest/graph_edges.py @@ -114,7 +114,12 @@ def _detect_vector_mode(info: Any) -> str: vectors_config={}, ) _GRAPH_VECTOR_MODE[graph_coll] = "none" - except Exception: + except Exception as vec_exc: + logger.debug( + "Vector-less creation failed for %s, trying named vector: %s", + graph_coll, + vec_exc, + ) client.create_collection( collection_name=graph_coll, vectors_config={ @@ -145,7 +150,7 @@ def _detect_vector_mode(info: Any) -> str: def _edge_id(edge_type: str, repo: str, caller_path: str, callee_symbol: str) -> str: - key = f"{edge_type}:{repo}:{caller_path}:{callee_symbol}" + key = f"{edge_type}\x00{repo}\x00{caller_path}\x00{callee_symbol}" return hashlib.sha256(key.encode("utf-8", errors="ignore")).hexdigest()[:32] diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py index 0e0aae1f..f43c7d3a 100644 --- a/scripts/ingest/pipeline.py +++ b/scripts/ingest/pipeline.py @@ -227,6 +227,53 @@ def _normalize_info_for_dense(s: str) -> str: return text +def _sync_graph_edges_best_effort( + client: QdrantClient, + collection: str, + file_path: str, + repo: str | None, + calls: list[str] | None, + imports: list[str] | None, +) -> None: + """Best-effort sync of file-level graph edges. Safe to skip on failure.""" + enabled = str(os.environ.get("GRAPH_EDGES_ENABLE", "1") or "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + if not enabled: + return + try: + from scripts.ingest.graph_edges import ( + delete_edges_by_path, + ensure_graph_collection, + upsert_file_edges, + ) + + ensure_graph_collection(client, collection) + # Important: delete stale edges for this file before upserting the new set. + delete_edges_by_path( + client, + collection, + caller_path=str(file_path), + repo=repo, + ) + upsert_file_edges( + client, + collection, + caller_path=str(file_path), + repo=repo, + calls=calls, + imports=imports, + ) + except Exception as exc: + try: + print(f"[graph_edges] best-effort sync failed for {file_path}: {exc}") + except Exception: + pass + + def build_information( language: str, path: Path, start: int, end: int, first_line: str ) -> str: @@ -655,38 +702,14 @@ def make_point(pid, dense_vec, lex_vec, payload, lex_text: str = "", code_text: upsert_points(client, collection, points) # Optional: materialize file-level graph edges in a companion `_graph` store. # This is an accelerator for symbol_graph callers/importers and is safe to skip on failure. - try: - enabled = str(os.environ.get("GRAPH_EDGES_ENABLE", "1") or "").strip().lower() in { - "1", - "true", - "yes", - "on", - } - if enabled: - from scripts.ingest.graph_edges import ( - delete_edges_by_path as _delete_edges_by_path, - ensure_graph_collection as _ensure_graph_collection, - upsert_file_edges as _upsert_file_edges, - ) - - _ensure_graph_collection(client, collection) - # Important: delete stale edges for this file before upserting the new set. - _delete_edges_by_path( - client, - collection, - caller_path=str(file_path), - repo=repo_tag, - ) - _upsert_file_edges( - client, - collection, - caller_path=str(file_path), - repo=repo_tag, - calls=calls, - imports=imports, - ) - except Exception: - pass + _sync_graph_edges_best_effort( + client, + collection, + str(file_path), + repo_tag, + calls, + imports, + ) try: ws = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" if set_cached_file_hash: @@ -1402,37 +1425,14 @@ def process_file_with_smart_reindexing( if all_points: _upsert_points_fn(client, current_collection, all_points) # Optional: materialize file-level graph edges (best-effort). - try: - enabled = str(os.environ.get("GRAPH_EDGES_ENABLE", "1") or "").strip().lower() in { - "1", - "true", - "yes", - "on", - } - if enabled: - from scripts.ingest.graph_edges import ( - delete_edges_by_path as _delete_edges_by_path, - ensure_graph_collection as _ensure_graph_collection, - upsert_file_edges as _upsert_file_edges, - ) - - _ensure_graph_collection(client, current_collection) - _delete_edges_by_path( - client, - current_collection, - caller_path=str(file_path), - repo=per_file_repo, - ) - _upsert_file_edges( - client, - current_collection, - caller_path=str(file_path), - repo=per_file_repo, - calls=calls, - imports=imports, - ) - except Exception: - pass + _sync_graph_edges_best_effort( + client, + current_collection, + str(file_path), + per_file_repo, + calls, + imports, + ) try: if set_cached_symbols: diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 93088ae0..574457a5 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -223,6 +223,8 @@ upsert_file_edges as upsert_graph_edges_for_file, ) except ImportError: + # graph_edges_backfill_tick is optional and intentionally left as None to + # force callers to explicitly guard long-running backfill behavior. graph_edges_backfill_tick = None # type: ignore[assignment] def delete_graph_edges_by_path(*_args, **_kwargs) -> int: diff --git a/scripts/mcp_impl/symbol_graph.py b/scripts/mcp_impl/symbol_graph.py index 2967035b..6c574dff 100644 --- a/scripts/mcp_impl/symbol_graph.py +++ b/scripts/mcp_impl/symbol_graph.py @@ -24,7 +24,12 @@ logger = logging.getLogger(__name__) -GRAPH_COLLECTION_SUFFIX = "_graph" +try: + from scripts.ingest.graph_edges import GRAPH_COLLECTION_SUFFIX as _GRAPH_SUFFIX +except Exception: + _GRAPH_SUFFIX = "_graph" + +GRAPH_COLLECTION_SUFFIX = _GRAPH_SUFFIX _MISSING_GRAPH_COLLECTIONS: set[str] = set() __all__ = [ @@ -339,10 +344,10 @@ async def _query_graph_edges_collection( ] ) - def _scroll(): + def _scroll(_flt=flt): return client.scroll( collection_name=graph_coll, - scroll_filter=flt, + scroll_filter=_flt, limit=max(32, limit * 4), with_payload=True, with_vectors=False, @@ -384,17 +389,17 @@ def _scroll(): if len(hydrated) >= limit: break - def _scroll_main(): + def _scroll_main(_p=p, _language=language): must = [ qmodels.FieldCondition( - key="metadata.path", match=qmodels.MatchValue(value=p) + key="metadata.path", match=qmodels.MatchValue(value=_p) ) ] - if language: + if _language: must.append( qmodels.FieldCondition( key="metadata.language", - match=qmodels.MatchValue(value=str(language).lower()), + match=qmodels.MatchValue(value=str(_language).lower()), ) ) return client.scroll( diff --git a/scripts/prune.py b/scripts/prune.py index 90e22863..d654132a 100755 --- a/scripts/prune.py +++ b/scripts/prune.py @@ -49,9 +49,10 @@ def delete_graph_edges_by_path(client: QdrantClient, path_str: str, repo: str | if not path_str: return 0 try: - path_str = os.path.normpath(str(path_str)).replace("\\", "/") + path_str = os.path.normpath(str(path_str)) except Exception: path_str = str(path_str) + path_str = str(path_str).replace("\\", "/") must = [ models.FieldCondition(key="caller_path", match=models.MatchValue(value=path_str)) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 8c88c754..42a726e9 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -522,6 +522,9 @@ def log_mapping_summary(self) -> None: def _excluded_dirnames(self) -> set: # Keep in sync with standalone_upload_client exclusions. + cached = getattr(self, "_excluded_dirnames_cache", None) + if cached is not None: + return cached excluded = { "node_modules", "vendor", "dist", "build", "target", "out", ".git", ".hg", ".svn", ".vscode", ".idea", ".venv", "venv", @@ -531,7 +534,9 @@ def _excluded_dirnames(self) -> set: dev_remote = os.environ.get("DEV_REMOTE_MODE") == "1" or os.environ.get("REMOTE_UPLOAD_MODE") == "development" if dev_remote: excluded.add("dev-workspace") - return excluded + cached = frozenset(excluded) + self._excluded_dirnames_cache = cached + return cached def _is_ignored_path(self, path: Path) -> bool: """Return True when path is outside workspace or under excluded dirs.""" @@ -541,8 +546,8 @@ def _is_ignored_path(self, path: Path) -> bool: except Exception: return True - parts = set(rel.parts) - if parts & self._excluded_dirnames(): + dir_parts = set(rel.parts[:-1]) if len(rel.parts) > 1 else set() + if dir_parts & self._excluded_dirnames(): return True # Ignore hidden directories anywhere under the workspace, but allow # extensionless dotfiles like `.gitignore` that we explicitly support. diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 7d407d9e..1ef65161 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -732,6 +732,9 @@ def log_mapping_summary(self) -> None: def _excluded_dirnames(self) -> set: # Keep in sync with get_all_code_files exclusions. + cached = getattr(self, "_excluded_dirnames_cache", None) + if cached is not None: + return cached excluded = { "node_modules", "vendor", "dist", "build", "target", "out", ".git", ".hg", ".svn", ".vscode", ".idea", ".venv", "venv", @@ -741,7 +744,9 @@ def _excluded_dirnames(self) -> set: dev_remote = os.environ.get("DEV_REMOTE_MODE") == "1" or os.environ.get("REMOTE_UPLOAD_MODE") == "development" if dev_remote: excluded.add("dev-workspace") - return excluded + cached = frozenset(excluded) + self._excluded_dirnames_cache = cached + return cached def _is_ignored_path(self, path: Path) -> bool: """Return True when path is outside workspace or under excluded dirs.""" @@ -751,8 +756,8 @@ def _is_ignored_path(self, path: Path) -> bool: except Exception: return True - parts = set(rel.parts) - if parts & self._excluded_dirnames(): + dir_parts = set(rel.parts[:-1]) if len(rel.parts) > 1 else set() + if dir_parts & self._excluded_dirnames(): return True # Ignore hidden directories anywhere under the workspace, but allow # extensionless dotfiles like `.gitignore` that we explicitly support. diff --git a/scripts/upload_service.py b/scripts/upload_service.py index 067e2e48..c4e19064 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -86,6 +86,10 @@ except Exception: delete_collection_everywhere = None copy_collection_qdrant = None +try: + from scripts.qdrant_client_manager import pooled_qdrant_client +except Exception: + pooled_qdrant_client = None try: from scripts.admin_ui import ( render_admin_acl, @@ -1241,23 +1245,39 @@ async def admin_copy_collection( graph_copied: Optional[str] = None try: if not name.endswith("_graph") and not str(new_name).endswith("_graph"): - from qdrant_client import QdrantClient # type: ignore - - cli = QdrantClient( - url=QDRANT_URL, - api_key=os.environ.get("QDRANT_API_KEY"), - timeout=float(os.environ.get("QDRANT_TIMEOUT", "5") or 5), - ) - try: - cli.get_collection(collection_name=f"{new_name}_graph") - graph_copied = "1" - except Exception: - graph_copied = "0" - finally: + used_pooled = False + if pooled_qdrant_client is not None: try: - cli.close() + with pooled_qdrant_client( + url=QDRANT_URL, + api_key=os.environ.get("QDRANT_API_KEY"), + ) as cli: + cli.get_collection(collection_name=f"{new_name}_graph") + graph_copied = "1" + used_pooled = True except Exception: - pass + used_pooled = False + if not used_pooled: + try: + from qdrant_client import QdrantClient # type: ignore + + cli = QdrantClient( + url=QDRANT_URL, + api_key=os.environ.get("QDRANT_API_KEY"), + timeout=float(os.environ.get("QDRANT_TIMEOUT", "5") or 5), + ) + try: + cli.get_collection(collection_name=f"{new_name}_graph") + graph_copied = "1" + except Exception: + graph_copied = "0" + finally: + try: + cli.close() + except Exception: + pass + except Exception: + graph_copied = "0" except Exception: graph_copied = None From 2476d6cc67d7245444f9f86f28d356bf571ac2b4 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 13 Feb 2026 19:41:33 +0000 Subject: [PATCH 06/39] collection_admin: Fix missing logging --- scripts/collection_admin.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/collection_admin.py b/scripts/collection_admin.py index 55715403..149dadf0 100644 --- a/scripts/collection_admin.py +++ b/scripts/collection_admin.py @@ -1,12 +1,15 @@ +import logging import os import json import re import shutil import time -from pathlib import Path from datetime import datetime +from pathlib import Path from typing import Any, Dict, Optional, List +logger = logging.getLogger(__name__) + from scripts.auth_backend import mark_collection_deleted try: From b43054b417b3055bf248b4fa4eb0a5792c79d23f Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 13 Feb 2026 19:42:36 +0000 Subject: [PATCH 07/39] Adds debug mode to repo search Implements a `debug` parameter for the repo search functionality. When `debug=true`, includes verbose internal fields (like components, rerank_counters, and code_signals) in the search results. When `debug=false` (default), strips these fields to reduce token consumption. --- scripts/mcp_impl/search.py | 67 ++++++++++++++++++++++++++++++++--- scripts/mcp_indexer_server.py | 5 ++- 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/scripts/mcp_impl/search.py b/scripts/mcp_impl/search.py index e4dc3766..a06a009e 100644 --- a/scripts/mcp_impl/search.py +++ b/scripts/mcp_impl/search.py @@ -54,6 +54,43 @@ ) +# Fields to strip from results when debug=False (internal/debugging fields) +_DEBUG_RESULT_FIELDS = { + "components", # Internal scoring breakdown (dense_rrf, lexical, fname_boost, etc.) + "doc_id", # Internal benchmark ID (often null/opaque) + "code_id", # Internal benchmark ID (often null/opaque) + "payload", # Duplicates other fields (information, document, pseudo, tags) + "why", # Often empty []; debugging explanation list + "span_budgeted", # Internal budget flag + "relations", # Call graph info (imports, calls) - useful but often noise + "related_paths", # Optional related file paths + "budget_tokens_used", # Internal token accounting + "fname_boost", # Internal boost value (already applied to score) + "host_path", # Internal dual-path (host side) - use path/client_path instead + "container_path", # Internal dual-path (container side) - use path/client_path instead +} + +# Top-level response fields to strip when debug=False +_DEBUG_TOP_LEVEL_FIELDS = { + "rerank_counters", # Internal reranking metrics (inproc_hybrid, timeout, etc.) + "code_signals", # Internal code signal detection results +} + + +def _strip_debug_fields(item: dict, keep_paths: bool = True) -> dict: + """Strip internal/debug fields from a result item. + + Args: + item: Result dict to strip + keep_paths: If True, keep host_path/container_path (client paths) + + Returns: + New dict with debug fields removed + """ + result = {k: v for k, v in item.items() if k not in _DEBUG_RESULT_FIELDS} + return result + + async def _repo_search_impl( query: Any = None, queries: Any = None, # Alias for query (many clients use this) @@ -89,6 +126,7 @@ async def _repo_search_impl( repo: Any = None, # str, list[str], or "*" to search all repos # Response shaping compact: Any = None, + debug: Any = None, # When True, include verbose internal fields (components, rerank_counters, etc.) output_format: Any = None, # "json" (default) or "toon" for token-efficient format args: Any = None, # Compatibility shim for mcp-remote/Claude wrappers that send args/kwargs kwargs: Any = None, @@ -119,16 +157,21 @@ async def _repo_search_impl( Use repo=["frontend","backend"] to search related repos together. - Filters (optional): language, under (path prefix), kind, symbol, ext, path_regex, path_glob (str or list[str]), not_glob (str or list[str]), not_ (negative text), case. + - debug: bool (default false). When true, includes verbose internal fields like + components, rerank_counters, code_signals. Default false saves ~60-80% tokens. Returns: - Dict with keys: - - results: list of {score, path, symbol, start_line, end_line, why[, components][, relations][, related_paths][, snippet]} - - total: int; used_rerank: bool; rerank_counters: dict + - results: list of {score, path, symbol, start_line, end_line[, snippet][, tags][, host_path][, container_path]} + When debug=true, also includes: components, why, relations, related_paths, doc_id, code_id + - total: int; used_rerank: bool - If compact=true (and snippets not requested), results contain only {path,start_line,end_line}. + - If debug=true, response also includes: rerank_counters, code_signals Examples: - path_glob=["scripts/**","**/*.py"], language="python" - symbol="context_answer", under="scripts" + - debug=true # Include internal scoring details for query tuning """ sess = require_auth_session_fn(session) if require_auth_session_fn else session @@ -252,6 +295,8 @@ async def _repo_search_impl( case = _extra.get("case") if compact in (None, "") and _extra.get("compact") is not None: compact = _extra.get("compact") + if debug in (None, "") and _extra.get("debug") is not None: + debug = _extra.get("debug") # Optional mode hint: "code_first", "docs_first", "balanced" if ( mode is None or (isinstance(mode, str) and str(mode).strip() == "") @@ -446,6 +491,11 @@ def _to_str_list(x): if include_snippet: compact = False + # Debug mode: when False (default), strip internal/debug fields from results + # to reduce token bloat. Set debug=True to see components, rerank_counters, etc. + debug_raw = debug + debug = _to_bool(debug, False) + # Default behavior: exclude commit-history docs (which use path=".git") from # generic repo_search calls, unless the caller explicitly asks for git # content. This prevents normal code queries from surfacing commit-index @@ -1491,6 +1541,11 @@ def _read_snip(args): } for r in results ] + elif not debug: + # Strip debug/internal fields from results to reduce token bloat + # Keeps: score, path, host_path, container_path, symbol, snippet, + # start_line, end_line, tags, pseudo + results = [_strip_debug_fields(r) for r in results] response = { "args": { @@ -1518,13 +1573,17 @@ def _read_snip(args): "compact": (_to_bool(compact_raw, compact)), }, "used_rerank": bool(used_rerank), - "rerank_counters": rerank_counters, - "code_signals": code_signals if code_signals.get("has_code_signals") else None, "total": len(results), "results": results, **res, } + # Only include debug fields when explicitly requested + if debug: + response["rerank_counters"] = rerank_counters + if code_signals.get("has_code_signals"): + response["code_signals"] = code_signals + # Apply TOON formatting if requested or enabled globally # Full mode (compact=False) still saves tokens vs JSON while preserving all fields if _should_use_toon(output_format): diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index d12aee9b..46d3db1e 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -1082,6 +1082,7 @@ async def repo_search( case: Any = None, repo: Any = None, compact: Any = None, + debug: Any = None, output_format: Any = None, args: Any = None, kwargs: Any = None, @@ -1098,12 +1099,13 @@ async def repo_search( - per_path: int (default 2). Max results per file. - include_snippet/context_lines: return inline snippets near hits when true. - rerank_*: ONNX reranker is ON by default for best relevance; timeouts fall back to hybrid. + - debug: bool (default false). Include verbose internal fields (components, rerank_counters, etc). - output_format: "json" (default) or "toon" for token-efficient TOON format. - collection: str. Target collection; defaults to workspace state or env COLLECTION_NAME. - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos. Returns: - - Dict with keys: results, total, used_rerank, rerank_counters + - Dict with keys: results, total, used_rerank, [rerank_counters if debug=true] """ return await _repo_search_impl( query=query, @@ -1134,6 +1136,7 @@ async def repo_search( case=case, repo=repo, compact=compact, + debug=debug, output_format=output_format, args=args, kwargs=kwargs, From 843d79f6da079290b9bc02114bf97eede68bf3e4 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 13 Feb 2026 20:23:44 +0000 Subject: [PATCH 08/39] bridge: make MCP list timeouts configurable and gate OAuth metadata for Codex compatibility --- ctx-mcp-bridge/src/mcpServer.js | 210 +++++++++++++++++++++++++++----- scripts/mcp_indexer_server.py | 17 +++ 2 files changed, 199 insertions(+), 28 deletions(-) diff --git a/ctx-mcp-bridge/src/mcpServer.js b/ctx-mcp-bridge/src/mcpServer.js index 53cb05b7..aa55ea73 100644 --- a/ctx-mcp-bridge/src/mcpServer.js +++ b/ctx-mcp-bridge/src/mcpServer.js @@ -8,7 +8,13 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js" import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js"; import { Client } from "@modelcontextprotocol/sdk/client/index.js"; import { StreamableHTTPClientTransport } from "@modelcontextprotocol/sdk/client/streamableHttp.js"; -import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js"; +import { + CallToolRequestSchema, + ListToolsRequestSchema, + ListResourcesRequestSchema, + ListResourceTemplatesRequestSchema, + ReadResourceRequestSchema, +} from "@modelcontextprotocol/sdk/types.js"; import { loadAnyAuthEntry, loadAuthEntry, readConfig, saveAuthEntry } from "./authConfig.js"; import { maybeRemapToolArgs, maybeRemapToolResult } from "./resultPathMapping.js"; import * as oauthHandler from "./oauthHandler.js"; @@ -58,14 +64,46 @@ function dedupeTools(tools) { return out; } +function dedupeResources(resources) { + const seen = new Set(); + const out = []; + for (const resource of resources) { + const uri = resource && typeof resource.uri === "string" ? resource.uri : ""; + if (!uri || seen.has(uri)) { + continue; + } + seen.add(uri); + out.push(resource); + } + return out; +} + +function dedupeResourceTemplates(templates) { + const seen = new Set(); + const out = []; + for (const template of templates) { + const uri = + template && template.resourceTemplate && typeof template.resourceTemplate.uriTemplate === "string" + ? template.resourceTemplate.uriTemplate + : ""; + if (!uri || seen.has(uri)) { + continue; + } + seen.add(uri); + out.push(template); + } + return out; +} + async function listMemoryTools(client) { if (!client) { return []; } try { + const timeoutMs = getBridgeListTimeoutMs(); const remote = await withTimeout( client.listTools(), - 5000, + timeoutMs, "memory tools/list", ); return Array.isArray(remote?.tools) ? remote.tools.slice() : []; @@ -75,6 +113,42 @@ async function listMemoryTools(client) { } } +async function listResourcesSafe(client, label) { + if (!client) { + return []; + } + try { + const timeoutMs = getBridgeListTimeoutMs(); + const remote = await withTimeout( + client.listResources(), + timeoutMs, + `${label} resources/list`, + ); + return Array.isArray(remote?.resources) ? remote.resources.slice() : []; + } catch (err) { + debugLog(`[ctxce] Error calling ${label} resources/list: ` + String(err)); + return []; + } +} + +async function listResourceTemplatesSafe(client, label) { + if (!client) { + return []; + } + try { + const timeoutMs = getBridgeListTimeoutMs(); + const remote = await withTimeout( + client.listResourceTemplates(), + timeoutMs, + `${label} resources/templates/list`, + ); + return Array.isArray(remote?.resourceTemplates) ? remote.resourceTemplates.slice() : []; + } catch (err) { + debugLog(`[ctxce] Error calling ${label} resources/templates/list: ` + String(err)); + return []; + } +} + function withTimeout(promise, ms, label) { return new Promise((resolve, reject) => { let settled = false; @@ -125,6 +199,25 @@ function getBridgeToolTimeoutMs() { } } +function getBridgeListTimeoutMs() { + try { + // Keep list operations on a separate budget from tools/call. + // Some streamable-http clients (including Codex) probe tools/resources early, + // and a short timeout here can make the bridge appear unavailable. + const raw = process.env.CTXCE_LIST_TIMEOUT_MSEC; + if (!raw) { + return 60000; + } + const parsed = Number.parseInt(String(raw), 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + return 60000; + } + return parsed; + } catch { + return 60000; + } +} + function selectClientForTool(name, indexerClient, memoryClient) { if (!name) { return indexerClient; @@ -651,6 +744,7 @@ async function createBridgeServer(options) { { capabilities: { tools: {}, + resources: {}, }, }, ); @@ -664,9 +758,10 @@ async function createBridgeServer(options) { if (!indexerClient) { throw new Error("Indexer MCP client not initialized"); } + const timeoutMs = getBridgeListTimeoutMs(); remote = await withTimeout( indexerClient.listTools(), - 10000, + timeoutMs, "indexer tools/list", ); } catch (err) { @@ -693,6 +788,57 @@ async function createBridgeServer(options) { return { tools }; }); + server.setRequestHandler(ListResourcesRequestSchema, async () => { + // Proxy resource discovery/read-through so clients that use MCP resources + // (not only tools) can access upstream indexer/memory resources directly. + await initializeRemoteClients(false); + const indexerResources = await listResourcesSafe(indexerClient, "indexer"); + const memoryResources = await listResourcesSafe(memoryClient, "memory"); + const resources = dedupeResources([...indexerResources, ...memoryResources]); + debugLog(`[ctxce] resources/list: returning ${resources.length} resources`); + return { resources }; + }); + + server.setRequestHandler(ListResourceTemplatesRequestSchema, async () => { + await initializeRemoteClients(false); + const indexerTemplates = await listResourceTemplatesSafe(indexerClient, "indexer"); + const memoryTemplates = await listResourceTemplatesSafe(memoryClient, "memory"); + const resourceTemplates = dedupeResourceTemplates([...indexerTemplates, ...memoryTemplates]); + debugLog(`[ctxce] resources/templates/list: returning ${resourceTemplates.length} templates`); + return { resourceTemplates }; + }); + + server.setRequestHandler(ReadResourceRequestSchema, async (request) => { + await initializeRemoteClients(false); + const params = request.params || {}; + const timeoutMs = getBridgeToolTimeoutMs(); + const uri = + params && typeof params.uri === "string" ? params.uri : ""; + debugLog(`[ctxce] resources/read: ${uri}`); + + const tryRead = async (client, label) => { + if (!client) { + return null; + } + try { + return await client.readResource(params, { timeout: timeoutMs }); + } catch (err) { + debugLog(`[ctxce] resources/read failed on ${label}: ` + String(err)); + return null; + } + }; + + const indexerResult = await tryRead(indexerClient, "indexer"); + if (indexerResult) { + return indexerResult; + } + const memoryResult = await tryRead(memoryClient, "memory"); + if (memoryResult) { + return memoryResult; + } + throw new Error(`Resource ${uri} not available on any configured MCP server`); + }); + // tools/call → proxied to indexer or memory server server.setRequestHandler(CallToolRequestSchema, async (request) => { const params = request.params || {}; @@ -843,6 +989,13 @@ export async function runHttpMcpServer(options) { typeof options.port === "number" ? options.port : Number.parseInt(process.env.CTXCE_HTTP_PORT || "30810", 10) || 30810; + // TODO(auth): replace this boolean toggle with explicit auth modes (none|required). + // In required mode, enforce Bearer auth on /mcp with consistent 401 challenges and + // only advertise OAuth metadata/endpoints when authentication is mandatory. + // In local/dev mode, leaving OAuth discovery off avoids clients entering an + // unnecessary OAuth path for otherwise unauthenticated bridge usage. + const oauthEnabled = String(process.env.CTXCE_ENABLE_OAUTH || "").trim().toLowerCase(); + const oauthEndpointsEnabled = oauthEnabled === "1" || oauthEnabled === "true" || oauthEnabled === "yes"; const transport = new StreamableHTTPServerTransport({ sessionIdGenerator: undefined, @@ -865,34 +1018,36 @@ export async function runHttpMcpServer(options) { // OAuth 2.0 Endpoints (RFC9728 Protected Resource Metadata + RFC7591) // ================================================================ - // OAuth metadata endpoint (RFC9728) - if (parsedUrl.pathname === "/.well-known/oauth-authorization-server") { - oauthHandler.handleOAuthMetadata(req, res, issuerUrl); - return; - } + if (oauthEndpointsEnabled) { + // OAuth metadata endpoint (RFC9728) + if (parsedUrl.pathname === "/.well-known/oauth-authorization-server") { + oauthHandler.handleOAuthMetadata(req, res, issuerUrl); + return; + } - // OAuth Dynamic Client Registration endpoint (RFC7591) - if (parsedUrl.pathname === "/oauth/register" && req.method === "POST") { - oauthHandler.handleOAuthRegister(req, res); - return; - } + // OAuth Dynamic Client Registration endpoint (RFC7591) + if (parsedUrl.pathname === "/oauth/register" && req.method === "POST") { + oauthHandler.handleOAuthRegister(req, res); + return; + } - // OAuth authorize endpoint - if (parsedUrl.pathname === "/oauth/authorize") { - oauthHandler.handleOAuthAuthorize(req, res, parsedUrl.searchParams); - return; - } + // OAuth authorize endpoint + if (parsedUrl.pathname === "/oauth/authorize") { + oauthHandler.handleOAuthAuthorize(req, res, parsedUrl.searchParams); + return; + } - // Store session endpoint (helper for login page) - if (parsedUrl.pathname === "/oauth/store-session" && req.method === "POST") { - oauthHandler.handleOAuthStoreSession(req, res); - return; - } + // Store session endpoint (helper for login page) + if (parsedUrl.pathname === "/oauth/store-session" && req.method === "POST") { + oauthHandler.handleOAuthStoreSession(req, res); + return; + } - // OAuth token endpoint - if (parsedUrl.pathname === "/oauth/token" && req.method === "POST") { - oauthHandler.handleOAuthToken(req, res); - return; + // OAuth token endpoint + if (parsedUrl.pathname === "/oauth/token" && req.method === "POST") { + oauthHandler.handleOAuthToken(req, res); + return; + } } // ================================================================ @@ -1058,4 +1213,3 @@ function detectRepoName(workspace, config) { const leaf = workspace ? path.basename(workspace) : ""; return leaf && SLUGGED_REPO_RE.test(leaf) ? leaf : null; } - diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index 46d3db1e..4d10325b 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -300,6 +300,23 @@ def _highlight_snippet(snippet, tokens): # type: ignore ) mcp = FastMCP(APP_NAME, transport_security=_security_settings) +# Minimal resource so MCP clients can verify resource wiring. +@mcp.resource( + "resource://context-engine/indexer/info", + name="context-engine-indexer-info", + title="Context Engine Indexer Info", + description="Basic metadata about the running indexer MCP server.", + mime_type="application/json", +) +def _indexer_info_resource(): + return { + "app": APP_NAME, + "host": HOST, + "port": PORT, + "qdrant_url": QDRANT_URL, + "default_collection": DEFAULT_COLLECTION, + } + # Capture tool registry automatically by wrapping the decorator once _TOOLS_REGISTRY: list[dict] = [] From 2f14efff09bbc98b7b80f83301f95566bfd38867 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 13 Feb 2026 20:43:51 +0000 Subject: [PATCH 09/39] fix(mcp): correct template dedupe uri source and clean debug field handling --- ctx-mcp-bridge/src/mcpServer.js | 4 ++-- scripts/mcp_impl/search.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ctx-mcp-bridge/src/mcpServer.js b/ctx-mcp-bridge/src/mcpServer.js index aa55ea73..7180afff 100644 --- a/ctx-mcp-bridge/src/mcpServer.js +++ b/ctx-mcp-bridge/src/mcpServer.js @@ -83,8 +83,8 @@ function dedupeResourceTemplates(templates) { const out = []; for (const template of templates) { const uri = - template && template.resourceTemplate && typeof template.resourceTemplate.uriTemplate === "string" - ? template.resourceTemplate.uriTemplate + template && typeof template.uriTemplate === "string" + ? template.uriTemplate : ""; if (!uri || seen.has(uri)) { continue; diff --git a/scripts/mcp_impl/search.py b/scripts/mcp_impl/search.py index a06a009e..bc310f14 100644 --- a/scripts/mcp_impl/search.py +++ b/scripts/mcp_impl/search.py @@ -82,12 +82,15 @@ def _strip_debug_fields(item: dict, keep_paths: bool = True) -> dict: Args: item: Result dict to strip - keep_paths: If True, keep host_path/container_path (client paths) + keep_paths: If True, keep host_path/container_path Returns: New dict with debug fields removed """ - result = {k: v for k, v in item.items() if k not in _DEBUG_RESULT_FIELDS} + strip_fields = _DEBUG_RESULT_FIELDS + if keep_paths: + strip_fields = _DEBUG_RESULT_FIELDS - {"host_path", "container_path"} + result = {k: v for k, v in item.items() if k not in strip_fields} return result @@ -493,7 +496,6 @@ def _to_str_list(x): # Debug mode: when False (default), strip internal/debug fields from results # to reduce token bloat. Set debug=True to see components, rerank_counters, etc. - debug_raw = debug debug = _to_bool(debug, False) # Default behavior: exclude commit-history docs (which use path=".git") from From a8656de1806c0745c1baee068846cd69c79d5332 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 13 Feb 2026 22:15:09 +0000 Subject: [PATCH 10/39] vscode-ext: Adds bundled MCP bridge mode Adds a new "bundled" mode for the MCP bridge, allowing the extension to use a pre-packaged bridge binary. The extension now attempts to use the bundled bridge if the `mcpBridgeMode` setting is set to "bundled". If not found, it falls back to external resolution. Also rewords the "localOnly" setting description to reflect the addition of "mcpBridgeMode". --- vscode-extension/build/build.sh | 21 ++++++++++++ .../context-engine-uploader/extension.js | 3 ++ .../context-engine-uploader/mcp_bridge.js | 33 ++++++++++++++++++- .../context-engine-uploader/package.json | 12 ++++++- 4 files changed, 67 insertions(+), 2 deletions(-) diff --git a/vscode-extension/build/build.sh b/vscode-extension/build/build.sh index f3e4d9fa..6607ac68 100755 --- a/vscode-extension/build/build.sh +++ b/vscode-extension/build/build.sh @@ -76,6 +76,27 @@ if [[ "$BUNDLE_DEPS" == "--bundle-deps" ]]; then fi fi +# Bundle MCP bridge npm package into the staged extension +BRIDGE_SRC="$SCRIPT_DIR/../../ctx-mcp-bridge" +BRIDGE_DIR="ctx-mcp-bridge" + +if [[ -d "$BRIDGE_SRC" && -f "$BRIDGE_SRC/package.json" ]]; then + echo "Bundling MCP bridge npm package into staged extension..." + mkdir -p "$STAGE_DIR/$BRIDGE_DIR" + cp -a "$BRIDGE_SRC/bin" "$STAGE_DIR/$BRIDGE_DIR/" + cp -a "$BRIDGE_SRC/src" "$STAGE_DIR/$BRIDGE_DIR/" + cp "$BRIDGE_SRC/package.json" "$STAGE_DIR/$BRIDGE_DIR/" + + if [[ -d "$BRIDGE_SRC/node_modules" ]]; then + cp -a "$BRIDGE_SRC/node_modules" "$STAGE_DIR/$BRIDGE_DIR/" + else + echo "Warning: Bridge node_modules not found. Run 'npm install' in ctx-mcp-bridge first." + fi + echo "MCP bridge bundled successfully." +else + echo "Warning: MCP bridge source not found at $BRIDGE_SRC" +fi + pushd "$STAGE_DIR" >/dev/null echo "Packaging extension..." npx @vscode/vsce package --no-dependencies --out "$OUT_DIR" diff --git a/vscode-extension/context-engine-uploader/extension.js b/vscode-extension/context-engine-uploader/extension.js index 9a387c66..217d14e7 100644 --- a/vscode-extension/context-engine-uploader/extension.js +++ b/vscode-extension/context-engine-uploader/extension.js @@ -230,6 +230,7 @@ function activate(context) { path, fs, log, + extensionRoot, getEffectiveConfig, resolveBridgeWorkspacePath: () => configResolver ? configResolver.resolveBridgeWorkspacePath() : undefined, attachOutput: (child, label) => processManager ? processManager.attachOutput(child, label) : undefined, @@ -425,6 +426,7 @@ function activate(context) { event.affectsConfiguration('contextEngineUploader.mcpBridgeBinPath') || event.affectsConfiguration('contextEngineUploader.mcpBridgePort') || event.affectsConfiguration('contextEngineUploader.mcpBridgeLocalOnly') || + event.affectsConfiguration('contextEngineUploader.mcpBridgeMode') || event.affectsConfiguration('contextEngineUploader.windsurfMcpPath') || event.affectsConfiguration('contextEngineUploader.augmentMcpPath') || event.affectsConfiguration('contextEngineUploader.antigravityMcpPath') || @@ -439,6 +441,7 @@ function activate(context) { event.affectsConfiguration('contextEngineUploader.mcpBridgePort') || event.affectsConfiguration('contextEngineUploader.mcpBridgeBinPath') || event.affectsConfiguration('contextEngineUploader.mcpBridgeLocalOnly') || + event.affectsConfiguration('contextEngineUploader.mcpBridgeMode') || event.affectsConfiguration('contextEngineUploader.mcpIndexerUrl') || event.affectsConfiguration('contextEngineUploader.mcpMemoryUrl') || event.affectsConfiguration('contextEngineUploader.mcpServerMode') || diff --git a/vscode-extension/context-engine-uploader/mcp_bridge.js b/vscode-extension/context-engine-uploader/mcp_bridge.js index d9825177..b7c1b441 100644 --- a/vscode-extension/context-engine-uploader/mcp_bridge.js +++ b/vscode-extension/context-engine-uploader/mcp_bridge.js @@ -4,6 +4,7 @@ function createBridgeManager(deps) { const path = deps.path; const fs = deps.fs; const log = deps.log; + const extensionRoot = deps.extensionRoot; const getEffectiveConfig = deps.getEffectiveConfig; const resolveBridgeWorkspacePath = deps.resolveBridgeWorkspacePath; @@ -42,7 +43,36 @@ function createBridgeManager(deps) { } } + function getBridgeMode() { + try { + const settings = getEffectiveConfig(); + return (settings.get('mcpBridgeMode') || 'bundled').trim(); + } catch (_) { + return 'bundled'; + } + } + + function findBundledBridgeBin() { + if (!extensionRoot) return undefined; + const bundledPath = path.join(extensionRoot, 'ctx-mcp-bridge', 'bin', 'ctxce.js'); + if (fs.existsSync(bundledPath)) { + return path.resolve(bundledPath); + } + return undefined; + } + function findLocalBridgeBin() { + // First check for bundled bridge if mode is 'bundled' + const mode = getBridgeMode(); + if (mode === 'bundled') { + const bundledBin = findBundledBridgeBin(); + if (bundledBin) { + return bundledBin; + } + log('Bundled bridge requested but not found; falling back to external resolution'); + } + + // External mode logic (existing behavior) let localOnly = true; let configured = ''; try { @@ -68,11 +98,12 @@ function createBridgeManager(deps) { function resolveBridgeCliInvocation() { const binPath = findLocalBridgeBin(); + const mode = getBridgeMode(); if (binPath) { return { command: 'node', args: [binPath], - kind: 'local' + kind: mode === 'bundled' ? 'bundled' : 'local' }; } const isWindows = process.platform === 'win32'; diff --git a/vscode-extension/context-engine-uploader/package.json b/vscode-extension/context-engine-uploader/package.json index d5e3584f..77b86261 100644 --- a/vscode-extension/context-engine-uploader/package.json +++ b/vscode-extension/context-engine-uploader/package.json @@ -297,7 +297,17 @@ "contextEngineUploader.mcpBridgeLocalOnly": { "type": "boolean", "default": false, - "description": "Development toggle. When true (default) the extension prefers local bridge binaries resolved from mcpBridgeBinPath or CTXCE_BRIDGE_BIN before falling back to the published npm build via npx." + "description": "Development toggle. When true and mcpBridgeMode='external', prefers local bridge binaries resolved from mcpBridgeBinPath or CTXCE_BRIDGE_BIN before falling back to the published npm build via npx. Ignored when mcpBridgeMode='bundled'." + }, + "contextEngineUploader.mcpBridgeMode": { + "type": "string", + "enum": ["bundled", "external"], + "default": "bundled", + "description": "Bridge invocation mode. 'bundled' uses the bundled bridge inside the extension (offline, no npx required). 'external' uses external binary path or npx (current behavior).", + "enumDescriptions": [ + "Use the bundled MCP bridge inside the extension (works offline).", + "Use external binary path or npx to run the bridge (requires internet for first npx install)." + ] }, "contextEngineUploader.mcpServerMode": { "type": "string", From 6cecc26c799cd94f7040596b34a8f9d5ff4b2a2e Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 14 Feb 2026 06:32:41 +0000 Subject: [PATCH 11/39] Add back Claude Code workflow for GH --- .github/workflows/claude.yaml | 68 +++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 .github/workflows/claude.yaml diff --git a/.github/workflows/claude.yaml b/.github/workflows/claude.yaml new file mode 100644 index 00000000..732de78c --- /dev/null +++ b/.github/workflows/claude.yaml @@ -0,0 +1,68 @@ +name: Claude Code + +on: + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + issues: + types: [opened] + pull_request_review: + types: [submitted] + pull_request_target: + types: [opened, synchronize] + +jobs: + claude: + # This simplified condition is more robust and correctly checks permissions. + if: > + (contains(github.event.comment.body, '@claude') || + contains(github.event.review.body, '@claude') || + contains(github.event.issue.body, '@claude') || + contains(github.event.pull_request.body, '@claude')) && + (github.event.sender.type == 'User' && ( + github.event.comment.author_association == 'OWNER' || + github.event.comment.author_association == 'MEMBER' || + github.event.comment.author_association == 'COLLABORATOR' + )) + runs-on: ubuntu-latest + permissions: + # CRITICAL: Write permissions are required for the action to push branches and update issues/PRs. + contents: write + pull-requests: write + issues: write + id-token: write # Required for OIDC token exchange + actions: read # Required for Claude to read CI results on PRs + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + # This correctly checks out the PR's head commit for pull_request_target events. + ref: ${{ github.event.pull_request.head.sha }} + + - name: Create Claude settings file + run: | + mkdir -p /home/runner/.claude + cat > /home/runner/.claude/settings.json << 'EOF' + { + "env": { + "ANTHROPIC_BASE_URL": "https://api.z.ai/api/anthropic", + "ANTHROPIC_AUTH_TOKEN": "${{ secrets.CUSTOM_ENDPOINT_API_KEY }}" + } + } + EOF + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@v1 + with: + # Still need this to satisfy the action's validation + anthropic_api_key: ${{ secrets.CUSTOM_ENDPOINT_API_KEY }} + + # Use the same variable names as your local setup + settings: '{"env": {"ANTHROPIC_BASE_URL": "https://api.z.ai/api/anthropic", "ANTHROPIC_AUTH_TOKEN": "${{ secrets.CUSTOM_ENDPOINT_API_KEY }}"}}' + + track_progress: true + claude_args: | + --allowedTools "Bash,Edit,Read,Write,Glob,Grep" From ec69b2b7df02a650ff4f36211c553d18e3986116 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 14 Feb 2026 06:33:24 +0000 Subject: [PATCH 12/39] Improves upload client and code search handling - Updates code search to accept additional parameters for debugging and output formatting. - Caches the excluded directory names in the upload clients to improve performance and prevent unintended behavior from runtime changes to exclusion settings. - Adds a fallback mechanism for Qdrant client connections in the admin copy collection function, ensuring resilience in case pooled client acquisition fails. --- scripts/mcp_indexer_server.py | 6 ++++++ scripts/remote_upload_client.py | 6 +++++- scripts/standalone_upload_client.py | 6 +++++- scripts/upload_service.py | 10 +++++++--- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index 4d10325b..cdd1912b 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -1662,6 +1662,9 @@ async def code_search( case: Any = None, session: Any = None, compact: Any = None, + debug: Any = None, + output_format: Any = None, + repo: Any = None, kwargs: Any = None, ) -> Dict[str, Any]: """Exact alias of repo_search (hybrid code search with reranking enabled by default). @@ -1694,6 +1697,9 @@ async def code_search( case=case, session=session, compact=compact, + debug=debug, + output_format=output_format, + repo=repo, kwargs=kwargs, ) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 42a726e9..bf9b980f 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -520,8 +520,12 @@ def log_mapping_summary(self) -> None: logger.info(f" source_path: {info['source_path']}") logger.info(f" container_path: {info['container_path']}") - def _excluded_dirnames(self) -> set: + def _excluded_dirnames(self) -> frozenset: # Keep in sync with standalone_upload_client exclusions. + # NOTE: This caches the exclusion set per RemoteUploadClient instance. + # Runtime changes to DEV_REMOTE_MODE/REMOTE_UPLOAD_MODE won't be reflected + # until a new client is created (typically via process restart), which is + # acceptable for the upload client use case. cached = getattr(self, "_excluded_dirnames_cache", None) if cached is not None: return cached diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 1ef65161..62982e91 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -730,8 +730,12 @@ def log_mapping_summary(self) -> None: logger.info(f" source_path: {info['source_path']}") logger.info(f" container_path: {info['container_path']}") - def _excluded_dirnames(self) -> set: + def _excluded_dirnames(self) -> frozenset: # Keep in sync with get_all_code_files exclusions. + # NOTE: This caches the exclusion set per client instance. + # Runtime changes to DEV_REMOTE_MODE/REMOTE_UPLOAD_MODE won't be reflected + # until a new client is created (typically via process restart), which is + # acceptable for the standalone upload client use case. cached = getattr(self, "_excluded_dirnames_cache", None) if cached is not None: return cached diff --git a/scripts/upload_service.py b/scripts/upload_service.py index c4e19064..9acd9931 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -1247,15 +1247,19 @@ async def admin_copy_collection( if not name.endswith("_graph") and not str(new_name).endswith("_graph"): used_pooled = False if pooled_qdrant_client is not None: + used_pooled = True try: with pooled_qdrant_client( url=QDRANT_URL, api_key=os.environ.get("QDRANT_API_KEY"), ) as cli: - cli.get_collection(collection_name=f"{new_name}_graph") - graph_copied = "1" - used_pooled = True + try: + cli.get_collection(collection_name=f"{new_name}_graph") + graph_copied = "1" + except Exception: + graph_copied = "0" except Exception: + # Failed to acquire pooled client; fall back to non-pooled used_pooled = False if not used_pooled: try: From ba3d336cdc01b232aa599d75d4a6f3a914d413d4 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 14 Feb 2026 06:33:50 +0000 Subject: [PATCH 13/39] Prompts for venv creation when auto-detection fails Ensures the user is prompted to create a virtual environment and install dependencies when both the initial Python environment and auto-detected environments fail to provide the required modules. Removes the conditional check that prevented the prompt when modules were missing from an auto-detected interpreter. --- vscode-extension/context-engine-uploader/python_env.js | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vscode-extension/context-engine-uploader/python_env.js b/vscode-extension/context-engine-uploader/python_env.js index 190f9945..9e5e7fd5 100644 --- a/vscode-extension/context-engine-uploader/python_env.js +++ b/vscode-extension/context-engine-uploader/python_env.js @@ -339,10 +339,7 @@ function createPythonEnvManager(deps) { } // As a last resort, offer to create a private venv and install deps via pip - if (!allowPrompt) { - log('Skipping auto-install prompt; interpreter was auto-detected and missing modules.'); - return false; - } + // Always prompt at this point - we've exhausted all other options (initial Python + auto-detected both failed) const choice = await vscode.window.showErrorMessage( 'Context Engine Uploader: missing Python modules. Create isolated environment and auto-install?', 'Auto-install to private venv', From a6201250ec5f80e49fc807f01924091f4d647a96 Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 2 Mar 2026 11:05:35 +0000 Subject: [PATCH 14/39] Updates session defaults on ID change Ensures that when a user's session ID changes, such as after an authentication event, the `defaultsPayload` is updated with the new session ID and re-sent to the indexer and memory clients. This keeps backend services synchronized with the current active session. Also adds the session ID to the defaults payload during initial setup. --- ctx-mcp-bridge/src/mcpServer.js | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/ctx-mcp-bridge/src/mcpServer.js b/ctx-mcp-bridge/src/mcpServer.js index 7180afff..c5b2c66b 100644 --- a/ctx-mcp-bridge/src/mcpServer.js +++ b/ctx-mcp-bridge/src/mcpServer.js @@ -726,6 +726,7 @@ async function createBridgeServer(options) { indexerClient = nextIndexerClient; memoryClient = nextMemoryClient; + defaultsPayload.session = sessionId; if (Object.keys(defaultsPayload).length > 1 && indexerClient) { await sendSessionDefaults(indexerClient, defaultsPayload, "indexer"); if (memoryClient) { @@ -847,16 +848,22 @@ async function createBridgeServer(options) { debugLog(`[ctxce] tools/call: ${name || ""}`); - // Refresh session before each call; re-init clients if session changes. + // Check if session changed (e.g., after auth login), and re-send defaults if so. const freshSession = resolveSessionId() || sessionId; if (freshSession && freshSession !== sessionId) { sessionId = freshSession; - try { - await initializeRemoteClients(true); - } catch (err) { - debugLog("[ctxce] Failed to reinitialize clients after session refresh: " + String(err)); + defaultsPayload.session = sessionId; + if (Object.keys(defaultsPayload).length > 1) { + await initializeRemoteClients(false); + if (indexerClient) { + await sendSessionDefaults(indexerClient, defaultsPayload, "indexer"); + } + if (memoryClient) { + await sendSessionDefaults(memoryClient, defaultsPayload, "memory"); + } } } + if (sessionId && (args === undefined || args === null || typeof args === "object")) { const obj = args && typeof args === "object" ? { ...args } : {}; if (!Object.prototype.hasOwnProperty.call(obj, "session")) { From 7ed96d90a348bfa67a447df8f07ef5e9b4c77461 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 6 Mar 2026 12:17:40 +0000 Subject: [PATCH 15/39] refactor(bridge): consolidate session defaults sync Extract repeated session defaults synchronization into ensureRemoteDefaults function with deduplication tracking via lastDefaultsSyncedSessionId to prevent redundant sync operations across session lifecycle events. --- ctx-mcp-bridge/src/mcpServer.js | 43 ++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/ctx-mcp-bridge/src/mcpServer.js b/ctx-mcp-bridge/src/mcpServer.js index c5b2c66b..8f309331 100644 --- a/ctx-mcp-bridge/src/mcpServer.js +++ b/ctx-mcp-bridge/src/mcpServer.js @@ -33,16 +33,18 @@ function debugLog(message) { async function sendSessionDefaults(client, payload, label) { if (!client) { - return; + return false; } try { await client.callTool({ name: "set_session_defaults", arguments: payload, }); + return true; } catch (err) { // eslint-disable-next-line no-console console.error(`[ctxce] Failed to call set_session_defaults on ${label}:`, err); + return false; } } function dedupeTools(tools) { @@ -533,6 +535,7 @@ async function createBridgeServer(options) { let indexerClient = null; let memoryClient = null; + let lastDefaultsSyncedSessionId = ""; // Derive a simple session identifier for this bridge process. In the // future this can be made user-aware (e.g. from auth), but for now we @@ -661,6 +664,23 @@ async function createBridgeServer(options) { defaultsPayload.under = defaultUnder; } + async function ensureRemoteDefaults(force = false) { + defaultsPayload.session = sessionId; + if (!sessionId || Object.keys(defaultsPayload).length <= 1) { + return; + } + if (!force && lastDefaultsSyncedSessionId === sessionId) { + return; + } + const indexerOk = await sendSessionDefaults(indexerClient, defaultsPayload, "indexer"); + if (memoryClient) { + await sendSessionDefaults(memoryClient, defaultsPayload, "memory"); + } + if (indexerOk) { + lastDefaultsSyncedSessionId = sessionId; + } + } + async function initializeRemoteClients(forceRecreate = false) { if (!forceRecreate && indexerClient) { return; @@ -726,13 +746,7 @@ async function createBridgeServer(options) { indexerClient = nextIndexerClient; memoryClient = nextMemoryClient; - defaultsPayload.session = sessionId; - if (Object.keys(defaultsPayload).length > 1 && indexerClient) { - await sendSessionDefaults(indexerClient, defaultsPayload, "indexer"); - if (memoryClient) { - await sendSessionDefaults(memoryClient, defaultsPayload, "memory"); - } - } + await ensureRemoteDefaults(true); } await initializeRemoteClients(false); @@ -853,15 +867,8 @@ async function createBridgeServer(options) { if (freshSession && freshSession !== sessionId) { sessionId = freshSession; defaultsPayload.session = sessionId; - if (Object.keys(defaultsPayload).length > 1) { - await initializeRemoteClients(false); - if (indexerClient) { - await sendSessionDefaults(indexerClient, defaultsPayload, "indexer"); - } - if (memoryClient) { - await sendSessionDefaults(memoryClient, defaultsPayload, "memory"); - } - } + await initializeRemoteClients(false); + await ensureRemoteDefaults(true); } if (sessionId && (args === undefined || args === null || typeof args === "object")) { @@ -887,6 +894,7 @@ async function createBridgeServer(options) { } await initializeRemoteClients(false); + await ensureRemoteDefaults(false); const timeoutMs = getBridgeToolTimeoutMs(); const maxAttempts = getBridgeRetryAttempts(); @@ -923,6 +931,7 @@ async function createBridgeServer(options) { String(err), ); await initializeRemoteClients(true); + await ensureRemoteDefaults(true); sessionRetried = true; continue; } From 036c6777822d85979a70cd7a2e4612a1aa80fd1e Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 7 Mar 2026 11:27:41 +0000 Subject: [PATCH 16/39] fix(search): change `under` filter to recursive subtree scope Refactor path scoping logic into shared `scripts/path_scope` module and change `under` parameter semantics from exact path_prefix match to recursive subtree filtering. This allows `under="scripts"` to match all files under scripts/** rather than requiring exact prefix equality. - Add `scripts/path_scope.py` with `normalize_under`, `metadata_matches_under`, and `path_matches_under` helpers for consistent path scoping - Update hybrid_search, expand.py, symbol_graph, and rerank_tools/local.py to use shared path scoping module with client-side post-filtering - Remove server-side path_prefix equality filters in favor of recursive client-side matching against multiple path forms (repo_rel_path, host_path, container_path, etc.) - Add overfetch multiplier for rerank when `under` is specified to ensure sufficient candidates before client-side filtering - Update docstrings to clarify `under` as recursive workspace subtree filter - Add comprehensive tests for new path scoping behavior - Modernize test asyncio usage from deprecated get_event_loop() to asyncio.run() --- scripts/hybrid/expand.py | 39 ++--- scripts/hybrid_search.py | 42 ++--- scripts/mcp_impl/search.py | 157 +++++++++++++----- scripts/mcp_impl/symbol_graph.py | 90 +++++----- scripts/mcp_indexer_server.py | 4 +- scripts/path_scope.py | 227 ++++++++++++++++++++++++++ scripts/rerank_tools/local.py | 71 ++++---- tests/test_change_history_for_path.py | 5 +- tests/test_globs_and_snippet.py | 26 +++ tests/test_integration_qdrant.py | 9 +- tests/test_negative_args.py | 3 +- tests/test_path_scope.py | 61 +++++++ tests/test_rerank_under_scope.py | 66 ++++++++ tests/test_reranker_verification.py | 73 ++++++++- tests/test_server_helpers.py | 3 +- tests/test_service_qdrant_status.py | 3 +- tests/test_staging_lifecycle.py | 1 + tests/test_symbol_graph_tool.py | 76 ++++++--- 18 files changed, 755 insertions(+), 201 deletions(-) create mode 100644 scripts/path_scope.py create mode 100644 tests/test_path_scope.py create mode 100644 tests/test_rerank_under_scope.py diff --git a/scripts/hybrid/expand.py b/scripts/hybrid/expand.py index 20a43834..b228f4ca 100644 --- a/scripts/hybrid/expand.py +++ b/scripts/hybrid/expand.py @@ -29,6 +29,11 @@ from typing import List, Dict, Any, TYPE_CHECKING from pathlib import Path +from scripts.path_scope import ( + normalize_under as _normalize_under_scope, + metadata_matches_under as _metadata_matches_under, +) + logger = logging.getLogger("hybrid_expand") # Import QdrantClient type for annotations @@ -542,20 +547,8 @@ def expand_via_embeddings( except Exception: vec_name = None - def _norm_under(u: str | None) -> str | None: - if not u: - return None - u = str(u).strip().replace("\\", "/") - u = "/".join([p for p in u.split("/") if p]) - if not u: - return None - if u.startswith("/work/"): - return u - if not u.startswith("/"): - return "/work/" + u - return "/work/" + u.lstrip("/") - flt = None + eff_under = _normalize_under_scope(under) try: from qdrant_client import models @@ -567,15 +560,6 @@ def _norm_under(u: str | None) -> str | None: match=models.MatchValue(value=language), ) ) - if under: - eff_under = _norm_under(under) - if eff_under: - must.append( - models.FieldCondition( - key="metadata.path_prefix", - match=models.MatchValue(value=eff_under), - ) - ) if kind: must.append( models.FieldCondition( @@ -637,6 +621,17 @@ def _norm_under(u: str | None) -> str | None: if not results: return [] + if eff_under: + _scoped = [] + for hit in results: + payload = getattr(hit, "payload", None) or {} + md = payload.get("metadata") or {} + if _metadata_matches_under(md, eff_under): + _scoped.append(hit) + results = _scoped + if not results: + return [] + # Extract unique terms from neighbors extracted_terms: set[str] = set() query_tokens = set(combined_query.lower().split()) diff --git a/scripts/hybrid_search.py b/scripts/hybrid_search.py index 70b5b094..f893f7a7 100644 --- a/scripts/hybrid_search.py +++ b/scripts/hybrid_search.py @@ -296,6 +296,10 @@ from scripts.utils import sanitize_vector_name as _sanitize_vector_name from scripts.ingest_code import ensure_collection as _ensure_collection_raw from scripts.ingest_code import project_mini as _project_mini +from scripts.path_scope import ( + normalize_under as _normalize_under_scope, + metadata_matches_under as _metadata_matches_under, +) # --------------------------------------------------------------------------- # Module logger @@ -440,7 +444,7 @@ def run_pure_dense_search( model: Embedding model (will load default if None) collection: Qdrant collection name language: Optional language filter - under: Optional path prefix filter + under: Optional recursive workspace subtree filter repo: Optional repo filter Returns: @@ -465,12 +469,10 @@ def run_pure_dense_search( vec_name = sanitize_vector_name(model_name) coll = collection or _collection() - # Build filter + # Build server-side filter (exclude `under` here; recursive under is post-filtered) must = [] if language: must.append(models.FieldCondition(key="metadata.language", match=models.MatchValue(value=language))) - if under: - must.append(models.FieldCondition(key="metadata.path_prefix", match=models.MatchValue(value=under))) if repo and repo != "*": if isinstance(repo, list): must.append(models.FieldCondition(key="metadata.repo", match=models.MatchAny(any=repo))) @@ -504,10 +506,13 @@ def run_pure_dense_search( ranked_points = dense_query(client, vec_name, vec_list, flt, limit, coll, query_text=query) # Build output + eff_under = _normalize_under_scope(under) results = [] for p in ranked_points: payload = p.payload or {} md = payload.get("metadata") or {} + if eff_under and not _metadata_matches_under(md, eff_under): + continue # Prefer host_path when available (consistent with hybrid search) _path = md.get("host_path") or payload.get("path") or md.get("path") or "" @@ -690,21 +695,8 @@ def _normalize_globs(globs: list[str]) -> list[str]: eff_path_globs_norm = _normalize_globs(eff_path_globs) eff_not_globs_norm = _normalize_globs(eff_not_globs) - # Normalize under - def _norm_under(u: str | None) -> str | None: - if not u: - return None - u = str(u).strip().replace("\\", "/") - u = "/".join([p for p in u.split("/") if p]) - if not u: - return None - if not u.startswith("/"): - v = "/work/" + u - else: - v = "/work/" + u.lstrip("/") if not u.startswith("/work/") else u - return v - - eff_under = _norm_under(eff_under) + # Normalize under as a user-facing recursive subtree scope. + eff_under = _normalize_under_scope(eff_under) # Expansion knobs that affect query construction/results (must be part of cache key) try: @@ -810,12 +802,8 @@ def _norm_under(u: str | None) -> str | None: key="metadata.repo", match=models.MatchValue(value=eff_repo) ) ) - if eff_under: - must.append( - models.FieldCondition( - key="metadata.path_prefix", match=models.MatchValue(value=eff_under) - ) - ) + # NOTE: `under` is recursive and user-facing; we enforce it in client-side + # filtering via normalized metadata paths instead of exact path_prefix equality. if eff_kind: must.append( models.FieldCondition( @@ -2105,7 +2093,7 @@ def _match_glob(pat: str, path: str) -> bool: return _fnm.fnmatchcase(path, pat) return _fnm.fnmatchcase(path.lower(), pat.lower()) - if eff_not or eff_path_regex or eff_ext or eff_path_globs or eff_not_globs: + if eff_under or eff_not or eff_path_regex or eff_ext or eff_path_globs or eff_not_globs: def _pass_filters(m: Dict[str, Any]) -> bool: md = (m["pt"].payload or {}).get("metadata") or {} @@ -2118,6 +2106,8 @@ def _pass_filters(m: Dict[str, Any]) -> bool: nn = eff_not if case_sensitive else eff_not.lower() if nn in p_for_sub or nn in pp_for_sub: return False + if eff_under and not _metadata_matches_under(md, eff_under): + return False if eff_not_globs_norm and any(_match_glob(g, path) or _match_glob(g, rel) for g in eff_not_globs_norm): return False if eff_ext: diff --git a/scripts/mcp_impl/search.py b/scripts/mcp_impl/search.py index bc310f14..750dca17 100644 --- a/scripts/mcp_impl/search.py +++ b/scripts/mcp_impl/search.py @@ -43,6 +43,9 @@ from scripts.mcp_impl.admin_tools import _detect_current_repo, _run_async from scripts.mcp_toon import _should_use_toon, _format_results_as_toon from scripts.mcp_auth import require_collection_access as _require_collection_access +from scripts.path_scope import ( + normalize_under as _normalize_under_scope, +) # Constants QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") @@ -158,7 +161,7 @@ async def _repo_search_impl( - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos (disable auto-filter). By default, auto-detects current repo from CURRENT_REPO env and filters to it. Use repo=["frontend","backend"] to search related repos together. - - Filters (optional): language, under (path prefix), kind, symbol, ext, path_regex, + - Filters (optional): language, under (recursive workspace subtree), kind, symbol, ext, path_regex, path_glob (str or list[str]), not_glob (str or list[str]), not_ (negative text), case. - debug: bool (default false). When true, includes verbose internal fields like components, rerank_counters, code_signals. Default false saves ~60-80% tokens. @@ -438,7 +441,7 @@ def _to_str(x, default=""): under = under_hint language = _to_str(language, "").strip() - under = _to_str(under, "").strip() + under = _normalize_under_scope(_to_str(under, "").strip()) kind = _to_str(kind, "").strip() symbol = _to_str(symbol, "").strip() path_regex = _to_str(path_regex, "").strip() @@ -488,6 +491,99 @@ def _to_str_list(x): if detected_repo: repo_filter = [detected_repo] + case_sensitive = str(case or "").strip().lower() in { + "sensitive", + "true", + "1", + "yes", + "on", + } + path_globs_norm = [g if case_sensitive else g.lower() for g in path_globs] + not_globs_norm = [g if case_sensitive else g.lower() for g in not_globs] + + def _norm_case(v: str) -> str: + return v if case_sensitive else v.lower() + + def _match_glob(glob_pat: str, path_val: str) -> bool: + import fnmatch as _fnm + if not glob_pat: + return False + p = _norm_case(path_val).replace("\\", "/").strip("/") + if _fnm.fnmatchcase(p, glob_pat): + return True + # Allow repo-relative globs (e.g., scripts/**) to match absolute paths + # by testing suffix windows of the normalized path. + if not glob_pat.startswith("/") and "/" in p: + parts = [seg for seg in p.split("/") if seg] + for i in range(1, len(parts)): + tail = "/".join(parts[i:]) + if _fnm.fnmatchcase(tail, glob_pat): + return True + return False + + def _result_passes_path_filters(item: dict) -> bool: + import re as _re + + path = str(item.get("path") or "") + if not path: + return False + + # Evaluate filters against all known path forms carried by this result. + path_vals = [] + for key in ("path", "rel_path", "client_path", "host_path", "container_path"): + v = item.get(key) + if isinstance(v, str) and v.strip(): + path_vals.append(v.strip().replace("\\", "/")) + if not path_vals: + path_vals = [path] + if path.startswith("/work/"): + path_vals.append(path[len("/work/") :]) + + # Deduplicate while preserving order. + seen = set() + norm_paths = [] + for pv in path_vals: + if pv not in seen: + norm_paths.append(pv) + seen.add(pv) + + if not_: + needle = _norm_case(str(not_)) + if any(needle in _norm_case(pv) for pv in norm_paths): + return False + + if ext: + ext_norm = str(ext).lower().lstrip(".") + if not any(_norm_case(pv).endswith("." + ext_norm) for pv in norm_paths): + return False + + if path_regex: + flags = 0 if case_sensitive else _re.IGNORECASE + try: + if not any(_re.search(path_regex, pv, flags=flags) for pv in norm_paths): + return False + except Exception: + pass + + if path_globs_norm and not any( + _match_glob(g, pv) for g in path_globs_norm for pv in norm_paths + ): + return False + + if not_globs_norm and any( + _match_glob(g, pv) for g in not_globs_norm for pv in norm_paths + ): + return False + + return True + + def _apply_result_filters(items: list[dict]) -> list[dict]: + if not items: + return [] + if not (not_ or path_regex or ext or path_globs_norm or not_globs_norm): + return items + return [it for it in items if _result_passes_path_filters(it)] + compact_raw = compact compact = _to_bool(compact, False) # If snippets are requested, do not compact (we need snippet field in results) @@ -607,46 +703,9 @@ def _to_str_list(x): ) ) - # Apply post-filters (path_regex, path_glob, not_glob, not_) that aren't - # supported by run_pure_dense_search's server-side filters - case_sensitive = str(case or "").strip().lower() in {"sensitive", "true", "1", "yes", "on"} - import fnmatch as _fnm - import re as _re - - def _norm_path(p: str) -> str: - return p if case_sensitive else p.lower() - - path_globs_norm = [g if case_sensitive else g.lower() for g in path_globs] - not_globs_norm = [g if case_sensitive else g.lower() for g in not_globs] - path_regex_norm = path_regex or "" - - def _match_glob(glob_pat: str, path_val: str) -> bool: - if not glob_pat: - return False - return _fnm.fnmatchcase(_norm_path(path_val), glob_pat) - for item in items: path = item.get("path") or "" - - # Apply path_regex filter - if path_regex_norm: - flags = 0 if case_sensitive else _re.IGNORECASE - try: - if not _re.search(path_regex_norm, path, flags=flags): - continue - except Exception: - pass - - # Apply path_glob filter - if path_globs_norm and not any(_match_glob(g, path) for g in path_globs_norm): - continue - - # Apply not_glob filter - if not_globs_norm and any(_match_glob(g, path) for g in not_globs_norm): - continue - - # Apply not_ text filter - if not_ and not_.lower() in _norm_path(path): + if not _result_passes_path_filters({"path": path}): continue payload = item.get("payload") or {} @@ -1269,6 +1328,10 @@ def _doc_for(obj: dict) -> str: item["tags"] = obj.get("tags") results.append(item) + # Enforce strict filter semantics regardless of retrieval/rerank branch. + # This closes gaps where fallback rerank paths may bypass path_glob/not_glob. + results = _apply_result_filters(results) + # Mode-aware reordering: nudge core implementation code vs docs and non-core when requested def _is_doc_path(p: str) -> bool: pl = str(p or "").lower() @@ -1549,6 +1612,12 @@ def _read_snip(args): # start_line, end_line, tags, pseudo results = [_strip_debug_fields(r) for r in results] + _res_ok = bool(res.get("ok", True)) if isinstance(res, dict) else True + try: + _res_code = int((res or {}).get("code", 0)) + except Exception: + _res_code = 0 + response = { "args": { "queries": queries, @@ -1577,11 +1646,17 @@ def _read_snip(args): "used_rerank": bool(used_rerank), "total": len(results), "results": results, - **res, + "ok": _res_ok, + "code": _res_code, } + # Expose a concise failure reason without leaking raw subprocess streams by default. + if (not _res_ok or _res_code != 0) and not results: + response["error"] = "search backend execution failed" + # Only include debug fields when explicitly requested if debug: + response["subprocess"] = res response["rerank_counters"] = rerank_counters if code_signals.get("has_code_signals"): response["code_signals"] = code_signals diff --git a/scripts/mcp_impl/symbol_graph.py b/scripts/mcp_impl/symbol_graph.py index 6c574dff..e15955b8 100644 --- a/scripts/mcp_impl/symbol_graph.py +++ b/scripts/mcp_impl/symbol_graph.py @@ -22,6 +22,12 @@ import re from typing import Any, Dict, List, Optional, Set +from scripts.path_scope import ( + normalize_under as _normalize_under_scope, + metadata_matches_under as _metadata_matches_under, + path_matches_under as _path_matches_under, +) + logger = logging.getLogger(__name__) try: @@ -113,23 +119,18 @@ def _symbol_variants(symbol: str) -> List[str]: return list(dict.fromkeys(variants)) # Dedupe preserving order def _norm_under(u: Optional[str]) -> Optional[str]: - """Normalize an `under` path to match ingest's stored `metadata.path_prefix` values. + """Normalize user-facing `under` to recursive subtree scope token.""" + return _normalize_under_scope(u) - This mirrors the engine's convention: normalize to a /work/... style path. - Note: `under` in this engine is an exact directory filter (not recursive). - """ - if not u: - return None - s = str(u).strip().replace("\\", "/") - s = "/".join([p for p in s.split("/") if p]) - if not s: - return None - # Normalize to /work/... - if not s.startswith("/"): - v = "/work/" + s - else: - v = "/work/" + s.lstrip("/") if not s.startswith("/work/") else s - return v.rstrip("/") + +def _point_matches_under(pt: Any, under: Optional[str]) -> bool: + if not under: + return True + payload = getattr(pt, "payload", None) or {} + md = payload.get("metadata", payload) + if not isinstance(md, dict): + md = {} + return _metadata_matches_under(md, under) async def _symbol_graph_impl( @@ -150,7 +151,7 @@ async def _symbol_graph_impl( query_type: One of "callers", "definition", "importers" limit: Maximum number of results language: Optional language filter - under: Optional path prefix filter + under: Optional recursive workspace subtree filter collection: Optional collection override session: Optional session ID for collection routing ctx: MCP context (optional) @@ -201,6 +202,8 @@ async def _symbol_graph_impl( results = [] + norm_under = _norm_under(under) + try: if query_type == "callers": # Prefer graph edges collection when available (fast keyword filters). @@ -212,7 +215,7 @@ async def _symbol_graph_impl( limit=limit, language=language, repo_filter=None, - under=_norm_under(under), + under=norm_under, ) if not results: # Fall back to array field lookup in the main collection. @@ -223,7 +226,7 @@ async def _symbol_graph_impl( value=symbol, limit=limit, language=language, - under=_norm_under(under), + under=norm_under, ) elif query_type == "definition": # Find chunks where symbol_path matches the symbol @@ -233,7 +236,7 @@ async def _symbol_graph_impl( symbol=symbol, limit=limit, language=language, - under=_norm_under(under), + under=norm_under, ) elif query_type == "importers": results = await _query_graph_edges_collection( @@ -244,7 +247,7 @@ async def _symbol_graph_impl( limit=limit, language=language, repo_filter=None, - under=_norm_under(under), + under=norm_under, ) if not results: # Fall back to array field lookup in the main collection. @@ -255,7 +258,7 @@ async def _symbol_graph_impl( value=symbol, limit=limit, language=language, - under=_norm_under(under), + under=norm_under, ) # If no results, fall back to semantic search @@ -265,6 +268,7 @@ async def _symbol_graph_impl( query_type=query_type, limit=limit, language=language, + under=norm_under, collection=coll, session=session, ) @@ -277,6 +281,7 @@ async def _symbol_graph_impl( query_type=query_type, limit=limit, language=language, + under=norm_under, collection=coll, session=session, ) @@ -371,7 +376,9 @@ def _scroll(_flt=flt): if not p: continue path_s = str(p) - if under and not str(path_s).startswith(str(under)): + if under and not _path_matches_under( + path_s, under, repo_hint=(payload.get("repo") or repo_filter) + ): continue if path_s in seen_paths: continue @@ -468,14 +475,6 @@ async def _query_array_field( match=qmodels.MatchValue(value=language.lower()), ) ) - if under: - base_conditions.append( - qmodels.FieldCondition( - key="metadata.path_prefix", - match=qmodels.MatchValue(value=under), - ) - ) - # Strategy 1: Exact match with MatchAny (most reliable for array fields) try: filter1 = qmodels.Filter( @@ -499,6 +498,8 @@ def scroll1(): scroll_result = await asyncio.to_thread(scroll1) points = scroll_result[0] if scroll_result else [] for pt in points: + if under and not _point_matches_under(pt, under): + continue pt_id = str(getattr(pt, "id", id(pt))) if pt_id not in seen_ids: seen_ids.add(pt_id) @@ -534,6 +535,8 @@ def scroll2(): scroll_result = await asyncio.to_thread(scroll2) points = scroll_result[0] if scroll_result else [] for pt in points: + if under and not _point_matches_under(pt, under): + continue pt_id = str(getattr(pt, "id", id(pt))) if pt_id not in seen_ids: seen_ids.add(pt_id) @@ -565,6 +568,8 @@ def scroll3(): scroll_result = await asyncio.to_thread(scroll3) points = scroll_result[0] if scroll_result else [] for pt in points: + if under and not _point_matches_under(pt, under): + continue pt_id = str(getattr(pt, "id", id(pt))) if pt_id not in seen_ids: seen_ids.add(pt_id) @@ -600,14 +605,6 @@ async def _query_definition( match=qmodels.MatchValue(value=language.lower()), ) ) - if under: - base_conditions.append( - qmodels.FieldCondition( - key="metadata.path_prefix", - match=qmodels.MatchValue(value=under), - ) - ) - # Strategy 1: Exact match on symbol_path (e.g., "MyClass.my_method") try: filter1 = qmodels.Filter( @@ -692,6 +689,8 @@ def scroll3(): seen_ids = set() unique_results = [] for pt in results: + if under and not _point_matches_under(pt, under): + continue pt_id = getattr(pt, "id", None) if pt_id not in seen_ids: seen_ids.add(pt_id) @@ -748,6 +747,7 @@ async def _fallback_semantic_search( query_type: str, limit: int = 20, language: Optional[str] = None, + under: Optional[str] = None, collection: Optional[str] = None, session: Optional[str] = None, ) -> List[Dict[str, Any]]: @@ -769,6 +769,7 @@ async def _fallback_semantic_search( query=query, limit=limit, language=language, + under=under, session=session, output_format="json", # Avoid TOON encoding for internal calls ) @@ -833,7 +834,7 @@ async def _compute_called_by( symbol: The symbol name to find callers for limit: Maximum number of callers to return language: Optional language filter - under: Optional path prefix filter + under: Optional recursive workspace subtree filter collection: Optional collection override Returns: @@ -881,13 +882,6 @@ async def _compute_called_by( ) ) norm_under = _norm_under(under) - if norm_under: - base_conditions.append( - qmodels.FieldCondition( - key="metadata.path_prefix", - match=qmodels.MatchValue(value=norm_under), - ) - ) callers: List[Dict[str, Any]] = [] seen_ids: Set[str] = set() @@ -921,6 +915,8 @@ def do_scroll(): points = scroll_result[0] if scroll_result else [] for pt in points: + if norm_under and not _point_matches_under(pt, norm_under): + continue pt_id = str(getattr(pt, "id", id(pt))) if pt_id in seen_ids: continue diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index cdd1912b..444b98ac 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -1431,7 +1431,7 @@ async def symbol_graph( - query_type: str. One of "callers", "definition", "importers". - limit: int (default 20). Maximum results to return. - language: str (optional). Filter by programming language. - - under: str (optional). Filter by path prefix. + - under: str (optional). Filter by recursive workspace subtree (e.g., "scripts" -> scripts/**). - output_format: "json" (default) or "toon" for token-efficient format. Returns: @@ -1746,7 +1746,7 @@ async def info_request( - include_relationships: bool (default false). Add imports_from, calls, related_paths to results. - limit: int (default 10). Maximum results to return. - language: str. Filter by programming language. - - under: str. Limit search to specific directory. + - under: str. Limit search to a recursive workspace subtree. - repo: str or list[str]. Filter by repository name(s). - output_format: "json" (default) or "toon" for token-efficient TOON format. diff --git a/scripts/path_scope.py b/scripts/path_scope.py new file mode 100644 index 00000000..7eb76889 --- /dev/null +++ b/scripts/path_scope.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 +""" +Shared helpers for user-facing path scoping (`under`) across search tools. + +`under` is treated as a recursive subtree scope from the user's workspace +perspective (for example: "space" matches ".../space/**"). +""" + +from __future__ import annotations + +import os +import re +from functools import lru_cache +from typing import Any, Mapping, Optional, Set + +_MULTI_SLASH_RE = re.compile(r"/+") + + +def _normalize_path_token(value: Any) -> str: + s = str(value or "").strip().replace("\\", "/") + if not s: + return "" + s = _MULTI_SLASH_RE.sub("/", s) + # Normalize common "file://" style inputs. + if s.startswith("file://"): + s = s[7:] + return s.strip("/") + + +def _normalize_repo_hint(repo_hint: Any) -> str: + r = _normalize_path_token(repo_hint) + if not r: + return "" + return r.split("/")[-1] + + +def _repo_root_hint() -> str: + """Best-effort repository root (directory containing scripts/).""" + try: + return os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + except Exception: + return "" + + +def _maybe_expand_from_cwd(token: str) -> str: + """Recover under values that were relativized from the current subdirectory.""" + s = str(token or "").strip().strip("/") + if not s or "/" in s: + return s + try: + root = _repo_root_hint() + if not root: + return s + cwd = os.path.abspath(os.getcwd()) + if not (cwd == root or cwd.startswith(root + os.sep)): + return s + rel_cwd = os.path.relpath(cwd, root).replace("\\", "/").strip("/") + if not rel_cwd: + return s + rebased = f"{rel_cwd}/{s}" + rebased_path = os.path.join(root, *rebased.split("/")) + top_level_path = os.path.join(root, s) + if os.path.exists(rebased_path) and not os.path.exists(top_level_path): + return rebased + except Exception: + pass + return s + + +@lru_cache(maxsize=256) +def _unique_segment_path(root: str, segment: str) -> str: + """Return unique repo-relative directory path for a segment, else empty.""" + if not root or not segment: + return "" + top = os.path.join(root, segment) + if os.path.exists(top): + return "" + matches: list[str] = [] + skip = { + ".git", + ".codebase", + "__pycache__", + ".venv", + "node_modules", + } + try: + for dirpath, dirnames, _filenames in os.walk(root): + dirnames[:] = [d for d in dirnames if d not in skip and not d.startswith(".")] + if segment in dirnames: + rel = os.path.relpath(os.path.join(dirpath, segment), root).replace("\\", "/") + matches.append(rel.strip("/")) + if len(matches) > 1: + return "" + except Exception: + return "" + return matches[0] if len(matches) == 1 else "" + + +def _maybe_expand_unique_segment(token: str) -> str: + """Resolve single-segment under values to a unique subtree when possible.""" + s = str(token or "").strip().strip("/") + if not s or "/" in s: + return s + root = _repo_root_hint() + if not root: + return s + found = _unique_segment_path(root, s) + return found or s + + +def normalize_under(under: Optional[str]) -> Optional[str]: + """Normalize user-provided `under` into a comparable path token.""" + s = _normalize_path_token(under) + if not s or s in {".", "work"}: + return None + # Accept absolute-style workspace prefixes while preserving user-facing scope. + if s.startswith("work/"): + s = s[len("work/") :] + s = _maybe_expand_from_cwd(s) + s = _maybe_expand_unique_segment(s) + if not s or s in {".", "work"}: + return None + return s + + +def _path_forms(path: Any, repo_hint: Any = None) -> Set[str]: + """Generate comparable path forms from a path-like value.""" + p = _normalize_path_token(path) + if not p: + return set() + + forms: Set[str] = {p} + + repo = _normalize_repo_hint(repo_hint) + + if p.startswith("work/"): + rest = p[len("work/") :] + if rest: + forms.add(rest) + if "/" in rest and repo: + head, tail = rest.split("/", 1) + if head.casefold() == repo.casefold() and tail: + forms.add(tail) + + if repo: + repo_cf = repo.casefold() + repo_prefix_cf = repo_cf + "/" + marker = "/" + repo + "/" + marker_cf = "/" + repo_cf + "/" + for f in list(forms): + f_cf = f.casefold() + if f_cf.startswith(repo_prefix_cf): + forms.add(f[len(repo) + 1 :]) + idx = f_cf.find(marker_cf) + if idx >= 0: + tail = f[idx + len(marker) :] + if tail: + forms.add(tail) + + return {x for x in forms if x} + + +def metadata_path_forms(metadata: Mapping[str, Any]) -> Set[str]: + """Collect path forms from a metadata payload.""" + repo_hint = metadata.get("repo") + forms: Set[str] = set() + for key in ( + "repo_rel_path", + "path", + "container_path", + "host_path", + "path_prefix", + "file_path", + "rel_path", + "client_path", + ): + v = metadata.get(key) + if v: + forms.update(_path_forms(v, repo_hint=repo_hint)) + return forms + + +def metadata_matches_under(metadata: Mapping[str, Any], under: Optional[str]) -> bool: + """Return True when metadata falls under the requested subtree scope.""" + norm_under = normalize_under(under) + if not norm_under: + return True + + repo_hint = metadata.get("repo") + under_forms = _path_forms(norm_under, repo_hint=repo_hint) + under_forms.add(norm_under) + if not norm_under.startswith("work/"): + under_forms.add("work/" + norm_under) + + under_forms_l = {u.casefold() for u in under_forms if u} + if not under_forms_l: + return True + + has_repo_hint = bool(str(repo_hint or "").strip()) + + for cand in metadata_path_forms(metadata): + cand_forms = {cand} + # Compatibility fallback for points that only store /work//... paths + # but do not carry metadata.repo (older/benchmark/custom payloads). + if not has_repo_hint: + c0 = cand.strip("/") + if c0.startswith("work/"): + rest = c0[len("work/") :] + if "/" in rest: + _head, tail = rest.split("/", 1) + if tail: + cand_forms.add(tail) + + for cf in cand_forms: + c = cf.casefold() + for u in under_forms_l: + if c == u or c.startswith(u + "/"): + return True + return False + + +def path_matches_under(path: Any, under: Optional[str], repo_hint: Any = None) -> bool: + """Path-only convenience wrapper for `under` subtree matching.""" + md = {"path": path} + if repo_hint: + md["repo"] = repo_hint + return metadata_matches_under(md, under) diff --git a/scripts/rerank_tools/local.py b/scripts/rerank_tools/local.py index e2151791..b7b1ba48 100644 --- a/scripts/rerank_tools/local.py +++ b/scripts/rerank_tools/local.py @@ -134,6 +134,10 @@ def _get_rerank_session(): from scripts.utils import sanitize_vector_name as _sanitize_vector_name +from scripts.path_scope import ( + normalize_under as _normalize_under_scope, + metadata_matches_under as _metadata_matches_under, +) def warmup_reranker(): @@ -163,18 +167,14 @@ def _start_background_warmup(): _start_background_warmup() -def _norm_under(u: str | None) -> str | None: - if not u: - return None - u = str(u).strip().replace("\\", "/") - u = "/".join([p for p in u.split("/") if p]) - if not u: - return None - if not u.startswith("/"): - return "/work/" + u - if not u.startswith("/work/"): - return "/work/" + u.lstrip("/") - return u +def _point_matches_under(pt: Any, under: str | None) -> bool: + if not under: + return True + payload = getattr(pt, "payload", None) or {} + md = payload.get("metadata") or {} + if not isinstance(md, dict): + md = {} + return _metadata_matches_under(md, under) def _select_dense_vector_name( @@ -366,18 +366,21 @@ def rerank_in_process( key="metadata.language", match=models.MatchValue(value=language) ) ) - eff_under = _norm_under(under) - if eff_under: - must.append( - models.FieldCondition( - key="metadata.path_prefix", match=models.MatchValue(value=eff_under) - ) - ) + eff_under = _normalize_under_scope(under) flt = models.Filter(must=must) if must else None - pts = dense_results(client, _model, vec_name, query, flt, topk, eff_collection) - if not pts and flt is not None: - pts = dense_results(client, _model, vec_name, query, None, topk, eff_collection) + fetch_topk = max(1, int(topk)) + if eff_under: + try: + under_mult = int(os.environ.get("RERANK_UNDER_FETCH_MULT", "4") or 4) + except Exception: + under_mult = 4 + fetch_topk = max(fetch_topk, int(limit) * max(under_mult, 2), fetch_topk * max(under_mult, 2)) + fetch_topk = min(fetch_topk, 2000) + + pts = dense_results(client, _model, vec_name, query, flt, fetch_topk, eff_collection) + if eff_under and pts: + pts = [pt for pt in pts if _point_matches_under(pt, eff_under)] if not pts: return [] @@ -447,19 +450,21 @@ def main(): key="metadata.language", match=models.MatchValue(value=args.language) ) ) - eff_under = _norm_under(args.under) - if eff_under: - must.append( - models.FieldCondition( - key="metadata.path_prefix", match=models.MatchValue(value=eff_under) - ) - ) + eff_under = _normalize_under_scope(args.under) flt = models.Filter(must=must) if must else None - pts = dense_results(client, model, vec_name, args.query, flt, args.topk, eff_collection) - # Fallback: if filtered search yields nothing, retry without filters to avoid empty rerank - if not pts and flt is not None: - pts = dense_results(client, model, vec_name, args.query, None, args.topk, eff_collection) + fetch_topk = max(1, int(args.topk)) + if eff_under: + try: + under_mult = int(os.environ.get("RERANK_UNDER_FETCH_MULT", "4") or 4) + except Exception: + under_mult = 4 + fetch_topk = max(fetch_topk, int(args.limit) * max(under_mult, 2), fetch_topk * max(under_mult, 2)) + fetch_topk = min(fetch_topk, 2000) + + pts = dense_results(client, model, vec_name, args.query, flt, fetch_topk, eff_collection) + if eff_under and pts: + pts = [pt for pt in pts if _point_matches_under(pt, eff_under)] if not pts: return pairs = prepare_pairs(args.query, pts) diff --git a/tests/test_change_history_for_path.py b/tests/test_change_history_for_path.py index 52be7592..7d822150 100644 --- a/tests/test_change_history_for_path.py +++ b/tests/test_change_history_for_path.py @@ -15,6 +15,10 @@ def tool(self, *args, **kwargs): def _decorator(fn): return fn return _decorator + def resource(self, *args, **kwargs): + def _decorator(fn): + return fn + return _decorator class _Context: def __init__(self, *args, **kwargs): @@ -98,4 +102,3 @@ async def test_change_history_strict_match_under_work(monkeypatch): assert summary.get("ingested_min") == 90 assert summary.get("ingested_max") == 115 assert summary.get("churn_count_max") == 5 - diff --git a/tests/test_globs_and_snippet.py b/tests/test_globs_and_snippet.py index 4c30ff48..e367bab3 100644 --- a/tests/test_globs_and_snippet.py +++ b/tests/test_globs_and_snippet.py @@ -116,6 +116,32 @@ def test_run_hybrid_search_slugged_path_globs(monkeypatch): assert "/work/other/docs/readme.md" not in paths +@pytest.mark.unit +def test_run_hybrid_search_under_recursive_scope(monkeypatch): + pts = [ + _Pt("1", "/work/repo/space/ship/a.py"), + _Pt("2", "/work/repo/direct/tools/b.py"), + ] + monkeypatch.setattr(hyb, "get_qdrant_client", lambda *a, **k: FakeQdrant(pts)) + monkeypatch.setattr(hyb, "return_qdrant_client", lambda *a, **k: None) + monkeypatch.setenv("EMBEDDING_MODEL", "unit-test") + monkeypatch.setenv("QDRANT_URL", "http://localhost:6333") + monkeypatch.setattr(hyb, "TextEmbedding", lambda *a, **k: FakeEmbed()) + monkeypatch.setattr(hyb, "_get_embedding_model", lambda *a, **k: FakeEmbed()) + + items = hyb.run_hybrid_search( + queries=["rotate heading"], + limit=10, + per_path=2, + under="space", + expand=False, + model=FakeEmbed(), + ) + paths = {it.get("path") for it in items} + assert "/work/repo/space/ship/a.py" in paths + assert "/work/repo/direct/tools/b.py" not in paths + + @pytest.mark.unit def test_dense_query_preserves_collection_on_filter_drop(monkeypatch): calls = [] diff --git a/tests/test_integration_qdrant.py b/tests/test_integration_qdrant.py index 65cef7f5..ab5d71d1 100644 --- a/tests/test_integration_qdrant.py +++ b/tests/test_integration_qdrant.py @@ -1,6 +1,7 @@ import os import json import uuid +import asyncio import importlib import pytest @@ -75,7 +76,7 @@ def test_index_and_search_minirepo(tmp_path, monkeypatch, qdrant_container): ) # Search directly via async function - res = srv.asyncio.get_event_loop().run_until_complete( + res = asyncio.run( srv.repo_search( queries=["def f"], limit=5, @@ -127,19 +128,19 @@ def test_filters_language_and_path(tmp_path, monkeypatch, qdrant_container): f_md = str(tmp_path / "pkg" / "b.md") # Filter by language=python should bias toward .py - res1 = srv.asyncio.get_event_loop().run_until_complete( + res1 = asyncio.run( srv.repo_search(queries=["def"], limit=5, language="python", compact=False) ) assert any(f_py in (r.get("path") or "") for r in res1.get("results", [])) # Filter by ext=txt should retrieve text file - res2 = srv.asyncio.get_event_loop().run_until_complete( + res2 = asyncio.run( srv.repo_search(queries=["hello"], limit=5, ext="md", compact=False) ) assert any(f_md in (r.get("path") or "") for r in res2.get("results", [])) # Path glob to only allow pkg/*.py - res3 = srv.asyncio.get_event_loop().run_until_complete( + res3 = asyncio.run( srv.repo_search( queries=["def"], limit=5, diff --git a/tests/test_negative_args.py b/tests/test_negative_args.py index 32dadd52..f5181375 100644 --- a/tests/test_negative_args.py +++ b/tests/test_negative_args.py @@ -1,4 +1,5 @@ import os +import asyncio import pytest import scripts.mcp_indexer_server as srv @@ -14,7 +15,7 @@ def test_repo_search_conflicting_filters_empty_ok(monkeypatch): monkeypatch.setattr(hy, "run_hybrid_search", lambda *a, **k: []) - res = srv.asyncio.get_event_loop().run_until_complete( + res = asyncio.run( srv.repo_search(queries=["foo"], limit=3, ext="cpp", compact=True) ) diff --git a/tests/test_path_scope.py b/tests/test_path_scope.py new file mode 100644 index 00000000..06369860 --- /dev/null +++ b/tests/test_path_scope.py @@ -0,0 +1,61 @@ +import importlib + + +ps = importlib.import_module("scripts.path_scope") + + +def test_normalize_under_strips_work_prefix(): + assert ps.normalize_under("/work/scripts/mcp_impl") == "scripts/mcp_impl" + + +def test_normalize_under_keeps_repo_prefixed_path(): + assert ( + ps.normalize_under("/work/Context-Engine/scripts/mcp_impl") + == "Context-Engine/scripts/mcp_impl" + ) + + +def test_normalize_under_rebases_single_segment_from_cwd(monkeypatch, tmp_path): + repo = tmp_path / "repo" + (repo / "nested" / "scope").mkdir(parents=True) + monkeypatch.setattr(ps, "_repo_root_hint", lambda: str(repo)) + monkeypatch.setattr(ps.os, "getcwd", lambda: str(repo / "nested")) + + assert ps.normalize_under("scope") == "nested/scope" + + +def test_normalize_under_does_not_rebase_when_top_level_exists(monkeypatch, tmp_path): + repo = tmp_path / "repo" + (repo / "nested" / "scope").mkdir(parents=True) + (repo / "scope").mkdir(parents=True) + monkeypatch.setattr(ps, "_repo_root_hint", lambda: str(repo)) + monkeypatch.setattr(ps.os, "getcwd", lambda: str(repo / "nested")) + + assert ps.normalize_under("scope") == "scope" + + +def test_normalize_under_expands_unique_segment(monkeypatch, tmp_path): + repo = tmp_path / "repo" + (repo / "alpha" / "mcp_impl").mkdir(parents=True) + monkeypatch.setattr(ps, "_repo_root_hint", lambda: str(repo)) + monkeypatch.setattr(ps.os, "getcwd", lambda: str(repo)) + ps._unique_segment_path.cache_clear() + + assert ps.normalize_under("mcp_impl") == "alpha/mcp_impl" + + +def test_normalize_under_keeps_ambiguous_segment(monkeypatch, tmp_path): + repo = tmp_path / "repo" + (repo / "alpha" / "dup").mkdir(parents=True) + (repo / "beta" / "dup").mkdir(parents=True) + monkeypatch.setattr(ps, "_repo_root_hint", lambda: str(repo)) + monkeypatch.setattr(ps.os, "getcwd", lambda: str(repo)) + ps._unique_segment_path.cache_clear() + + assert ps.normalize_under("dup") == "dup" + + +def test_metadata_matches_under_without_repo_hint_for_work_repo_paths(): + md = {"path": "/work/repo/space/ship/a.py"} + assert ps.metadata_matches_under(md, "space") + assert not ps.metadata_matches_under(md, "direct") diff --git a/tests/test_rerank_under_scope.py b/tests/test_rerank_under_scope.py new file mode 100644 index 00000000..080da9cd --- /dev/null +++ b/tests/test_rerank_under_scope.py @@ -0,0 +1,66 @@ +import importlib + + +rr = importlib.import_module("scripts.rerank_tools.local") + + +class _Pt: + def __init__(self, pid: str, path: str): + self.id = pid + self.payload = { + "metadata": { + "path": path, + "start_line": 1, + "end_line": 2, + "symbol": "f", + } + } + + +class _FakeModel: + def embed(self, texts): + for _ in texts: + yield [0.01] * 8 + + +def test_rerank_in_process_under_excludes_out_of_scope(monkeypatch): + monkeypatch.setattr(rr, "QdrantClient", lambda *a, **k: object()) + monkeypatch.setattr(rr, "_select_dense_vector_name", lambda *a, **k: "vec") + monkeypatch.setattr( + rr, + "dense_results", + lambda *a, **k: [_Pt("1", "/work/repo/direct/tools/b.py")], + ) + monkeypatch.setattr(rr, "rerank_local", lambda pairs: [0.9] * len(pairs)) + + out = rr.rerank_in_process( + query="rotate heading", + topk=10, + limit=5, + under="space", + model=_FakeModel(), + collection="codebase", + ) + assert out == [] + + +def test_rerank_in_process_under_keeps_in_scope(monkeypatch): + monkeypatch.setattr(rr, "QdrantClient", lambda *a, **k: object()) + monkeypatch.setattr(rr, "_select_dense_vector_name", lambda *a, **k: "vec") + monkeypatch.setattr( + rr, + "dense_results", + lambda *a, **k: [_Pt("1", "/work/repo/space/ship/a.py")], + ) + monkeypatch.setattr(rr, "rerank_local", lambda pairs: [0.9] * len(pairs)) + + out = rr.rerank_in_process( + query="rotate heading", + topk=10, + limit=5, + under="space", + model=_FakeModel(), + collection="codebase", + ) + assert len(out) == 1 + assert out[0]["path"] == "/work/repo/space/ship/a.py" diff --git a/tests/test_reranker_verification.py b/tests/test_reranker_verification.py index e7a05245..b43441eb 100644 --- a/tests/test_reranker_verification.py +++ b/tests/test_reranker_verification.py @@ -16,6 +16,10 @@ def tool(self, *args, **kwargs): def _decorator(fn): return fn return _decorator + def resource(self, *args, **kwargs): + def _decorator(fn): + return fn + return _decorator class _Context: def __init__(self, *args, **kwargs): @@ -102,7 +106,9 @@ def fake_rerank_local(pairs): assert [r["path"] for r in base["results"]] == ["/work/a.py", "/work/b.py"] # With rerank enabled, order should flip to B then A; counters should show inproc_hybrid - rr = await server.repo_search(query="q", limit=2, per_path=2, rerank_enabled=True, compact=True) + rr = await server.repo_search( + query="q", limit=2, per_path=2, rerank_enabled=True, compact=True, debug=True + ) assert rr.get("used_rerank") is True assert rr.get("rerank_counters", {}).get("inproc_hybrid", 0) >= 1 assert [r["path"] for r in rr["results"]] == ["/work/b.py", "/work/a.py"] @@ -147,6 +153,69 @@ def fake_rerank_in_process(**kwargs): assert captured.get("collection") == "other-collection" +@pytest.mark.service +@pytest.mark.anyio +async def test_rerank_inproc_dense_respects_path_filters(monkeypatch): + monkeypatch.setenv("HYBRID_IN_PROCESS", "1") + monkeypatch.setenv("RERANK_IN_PROCESS", "1") + + def fake_run_hybrid_search(**kwargs): + return [] + + monkeypatch.setitem(sys.modules, "scripts.hybrid_search", _make_hybrid_stub(fake_run_hybrid_search)) + monkeypatch.delitem(sys.modules, "scripts.mcp_indexer_server", raising=False) + server = importlib.import_module("scripts.mcp_indexer_server") + monkeypatch.setattr(server, "_get_embedding_model", _fake_embedding_model) + + def fake_rerank_in_process(**kwargs): + return [ + {"score": 0.9, "path": "/work/src/a.py", "symbol": "", "start_line": 1, "end_line": 3}, + {"score": 0.8, "path": "/work/tests/b.py", "symbol": "", "start_line": 5, "end_line": 9}, + { + "score": 0.7, + "path": "/home/coder/project/Context-Engine/scripts/mcp_impl/search.py", + "symbol": "", + "start_line": 10, + "end_line": 20, + }, + ] + + monkeypatch.setattr( + importlib.import_module("scripts.rerank_local"), + "rerank_in_process", + fake_rerank_in_process, + ) + + only_tests = await server.repo_search( + query="q", + limit=10, + rerank_enabled=True, + path_glob=["tests/**"], + compact=True, + ) + assert [r["path"] for r in only_tests["results"]] == ["/work/tests/b.py"] + + no_tests = await server.repo_search( + query="q", + limit=10, + rerank_enabled=True, + not_glob=["**/tests/**"], + compact=True, + ) + assert all("/tests/" not in r["path"] for r in no_tests["results"]) + + host_rel_glob = await server.repo_search( + query="q", + limit=10, + rerank_enabled=True, + path_glob=["scripts/mcp_impl/**"], + compact=True, + ) + assert [r["path"] for r in host_rel_glob["results"]] == [ + "/home/coder/project/Context-Engine/scripts/mcp_impl/search.py" + ] + + @pytest.mark.service @pytest.mark.anyio async def test_rerank_subprocess_timeout_fallback(monkeypatch): @@ -187,9 +256,9 @@ async def fake_run_async(cmd, env=None, timeout=None): rerank_enabled=True, compact=True, collection="test-coll", + debug=True, ) # Fallback should keep original order from hybrid; timeout counter incremented assert rr.get("used_rerank") is False assert rr.get("rerank_counters", {}).get("timeout", 0) >= 1 assert [r["path"] for r in rr["results"]] == ["/work/a.py", "/work/b.py"] - diff --git a/tests/test_server_helpers.py b/tests/test_server_helpers.py index 94212145..84856ee6 100644 --- a/tests/test_server_helpers.py +++ b/tests/test_server_helpers.py @@ -1,4 +1,5 @@ import json +import asyncio import types import importlib @@ -53,7 +54,7 @@ def test_repo_search_arg_normalization(monkeypatch, tmp_path): # Ensure in-process branch stays off monkeypatch.delenv("HYBRID_IN_PROCESS", raising=False) - res = srv.asyncio.get_event_loop().run_until_complete( + res = asyncio.run( _call_repo_search( queries=["FooBar"], limit="12", # str on purpose to test coercion diff --git a/tests/test_service_qdrant_status.py b/tests/test_service_qdrant_status.py index df04254c..38a1ec6b 100644 --- a/tests/test_service_qdrant_status.py +++ b/tests/test_service_qdrant_status.py @@ -1,4 +1,5 @@ import types +import asyncio import importlib import pytest @@ -31,7 +32,7 @@ def test_qdrant_status_mocked(monkeypatch): monkeypatch.setattr(qdrant_client, "QdrantClient", lambda *a, **k: FakeQdrant()) - out = srv.asyncio.get_event_loop().run_until_complete( + out = asyncio.run( srv.qdrant_status(collection="test") ) # qdrant_status returns a summary shape without an 'ok' key diff --git a/tests/test_staging_lifecycle.py b/tests/test_staging_lifecycle.py index eb8a93cf..ea7ab528 100644 --- a/tests/test_staging_lifecycle.py +++ b/tests/test_staging_lifecycle.py @@ -553,6 +553,7 @@ def test_admin_copy_endpoint_reports_graph_clone_in_redirect(monkeypatch: pytest monkeypatch.setattr(upload_service, "_require_admin_session", lambda request: {"user_id": "admin"}) monkeypatch.setattr(upload_service, "WORK_DIR", "/fake/work") monkeypatch.setenv("WORK_DIR", "/fake/work") + monkeypatch.setattr(upload_service, "pooled_qdrant_client", None, raising=False) def fake_copy_collection_qdrant(**kwargs): assert kwargs.get("source") == "src" diff --git a/tests/test_symbol_graph_tool.py b/tests/test_symbol_graph_tool.py index e148fc17..1d3861d0 100644 --- a/tests/test_symbol_graph_tool.py +++ b/tests/test_symbol_graph_tool.py @@ -2,21 +2,43 @@ @pytest.mark.asyncio -async def test_symbol_graph_under_uses_path_prefix_matchvalue(): - # Import internal helper to validate filter construction without needing a real Qdrant instance. - from qdrant_client import models as qmodels +async def test_symbol_graph_under_filters_results_by_recursive_scope(): + # Validate that under applies as recursive subtree filter (user-facing scope). from scripts.mcp_impl import symbol_graph as sg - captured = {} + class _Pt: + def __init__(self, pid, path): + self.id = pid + self.payload = { + "metadata": { + "repo": "repo", + "path": path, + "start_line": 1, + "end_line": 2, + "symbol": "f", + "symbol_path": "f", + "language": "python", + "calls": ["foo"], + } + } class FakeClient: + def __init__(self): + self.scroll_filters = [] + def scroll(self, *, collection_name, scroll_filter, limit, with_payload, with_vectors): - captured["collection_name"] = collection_name - captured["scroll_filter"] = scroll_filter - return ([], None) + self.scroll_filters.append(scroll_filter) + return ( + [ + _Pt("1", "/work/repo/scripts/a.py"), + _Pt("2", "/work/repo/tests/b.py"), + ], + None, + ) - await sg._query_array_field( # type: ignore[attr-defined] - client=FakeClient(), + client = FakeClient() + out = await sg._query_array_field( # type: ignore[attr-defined] + client=client, collection="codebase", field_key="metadata.calls", value="foo", @@ -25,15 +47,29 @@ def scroll(self, *, collection_name, scroll_filter, limit, with_payload, with_ve under=sg._norm_under("scripts"), # type: ignore[attr-defined] ) - flt = captured.get("scroll_filter") - assert isinstance(flt, qmodels.Filter) - must = list(flt.must or []) - keys = [getattr(c, "key", None) for c in must] - assert "metadata.path_prefix" in keys - - # Ensure it's an exact match (MatchValue), not substring (MatchText) - cond = next(c for c in must if getattr(c, "key", None) == "metadata.path_prefix") - assert isinstance(cond.match, qmodels.MatchValue) - assert cond.match.value == "/work/scripts" - + # Validate _query_array_field forwards language/value constraints to scroll_filter. + assert client.scroll_filters, "Expected at least one scroll() call" + first_filter = client.scroll_filters[0] + first_must = list(getattr(first_filter, "must", []) or []) + assert any( + getattr(cond, "key", None) == "metadata.calls" + and getattr(getattr(cond, "match", None), "any", None) == ["foo"] + for cond in first_must + ) + assert any( + getattr(cond, "key", None) == "metadata.language" + and getattr(getattr(cond, "match", None), "value", None) == "python" + for cond in first_must + ) + assert any( + any( + getattr(cond, "key", None) == "metadata.calls" + and getattr(getattr(cond, "match", None), "text", None) == "foo" + for cond in list(getattr(sf, "must", []) or []) + ) + for sf in client.scroll_filters + ), "Expected MatchText fallback filter for metadata.calls" + paths = {r.get("path") for r in out} + assert "/work/repo/scripts/a.py" in paths + assert "/work/repo/tests/b.py" not in paths From ba1e9c20bf4c4e606c2327aa5fb9887108c36f49 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 7 Mar 2026 13:36:13 +0000 Subject: [PATCH 17/39] refactor(ingest): add async git history processing and structured logging - Replace blocking subprocess.run with ThreadPoolExecutor for non-blocking git history ingestion in watch processor - Add comprehensive progress logging with metrics (prepared, persisted, failures) in ingest_history.py - Add skip-reason logging for git history collection in upload clients - Support configurable timeout via WATCH_GIT_HISTORY_TIMEOUT_SECONDS - Track in-flight manifests to prevent duplicate processing - Stream stdout/stderr from ingestion subprocess with tail capture --- scripts/ingest_history.py | 107 ++++++++++-- scripts/remote_upload_client.py | 33 ++++ scripts/standalone_upload_client.py | 33 ++++ scripts/watch_index_core/processor.py | 235 +++++++++++++++++++++++--- 4 files changed, 374 insertions(+), 34 deletions(-) diff --git a/scripts/ingest_history.py b/scripts/ingest_history.py index 3f42715c..26fcf448 100644 --- a/scripts/ingest_history.py +++ b/scripts/ingest_history.py @@ -4,6 +4,7 @@ import subprocess import shlex import hashlib +import logging from typing import List, Dict, Any import re import time @@ -35,6 +36,10 @@ from scripts.utils import sanitize_vector_name as _sanitize_vector_name +logger = logging.getLogger(__name__) +if not logger.handlers: + logging.basicConfig(level=logging.INFO) + def _manifest_run_id(manifest_path: str) -> str: try: @@ -382,13 +387,43 @@ def _ingest_from_manifest( mode = str(data.get("mode") or "delta").strip().lower() or "delta" points: List[models.PointStruct] = [] - count = 0 - for c in commits: + total_commits = len(commits) + prepared_count = 0 + persisted_count = 0 + invalid_commit_records = 0 + embed_failures = 0 + point_build_failures = 0 + upsert_failures = 0 + processed_count = 0 + progress_step = max(1, total_commits // 10) if total_commits > 0 else 1 + + def _log_progress(force: bool = False) -> None: + if not force and processed_count % progress_step != 0: + return + logger.info( + "[ingest_history] progress run_id=%s processed=%d/%d prepared=%d persisted=%d invalid=%d embed_failures=%d point_failures=%d upsert_failures=%d", + run_id, + processed_count, + total_commits, + prepared_count, + persisted_count, + invalid_commit_records, + embed_failures, + point_build_failures, + upsert_failures, + ) + + for idx, c in enumerate(commits, start=1): + processed_count += 1 try: if not isinstance(c, dict): + invalid_commit_records += 1 + _log_progress() continue commit_id = str(c.get("commit_id") or "").strip() if not commit_id: + invalid_commit_records += 1 + _log_progress() continue author_name = str(c.get("author_name") or "") authored_date = str(c.get("authored_date") or "") @@ -406,7 +441,12 @@ def _ingest_from_manifest( text = build_text(md, include_body=include_body) try: vec = next(model.embed([text])).tolist() - except Exception: + except Exception as e: + embed_failures += 1 + logger.warning( + f"[ingest_history] embed failed for commit={commit_id} idx={idx}: {e}", + ) + _log_progress() continue goal: str = "" @@ -451,25 +491,64 @@ def _ingest_from_manifest( pid = stable_id(commit_id) pt = models.PointStruct(id=pid, vector={vec_name: vec}, payload=payload) points.append(pt) - count += 1 + prepared_count += 1 if len(points) >= per_batch: - client.upsert(collection_name=COLLECTION, points=points) - points.clear() - except Exception: + batch_size = len(points) + try: + client.upsert(collection_name=COLLECTION, points=points) + persisted_count += batch_size + except Exception as e: + upsert_failures += batch_size + logger.error( + "[ingest_history] upsert batch failed (size=%d): %s", + batch_size, + e, + ) + finally: + points.clear() + _log_progress() + except Exception as e: + point_build_failures += 1 + logger.warning( + f"[ingest_history] commit processing failed idx={idx}: {e}", + ) + _log_progress() continue if points: - client.upsert(collection_name=COLLECTION, points=points) + batch_size = len(points) + try: + client.upsert(collection_name=COLLECTION, points=points) + persisted_count += batch_size + except Exception as e: + upsert_failures += batch_size + logger.error( + "[ingest_history] final upsert failed (size=%d): %s", + batch_size, + e, + ) + _log_progress(force=True) try: _prune_old_commit_points(client, run_id, mode=mode) - except Exception: - pass + except Exception as e: + logger.warning("[ingest_history] prune failed for run_id=%s: %s", run_id, e) try: _cleanup_manifest_files(manifest_path) - except Exception: - pass - print(f"Ingested {count} commits into {COLLECTION} from manifest {manifest_path}.") - return count + except Exception as e: + logger.warning("[ingest_history] manifest cleanup failed for %s: %s", manifest_path, e) + logger.info( + "Ingested commits from manifest %s into %s: persisted=%d prepared=%d invalid=%d " + "embed_failures=%d point_failures=%d upsert_failures=%d", + manifest_path, + COLLECTION, + persisted_count, + prepared_count, + invalid_commit_records, + embed_failures, + point_build_failures, + upsert_failures, + ) + return persisted_count def main(): diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index bf9b980f..6440bfeb 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -46,6 +46,16 @@ # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +_git_history_skip_log_key: Optional[str] = None + + +def _log_git_history_skip_once(reason: str, key: str) -> None: + global _git_history_skip_log_key + marker = f"{reason}:{key}" + if _git_history_skip_log_key == marker: + return + _git_history_skip_log_key = marker + logger.info("[git_history] skip (%s): %s", reason, key) DEFAULT_MAX_TEMP_CLEAN_ATTEMPTS = 3 DEFAULT_TEMP_CLEAN_SLEEP = 1.0 @@ -167,10 +177,12 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str } if max_commits <= 0: + _log_git_history_skip_once("disabled", f"max_commits={max_commits}") return None root = _find_git_root(Path(workspace_path)) if not root: + _log_git_history_skip_once("no_repo", workspace_path) return None # Git history cache: avoid emitting identical manifests when HEAD/settings are unchanged @@ -204,6 +216,7 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str cache = {} if current_head and cache.get("last_head") == current_head and cache.get("max_commits") == max_commits and str(cache.get("since") or "") == since: + _log_git_history_skip_once("cache_hit", f"head={current_head[:10]} since={since or '-'} max={max_commits}") return None base_head = "" @@ -254,12 +267,20 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str errors="replace", ) if proc.returncode != 0 or not proc.stdout.strip(): + _log_git_history_skip_once( + "rev_list_empty", + f"head={current_head[:10] if current_head else '-'} rc={proc.returncode}", + ) return None commits = [l.strip() for l in proc.stdout.splitlines() if l.strip()] except Exception: return None if not commits: + _log_git_history_skip_once( + "no_commits", + f"head={current_head[:10] if current_head else '-'}", + ) return None if len(commits) > max_commits: commits = commits[:max_commits] @@ -333,6 +354,10 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str continue if not records: + _log_git_history_skip_once( + "no_records", + f"commits={len(commits)} head={current_head[:10] if current_head else '-'}", + ) return None try: @@ -352,6 +377,14 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str "since": since, "commits": records, } + logger.info( + "[git_history] prepared manifest mode=%s commits=%d head=%s prev=%s base=%s", + manifest["mode"], + len(records), + (current_head[:10] if current_head else "-"), + (prev_head[:10] if prev_head else "-"), + (base_head[:10] if base_head else "-"), + ) # Update git history cache with the HEAD and settings used for this manifest try: diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 62982e91..f9eb8075 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -48,6 +48,16 @@ def get_auth_session(upload_endpoint: str) -> str: # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +_git_history_skip_log_key: Optional[str] = None + + +def _log_git_history_skip_once(reason: str, key: str) -> None: + global _git_history_skip_log_key + marker = f"{reason}:{key}" + if _git_history_skip_log_key == marker: + return + _git_history_skip_log_key = marker + logger.info("[git_history] skip (%s): %s", reason, key) DEFAULT_MAX_TEMP_CLEAN_ATTEMPTS = 3 DEFAULT_TEMP_CLEAN_SLEEP = 1.0 @@ -426,10 +436,12 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str } if max_commits <= 0: + _log_git_history_skip_once("disabled", f"max_commits={max_commits}") return None root = _find_git_root(Path(workspace_path)) if not root: + _log_git_history_skip_once("no_repo", workspace_path) return None # Git history cache: avoid emitting identical manifests when HEAD/settings are unchanged @@ -463,6 +475,7 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str cache = {} if current_head and cache.get("last_head") == current_head and cache.get("max_commits") == max_commits and str(cache.get("since") or "") == since: + _log_git_history_skip_once("cache_hit", f"head={current_head[:10]} since={since or '-'} max={max_commits}") return None base_head = "" @@ -513,12 +526,20 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str errors="replace", ) if proc.returncode != 0 or not proc.stdout.strip(): + _log_git_history_skip_once( + "rev_list_empty", + f"head={current_head[:10] if current_head else '-'} rc={proc.returncode}", + ) return None commits = [l.strip() for l in proc.stdout.splitlines() if l.strip()] except Exception: return None if not commits: + _log_git_history_skip_once( + "no_commits", + f"head={current_head[:10] if current_head else '-'}", + ) return None if len(commits) > max_commits: commits = commits[:max_commits] @@ -592,6 +613,10 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str continue if not records: + _log_git_history_skip_once( + "no_records", + f"commits={len(commits)} head={current_head[:10] if current_head else '-'}", + ) return None try: @@ -611,6 +636,14 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str "since": since, "commits": records, } + logger.info( + "[git_history] prepared manifest mode=%s commits=%d head=%s prev=%s base=%s", + manifest["mode"], + len(records), + (current_head[:10] if current_head else "-"), + (prev_head[:10] if prev_head else "-"), + (base_head[:10] if base_head else "-"), + ) # Update git history cache with the HEAD and settings used for this manifest try: diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index 6f529cf7..968eba76 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -3,9 +3,14 @@ from __future__ import annotations import hashlib +import json import os import subprocess import sys +import threading +import time +from collections import deque +from concurrent.futures import Future, ThreadPoolExecutor from datetime import datetime from pathlib import Path from typing import Dict, List, Optional @@ -38,39 +43,229 @@ class _SkipUnchanged(Exception): """Sentinel exception to skip unchanged files in the watch loop.""" -def _process_git_history_manifest( +def _env_int(name: str, default: int) -> int: + try: + raw = str(os.environ.get(name, str(default))).strip() + val = int(raw) + return val if val > 0 else default + except Exception: + return default + + +_GIT_HISTORY_MAX_WORKERS = _env_int("WATCH_GIT_HISTORY_MAX_WORKERS", 1) +_GIT_HISTORY_TIMEOUT_SECONDS = _env_int("WATCH_GIT_HISTORY_TIMEOUT_SECONDS", 0) +_GIT_HISTORY_EXECUTOR = ThreadPoolExecutor( + max_workers=_GIT_HISTORY_MAX_WORKERS, + thread_name_prefix="git-history", +) +_GIT_HISTORY_INFLIGHT: set[str] = set() +_GIT_HISTORY_INFLIGHT_LOCK = threading.Lock() + + +def _manifest_key(p: Path) -> str: + try: + return str(p.resolve()) + except Exception: + return str(p) + + +def _manifest_stats(p: Path) -> tuple[str, int]: + run_id = "unknown" + commit_count = -1 + try: + with p.open("r", encoding="utf-8") as fh: + data = json.load(fh) + if isinstance(data, dict): + commits = data.get("commits") or [] + if isinstance(commits, list): + commit_count = len(commits) + name = p.name + run_id = name[:-5] if name.endswith(".json") else name + except Exception: + pass + return run_id, commit_count + + +def _run_git_history_ingest( p: Path, collection: str, repo_name: Optional[str], env_snapshot: Optional[Dict[str, str]] = None, ) -> None: - try: - script = ROOT_DIR / "scripts" / "ingest_history.py" - if not script.exists(): - return - cmd = [sys.executable or "python3", str(script), "--manifest-json", str(p)] - env = _build_subprocess_env(collection, repo_name, env_snapshot) + script = ROOT_DIR / "scripts" / "ingest_history.py" + if not script.exists(): + logger.warning("[git_history_manifest] ingest script missing: %s", script) + return + + cmd = [sys.executable or "python3", str(script), "--manifest-json", str(p)] + env = _build_subprocess_env(collection, repo_name, env_snapshot) + started = time.monotonic() + timeout = _GIT_HISTORY_TIMEOUT_SECONDS if _GIT_HISTORY_TIMEOUT_SECONDS > 0 else None + stdout_tail: deque[str] = deque(maxlen=20) + stderr_tail: deque[str] = deque(maxlen=20) + + def _stream_pipe(pipe, label: str, tail: deque[str]) -> None: try: - print( - f"[git_history_manifest] launching ingest_history.py for {p} " - f"collection={collection} repo={repo_name}" - ) + for raw in iter(pipe.readline, ""): + line = (raw or "").rstrip() + if not line: + continue + tail.append(line) + logger.info("[git_history_manifest][%s] %s", label, line) except Exception: pass - # Use subprocess.run for better error observability. - # NOTE: This blocks until ingest_history.py completes. If history ingestion - # is slow, this may need revisiting (e.g., revert to Popen fire-and-forget - # or run in a separate thread) to avoid blocking the watcher. - result = subprocess.run(cmd, env=env, capture_output=True, text=True, check=False) - if result.returncode != 0: + finally: + try: + pipe.close() + except Exception: + pass + + proc: Optional[subprocess.Popen] = None + try: + proc = subprocess.Popen( + cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + bufsize=1, + ) + t_out = threading.Thread( + target=_stream_pipe, + args=(proc.stdout, "stdout", stdout_tail), + daemon=True, + ) + t_err = threading.Thread( + target=_stream_pipe, + args=(proc.stderr, "stderr", stderr_tail), + daemon=True, + ) + t_out.start() + t_err.start() + + deadline = (started + timeout) if timeout else None + timed_out = False + while True: + code = proc.poll() + if code is not None: + break + if deadline and time.monotonic() >= deadline: + timed_out = True + try: + proc.kill() + except Exception: + pass + break + time.sleep(0.2) + + # Ensure threads flush trailing output after process exit/kill. + t_out.join(timeout=1.0) + t_err.join(timeout=1.0) + + if timed_out: + elapsed_ms = int((time.monotonic() - started) * 1000) logger.warning( - "[git_history_manifest] ingest_history.py failed for %s: exit=%d stderr=%s", - p, result.returncode, (result.stderr or "")[:500], + "[git_history_manifest] ingest_history.py timeout for %s after %dms (timeout=%ss)", + p, + elapsed_ms, + _GIT_HISTORY_TIMEOUT_SECONDS, ) + if stderr_tail: + logger.warning( + "[git_history_manifest] timeout stderr tail for %s: %s", + p, + " | ".join(list(stderr_tail)[-5:]), + ) + return + + returncode = proc.wait(timeout=1.0) except Exception as e: - logger.warning("[git_history_manifest] error processing %s: %s", p, e) + logger.warning("[git_history_manifest] subprocess error for %s: %s", p, e) + try: + if proc and proc.poll() is None: + proc.kill() + except Exception: + pass + return + + elapsed_ms = int((time.monotonic() - started) * 1000) + if returncode != 0: + logger.warning( + "[git_history_manifest] ingest_history.py failed for %s: exit=%d elapsed_ms=%d stderr=%s", + p, + returncode, + elapsed_ms, + " | ".join(list(stderr_tail)[-5:]), + ) return + logger.info( + "[git_history_manifest] completed for %s: exit=0 elapsed_ms=%d", + p, + elapsed_ms, + ) + if stdout_tail: + logger.info( + "[git_history_manifest] stdout tail for %s: %s", + p, + " | ".join(list(stdout_tail)[-5:]), + ) + if stderr_tail: + logger.warning( + "[git_history_manifest] stderr tail for %s: %s", + p, + " | ".join(list(stderr_tail)[-5:]), + ) + + +def _on_git_history_done(manifest_key: str, future: Future) -> None: + with _GIT_HISTORY_INFLIGHT_LOCK: + _GIT_HISTORY_INFLIGHT.discard(manifest_key) + remaining = len(_GIT_HISTORY_INFLIGHT) + logger.info("[git_history_manifest] in-flight remaining=%d", remaining) + try: + future.result() + except Exception as e: + logger.warning("[git_history_manifest] worker crashed for %s: %s", manifest_key, e) + + +def _process_git_history_manifest( + p: Path, + collection: str, + repo_name: Optional[str], + env_snapshot: Optional[Dict[str, str]] = None, +) -> None: + key = _manifest_key(p) + run_id, commit_count = _manifest_stats(p) + queued = 0 + with _GIT_HISTORY_INFLIGHT_LOCK: + if key in _GIT_HISTORY_INFLIGHT: + logger.info( + "[git_history_manifest] skip duplicate in-flight manifest: %s run_id=%s", + p, + run_id, + ) + return + _GIT_HISTORY_INFLIGHT.add(key) + queued = len(_GIT_HISTORY_INFLIGHT) + logger.info( + "[git_history_manifest] queued ingest_history.py for %s run_id=%s commits=%d collection=%s repo=%s in_flight=%d", + p, + run_id, + commit_count, + collection, + repo_name, + queued, + ) + future = _GIT_HISTORY_EXECUTOR.submit( + _run_git_history_ingest, + p, + collection, + repo_name, + env_snapshot, + ) + future.add_done_callback(lambda fut, manifest_key=key: _on_git_history_done(manifest_key, fut)) + def _advance_progress( repo_progress: Dict[str, int], From d30e1c48b7876706fb26c17dd50ffda46955e987 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 7 Mar 2026 13:37:15 +0000 Subject: [PATCH 18/39] fix(vscode-uploader): restore watch startup after successful auto force-sync (exclude git-history runs) - runSequence('auto') now starts watch after successful force-sync - keeps uploadGitHistory as one-shot (no watch auto-start) --- vscode-extension/context-engine-uploader/extension.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vscode-extension/context-engine-uploader/extension.js b/vscode-extension/context-engine-uploader/extension.js index 217d14e7..d57b64ec 100644 --- a/vscode-extension/context-engine-uploader/extension.js +++ b/vscode-extension/context-engine-uploader/extension.js @@ -543,8 +543,9 @@ async function runSequence(mode = 'auto') { if (code === 0) { setStatusBarState('indexed'); if (processManager) { processManager.ensureIndexedWatcher(options.targetPath); } - // Only start watching after a regular force sync, not after git history upload - if (mode === 'force' && options.startWatchAfterForce && processManager) { + // Start watch after successful force sync in normal flows (`force` and `auto`), + // but keep git-history upload as one-shot. + if (mode !== 'uploadGitHistory' && options.startWatchAfterForce && processManager) { processManager.startWatch(options); } } else { From 24b7c3f0602e746e540330f37d23382fdfb88373 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 7 Mar 2026 13:44:25 +0000 Subject: [PATCH 19/39] refactor(ingest): improve logging practices and thread safety - Move logging.basicConfig to main() and use NullHandler at module level - Convert f-string logging to lazy % formatting for better performance - Add exc_info=True for exception logging context - Add thread-safe locking for stdout/stderr tail deques in processor - Add atexit handler for graceful thread pool executor shutdown - Add error handling with detailed logging for batch upsert operations --- scripts/ingest_history.py | 44 ++++++++++++++++++++++----- scripts/watch_index_core/processor.py | 33 +++++++++++++++----- 2 files changed, 61 insertions(+), 16 deletions(-) diff --git a/scripts/ingest_history.py b/scripts/ingest_history.py index 26fcf448..602b882f 100644 --- a/scripts/ingest_history.py +++ b/scripts/ingest_history.py @@ -37,8 +37,7 @@ from scripts.utils import sanitize_vector_name as _sanitize_vector_name logger = logging.getLogger(__name__) -if not logger.handlers: - logging.basicConfig(level=logging.INFO) +logger.addHandler(logging.NullHandler()) def _manifest_run_id(manifest_path: str) -> str: @@ -444,7 +443,10 @@ def _log_progress(force: bool = False) -> None: except Exception as e: embed_failures += 1 logger.warning( - f"[ingest_history] embed failed for commit={commit_id} idx={idx}: {e}", + "[ingest_history] embed failed for commit=%s idx=%d: %s", + commit_id, + idx, + e, ) _log_progress() continue @@ -507,10 +509,12 @@ def _log_progress(force: bool = False) -> None: finally: points.clear() _log_progress() - except Exception as e: + except Exception: point_build_failures += 1 logger.warning( - f"[ingest_history] commit processing failed idx={idx}: {e}", + "[ingest_history] commit processing failed idx=%d", + idx, + exc_info=True, ) _log_progress() continue @@ -552,6 +556,7 @@ def _log_progress(force: bool = False) -> None: def main(): + logging.basicConfig(level=logging.INFO) ap = argparse.ArgumentParser( description="Ingest Git history into Qdrant deterministically" ) @@ -662,10 +667,33 @@ def main(): point = models.PointStruct(id=pid, vector={vec_name: vec}, payload=payload) points.append(point) if len(points) >= args.per_batch: - client.upsert(collection_name=COLLECTION, points=points) - points.clear() + batch_size = len(points) + try: + client.upsert(collection_name=COLLECTION, points=points) + except Exception as e: + logger.error( + "[ingest_history] batch upsert failed collection=%s repo=%s size=%d path=%s: %s", + COLLECTION, + REPO_NAME, + batch_size, + args.path or "", + e, + ) + finally: + points.clear() if points: - client.upsert(collection_name=COLLECTION, points=points) + final_size = len(points) + try: + client.upsert(collection_name=COLLECTION, points=points) + except Exception as e: + logger.error( + "[ingest_history] final upsert failed collection=%s repo=%s size=%d path=%s: %s", + COLLECTION, + REPO_NAME, + final_size, + args.path or "", + e, + ) print(f"Ingested {len(commits)} commits into {COLLECTION}.") diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index 968eba76..a530ce89 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -7,6 +7,7 @@ import os import subprocess import sys +import atexit import threading import time from collections import deque @@ -58,6 +59,16 @@ def _env_int(name: str, default: int) -> int: max_workers=_GIT_HISTORY_MAX_WORKERS, thread_name_prefix="git-history", ) + + +def _shutdown_git_history_executor() -> None: + try: + _GIT_HISTORY_EXECUTOR.shutdown(wait=False) + except Exception: + pass + + +atexit.register(_shutdown_git_history_executor) _GIT_HISTORY_INFLIGHT: set[str] = set() _GIT_HISTORY_INFLIGHT_LOCK = threading.Lock() @@ -103,14 +114,20 @@ def _run_git_history_ingest( timeout = _GIT_HISTORY_TIMEOUT_SECONDS if _GIT_HISTORY_TIMEOUT_SECONDS > 0 else None stdout_tail: deque[str] = deque(maxlen=20) stderr_tail: deque[str] = deque(maxlen=20) + tail_lock = threading.Lock() + + def _tail_snapshot(tail: deque[str], limit: int = 5) -> str: + with tail_lock: + return " | ".join(list(tail)[-limit:]) - def _stream_pipe(pipe, label: str, tail: deque[str]) -> None: + def _stream_pipe(pipe, label: str, tail: deque[str], lock: threading.Lock) -> None: try: for raw in iter(pipe.readline, ""): line = (raw or "").rstrip() if not line: continue - tail.append(line) + with lock: + tail.append(line) logger.info("[git_history_manifest][%s] %s", label, line) except Exception: pass @@ -132,12 +149,12 @@ def _stream_pipe(pipe, label: str, tail: deque[str]) -> None: ) t_out = threading.Thread( target=_stream_pipe, - args=(proc.stdout, "stdout", stdout_tail), + args=(proc.stdout, "stdout", stdout_tail, tail_lock), daemon=True, ) t_err = threading.Thread( target=_stream_pipe, - args=(proc.stderr, "stderr", stderr_tail), + args=(proc.stderr, "stderr", stderr_tail, tail_lock), daemon=True, ) t_out.start() @@ -174,7 +191,7 @@ def _stream_pipe(pipe, label: str, tail: deque[str]) -> None: logger.warning( "[git_history_manifest] timeout stderr tail for %s: %s", p, - " | ".join(list(stderr_tail)[-5:]), + _tail_snapshot(stderr_tail), ) return @@ -195,7 +212,7 @@ def _stream_pipe(pipe, label: str, tail: deque[str]) -> None: p, returncode, elapsed_ms, - " | ".join(list(stderr_tail)[-5:]), + _tail_snapshot(stderr_tail), ) return @@ -208,13 +225,13 @@ def _stream_pipe(pipe, label: str, tail: deque[str]) -> None: logger.info( "[git_history_manifest] stdout tail for %s: %s", p, - " | ".join(list(stdout_tail)[-5:]), + _tail_snapshot(stdout_tail), ) if stderr_tail: logger.warning( "[git_history_manifest] stderr tail for %s: %s", p, - " | ".join(list(stderr_tail)[-5:]), + _tail_snapshot(stderr_tail), ) From fb560c194e1a27d54c25d28b83a6d90375323c4b Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 7 Mar 2026 15:03:12 +0000 Subject: [PATCH 20/39] fix(uploader): restore incremental sync cache and reduce Windows Python check races - persist standalone file hash cache after successful uploads - restore `.context-engine/file_cache.json` creation for repeat incremental syncs - deduplicate concurrent Python dependency checks per workspace - delay configured-python errors until fallback interpreter discovery is exhausted - log bundled `python_libs` usage during dependency checks --- scripts/standalone_upload_client.py | 13 ++ .../context-engine-uploader/python_env.js | 155 +++++++++++------- 2 files changed, 108 insertions(+), 60 deletions(-) diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index f9eb8075..789db42d 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -300,6 +300,10 @@ def remove_hash(self, file_path: str) -> None: self._cache = file_hashes self._cache_loaded = True + def flush(self) -> None: + """Persist the current in-memory cache state to disk.""" + self._save_cache(dict(self._load_cache())) + def _cache_seems_stale(self, file_hashes: Dict[str, str]) -> bool: """Return True if a large portion of cached paths no longer exist on disk.""" total = len(file_hashes) @@ -351,6 +355,13 @@ def remove_cached_file(file_path: str, repo_name: Optional[str] = None) -> None: _hash_cache.remove_hash(file_path) +def flush_cached_file_hashes() -> None: + """Persist the current workspace hash cache to disk.""" + global _hash_cache + if _hash_cache: + _hash_cache.flush() + + def _find_git_root(start: Path) -> Optional[Path]: """Best-effort detection of the git repository root for a workspace. @@ -1542,6 +1553,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: processed_ops = response.get('processed_operations', {}) logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") logger.info(f"[remote_upload] Processed operations: {processed_ops}") + flush_cached_file_hashes() # Clean up temporary bundle after successful upload try: @@ -1890,6 +1902,7 @@ def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: processed_ops = response.get('processed_operations', {}) logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") logger.info(f"[remote_upload] Processed operations: {processed_ops}") + flush_cached_file_hashes() # Clean up temporary bundle after successful upload try: diff --git a/vscode-extension/context-engine-uploader/python_env.js b/vscode-extension/context-engine-uploader/python_env.js index 9e5e7fd5..9bf24f6e 100644 --- a/vscode-extension/context-engine-uploader/python_env.js +++ b/vscode-extension/context-engine-uploader/python_env.js @@ -114,11 +114,27 @@ function createPythonEnvManager(deps) { const REQUIRED_PYTHON_MODULES = ['requests', 'urllib3', 'charset_normalizer', 'watchdog']; const depCheckCache = new Map(); + const ensureDepCheckInflight = new Map(); + let hasLoggedBundledDepPath = false; function cacheKey(pythonPath, workingDirectory) { return `${pythonPath || ''}::${workingDirectory || ''}`; } + function getBundledLibsPath(workingDirectory) { + const candidates = []; + if (workingDirectory) { + candidates.push(path.join(workingDirectory, 'python_libs')); + } + candidates.push(path.join(getExtensionRoot(), 'python_libs')); + for (const libsPath of candidates) { + if (libsPath && fs.existsSync(libsPath)) { + return libsPath; + } + } + return undefined; + } + function venvRootDir() { // Prefer workspace storage; fallback to extension storage try { @@ -186,16 +202,13 @@ function createPythonEnvManager(deps) { let pythonError; const env = { ...process.env }; try { - const candidates = []; - if (workingDirectory) { - candidates.push(path.join(workingDirectory, 'python_libs')); - } - candidates.push(path.join(getExtensionRoot(), 'python_libs')); - for (const libsPath of candidates) { - if (libsPath && fs.existsSync(libsPath)) { - const existing = env.PYTHONPATH || ''; - env.PYTHONPATH = existing ? `${libsPath}${path.delimiter}${existing}` : libsPath; - break; + const libsPath = getBundledLibsPath(workingDirectory); + if (libsPath) { + const existing = env.PYTHONPATH || ''; + env.PYTHONPATH = existing ? `${libsPath}${path.delimiter}${existing}` : libsPath; + if (!hasLoggedBundledDepPath) { + log(`Using bundled python_libs for dependency checks: ${libsPath}`); + hasLoggedBundledDepPath = true; } } } catch (error) { @@ -309,65 +322,87 @@ function createPythonEnvManager(deps) { } async function ensurePythonDependencies(pythonPath, workingDirectory, pythonPathSource) { - // Probe current interpreter with bundled python_libs first - const allowPrompt = pythonPathSource === 'configured' || pythonPathSource === 'override'; - const primaryKey = cacheKey(pythonPath, workingDirectory); - if (depCheckCache.get(primaryKey)) { - return true; - } - let ok = await checkPythonDeps(pythonPath, workingDirectory, { showInterpreterError: allowPrompt }); - if (ok) { - depCheckCache.set(primaryKey, true); - return true; + const inflightKey = cacheKey(pythonPath, workingDirectory); + const existing = ensureDepCheckInflight.get(inflightKey); + if (existing) { + return existing; } - // If that fails, try to auto-detect a better system Python before falling back to a venv - const autoPython = await detectSystemPython(); - if (autoPython && autoPython !== pythonPath) { - log(`Falling back to auto-detected Python interpreter: ${autoPython}`); - const autoKey = cacheKey(autoPython, workingDirectory); - if (depCheckCache.get(autoKey)) { - setPythonOverridePath(autoPython); + const task = (async () => { + const allowPrompt = pythonPathSource === 'configured' || pythonPathSource === 'override'; + const primaryKey = cacheKey(pythonPath, workingDirectory); + if (depCheckCache.get(primaryKey)) { return true; } - ok = await checkPythonDeps(autoPython, workingDirectory, { showInterpreterError: allowPrompt }); + + let ok = await checkPythonDeps(pythonPath, workingDirectory, { showInterpreterError: false }); if (ok) { - setPythonOverridePath(autoPython); - depCheckCache.set(autoKey, true); + depCheckCache.set(primaryKey, true); return true; } - } - // As a last resort, offer to create a private venv and install deps via pip - // Always prompt at this point - we've exhausted all other options (initial Python + auto-detected both failed) - const choice = await vscode.window.showErrorMessage( - 'Context Engine Uploader: missing Python modules. Create isolated environment and auto-install?', - 'Auto-install to private venv', - 'Cancel' - ); - if (choice !== 'Auto-install to private venv') { - return false; - } - const created = await ensurePrivateVenv(); - if (!created) return false; - const venvPython = resolvePrivateVenvPython(); - if (!venvPython) { - vscode.window.showErrorMessage('Context Engine Uploader: failed to locate private venv python.'); - return false; - } - const installed = await installDepsInto(venvPython); - if (!installed) return false; - setPythonOverridePath(venvPython); - log(`Using private venv interpreter: ${getPythonOverridePath()}`); - const venvKey = cacheKey(venvPython, workingDirectory); - if (depCheckCache.get(venvKey)) { - return true; - } - const finalOk = await checkPythonDeps(venvPython, workingDirectory, { showInterpreterError: true }); - if (finalOk) { - depCheckCache.set(venvKey, true); + // If that fails, try to auto-detect a better system Python before falling back to a venv. + const autoPython = await detectSystemPython(); + if (autoPython && autoPython !== pythonPath) { + log(`Falling back to auto-detected Python interpreter: ${autoPython}`); + const autoKey = cacheKey(autoPython, workingDirectory); + if (depCheckCache.get(autoKey)) { + setPythonOverridePath(autoPython); + return true; + } + ok = await checkPythonDeps(autoPython, workingDirectory, { showInterpreterError: false }); + if (ok) { + setPythonOverridePath(autoPython); + depCheckCache.set(autoKey, true); + return true; + } + } + + // Delay configured-python noise until after fallback discovery is exhausted. + if (allowPrompt) { + vscode.window.showErrorMessage(`Context Engine Uploader: failed to run ${pythonPath}. Update contextEngineUploader.pythonPath.`); + } + + // As a last resort, offer to create a private venv and install deps via pip + // only after current and auto-detected interpreters have both failed. + const choice = await vscode.window.showErrorMessage( + 'Context Engine Uploader: missing Python modules. Create isolated environment and auto-install?', + 'Auto-install to private venv', + 'Cancel' + ); + if (choice !== 'Auto-install to private venv') { + return false; + } + const created = await ensurePrivateVenv(); + if (!created) return false; + const venvPython = resolvePrivateVenvPython(); + if (!venvPython) { + vscode.window.showErrorMessage('Context Engine Uploader: failed to locate private venv python.'); + return false; + } + const installed = await installDepsInto(venvPython); + if (!installed) return false; + setPythonOverridePath(venvPython); + log(`Using private venv interpreter: ${getPythonOverridePath()}`); + const venvKey = cacheKey(venvPython, workingDirectory); + if (depCheckCache.get(venvKey)) { + return true; + } + const finalOk = await checkPythonDeps(venvPython, workingDirectory, { showInterpreterError: true }); + if (finalOk) { + depCheckCache.set(venvKey, true); + } + return finalOk; + })(); + + ensureDepCheckInflight.set(inflightKey, task); + try { + return await task; + } finally { + if (ensureDepCheckInflight.get(inflightKey) === task) { + ensureDepCheckInflight.delete(inflightKey); + } } - return finalOk; } return { From 8c05f4559e0145cd752b9f2b1cbe9899e73fc060 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 7 Mar 2026 16:52:38 +0000 Subject: [PATCH 21/39] feat(upload): add hash-based deduplication and processing status tracking - Add content hash comparison to skip redundant file writes when bundle content matches existing replica cache. This reduces unnecessary I/O during delta bundle processing by checking hashes before applying created/updated operations. Enhance status endpoint to report real-time processing state: - Track upload results including processed operations and timing - Expose last bundle processing details via server_info - Report "processing" status during active bundle handling Refactor status validation into shared helper functions for consistent error handling across remote and standalone upload clients. --- scripts/remote_upload_client.py | 59 ++++-- scripts/standalone_upload_client.py | 59 ++++-- scripts/upload_delta_bundle.py | 203 +++++++++++++++----- scripts/upload_service.py | 51 ++++- tests/test_upload_service_path_traversal.py | 174 +++++++++++++++++ tests/test_upload_service_status.py | 117 +++++++++++ 6 files changed, 570 insertions(+), 93 deletions(-) create mode 100644 tests/test_upload_service_status.py diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 6440bfeb..0c840f2f 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -49,6 +49,31 @@ _git_history_skip_log_key: Optional[str] = None +def _is_usable_delta_status(status: Any) -> bool: + if not isinstance(status, dict): + return False + state = str(status.get("status") or "").strip().lower() + return ( + bool(status.get("success")) and + "workspace_path" in status and + "collection_name" in status and + state in {"ready", "processing", "completed"} + ) + + +def _server_status_error_message(status: Any) -> str: + if isinstance(status, dict): + error = status.get("error") + if isinstance(error, dict): + msg = str(error.get("message") or "").strip() + if msg: + return msg + state = str(status.get("status") or "").strip() + if state: + return f"Server status is {state}" + return "Invalid server status response" + + def _log_git_history_skip_once(reason: str, key: str) -> None: global _git_history_skip_log_key marker = f"{reason}:{key}" @@ -1236,7 +1261,16 @@ def get_server_status(self) -> Dict[str, Any]: ) if response.status_code == 200: - return response.json() + payload = response.json() + if not isinstance(payload, dict): + return { + "success": False, + "error": { + "code": "STATUS_INVALID", + "message": "Invalid status response payload", + }, + } + return {"success": True, **payload} # Handle error response error_msg = f"Status check failed with HTTP {response.status_code}" @@ -1933,15 +1967,8 @@ def main(): # Test server connection first logger.info("Checking server status...") status = client.get_server_status() - is_success = ( - isinstance(status, dict) and - 'workspace_path' in status and - 'collection_name' in status and - status.get('status') == 'ready' - ) - if not is_success: - error = status.get("error", {}) - logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + if not _is_usable_delta_status(status): + logger.error("Cannot connect to server: %s", _server_status_error_message(status)) return 1 logger.info("Server connection successful") @@ -1977,16 +2004,8 @@ def main(): # Test server connection logger.info("Checking server status...") status = client.get_server_status() - # For delta endpoint, success is indicated by having expected fields (not a "success" boolean) - is_success = ( - isinstance(status, dict) and - 'workspace_path' in status and - 'collection_name' in status and - status.get('status') == 'ready' - ) - if not is_success: - error = status.get("error", {}) - logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + if not _is_usable_delta_status(status): + logger.error("Cannot connect to server: %s", _server_status_error_message(status)) return 1 logger.info("Server connection successful") diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 789db42d..01f1add0 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -51,6 +51,31 @@ def get_auth_session(upload_endpoint: str) -> str: _git_history_skip_log_key: Optional[str] = None +def _is_usable_delta_status(status: Any) -> bool: + if not isinstance(status, dict): + return False + state = str(status.get("status") or "").strip().lower() + return ( + bool(status.get("success")) and + "workspace_path" in status and + "collection_name" in status and + state in {"ready", "processing", "completed"} + ) + + +def _server_status_error_message(status: Any) -> str: + if isinstance(status, dict): + error = status.get("error") + if isinstance(error, dict): + msg = str(error.get("message") or "").strip() + if msg: + return msg + state = str(status.get("status") or "").strip() + if state: + return f"Server status is {state}" + return "Invalid server status response" + + def _log_git_history_skip_once(reason: str, key: str) -> None: global _git_history_skip_log_key marker = f"{reason}:{key}" @@ -1448,7 +1473,16 @@ def get_server_status(self) -> Dict[str, Any]: ) if response.status_code == 200: - return response.json() + payload = response.json() + if not isinstance(payload, dict): + return { + "success": False, + "error": { + "code": "STATUS_INVALID", + "message": "Invalid status response payload", + }, + } + return {"success": True, **payload} # Handle error response error_msg = f"Status check failed with HTTP {response.status_code}" @@ -2127,15 +2161,8 @@ def main(): # Test server connection first logger.info("Checking server status...") status = client.get_server_status() - is_success = ( - isinstance(status, dict) and - 'workspace_path' in status and - 'collection_name' in status and - status.get('status') == 'ready' - ) - if not is_success: - error = status.get("error", {}) - logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + if not _is_usable_delta_status(status): + logger.error("Cannot connect to server: %s", _server_status_error_message(status)) return 1 logger.info("Server connection successful") @@ -2172,16 +2199,8 @@ def main(): # Test server connection logger.info("Checking server status...") status = client.get_server_status() - # For delta endpoint, success is indicated by having expected fields (not a "success" boolean) - is_success = ( - isinstance(status, dict) and - 'workspace_path' in status and - 'collection_name' in status and - status.get('status') == 'ready' - ) - if not is_success: - error = status.get("error", {}) - logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + if not _is_usable_delta_status(status): + logger.error("Cannot connect to server: %s", _server_status_error_message(status)) return 1 logger.info("Server connection successful") diff --git a/scripts/upload_delta_bundle.py b/scripts/upload_delta_bundle.py index 973be132..f87e8fad 100644 --- a/scripts/upload_delta_bundle.py +++ b/scripts/upload_delta_bundle.py @@ -10,6 +10,7 @@ try: from scripts.workspace_state import ( + _normalize_cache_key_path, _extract_repo_name_from_path, get_staging_targets, get_collection_state_snapshot, @@ -27,6 +28,53 @@ _SLUGGED_REPO_RE = re.compile(r"^.+-[0-9a-f]{16}(?:_old)?$") +def _normalize_hash_value(value: Any) -> str: + raw = str(value or "").strip() + if not raw: + return "" + if ":" in raw: + _, _, digest = raw.partition(":") + if digest.strip(): + return digest.strip().lower() + return raw.lower() + + +def _load_cache_hashes(cache_path: Path) -> Dict[str, str]: + try: + with cache_path.open("r", encoding="utf-8-sig") as f: + data = json.load(f) + except (OSError, ValueError, json.JSONDecodeError): + return {} + + file_hashes = data.get("file_hashes", {}) + if not isinstance(file_hashes, dict): + return {} + + normalized: Dict[str, str] = {} + for path_key, value in file_hashes.items(): + if isinstance(value, dict): + hash_value = value.get("hash") + else: + hash_value = value + digest = _normalize_hash_value(hash_value) + if digest: + normalized[_normalize_cache_key_path(str(path_key))] = digest + return normalized + + +def _load_replica_cache_hashes(workspace_root: Path, slug: str) -> Dict[str, str]: + merged: Dict[str, str] = {} + cache_paths = ( + Path(WORK_DIR) / ".codebase" / "repos" / slug / "cache.json", + workspace_root / ".codebase" / "cache.json", + ) + for cache_path in cache_paths: + if not cache_path.exists(): + continue + merged.update(_load_cache_hashes(cache_path)) + return merged + + def get_workspace_key(workspace_path: str) -> str: """Generate 16-char hash for collision avoidance in remote uploads. @@ -69,6 +117,7 @@ def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[ "deleted": 0, "moved": 0, "skipped": 0, + "skipped_hash_match": 0, "failed": 0, } @@ -224,12 +273,42 @@ def _safe_join(base: Path, rel: str) -> Path: raise ValueError(f"Path escapes workspace: {rel}") return candidate + def _member_suffix(name: str, marker: str) -> Optional[str]: + idx = name.find(marker) + if idx < 0: + return None + suffix = name[idx + len(marker):] + return suffix or None + with tarfile.open(bundle_path, "r:gz") as tar: ops_member = None - for member in tar.getnames(): - if member.endswith("metadata/operations.json"): + hashes_member = None + git_member = None + created_members: Dict[str, tarfile.TarInfo] = {} + updated_members: Dict[str, tarfile.TarInfo] = {} + moved_members: Dict[str, tarfile.TarInfo] = {} + for member in tar.getmembers(): + name = member.name + if name.endswith("metadata/operations.json"): ops_member = member - break + continue + if name.endswith("metadata/hashes.json"): + hashes_member = member + continue + if name.endswith("metadata/git_history.json"): + git_member = member + continue + created_rel = _member_suffix(name, "files/created/") + if created_rel: + created_members[created_rel] = member + continue + updated_rel = _member_suffix(name, "files/updated/") + if updated_rel: + updated_members[updated_rel] = member + continue + moved_rel = _member_suffix(name, "files/moved/") + if moved_rel: + moved_members[moved_rel] = member if not ops_member: raise ValueError("operations.json not found in bundle") @@ -240,14 +319,25 @@ def _safe_join(base: Path, rel: str) -> Path: operations_data = json.loads(ops_file.read().decode("utf-8")) operations = operations_data.get("operations", []) + bundle_hashes: Dict[str, str] = {} + if hashes_member: + hashes_file = tar.extractfile(hashes_member) + if hashes_file: + hashes_data = json.loads(hashes_file.read().decode("utf-8")) + raw_hashes = hashes_data.get("file_hashes", {}) + if isinstance(raw_hashes, dict): + for rel_path, hash_value in raw_hashes.items(): + digest = _normalize_hash_value(hash_value) + if digest: + bundle_hashes[str(rel_path)] = digest + + replica_cache_hashes = { + slug: _load_replica_cache_hashes(root, slug) + for slug, root in replica_roots.items() + } # Best-effort: extract git history metadata for watcher to ingest try: - git_member = None - for member in tar.getnames(): - if member.endswith("metadata/git_history.json"): - git_member = member - break if git_member: git_file = tar.extractfile(git_member) if git_file: @@ -266,11 +356,16 @@ def _safe_join(base: Path, rel: str) -> Path: except Exception as git_err: logger.debug(f"[upload_service] Error extracting git history metadata: {git_err}") - def _apply_operation_to_workspace(workspace_root: Path) -> bool: - """Apply a single file operation to a workspace. Returns True on success.""" - nonlocal operations_count, op_type, rel_path, tar + def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: + """Apply a single file operation to a workspace.""" + nonlocal operations_count, op_type, rel_path, tar, operation target_path = _safe_join(workspace_root, rel_path) + target_key = _normalize_cache_key_path(str(target_path)) + replica_hashes = replica_cache_hashes.setdefault(slug, {}) + op_content_hash = _normalize_hash_value( + operation.get("content_hash") or bundle_hashes.get(rel_path) + ) safe_source_path = None source_rel_path = None @@ -281,76 +376,84 @@ def _apply_operation_to_workspace(workspace_root: Path) -> bool: try: if op_type == "created": - file_member = None - for member in tar.getnames(): - if member.endswith(f"files/created/{rel_path}"): - file_member = member - break - + if op_content_hash and target_path.exists(): + cached_hash = replica_hashes.get(target_key) + if cached_hash and cached_hash == op_content_hash: + return "skipped_hash_match" + file_member = created_members.get(rel_path) if file_member: file_content = tar.extractfile(file_member) if file_content: target_path.parent.mkdir(parents=True, exist_ok=True) target_path.write_bytes(file_content.read()) - return True + if op_content_hash: + replica_hashes[target_key] = op_content_hash + return "applied" else: - return False + return "failed" else: - return False + return "failed" elif op_type == "updated": - file_member = None - for member in tar.getnames(): - if member.endswith(f"files/updated/{rel_path}"): - file_member = member - break - + if op_content_hash and target_path.exists(): + cached_hash = replica_hashes.get(target_key) + if cached_hash and cached_hash == op_content_hash: + return "skipped_hash_match" + file_member = updated_members.get(rel_path) if file_member: file_content = tar.extractfile(file_member) if file_content: target_path.parent.mkdir(parents=True, exist_ok=True) target_path.write_bytes(file_content.read()) - return True + if op_content_hash: + replica_hashes[target_key] = op_content_hash + return "applied" else: - return False + return "failed" else: - return False + return "failed" elif op_type == "deleted": if target_path.exists(): target_path.unlink(missing_ok=True) - return True + replica_hashes.pop(target_key, None) + return "applied" else: - return True # Already deleted + replica_hashes.pop(target_key, None) + return "applied" # Already deleted elif op_type == "moved": if safe_source_path and safe_source_path.exists(): target_path.parent.mkdir(parents=True, exist_ok=True) safe_source_path.rename(target_path) - return True + source_key = _normalize_cache_key_path(str(safe_source_path)) + moved_hash = replica_hashes.pop(source_key, None) + if op_content_hash: + replica_hashes[target_key] = op_content_hash + elif moved_hash: + replica_hashes[target_key] = moved_hash + return "applied" # Remote uploads may not have the source file on the server (e.g. staging # mirrors). In that case, clients can embed the destination content under # files/moved/. - file_member = None - for member in tar.getnames(): - if member.endswith(f"files/moved/{rel_path}"): - file_member = member - break + file_member = moved_members.get(rel_path) if file_member: file_content = tar.extractfile(file_member) if file_content: target_path.parent.mkdir(parents=True, exist_ok=True) target_path.write_bytes(file_content.read()) - return True - return False - return False + if op_content_hash: + replica_hashes[target_key] = op_content_hash + return "applied" + return "failed" + return "failed" else: logger.warning(f"[upload_service] Unknown operation type: {op_type}") - return False + return "failed" except Exception as e: logger.debug(f"[upload_service] Failed to apply {op_type} to {rel_path} in {workspace_root}: {e}") - return False + return "failed" for operation in operations: op_type = operation.get("operation") @@ -381,19 +484,25 @@ def _apply_operation_to_workspace(workspace_root: Path) -> bool: rel_path = sanitized_path - replica_results: Dict[str, bool] = {} + replica_results: Dict[str, str] = {} for slug, root in replica_roots.items(): - replica_results[slug] = _apply_operation_to_workspace(root) + replica_results[slug] = _apply_operation_to_workspace(slug, root) - success_any = any(replica_results.values()) - success_all = all(replica_results.values()) - if success_any: + applied_any = any(result == "applied" for result in replica_results.values()) + skipped_hash_match = bool(replica_results) and all( + result == "skipped_hash_match" for result in replica_results.values() + ) + success_all = all(result in {"applied", "skipped_hash_match"} for result in replica_results.values()) + if applied_any: operations_count.setdefault(op_type, 0) operations_count[op_type] = operations_count.get(op_type, 0) + 1 if not success_all: logger.debug( f"[upload_service] Partial success for {op_type} {rel_path}: {replica_results}" ) + elif skipped_hash_match: + operations_count["skipped"] += 1 + operations_count["skipped_hash_match"] += 1 else: operations_count["failed"] += 1 diff --git a/scripts/upload_service.py b/scripts/upload_service.py index 9acd9931..6d4d9327 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -204,6 +204,7 @@ def logical_repo_reuse_enabled() -> bool: # type: ignore[no-redef] # In-memory sequence tracking (in production, use persistent storage) _sequence_tracker: Dict[str, int] = {} +_upload_result_tracker: Dict[str, Dict[str, Any]] = {} def _int_env(name: str, default: int) -> int: @@ -484,14 +485,33 @@ async def _process_bundle_background( sequence_number: Optional[int], bundle_id: Optional[str], ) -> None: + key = get_workspace_key(workspace_path) try: start_time = datetime.now() + _upload_result_tracker[key] = { + "workspace_path": workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": None, + "processing_time_ms": None, + "status": "processing", + "completed_at": None, + } operations_count = await asyncio.to_thread( process_delta_bundle, workspace_path, bundle_path, manifest ) + processing_time = int((datetime.now() - start_time).total_seconds() * 1000) if sequence_number is not None: - key = get_workspace_key(workspace_path) _sequence_tracker[key] = sequence_number + _upload_result_tracker[key] = { + "workspace_path": workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": operations_count, + "processing_time_ms": processing_time, + "status": "completed", + "completed_at": datetime.now().isoformat(), + } if log_activity: try: repo = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None @@ -507,11 +527,21 @@ async def _process_bundle_background( ) except Exception as activity_err: logger.debug(f"[upload_service] Failed to log activity for bundle {bundle_id}: {activity_err}") - processing_time = (datetime.now() - start_time).total_seconds() * 1000 logger.info( - f"[upload_service] Finished processing bundle {bundle_id} seq {sequence_number} in {int(processing_time)}ms" + f"[upload_service] Finished processing bundle {bundle_id} seq {sequence_number} " + f"in {processing_time}ms ops={operations_count}" ) except Exception as e: + _upload_result_tracker[key] = { + "workspace_path": workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": None, + "processing_time_ms": None, + "status": "error", + "completed_at": datetime.now().isoformat(), + "error": str(e), + } logger.error(f"[upload_service] Error in background processing for bundle {bundle_id}: {e}") finally: try: @@ -1427,8 +1457,12 @@ async def get_status(workspace_path: str): # Get last sequence last_sequence = get_last_sequence(workspace_path) + key = get_workspace_key(workspace_path) + upload_result = _upload_result_tracker.get(key, {}) - last_upload = None + last_upload = upload_result.get("completed_at") + upload_status = str(upload_result.get("status") or "") + status = "processing" if upload_status == "processing" else "ready" return StatusResponse( workspace_path=workspace_path, @@ -1436,11 +1470,16 @@ async def get_status(workspace_path: str): last_sequence=last_sequence, last_upload=last_upload, pending_operations=0, - status="ready", + status=status, server_info={ "version": "1.0.0", "max_bundle_size_mb": MAX_BUNDLE_SIZE_MB, - "supported_formats": ["tar.gz"] + "supported_formats": ["tar.gz"], + "last_bundle_id": upload_result.get("bundle_id"), + "last_processing_time_ms": upload_result.get("processing_time_ms"), + "last_processed_operations": upload_result.get("processed_operations"), + "last_upload_status": upload_status or None, + "last_error": upload_result.get("error"), } ) diff --git a/tests/test_upload_service_path_traversal.py b/tests/test_upload_service_path_traversal.py index 0d01478f..bc426c97 100644 --- a/tests/test_upload_service_path_traversal.py +++ b/tests/test_upload_service_path_traversal.py @@ -1,5 +1,6 @@ import io import json +import os import tarfile from pathlib import Path @@ -52,6 +53,59 @@ def _write_bundle_with_created_file(tmp_path: Path, rel_path: str, content: byte return bundle_path +def _write_bundle_with_hash_metadata( + tmp_path: Path, + *, + operations: list[dict], + file_hashes: dict[str, str] | None = None, + created_files: dict[str, bytes] | None = None, + updated_files: dict[str, bytes] | None = None, +) -> Path: + bundle_path = tmp_path / "bundle-hashes.tar.gz" + payload = json.dumps({"operations": operations}).encode("utf-8") + hashes_payload = json.dumps({"file_hashes": file_hashes or {}}).encode("utf-8") + + with tarfile.open(bundle_path, "w:gz") as tar: + info = tarfile.TarInfo(name="metadata/operations.json") + info.size = len(payload) + tar.addfile(info, io.BytesIO(payload)) + + hashes_info = tarfile.TarInfo(name="metadata/hashes.json") + hashes_info.size = len(hashes_payload) + tar.addfile(hashes_info, io.BytesIO(hashes_payload)) + + for rel_path, content in (created_files or {}).items(): + file_info = tarfile.TarInfo(name=f"files/created/{rel_path}") + file_info.size = len(content) + tar.addfile(file_info, io.BytesIO(content)) + + for rel_path, content in (updated_files or {}).items(): + file_info = tarfile.TarInfo(name=f"files/updated/{rel_path}") + file_info.size = len(content) + tar.addfile(file_info, io.BytesIO(content)) + + return bundle_path + + +def _write_repo_cache(work_dir: Path, slug: str, rel_path: str, file_hash: str) -> None: + target = (work_dir / slug / rel_path).resolve() + cache_path = work_dir / ".codebase" / "repos" / slug / "cache.json" + cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_path.write_text( + json.dumps( + { + "file_hashes": { + str(target): { + "hash": file_hash, + } + } + }, + indent=2, + ), + encoding="utf-8", + ) + + def test_process_delta_bundle_rejects_traversal_created(tmp_path, monkeypatch): import scripts.upload_delta_bundle as us @@ -197,3 +251,123 @@ def test_process_delta_bundle_rejects_traversal_moved_source(tmp_path, monkeypat bundle_path=bundle, manifest={"bundle_id": "b1"}, ) + + +def test_process_delta_bundle_skips_created_write_when_server_hash_matches(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "src/file.txt" + content = b"same-content" + file_hash = "sha1:efb5d7d4d38013264f2c00fceeb401f8c8d77d9f" + + target = work_dir / slug / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(content) + os.utime(target, ns=(1_000_000_000, 1_000_000_000)) + before_mtime_ns = target.stat().st_mtime_ns + _write_repo_cache(work_dir, slug, rel_path, file_hash) + + bundle = _write_bundle_with_hash_metadata( + tmp_path, + operations=[ + { + "operation": "created", + "path": rel_path, + "content_hash": file_hash, + } + ], + file_hashes={rel_path: file_hash}, + created_files={rel_path: content}, + ) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-skip-created"}, + ) + + assert counts.get("created") == 0 + assert counts.get("skipped") == 1 + assert counts.get("skipped_hash_match") == 1 + assert target.read_bytes() == content + assert target.stat().st_mtime_ns == before_mtime_ns + + +def test_process_delta_bundle_uses_hashes_metadata_for_updated_skip(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "src/keep.txt" + content = b"existing-content" + file_hash = "sha1:2910e29d6f6d3d2f01f8cc52ec386a4936ca9d2f" + + target = work_dir / slug / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(content) + os.utime(target, ns=(2_000_000_000, 2_000_000_000)) + before_mtime_ns = target.stat().st_mtime_ns + _write_repo_cache(work_dir, slug, rel_path, file_hash) + + bundle = _write_bundle_with_hash_metadata( + tmp_path, + operations=[ + { + "operation": "updated", + "path": rel_path, + } + ], + file_hashes={rel_path: file_hash}, + updated_files={rel_path: content}, + ) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-skip-updated"}, + ) + + assert counts.get("updated") == 0 + assert counts.get("skipped") == 1 + assert counts.get("skipped_hash_match") == 1 + assert target.read_bytes() == content + assert target.stat().st_mtime_ns == before_mtime_ns + + +def test_normalize_hash_value_strips_algorithm_prefixes(): + import scripts.upload_delta_bundle as us + + assert us._normalize_hash_value("sha1:ABCDEF") == "abcdef" + assert us._normalize_hash_value("md5:ABCDEF") == "abcdef" + assert us._normalize_hash_value("sha256:ABCDEF") == "abcdef" + assert us._normalize_hash_value("ABCDEF") == "abcdef" + + +def test_process_delta_bundle_uses_first_marker_match_for_created_members(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "nested/files/created/path.txt" + content = b"marker-safe" + bundle = _write_bundle_with_created_file(tmp_path, rel_path, content) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-created-marker"}, + ) + + assert counts.get("created") == 1 + assert (work_dir / slug / rel_path).read_bytes() == content diff --git a/tests/test_upload_service_status.py b/tests/test_upload_service_status.py new file mode 100644 index 00000000..04f2138e --- /dev/null +++ b/tests/test_upload_service_status.py @@ -0,0 +1,117 @@ +import asyncio +import importlib +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient + + +@pytest.mark.unit +def test_delta_status_exposes_last_processed_operations(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + + monkeypatch.setattr(srv, "get_collection_name", lambda _repo=None: "test-coll") + monkeypatch.setattr(srv, "_extract_repo_name_from_path", lambda _path: "repo") + + key = srv.get_workspace_key("/work/repo") + srv._sequence_tracker[key] = 7 + srv._upload_result_tracker[key] = { + "workspace_path": "/work/repo", + "bundle_id": "bundle-123", + "sequence_number": 7, + "processed_operations": { + "created": 1, + "updated": 2, + "deleted": 0, + "moved": 0, + "skipped": 5, + "skipped_hash_match": 4, + "failed": 0, + }, + "processing_time_ms": 321, + "status": "completed", + "completed_at": "2026-03-07T15:40:46.623000", + } + + client = TestClient(srv.app) + resp = client.get("/api/v1/delta/status", params={"workspace_path": "/work/repo"}) + assert resp.status_code == 200 + body = resp.json() + assert body["last_sequence"] == 7 + assert body["last_upload"] == "2026-03-07T15:40:46.623000" + assert body["status"] == "ready" + assert body["server_info"]["last_bundle_id"] == "bundle-123" + assert body["server_info"]["last_processing_time_ms"] == 321 + assert body["server_info"]["last_processed_operations"]["skipped_hash_match"] == 4 + assert body["server_info"]["last_upload_status"] == "completed" + assert body["server_info"]["last_error"] is None + + +@pytest.mark.unit +def test_process_bundle_background_tracks_completed_operations(monkeypatch, tmp_path: Path): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + + bundle_path = tmp_path / "bundle.tar.gz" + bundle_path.write_bytes(b"placeholder") + + monkeypatch.setattr( + srv, + "process_delta_bundle", + lambda workspace_path, bundle_path, manifest: { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 10, + "skipped_hash_match": 10, + "failed": 0, + }, + ) + monkeypatch.setattr(srv, "log_activity", None) + + asyncio.run( + srv._process_bundle_background( + workspace_path="/work/repo", + bundle_path=bundle_path, + manifest={"bundle_id": "bundle-xyz"}, + sequence_number=3, + bundle_id="bundle-xyz", + ) + ) + + key = srv.get_workspace_key("/work/repo") + tracked = srv._upload_result_tracker[key] + assert tracked["status"] == "completed" + assert tracked["sequence_number"] == 3 + assert tracked["processed_operations"]["skipped_hash_match"] == 10 + assert tracked["processing_time_ms"] is not None + assert not bundle_path.exists() + + +@pytest.mark.unit +def test_delta_status_reports_processing_while_upload_in_progress(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + + monkeypatch.setattr(srv, "get_collection_name", lambda _repo=None: "test-coll") + monkeypatch.setattr(srv, "_extract_repo_name_from_path", lambda _path: "repo") + + key = srv.get_workspace_key("/work/repo") + srv._upload_result_tracker[key] = { + "workspace_path": "/work/repo", + "bundle_id": "bundle-123", + "sequence_number": 8, + "processed_operations": None, + "processing_time_ms": None, + "status": "processing", + "completed_at": None, + } + + client = TestClient(srv.app) + resp = client.get("/api/v1/delta/status", params={"workspace_path": "/work/repo"}) + assert resp.status_code == 200 + body = resp.json() + assert body["status"] == "processing" + assert body["server_info"]["last_upload_status"] == "processing" From 366b6f4634bd26c7780e0474fe4872a11c1964da Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 7 Mar 2026 18:49:20 +0000 Subject: [PATCH 22/39] feat(upload): cleanup ignored cached paths and prune empty directories on force sync Add proper handling for stale cached paths that become ignored (e.g., dev-workspace in dev-remote mode). When paths transition from tracked to ignored, mark them as deleted in change detection and ensure force mode actively removes them from the remote side. Enhance delta bundle processing to prune empty parent directories after delete and move operations, and sweep stranded empty directories at the end of each bundle processing cycle. Protected top-level directories (.codebase, .remote-git) are preserved even when empty. Optimize indexing pipeline by allowing preloaded file content, hash, and language to be passed through, avoiding redundant file reads when the caller has already loaded this data. --- scripts/ingest/pipeline.py | 26 +++-- scripts/remote_upload_client.py | 104 ++++++++++++++++- scripts/standalone_upload_client.py | 98 +++++++++++++++- scripts/upload_delta_bundle.py | 35 ++++++ scripts/watch_index_core/processor.py | 3 + tests/test_upload_client_ignore_cleanup.py | 111 ++++++++++++++++++ tests/test_upload_service_path_traversal.py | 118 ++++++++++++++++++++ tests/test_watch_index_cache.py | 36 ++++++ 8 files changed, 518 insertions(+), 13 deletions(-) create mode 100644 tests/test_upload_client_ignore_cleanup.py diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py index f43c7d3a..2ebac7af 100644 --- a/scripts/ingest/pipeline.py +++ b/scripts/ingest/pipeline.py @@ -298,6 +298,9 @@ def index_single_file( repo_name_for_cache: str | None = None, allowed_vectors: set[str] | None = None, allowed_sparse: set[str] | None = None, + preloaded_text: str | None = None, + preloaded_file_hash: str | None = None, + preloaded_language: str | None = None, ) -> bool: """Index a single file path. Returns True if indexed, False if skipped.""" try: @@ -330,6 +333,9 @@ def index_single_file( repo_name_for_cache=repo_name_for_cache, allowed_vectors=allowed_vectors, allowed_sparse=allowed_sparse, + preloaded_text=preloaded_text, + preloaded_file_hash=preloaded_file_hash, + preloaded_language=preloaded_language, ) finally: if _file_lock_ctx is not None: @@ -353,6 +359,9 @@ def _index_single_file_inner( repo_name_for_cache: str | None = None, allowed_vectors: set[str] | None = None, allowed_sparse: set[str] | None = None, + preloaded_text: str | None = None, + preloaded_file_hash: str | None = None, + preloaded_language: str | None = None, ) -> bool: """Inner implementation of index_single_file (after lock is acquired).""" if trust_cache is None: @@ -380,15 +389,18 @@ def _index_single_file_inner( except Exception: pass - try: - text = file_path.read_text(encoding="utf-8", errors="ignore") - except Exception as e: - print(f"Skipping {file_path}: {e}") - return False + if preloaded_text is None: + try: + text = file_path.read_text(encoding="utf-8", errors="ignore") + except Exception as e: + print(f"Skipping {file_path}: {e}") + return False + else: + text = preloaded_text - language = detect_language(file_path) + language = preloaded_language or detect_language(file_path) is_text_like = _is_text_like_language(language) - file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() + file_hash = preloaded_file_hash or hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() repo_tag = repo_name_for_cache or _detect_repo_name_from_path(file_path) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 0c840f2f..6b0b7d86 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -476,6 +476,12 @@ def _load_local_cache_file_hashes(workspace_path: str, repo_name: Optional[str]) return {} +def get_all_cached_paths(repo_name: Optional[str] = None) -> List[str]: + """Return cached file paths from the local workspace cache.""" + workspace_path = os.environ.get("WORKSPACE_PATH") or os.getcwd() + return list(_load_local_cache_file_hashes(workspace_path, repo_name).keys()) + + class RemoteUploadClient: """Client for uploading delta bundles to remote server.""" @@ -651,6 +657,17 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: for path in changed_paths: if self._is_ignored_path(path): + try: + abs_path = str(path.resolve()) + except Exception: + continue + cached_hash = get_cached_file_hash(abs_path, self.repo_name) + if cached_hash: + changes["deleted"].append(path) + try: + self._stat_cache.pop(abs_path, None) + except Exception: + pass continue # Resolve to an absolute path for stable cache keys try: @@ -1294,6 +1311,89 @@ def has_meaningful_changes(self, changes: Dict[str, List]) -> bool: total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") return total_changes > 0 + def _collect_force_cleanup_paths(self) -> List[Path]: + """ + Return ignored paths that force mode should actively delete remotely. + + In dev-remote mode, dev-workspace is intentionally ignored during upload + scans to avoid recursive dogfooding. If that tree already exists on the + remote side from an older buggy upload, force mode should remove it even + when the local cache does not contain those paths. + """ + cleanup_paths: List[Path] = [] + if "dev-workspace" not in self._excluded_dirnames(): + return cleanup_paths + + dev_root = Path(self.workspace_path) / "dev-workspace" + if not dev_root.exists(): + return cleanup_paths + + for root, dirnames, filenames in os.walk(dev_root): + dirnames[:] = [d for d in dirnames if not d.startswith(".")] + for filename in filenames: + path = Path(root) / filename + try: + if path.is_file(): + cleanup_paths.append(path) + except Exception: + continue + return cleanup_paths + + def build_force_changes(self, all_files: List[Path]) -> Dict[str, List]: + """ + Build force-upload changes while still cleaning stale cached paths. + + Force mode should re-upload every currently managed file, but it must also + emit deletes for files that only exist in the local cache now, including + paths that are ignored under the current client policy such as + dev-workspace in dev-remote mode. + """ + path_map: Dict[Path, Path] = {} + for path in all_files: + try: + resolved = path.resolve() + except Exception: + continue + path_map[resolved] = path + + for cached_abs in get_all_cached_paths(self.repo_name): + try: + cached_path = Path(cached_abs) + resolved = cached_path.resolve() + except Exception: + continue + if resolved not in path_map: + path_map[resolved] = cached_path + + force_cleanup_paths = self._collect_force_cleanup_paths() + for cleanup_path in force_cleanup_paths: + try: + resolved = cleanup_path.resolve() + except Exception: + continue + if resolved not in path_map: + path_map[resolved] = cleanup_path + + probed = self.detect_file_changes(list(path_map.values())) + deleted_by_resolved: Dict[Path, Path] = {} + for deleted_path in probed.get("deleted", []): + try: + deleted_by_resolved[deleted_path.resolve()] = deleted_path + except Exception: + continue + for cleanup_path in force_cleanup_paths: + try: + deleted_by_resolved.setdefault(cleanup_path.resolve(), cleanup_path) + except Exception: + continue + return { + "created": all_files, + "updated": [], + "deleted": list(deleted_by_resolved.values()), + "moved": [], + "unchanged": [], + } + def upload_git_history_only(self, git_history: Dict[str, Any]) -> bool: try: empty_changes = { @@ -1531,7 +1631,6 @@ def _process_pending_changes(self): self.client.repo_name ) cached_paths = [Path(p) for p in cached_file_hashes.keys()] - cached_paths = [p for p in cached_paths if not self.client._is_ignored_path(p)] all_paths = list(set(pending + cached_paths)) else: all_paths = pending @@ -2031,8 +2130,7 @@ def main(): # Detect changes (treat all files as changes for initial upload) if args.force: - # Force mode: treat all files as created - changes = {"created": all_files, "updated": [], "deleted": [], "moved": [], "unchanged": []} + changes = client.build_force_changes(all_files) else: changes = client.detect_file_changes(all_files) diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 01f1add0..53982736 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -868,6 +868,17 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: for path in changed_paths: if self._is_ignored_path(path): + try: + abs_path = str(path.resolve()) + except Exception: + continue + cached_hash = get_cached_file_hash(abs_path, self.repo_name) + if cached_hash: + changes["deleted"].append(path) + try: + self._stat_cache.pop(abs_path, None) + except Exception: + pass continue try: abs_path = str(path.resolve()) @@ -1506,6 +1517,89 @@ def has_meaningful_changes(self, changes: Dict[str, List]) -> bool: total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") return total_changes > 0 + def _collect_force_cleanup_paths(self) -> List[Path]: + """ + Return ignored paths that force mode should actively delete remotely. + + In dev-remote mode, dev-workspace is intentionally ignored during upload + scans to avoid recursive dogfooding. If that tree already exists on the + remote side from an older buggy upload, force mode should remove it even + when the standalone client's cache does not know about those paths. + """ + cleanup_paths: List[Path] = [] + if "dev-workspace" not in self._excluded_dirnames(): + return cleanup_paths + + dev_root = Path(self.workspace_path) / "dev-workspace" + if not dev_root.exists(): + return cleanup_paths + + for root, dirnames, filenames in os.walk(dev_root): + dirnames[:] = [d for d in dirnames if not d.startswith(".")] + for filename in filenames: + path = Path(root) / filename + try: + if path.is_file(): + cleanup_paths.append(path) + except Exception: + continue + return cleanup_paths + + def build_force_changes(self, all_files: List[Path]) -> Dict[str, List]: + """ + Build force-upload changes while still cleaning stale cached paths. + + Force mode should re-upload every currently managed file, but it must also + emit deletes for files that only exist in the local cache now, including + paths that are ignored under the current client policy such as + dev-workspace in dev-remote mode. + """ + path_map: Dict[Path, Path] = {} + for path in all_files: + try: + resolved = path.resolve() + except Exception: + continue + path_map[resolved] = path + + for cached_abs in get_all_cached_paths(self.repo_name): + try: + cached_path = Path(cached_abs) + resolved = cached_path.resolve() + except Exception: + continue + if resolved not in path_map: + path_map[resolved] = cached_path + + force_cleanup_paths = self._collect_force_cleanup_paths() + for cleanup_path in force_cleanup_paths: + try: + resolved = cleanup_path.resolve() + except Exception: + continue + if resolved not in path_map: + path_map[resolved] = cleanup_path + + probed = self.detect_file_changes(list(path_map.values())) + deleted_by_resolved: Dict[Path, Path] = {} + for deleted_path in probed.get("deleted", []): + try: + deleted_by_resolved[deleted_path.resolve()] = deleted_path + except Exception: + continue + for cleanup_path in force_cleanup_paths: + try: + deleted_by_resolved.setdefault(cleanup_path.resolve(), cleanup_path) + except Exception: + continue + return { + "created": all_files, + "updated": [], + "deleted": list(deleted_by_resolved.values()), + "moved": [], + "unchanged": [], + } + def upload_git_history_only(self, git_history: Dict[str, Any]) -> bool: try: empty_changes = { @@ -1694,7 +1788,6 @@ def _process_pending_changes(self): cached_paths = [ Path(p) for p in get_all_cached_paths(self.client.repo_name) ] - cached_paths = [p for p in cached_paths if not self.client._is_ignored_path(p)] all_paths = list(set(pending + cached_paths)) else: all_paths = pending @@ -2219,8 +2312,7 @@ def main(): # Detect changes (treat all files as changes for initial upload) if args.force: - # Force mode: treat all files as created - changes = {"created": all_files, "updated": [], "deleted": [], "moved": [], "unchanged": []} + changes = client.build_force_changes(all_files) else: changes = client.detect_file_changes(all_files) diff --git a/scripts/upload_delta_bundle.py b/scripts/upload_delta_bundle.py index f87e8fad..5ae3ae2f 100644 --- a/scripts/upload_delta_bundle.py +++ b/scripts/upload_delta_bundle.py @@ -109,6 +109,35 @@ def _cleanup_empty_dirs(path: Path, stop_at: Path) -> None: break +def _sweep_empty_workspace_dirs(workspace_root: Path) -> None: + """Best-effort prune of empty directories under a workspace root.""" + protected_top_level = {".codebase", ".remote-git"} + try: + workspace_root = workspace_root.resolve() + except Exception: + pass + + try: + for root, dirnames, _filenames in os.walk(workspace_root, topdown=True): + current = Path(root) + if current == workspace_root: + dirnames[:] = [d for d in dirnames if d not in protected_top_level] + for root, dirnames, _filenames in os.walk(workspace_root, topdown=False): + current = Path(root) + if current == workspace_root: + continue + if current.parent == workspace_root and current.name in protected_top_level: + continue + try: + if any(current.iterdir()): + continue + current.rmdir() + except Exception: + continue + except Exception: + pass + + def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[str, Any]) -> Dict[str, int]: """Process delta bundle and return operation counts.""" operations_count = { @@ -416,9 +445,11 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: elif op_type == "deleted": if target_path.exists(): target_path.unlink(missing_ok=True) + _cleanup_empty_dirs(target_path.parent, workspace_root) replica_hashes.pop(target_key, None) return "applied" else: + _cleanup_empty_dirs(target_path.parent, workspace_root) replica_hashes.pop(target_key, None) return "applied" # Already deleted @@ -426,6 +457,7 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: if safe_source_path and safe_source_path.exists(): target_path.parent.mkdir(parents=True, exist_ok=True) safe_source_path.rename(target_path) + _cleanup_empty_dirs(safe_source_path.parent, workspace_root) source_key = _normalize_cache_key_path(str(safe_source_path)) moved_hash = replica_hashes.pop(source_key, None) if op_content_hash: @@ -506,6 +538,9 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: else: operations_count["failed"] += 1 + for root in replica_roots.values(): + _sweep_empty_workspace_dirs(root) + return operations_count except Exception as e: diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index a530ce89..e84f2dec 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -644,6 +644,9 @@ def _run_indexing_strategy( skip_unchanged=False, pseudo_mode=pseudo_mode, repo_name_for_cache=repo_name, + preloaded_text=text, + preloaded_file_hash=file_hash, + preloaded_language=language if text is not None else None, ) return ok diff --git a/tests/test_upload_client_ignore_cleanup.py b/tests/test_upload_client_ignore_cleanup.py new file mode 100644 index 00000000..c7dcd714 --- /dev/null +++ b/tests/test_upload_client_ignore_cleanup.py @@ -0,0 +1,111 @@ +import importlib +from pathlib import Path + + +def _exercise_ignored_path_cleanup(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + ignored = workspace / "dev-workspace" / "nested.py" + ignored.parent.mkdir(parents=True, exist_ok=True) + ignored.write_text("print('dogfood')\n", encoding="utf-8") + + monkeypatch.setenv("DEV_REMOTE_MODE", "1") + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: "abc123") + monkeypatch.setattr(mod, "set_cached_file_hash", lambda *a, **k: None) + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + changes = client.detect_file_changes([ignored]) + + assert ignored in changes["deleted"] + assert not changes["created"] + assert not changes["updated"] + + +def test_remote_upload_client_marks_ignored_cached_paths_deleted(monkeypatch, tmp_path): + _exercise_ignored_path_cleanup("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_marks_ignored_cached_paths_deleted(monkeypatch, tmp_path): + _exercise_ignored_path_cleanup("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def _exercise_force_mode_cleanup(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + stale_ignored = workspace / "dev-workspace" / "nested.py" + stale_ignored.parent.mkdir(parents=True, exist_ok=True) + stale_ignored.write_text("print('stale')\n", encoding="utf-8") + + monkeypatch.setenv("DEV_REMOTE_MODE", "1") + monkeypatch.setattr(mod, "get_all_cached_paths", lambda repo_name=None: [str(stale_ignored)]) + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: "abc123") + monkeypatch.setattr(mod, "set_cached_file_hash", lambda *a, **k: None) + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + changes = client.build_force_changes([current]) + + assert current in changes["created"] + assert stale_ignored in changes["deleted"] + assert not changes["updated"] + assert not changes["moved"] + + +def test_remote_upload_client_force_mode_keeps_creates_and_deletes_ignored_cached_paths(monkeypatch, tmp_path): + _exercise_force_mode_cleanup("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_force_mode_keeps_creates_and_deletes_ignored_cached_paths(monkeypatch, tmp_path): + _exercise_force_mode_cleanup("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def _exercise_force_mode_dev_workspace_cleanup_without_cache(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + mirrored = workspace / "dev-workspace" / "nested" / "stale.py" + mirrored.parent.mkdir(parents=True, exist_ok=True) + mirrored.write_text("print('stale')\n", encoding="utf-8") + + monkeypatch.setenv("DEV_REMOTE_MODE", "1") + monkeypatch.setattr(mod, "get_all_cached_paths", lambda repo_name=None: []) + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: None) + monkeypatch.setattr(mod, "set_cached_file_hash", lambda *a, **k: None) + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + changes = client.build_force_changes([current]) + + assert current in changes["created"] + assert mirrored in changes["deleted"] + + +def test_remote_upload_client_force_mode_deletes_dev_workspace_without_cache(monkeypatch, tmp_path): + _exercise_force_mode_dev_workspace_cleanup_without_cache("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_force_mode_deletes_dev_workspace_without_cache(monkeypatch, tmp_path): + _exercise_force_mode_dev_workspace_cleanup_without_cache("scripts.standalone_upload_client", monkeypatch, tmp_path) diff --git a/tests/test_upload_service_path_traversal.py b/tests/test_upload_service_path_traversal.py index bc426c97..a182d693 100644 --- a/tests/test_upload_service_path_traversal.py +++ b/tests/test_upload_service_path_traversal.py @@ -371,3 +371,121 @@ def test_process_delta_bundle_uses_first_marker_match_for_created_members(tmp_pa assert counts.get("created") == 1 assert (work_dir / slug / rel_path).read_bytes() == content + + +def test_process_delta_bundle_deleted_prunes_empty_parent_dirs(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "dev-workspace/nested/stale.py" + target = work_dir / slug / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text("stale\n", encoding="utf-8") + + bundle = _write_bundle( + tmp_path, + [{"operation": "deleted", "path": rel_path}], + ) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-delete-prune"}, + ) + + assert counts.get("deleted") == 1 + assert not target.exists() + assert not (work_dir / slug / "dev-workspace" / "nested").exists() + assert not (work_dir / slug / "dev-workspace").exists() + assert (work_dir / slug).exists() + + +def test_process_delta_bundle_moved_prunes_empty_source_parent_dirs(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + src = work_dir / slug / "dev-workspace" / "nested" / "from.py" + dest_rel_path = "dest/to.py" + src.parent.mkdir(parents=True, exist_ok=True) + src.write_text("payload\n", encoding="utf-8") + + bundle = _write_bundle( + tmp_path, + [{"operation": "moved", "path": dest_rel_path, "source_path": "dev-workspace/nested/from.py"}], + ) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-move-prune"}, + ) + + assert counts.get("moved") == 1 + assert not src.exists() + assert (work_dir / slug / dest_rel_path).read_text(encoding="utf-8") == "payload\n" + assert not (work_dir / slug / "dev-workspace" / "nested").exists() + assert not (work_dir / slug / "dev-workspace").exists() + assert (work_dir / slug).exists() + + +def test_process_delta_bundle_sweeps_stranded_empty_dirs_without_file_ops(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + stranded = work_dir / slug / "dev-workspace" / "nested" / "empty" + stranded.mkdir(parents=True, exist_ok=True) + + bundle = _write_bundle(tmp_path, []) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-sweep-empty"}, + ) + + assert counts == { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + } + assert not stranded.exists() + assert not (work_dir / slug / "dev-workspace").exists() + assert (work_dir / slug).exists() + + +def test_process_delta_bundle_preserves_protected_top_level_dirs_when_empty(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + protected = work_dir / slug / ".remote-git" + protected.mkdir(parents=True, exist_ok=True) + + bundle = _write_bundle(tmp_path, []) + + us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-protected-empty"}, + ) + + assert protected.exists() diff --git a/tests/test_watch_index_cache.py b/tests/test_watch_index_cache.py index c5065af1..1d8d3be6 100644 --- a/tests/test_watch_index_cache.py +++ b/tests/test_watch_index_cache.py @@ -150,3 +150,39 @@ def test_processor_delete_clears_cache_even_without_client(monkeypatch, tmp_path ) remove_mock.assert_called_once_with(str(missing), "repo") + + +def test_run_indexing_strategy_reuses_preloaded_file_state(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "file.py" + path.write_text("print('x')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod.idx, "ensure_collection_and_indexes_once", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_read_text_and_sha1", lambda _p: ("print('x')\n", "abc123")) + monkeypatch.setattr(proc_mod, "get_cached_file_hash", lambda *a, **k: None) + monkeypatch.setattr(proc_mod.idx, "detect_language", lambda _p: "python") + monkeypatch.setattr(proc_mod.idx, "should_use_smart_reindexing", lambda *a, **k: (False, "changed")) + + captured = {} + + def fake_index_single_file(*args, **kwargs): + captured.update(kwargs) + return True + + monkeypatch.setattr(proc_mod.idx, "index_single_file", fake_index_single_file) + + ok = proc_mod._run_indexing_strategy( + path, + client=object(), + model=object(), + collection="coll", + vector_name="vec", + model_dim=1, + repo_name="repo", + ) + + assert ok is True + assert captured["preloaded_text"] == "print('x')\n" + assert captured["preloaded_file_hash"] == "abc123" + assert captured["preloaded_language"] == "python" From 0a380b90eaf8c65903e1c1317d2d0ba64e2e7f57 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 7 Mar 2026 18:53:16 +0000 Subject: [PATCH 23/39] feat(upload): add interval-based empty dir sweep and fix force sync ignored paths - Add configurable interval for empty directory sweeps with state tracking via CTXCE_UPLOAD_EMPTY_DIR_SWEEP and CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS - Exclude ignored paths (e.g., dev-workspace) from created files in force mode - Fix move operations to handle existing targets using shutil.move - Preserve nested directories under protected top-level paths (.codebase, .remote-git) - Add MaintenanceInfo to workspace state for tracking last sweep timestamp --- scripts/remote_upload_client.py | 15 ++- scripts/standalone_upload_client.py | 6 +- scripts/upload_delta_bundle.py | 107 +++++++++++++++++-- scripts/workspace_state.py | 8 +- tests/test_upload_client_ignore_cleanup.py | 51 +++++++++ tests/test_upload_service_path_traversal.py | 111 ++++++++++++++++++++ tests/test_watch_index_cache.py | 4 +- 7 files changed, 279 insertions(+), 23 deletions(-) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 6b0b7d86..fede2311 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -1348,12 +1348,16 @@ def build_force_changes(self, all_files: List[Path]) -> Dict[str, List]: paths that are ignored under the current client policy such as dev-workspace in dev-remote mode. """ + created_files: List[Path] = [] path_map: Dict[Path, Path] = {} for path in all_files: + if self._is_ignored_path(path): + continue try: resolved = path.resolve() except Exception: continue + created_files.append(path) path_map[resolved] = path for cached_abs in get_all_cached_paths(self.repo_name): @@ -1387,7 +1391,7 @@ def build_force_changes(self, all_files: List[Path]) -> Dict[str, List]: except Exception: continue return { - "created": all_files, + "created": created_files, "updated": [], "deleted": list(deleted_by_resolved.values()), "moved": [], @@ -2113,14 +2117,7 @@ def main(): logger.info("Scanning repository for files...") workspace_path = Path(config['workspace_path']) - # Find all files in the repository - all_files = [] - for file_path in workspace_path.rglob('*'): - if file_path.is_file() and not file_path.name.startswith('.'): - rel_path = file_path.relative_to(workspace_path) - # Skip .codebase directory and other metadata - if not str(rel_path).startswith('.codebase'): - all_files.append(file_path) + all_files = client.get_all_code_files() logger.info(f"Found {len(all_files)} files to upload") diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 53982736..c52a2aef 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -1554,12 +1554,16 @@ def build_force_changes(self, all_files: List[Path]) -> Dict[str, List]: paths that are ignored under the current client policy such as dev-workspace in dev-remote mode. """ + created_files: List[Path] = [] path_map: Dict[Path, Path] = {} for path in all_files: + if self._is_ignored_path(path): + continue try: resolved = path.resolve() except Exception: continue + created_files.append(path) path_map[resolved] = path for cached_abs in get_all_cached_paths(self.repo_name): @@ -1593,7 +1597,7 @@ def build_force_changes(self, all_files: List[Path]) -> Dict[str, List]: except Exception: continue return { - "created": all_files, + "created": created_files, "updated": [], "deleted": list(deleted_by_resolved.values()), "moved": [], diff --git a/scripts/upload_delta_bundle.py b/scripts/upload_delta_bundle.py index 5ae3ae2f..8b33324d 100644 --- a/scripts/upload_delta_bundle.py +++ b/scripts/upload_delta_bundle.py @@ -1,9 +1,11 @@ import os import json +import shutil import tarfile import hashlib import re import logging +from datetime import datetime, timezone from pathlib import Path from typing import Dict, Any, Optional @@ -14,7 +16,9 @@ _extract_repo_name_from_path, get_staging_targets, get_collection_state_snapshot, + get_workspace_state, is_staging_enabled, + update_workspace_state, ) except ImportError as exc: raise ImportError( @@ -26,6 +30,24 @@ WORK_DIR = os.environ.get("WORK_DIR") or os.environ.get("WORKDIR") or "/work" _SLUGGED_REPO_RE = re.compile(r"^.+-[0-9a-f]{16}(?:_old)?$") +_DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS = 7 * 24 * 60 * 60 + + +def _env_flag(name: str, default: bool) -> bool: + raw = os.environ.get(name) + if raw is None: + return default + return str(raw).strip().lower() in {"1", "true", "yes", "on"} + + +def _env_int(name: str, default: int) -> int: + raw = os.environ.get(name) + if raw is None: + return default + try: + return int(str(raw).strip()) + except (TypeError, ValueError): + return default def _normalize_hash_value(value: Any) -> str: @@ -116,18 +138,19 @@ def _sweep_empty_workspace_dirs(workspace_root: Path) -> None: workspace_root = workspace_root.resolve() except Exception: pass - try: - for root, dirnames, _filenames in os.walk(workspace_root, topdown=True): - current = Path(root) - if current == workspace_root: - dirnames[:] = [d for d in dirnames if d not in protected_top_level] for root, dirnames, _filenames in os.walk(workspace_root, topdown=False): current = Path(root) if current == workspace_root: continue if current.parent == workspace_root and current.name in protected_top_level: continue + try: + rel = current.relative_to(workspace_root) + except Exception: + continue + if rel.parts and rel.parts[0] in protected_top_level: + continue try: if any(current.iterdir()): continue @@ -138,6 +161,65 @@ def _sweep_empty_workspace_dirs(workspace_root: Path) -> None: pass +def _parse_timestamp(value: Any) -> Optional[datetime]: + raw = str(value or "").strip() + if not raw: + return None + try: + parsed = datetime.fromisoformat(raw.replace("Z", "+00:00")) + except ValueError: + return None + if parsed.tzinfo is None: + return parsed.replace(tzinfo=timezone.utc) + return parsed.astimezone(timezone.utc) + + +def _should_run_empty_dir_sweep(workspace_root: Path, slug: str) -> bool: + if not _env_flag("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", True): + return False + + interval_seconds = max( + 0, + _env_int( + "CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", + _DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS, + ), + ) + if interval_seconds == 0: + return True + + try: + state = get_workspace_state(workspace_path=str(workspace_root), repo_name=slug) or {} + except Exception: + return True + + maintenance = state.get("maintenance") or {} + last_sweep_at = _parse_timestamp(maintenance.get("last_empty_dir_sweep_at")) + if last_sweep_at is None: + return True + + age_seconds = (datetime.now(timezone.utc) - last_sweep_at).total_seconds() + return age_seconds >= interval_seconds + + +def _record_empty_dir_sweep(workspace_root: Path, slug: str) -> None: + try: + state = get_workspace_state(workspace_path=str(workspace_root), repo_name=slug) or {} + maintenance = dict(state.get("maintenance") or {}) + maintenance["last_empty_dir_sweep_at"] = datetime.now(timezone.utc).isoformat() + update_workspace_state( + workspace_path=str(workspace_root), + repo_name=slug, + updates={"maintenance": maintenance}, + ) + except Exception as exc: + logger.debug( + "[upload_service] Failed to record empty-dir sweep for %s: %s", + workspace_root, + exc, + ) + + def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[str, Any]) -> Dict[str, int]: """Process delta bundle and return operation counts.""" operations_count = { @@ -456,7 +538,12 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: elif op_type == "moved": if safe_source_path and safe_source_path.exists(): target_path.parent.mkdir(parents=True, exist_ok=True) - safe_source_path.rename(target_path) + if target_path.exists(): + if target_path.is_dir(): + shutil.rmtree(target_path) + else: + target_path.unlink() + shutil.move(str(safe_source_path), str(target_path)) _cleanup_empty_dirs(safe_source_path.parent, workspace_root) source_key = _normalize_cache_key_path(str(safe_source_path)) moved_hash = replica_hashes.pop(source_key, None) @@ -527,7 +614,7 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: success_all = all(result in {"applied", "skipped_hash_match"} for result in replica_results.values()) if applied_any: operations_count.setdefault(op_type, 0) - operations_count[op_type] = operations_count.get(op_type, 0) + 1 + operations_count[op_type] += 1 if not success_all: logger.debug( f"[upload_service] Partial success for {op_type} {rel_path}: {replica_results}" @@ -538,8 +625,12 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: else: operations_count["failed"] += 1 - for root in replica_roots.values(): + for slug, root in replica_roots.items(): + if not _should_run_empty_dir_sweep(root, slug): + continue + logger.info("[upload_service] Sweeping empty directories under %s", root) _sweep_empty_workspace_dirs(root) + _record_empty_dir_sweep(root, slug) return operations_count diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index b0cb28df..3d523276 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -184,6 +184,10 @@ class StagingInfo(TypedDict, total=False): repo_name: Optional[str] +class MaintenanceInfo(TypedDict, total=False): + last_empty_dir_sweep_at: Optional[str] + + class WorkspaceState(TypedDict, total=False): created_at: str updated_at: str @@ -204,6 +208,7 @@ class WorkspaceState(TypedDict, total=False): active_repo_slug: Optional[str] serving_repo_slug: Optional[str] staging: Optional[StagingInfo] + maintenance: Optional[MaintenanceInfo] def is_multi_repo_mode() -> bool: """Check if multi-repo mode is enabled.""" @@ -2537,6 +2542,3 @@ def _list_workspaces_from_qdrant(seen_paths: set) -> List[Dict[str, Any]]: pass return workspaces - - -# Add missing functions that callers expect (already defined above) \ No newline at end of file diff --git a/tests/test_upload_client_ignore_cleanup.py b/tests/test_upload_client_ignore_cleanup.py index c7dcd714..3f68a819 100644 --- a/tests/test_upload_client_ignore_cleanup.py +++ b/tests/test_upload_client_ignore_cleanup.py @@ -25,6 +25,7 @@ def _exercise_ignored_path_cleanup(mod_name: str, monkeypatch, tmp_path: Path) - assert ignored in changes["deleted"] assert not changes["created"] assert not changes["updated"] + assert not changes["moved"] def test_remote_upload_client_marks_ignored_cached_paths_deleted(monkeypatch, tmp_path): @@ -74,6 +75,54 @@ def test_standalone_upload_client_force_mode_keeps_creates_and_deletes_ignored_c _exercise_force_mode_cleanup("scripts.standalone_upload_client", monkeypatch, tmp_path) +def _exercise_force_mode_excludes_ignored_current_files(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + ignored_current = workspace / "dev-workspace" / "ignored.py" + ignored_current.parent.mkdir(parents=True, exist_ok=True) + ignored_current.write_text("print('ignored')\n", encoding="utf-8") + + monkeypatch.setenv("DEV_REMOTE_MODE", "1") + monkeypatch.setattr(mod, "get_all_cached_paths", lambda repo_name=None: []) + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: None) + monkeypatch.setattr(mod, "set_cached_file_hash", lambda *a, **k: None) + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + changes = client.build_force_changes([current, ignored_current]) + + assert current in changes["created"] + assert ignored_current not in changes["created"] + assert ignored_current in changes["deleted"] + assert not changes["updated"] + assert not changes["moved"] + + +def test_remote_upload_client_force_mode_excludes_ignored_current_files(monkeypatch, tmp_path): + _exercise_force_mode_excludes_ignored_current_files( + "scripts.remote_upload_client", + monkeypatch, + tmp_path, + ) + + +def test_standalone_upload_client_force_mode_excludes_ignored_current_files(monkeypatch, tmp_path): + _exercise_force_mode_excludes_ignored_current_files( + "scripts.standalone_upload_client", + monkeypatch, + tmp_path, + ) + + def _exercise_force_mode_dev_workspace_cleanup_without_cache(mod_name: str, monkeypatch, tmp_path: Path) -> None: mod = importlib.import_module(mod_name) @@ -101,6 +150,8 @@ def _exercise_force_mode_dev_workspace_cleanup_without_cache(mod_name: str, monk assert current in changes["created"] assert mirrored in changes["deleted"] + assert not changes["updated"] + assert not changes["moved"] def test_remote_upload_client_force_mode_deletes_dev_workspace_without_cache(monkeypatch, tmp_path): diff --git a/tests/test_upload_service_path_traversal.py b/tests/test_upload_service_path_traversal.py index a182d693..78523a64 100644 --- a/tests/test_upload_service_path_traversal.py +++ b/tests/test_upload_service_path_traversal.py @@ -2,6 +2,7 @@ import json import os import tarfile +from datetime import datetime, timedelta, timezone from pathlib import Path import pytest @@ -442,10 +443,27 @@ def test_process_delta_bundle_sweeps_stranded_empty_dirs_without_file_ops(tmp_pa work_dir = tmp_path / "work" work_dir.mkdir(parents=True, exist_ok=True) monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", "1") + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", "604800") slug = "repo-0123456789abcdef" stranded = work_dir / slug / "dev-workspace" / "nested" / "empty" stranded.mkdir(parents=True, exist_ok=True) + state_store = {} + + monkeypatch.setattr( + us, + "get_workspace_state", + lambda workspace_path=None, repo_name=None: state_store.get(repo_name, {}), + ) + + def _fake_update_workspace_state(workspace_path=None, updates=None, repo_name=None): + state = dict(state_store.get(repo_name, {})) + state.update(updates or {}) + state_store[repo_name] = state + return state + + monkeypatch.setattr(us, "update_workspace_state", _fake_update_workspace_state) bundle = _write_bundle(tmp_path, []) @@ -467,6 +485,73 @@ def test_process_delta_bundle_sweeps_stranded_empty_dirs_without_file_ops(tmp_pa assert not stranded.exists() assert not (work_dir / slug / "dev-workspace").exists() assert (work_dir / slug).exists() + assert state_store[slug]["maintenance"]["last_empty_dir_sweep_at"] + + +def test_process_delta_bundle_skips_broad_empty_dir_sweep_when_disabled(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", "0") + + slug = "repo-0123456789abcdef" + stranded = work_dir / slug / "dev-workspace" / "nested" / "empty" + stranded.mkdir(parents=True, exist_ok=True) + + bundle = _write_bundle(tmp_path, []) + + us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-sweep-disabled"}, + ) + + assert stranded.exists() + + +def test_process_delta_bundle_skips_broad_empty_dir_sweep_when_recent(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", "1") + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", "604800") + + slug = "repo-0123456789abcdef" + stranded = work_dir / slug / "dev-workspace" / "nested" / "empty" + stranded.mkdir(parents=True, exist_ok=True) + recent = datetime.now(timezone.utc) - timedelta(hours=1) + state_store = { + slug: { + "maintenance": { + "last_empty_dir_sweep_at": recent.isoformat(), + } + } + } + + monkeypatch.setattr( + us, + "get_workspace_state", + lambda workspace_path=None, repo_name=None: state_store.get(repo_name, {}), + ) + monkeypatch.setattr( + us, + "update_workspace_state", + lambda workspace_path=None, updates=None, repo_name=None: state_store.get(repo_name, {}), + ) + + bundle = _write_bundle(tmp_path, []) + + us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-sweep-recent"}, + ) + + assert stranded.exists() def test_process_delta_bundle_preserves_protected_top_level_dirs_when_empty(tmp_path, monkeypatch): @@ -475,6 +560,8 @@ def test_process_delta_bundle_preserves_protected_top_level_dirs_when_empty(tmp_ work_dir = tmp_path / "work" work_dir.mkdir(parents=True, exist_ok=True) monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", "1") + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", "0") slug = "repo-0123456789abcdef" protected = work_dir / slug / ".remote-git" @@ -489,3 +576,27 @@ def test_process_delta_bundle_preserves_protected_top_level_dirs_when_empty(tmp_ ) assert protected.exists() + + +def test_process_delta_bundle_preserves_nested_dirs_under_protected_top_level(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", "1") + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", "0") + + slug = "repo-0123456789abcdef" + protected_nested = work_dir / slug / ".codebase" / "repos" / "empty" + protected_nested.mkdir(parents=True, exist_ok=True) + + bundle = _write_bundle(tmp_path, []) + + us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-protected-nested-empty"}, + ) + + assert protected_nested.exists() diff --git a/tests/test_watch_index_cache.py b/tests/test_watch_index_cache.py index 1d8d3be6..a5d28738 100644 --- a/tests/test_watch_index_cache.py +++ b/tests/test_watch_index_cache.py @@ -174,8 +174,8 @@ def fake_index_single_file(*args, **kwargs): ok = proc_mod._run_indexing_strategy( path, - client=object(), - model=object(), + client=MagicMock(), + model=MagicMock(), collection="coll", vector_name="vec", model_dim=1, From 168f22f85c78d01cf76e3afbd0c0c77d71d47585 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 7 Mar 2026 22:26:22 +0000 Subject: [PATCH 24/39] feat(upload): add plan/apply workflow for delta uploads Add a two-phase plan/apply workflow that reduces bandwidth by skipping unchanged files based on hash comparison. Clients now send a planning request with file metadata and hashes; the server decides which files actually need content upload. Metadata-only operations such as deletes and moves can be applied without sending a tar bundle. - add /api/v1/delta/plan to compute needed file uploads - add /api/v1/delta/apply_ops for metadata-only delete/move operations - reduce bundle uploads to only files required by the server plan - add async upload result tracking and polling states - harden upload planning/apply paths and related status reporting --- scripts/remote_upload_client.py | 541 ++++++++++++++++--- scripts/standalone_upload_client.py | 553 ++++++++++++++++--- scripts/upload_delta_bundle.py | 567 ++++++++++++++------ scripts/upload_service.py | 233 +++++++- tests/test_upload_client_ignore_cleanup.py | 325 +++++++++++ tests/test_upload_service_path_traversal.py | 174 ++++++ tests/test_upload_service_status.py | 155 ++++++ 7 files changed, 2225 insertions(+), 323 deletions(-) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index fede2311..3efa2422 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -74,6 +74,13 @@ def _server_status_error_message(status: Any) -> str: return "Invalid server status response" +def _env_flag(name: str, default: bool) -> bool: + raw = os.environ.get(name) + if raw is None: + return default + return str(raw).strip().lower() in {"1", "true", "yes", "on"} + + def _log_git_history_skip_once(reason: str, key: str) -> None: global _git_history_skip_log_key marker = f"{reason}:{key}" @@ -549,6 +556,94 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("http://", adapter) self.session.mount("https://", adapter) + self.last_upload_result: Dict[str, Any] = {"outcome": "idle"} + self._last_plan_payload: Optional[Dict[str, Any]] = None + + def _set_last_upload_result(self, outcome: str, **details: Any) -> Dict[str, Any]: + result: Dict[str, Any] = {"outcome": outcome} + result.update(details) + self.last_upload_result = result + return result + + def log_watch_upload_result(self) -> None: + outcome = str((self.last_upload_result or {}).get("outcome") or "") + if outcome == "skipped_by_plan": + logger.info("[watch] No upload needed after plan") + elif outcome == "queued": + logger.info("[watch] Upload request accepted; server processing asynchronously") + elif outcome == "uploaded_async": + processed = (self.last_upload_result or {}).get("processed_operations") + logger.info("[watch] Upload processed asynchronously: %s", processed or {}) + elif outcome == "uploaded": + logger.info("[watch] Successfully uploaded changes") + elif outcome == "no_changes": + logger.info("[watch] No meaningful changes to upload") + else: + logger.info("[watch] Upload handling completed") + + def _finalize_successful_changes(self, changes: Dict[str, List]) -> None: + for path in changes.get("deleted", []): + try: + abs_path = str(path.resolve()) + remove_cached_file(abs_path, self.repo_name) + self._stat_cache.pop(abs_path, None) + except Exception: + continue + for source_path, _dest_path in changes.get("moved", []): + try: + abs_path = str(source_path.resolve()) + remove_cached_file(abs_path, self.repo_name) + self._stat_cache.pop(abs_path, None) + except Exception: + continue + + def _await_async_upload_result( + self, + bundle_id: Optional[str], + sequence_number: Optional[int], + ) -> Optional[Dict[str, Any]]: + try: + max_wait = float(os.environ.get("CTXCE_REMOTE_UPLOAD_STATUS_WAIT_SECS", "5")) + except Exception: + max_wait = 5.0 + if max_wait <= 0: + return None + + try: + poll_interval = float(os.environ.get("CTXCE_REMOTE_UPLOAD_STATUS_POLL_INTERVAL_SECS", "1")) + except Exception: + poll_interval = 1.0 + poll_interval = max(0.1, poll_interval) + + deadline = time.time() + max_wait + while time.time() < deadline: + status = self.get_server_status() + if not status.get("success"): + return None + server_info = status.get("server_info", {}) if isinstance(status, dict) else {} + last_bundle_id = server_info.get("last_bundle_id") + last_upload_status = server_info.get("last_upload_status") + last_sequence = status.get("last_sequence") + bundle_matches = bool(bundle_id) and last_bundle_id == bundle_id + sequence_matches = sequence_number is not None and last_sequence == sequence_number + if bundle_matches or sequence_matches: + if last_upload_status == "completed": + return { + "outcome": "uploaded_async", + "bundle_id": last_bundle_id or bundle_id, + "sequence_number": last_sequence if last_sequence is not None else sequence_number, + "processed_operations": server_info.get("last_processed_operations"), + "processing_time_ms": server_info.get("last_processing_time_ms"), + } + if last_upload_status == "failed": + return { + "outcome": "failed", + "bundle_id": last_bundle_id or bundle_id, + "sequence_number": last_sequence if last_sequence is not None else sequence_number, + "error": server_info.get("last_error"), + } + time.sleep(poll_interval) + return None def __enter__(self): """Context manager entry.""" @@ -629,6 +724,10 @@ def _is_ignored_path(self, path: Path) -> bool: return True return False + def _is_watchable_path(self, path: Path) -> bool: + """Return True when a filesystem event path is eligible for upload processing.""" + return not self._is_ignored_path(path) and idx.CODE_EXTS.get(path.suffix.lower(), "unknown") != "unknown" + def _get_temp_bundle_dir(self) -> Path: """Get or create temporary directory for bundle creation.""" if not self.temp_dir: @@ -1031,6 +1130,264 @@ def create_delta_bundle( return str(bundle_path), manifest + def _build_plan_payload(self, changes: Dict[str, List]) -> Dict[str, Any]: + created_at = datetime.now().isoformat() + bundle_id = str(uuid.uuid4()) + operations: List[Dict[str, Any]] = [] + file_hashes: Dict[str, str] = {} + total_size = 0 + + for path in changes["created"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = path.stat() + operations.append( + { + "operation": "created", + "path": rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown"), + } + ) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning("[remote_upload] Failed to prepare created plan entry for %s: %s", path, e) + + for path in changes["updated"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = path.stat() + operations.append( + { + "operation": "updated", + "path": rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "previous_hash": get_cached_file_hash(str(path.resolve()), self.repo_name), + "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown"), + } + ) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning("[remote_upload] Failed to prepare updated plan entry for %s: %s", path, e) + + for source_path, dest_path in changes["moved"]: + dest_rel_path = dest_path.relative_to(Path(self.workspace_path)).as_posix() + source_rel_path = source_path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = dest_path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = dest_path.stat() + operations.append( + { + "operation": "moved", + "path": dest_rel_path, + "source_path": source_rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "language": idx.CODE_EXTS.get(dest_path.suffix.lower(), "unknown"), + } + ) + file_hashes[dest_rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning( + "[remote_upload] Failed to prepare moved plan entry for %s -> %s: %s", + source_path, + dest_path, + e, + ) + + for path in changes["deleted"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + operations.append( + { + "operation": "deleted", + "path": rel_path, + "previous_hash": get_cached_file_hash(str(path.resolve()), self.repo_name), + "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown"), + } + ) + except Exception as e: + logger.warning("[remote_upload] Failed to prepare deleted plan entry for %s: %s", path, e) + + manifest = { + "version": "1.0", + "bundle_id": bundle_id, + "workspace_path": self.workspace_path, + "collection_name": self.collection_name, + "created_at": created_at, + "sequence_number": None, + "parent_sequence": None, + "operations": { + "created": len(changes["created"]), + "updated": len(changes["updated"]), + "deleted": len(changes["deleted"]), + "moved": len(changes["moved"]), + }, + "total_files": len(operations), + "total_size_bytes": total_size, + "compression": "gzip", + "encoding": "utf-8", + } + return { + "manifest": manifest, + "operations": operations, + "file_hashes": file_hashes, + } + + def _plan_delta_upload(self, changes: Dict[str, List]) -> Optional[Dict[str, Any]]: + if not _env_flag("CTXCE_REMOTE_UPLOAD_PLAN_ENABLED", True): + return None + try: + payload = self._build_plan_payload(changes) + self._last_plan_payload = payload + data = { + "workspace_path": self._translate_to_container_path(self.workspace_path), + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "logical_repo_id": _compute_logical_repo_id(self.workspace_path), + "manifest": payload["manifest"], + "operations": payload["operations"], + "file_hashes": payload["file_hashes"], + } + sess = get_auth_session(self.upload_endpoint) + if sess: + data["session"] = sess + if getattr(self, "logical_repo_id", None): + data["logical_repo_id"] = self.logical_repo_id + + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/plan", + json=data, + timeout=min(self.timeout, 60), + ) + if response.status_code in {404, 405}: + logger.info("[remote_upload] Plan endpoint unavailable; falling back to full bundle upload") + return None + response.raise_for_status() + body = response.json() + if not body.get("success", False): + logger.warning("[remote_upload] Plan request failed; falling back: %s", body.get("error")) + return None + return body + except Exception as e: + logger.warning("[remote_upload] Plan request failed; falling back to full bundle upload: %s", e) + return None + + def _build_apply_only_payload(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Dict[str, Any]: + payload = self._last_plan_payload or self._build_plan_payload(changes) + needed = plan.get("needed_files", {}) if isinstance(plan, dict) else {} + moved_needed = set(needed.get("moved", []) or []) + filtered_ops: List[Dict[str, Any]] = [] + filtered_hashes: Dict[str, str] = {} + for operation in payload.get("operations", []): + op_type = str(operation.get("operation") or "") + rel_path = str(operation.get("path") or "") + if op_type == "deleted": + filtered_ops.append(operation) + continue + if op_type == "moved" and rel_path not in moved_needed: + filtered_ops.append(operation) + hash_value = payload.get("file_hashes", {}).get(rel_path) + if hash_value: + filtered_hashes[rel_path] = hash_value + return { + "manifest": payload.get("manifest", {}), + "operations": filtered_ops, + "file_hashes": filtered_hashes, + } + + def _apply_operations_without_content(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Optional[bool]: + payload = self._build_apply_only_payload(changes, plan) + operations = payload.get("operations", []) + if not operations: + return None + try: + data = { + "workspace_path": self._translate_to_container_path(self.workspace_path), + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "logical_repo_id": _compute_logical_repo_id(self.workspace_path), + "manifest": payload["manifest"], + "operations": operations, + "file_hashes": payload["file_hashes"], + } + sess = get_auth_session(self.upload_endpoint) + if sess: + data["session"] = sess + if getattr(self, "logical_repo_id", None): + data["logical_repo_id"] = self.logical_repo_id + + logger.info( + "[remote_upload] Applying metadata-only operations without bundle: deleted=%s moved=%s", + sum(1 for op in operations if op.get("operation") == "deleted"), + sum(1 for op in operations if op.get("operation") == "moved"), + ) + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/apply_ops", + json=data, + timeout=min(self.timeout, 60), + ) + if response.status_code in {404, 405}: + logger.info("[remote_upload] apply_ops endpoint unavailable; falling back to bundle upload") + return None + response.raise_for_status() + body = response.json() + if not body.get("success", False): + logger.warning("[remote_upload] apply_ops failed; falling back to bundle upload: %s", body.get("error")) + return None + self._finalize_successful_changes(changes) + self._set_last_upload_result( + "uploaded", + bundle_id=body.get("bundle_id"), + sequence_number=body.get("sequence_number"), + processed_operations=body.get("processed_operations"), + ) + logger.info( + "[remote_upload] Metadata-only operations applied: %s", + body.get("processed_operations") or {}, + ) + return True + except Exception as e: + logger.warning("[remote_upload] apply_ops failed; falling back to bundle upload: %s", e) + return None + + def _filter_changes_by_plan(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Dict[str, List]: + needed = plan.get("needed_files", {}) if isinstance(plan, dict) else {} + created_needed = set(needed.get("created", []) or []) + updated_needed = set(needed.get("updated", []) or []) + moved_needed = set(needed.get("moved", []) or []) + + filtered_created = [ + path for path in changes["created"] + if path.relative_to(Path(self.workspace_path)).as_posix() in created_needed + ] + filtered_updated = [ + path for path in changes["updated"] + if path.relative_to(Path(self.workspace_path)).as_posix() in updated_needed + ] + filtered_moved = [ + (source_path, dest_path) + for source_path, dest_path in changes["moved"] + if dest_path.relative_to(Path(self.workspace_path)).as_posix() in moved_needed + ] + return { + "created": filtered_created, + "updated": filtered_updated, + "deleted": list(changes["deleted"]), + "moved": filtered_moved, + "unchanged": [], + } + def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, Any]: """ Upload delta bundle to remote server with exponential backoff retry. @@ -1442,10 +1799,13 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: # Validate input if not changes: logger.info("[remote_upload] No changes provided") + self._set_last_upload_result("no_changes") return True + if not self.has_meaningful_changes(changes): logger.info("[remote_upload] No meaningful changes detected, skipping upload") + self._set_last_upload_result("no_changes") return True # Log change summary @@ -1454,10 +1814,43 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: f"{len(changes['created'])} created, {len(changes['updated'])} updated, " f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + planned_changes = changes + plan = self._plan_delta_upload(changes) + if plan: + preview = plan.get("operation_counts_preview", {}) + logger.info( + "[remote_upload] Plan preview: needed created=%s updated=%s deleted=%s moved=%s " + "skipped_hash_match=%s needed_bytes=%s", + preview.get("created", 0), + preview.get("updated", 0), + preview.get("deleted", 0), + preview.get("moved", 0), + preview.get("skipped_hash_match", 0), + plan.get("needed_size_bytes", 0), + ) + planned_changes = self._filter_changes_by_plan(changes, plan) + has_content_work = bool( + planned_changes.get("created") + or planned_changes.get("updated") + or planned_changes.get("moved") + ) + if not has_content_work: + apply_only_result = self._apply_operations_without_content(changes, plan) + if apply_only_result is True: + return True + if not self.has_meaningful_changes(planned_changes): + logger.info("[remote_upload] Plan found no upload work; skipping bundle upload") + self._set_last_upload_result( + "skipped_by_plan", + plan_preview=preview, + needed_size_bytes=plan.get("needed_size_bytes", 0), + ) + return True + # Create delta bundle bundle_path = None try: - bundle_path, manifest = self.create_delta_bundle(changes) + bundle_path, manifest = self.create_delta_bundle(planned_changes) logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " f"(size: {manifest['total_size_bytes']} bytes)") @@ -1469,6 +1862,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: logger.error(f"[remote_upload] Error creating delta bundle: {e}") # Clean up any temporary files on failure self.cleanup() + self._set_last_upload_result("failed", stage="bundle_creation", error=str(e)) return False # Upload bundle with retry logic @@ -1476,9 +1870,47 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: response = self.upload_bundle(bundle_path, manifest) if response.get("success", False): - processed_ops = response.get('processed_operations', {}) - logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") - logger.info(f"[remote_upload] Processed operations: {processed_ops}") + processed_ops = response.get("processed_operations") + if processed_ops is None: + logger.info( + "[remote_upload] Bundle %s accepted by server; processing asynchronously (sequence=%s)", + manifest["bundle_id"], + response.get("sequence_number"), + ) + self._set_last_upload_result( + "queued", + bundle_id=manifest["bundle_id"], + sequence_number=response.get("sequence_number"), + ) + async_result = self._await_async_upload_result( + manifest["bundle_id"], + response.get("sequence_number"), + ) + if async_result: + self.last_upload_result = async_result + if async_result["outcome"] == "uploaded_async": + self._finalize_successful_changes(planned_changes) + logger.info( + "[remote_upload] Async processing completed for bundle %s: %s", + manifest["bundle_id"], + async_result.get("processed_operations") or {}, + ) + elif async_result["outcome"] == "failed": + logger.warning( + "[remote_upload] Async processing failed for bundle %s: %s", + manifest["bundle_id"], + async_result.get("error"), + ) + else: + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + self._finalize_successful_changes(planned_changes) + self._set_last_upload_result( + "uploaded", + bundle_id=manifest["bundle_id"], + sequence_number=response.get("sequence_number"), + processed_operations=processed_ops, + ) # Clean up temporary bundle after successful upload try: @@ -1494,16 +1926,20 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: else: error_msg = response.get('error', {}).get('message', 'Unknown upload error') logger.error(f"[remote_upload] Upload failed: {error_msg}") + self._set_last_upload_result("failed", stage="upload", error=error_msg) return False except Exception as e: logger.error(f"[remote_upload] Error uploading bundle: {e}") + self._set_last_upload_result("failed", stage="upload", error=str(e)) return False except Exception as e: logger.error(f"[remote_upload] Unexpected error in process_changes_and_upload: {e}") + self._set_last_upload_result("failed", stage="unexpected", error=str(e)) return False + def get_all_code_files(self) -> List[Path]: """Get all code files in the workspace.""" files: List[Path] = [] @@ -1589,13 +2025,13 @@ def on_any_event(self, event): # Always check src_path src_path = Path(event.src_path) - if not self.client._is_ignored_path(src_path) and idx.CODE_EXTS.get(src_path.suffix.lower(), "unknown") != "unknown": + if self.client._is_watchable_path(src_path): paths_to_process.append(src_path) # For FileMovedEvent, also process the destination path if hasattr(event, 'dest_path') and event.dest_path: dest_path = Path(event.dest_path) - if not self.client._is_ignored_path(dest_path) and idx.CODE_EXTS.get(dest_path.suffix.lower(), "unknown") != "unknown": + if self.client._is_watchable_path(dest_path): paths_to_process.append(dest_path) if not paths_to_process: @@ -1639,6 +2075,7 @@ def _process_pending_changes(self): else: all_paths = pending + changes = self.client.detect_file_changes(all_paths) meaningful_changes = ( len(changes.get("created", [])) + @@ -1651,7 +2088,7 @@ def _process_pending_changes(self): logger.info(f"[watch] Detected {meaningful_changes} changes: { {k: len(v) for k, v in changes.items() if k != 'unchanged'} }") success = self.client.process_changes_and_upload(changes) if success: - logger.info("[watch] Successfully uploaded changes") + self.client.log_watch_upload_result() else: logger.error("[watch] Failed to upload changes") else: @@ -1746,7 +2183,7 @@ def _watch_loop_polling(self, interval: int = 5): success = self.process_changes_and_upload(changes) if success: - logger.info(f"[watch] Successfully uploaded changes") + self.log_watch_upload_result() else: logger.error(f"[watch] Failed to upload changes") else: @@ -1804,80 +2241,7 @@ def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: except Exception as e: logger.error(f"[remote_upload] Error detecting file changes: {e}") return False - - if not self.has_meaningful_changes(changes): - logger.info("[remote_upload] No meaningful changes detected, skipping upload") - return True - - # Log change summary - total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") - logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " - f"{len(changes['created'])} created, {len(changes['updated'])} updated, " - f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") - - # Create delta bundle - bundle_path = None - try: - bundle_path, manifest = self.create_delta_bundle(changes) - logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " - f"(size: {manifest['total_size_bytes']} bytes)") - - # Validate bundle was created successfully - if not bundle_path or not os.path.exists(bundle_path): - raise RuntimeError(f"Failed to create bundle at {bundle_path}") - - except Exception as e: - logger.error(f"[remote_upload] Error creating delta bundle: {e}") - # Clean up any temporary files on failure - self.cleanup() - return False - - # Upload bundle with retry logic - try: - response = self.upload_bundle(bundle_path, manifest) - - if response.get("success", False): - processed_ops = response.get('processed_operations', {}) - logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") - logger.info(f"[remote_upload] Processed operations: {processed_ops}") - - # Clean up temporary bundle after successful upload - try: - if os.path.exists(bundle_path): - os.remove(bundle_path) - logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") - # Also clean up the entire temp directory if this is the last bundle - self.cleanup() - except Exception as cleanup_error: - logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") - - return True - else: - error = response.get("error", {}) - error_code = error.get("code", "UNKNOWN") - error_msg = error.get("message", "Unknown error") - - logger.error(f"[remote_upload] Upload failed: {error_msg}") - - # Handle specific error types - # CLI is stateless - server handles sequence management - if error_code in ["BUNDLE_TOO_LARGE", "BUNDLE_NOT_FOUND"]: - # These are unrecoverable errors - logger.error(f"[remote_upload] Unrecoverable error ({error_code}): {error_msg}") - return False - elif error_code in ["TIMEOUT_ERROR", "CONNECTION_ERROR", "NETWORK_ERROR"]: - # These might be temporary, suggest fallback - logger.warning(f"[remote_upload] Network-related error ({error_code}): {error_msg}") - logger.warning("[remote_upload] Consider falling back to local mode if this persists") - return False - else: - # Other errors - logger.error(f"[remote_upload] Upload error ({error_code}): {error_msg}") - return False - - except Exception as e: - logger.error(f"[remote_upload] Unexpected error during upload: {e}") - return False + return self.process_changes_and_upload(changes) except Exception as e: logger.error(f"[remote_upload] Critical error in process_and_upload_changes: {e}") @@ -2142,7 +2506,18 @@ def main(): success = client.process_changes_and_upload(changes) if success: - logger.info("Repository upload completed successfully!") + outcome = str((client.last_upload_result or {}).get("outcome") or "") + if outcome == "skipped_by_plan": + logger.info("No upload needed after plan") + elif outcome == "queued": + logger.info("Repository upload request accepted; server processing asynchronously") + elif outcome == "uploaded_async": + logger.info( + "Repository upload processed asynchronously: %s", + (client.last_upload_result or {}).get("processed_operations") or {}, + ) + else: + logger.info("Repository upload completed successfully!") logger.info(f"Collection name: {config['collection_name']}") logger.info(f"Files uploaded: {len(all_files)}") else: diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index c52a2aef..2c98d86a 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -76,6 +76,20 @@ def _server_status_error_message(status: Any) -> str: return "Invalid server status response" +def _env_flag(name: str, default: bool) -> bool: + raw = os.environ.get(name) + if raw is None: + return default + return str(raw).strip().lower() in {"1", "true", "yes", "on"} + + +def _format_cached_sha1(value: Optional[str]) -> Optional[str]: + raw = str(value or "").strip() + if not raw: + return None + return raw if raw.lower().startswith("sha1:") else f"sha1:{raw}" + + def _log_git_history_skip_once(reason: str, key: str) -> None: global _git_history_skip_log_key marker = f"{reason}:{key}" @@ -764,6 +778,94 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("http://", adapter) self.session.mount("https://", adapter) + self.last_upload_result: Dict[str, Any] = {"outcome": "idle"} + self._last_plan_payload: Optional[Dict[str, Any]] = None + + def _set_last_upload_result(self, outcome: str, **details: Any) -> Dict[str, Any]: + result: Dict[str, Any] = {"outcome": outcome} + result.update(details) + self.last_upload_result = result + return result + + def log_watch_upload_result(self) -> None: + outcome = str((self.last_upload_result or {}).get("outcome") or "") + if outcome == "skipped_by_plan": + logger.info("[watch] No upload needed after plan") + elif outcome == "queued": + logger.info("[watch] Upload request accepted; server processing asynchronously") + elif outcome == "uploaded_async": + processed = (self.last_upload_result or {}).get("processed_operations") + logger.info("[watch] Upload processed asynchronously: %s", processed or {}) + elif outcome == "uploaded": + logger.info("[watch] Successfully uploaded changes") + elif outcome == "no_changes": + logger.info("[watch] No meaningful changes to upload") + else: + logger.info("[watch] Upload handling completed") + + def _finalize_successful_changes(self, changes: Dict[str, List]) -> None: + for path in changes.get("deleted", []): + try: + abs_path = str(path.resolve()) + remove_cached_file(abs_path, self.repo_name) + self._stat_cache.pop(abs_path, None) + except Exception: + continue + for source_path, _dest_path in changes.get("moved", []): + try: + abs_path = str(source_path.resolve()) + remove_cached_file(abs_path, self.repo_name) + self._stat_cache.pop(abs_path, None) + except Exception: + continue + + def _await_async_upload_result( + self, + bundle_id: Optional[str], + sequence_number: Optional[int], + ) -> Optional[Dict[str, Any]]: + try: + max_wait = float(os.environ.get("CTXCE_REMOTE_UPLOAD_STATUS_WAIT_SECS", "5")) + except Exception: + max_wait = 5.0 + if max_wait <= 0: + return None + + try: + poll_interval = float(os.environ.get("CTXCE_REMOTE_UPLOAD_STATUS_POLL_INTERVAL_SECS", "1")) + except Exception: + poll_interval = 1.0 + poll_interval = max(0.1, poll_interval) + + deadline = time.time() + max_wait + while time.time() < deadline: + status = self.get_server_status() + if not status.get("success"): + return None + server_info = status.get("server_info", {}) if isinstance(status, dict) else {} + last_bundle_id = server_info.get("last_bundle_id") + last_upload_status = server_info.get("last_upload_status") + last_sequence = status.get("last_sequence") + bundle_matches = bool(bundle_id) and last_bundle_id == bundle_id + sequence_matches = sequence_number is not None and last_sequence == sequence_number + if bundle_matches or sequence_matches: + if last_upload_status == "completed": + return { + "outcome": "uploaded_async", + "bundle_id": last_bundle_id or bundle_id, + "sequence_number": last_sequence if last_sequence is not None else sequence_number, + "processed_operations": server_info.get("last_processed_operations"), + "processing_time_ms": server_info.get("last_processing_time_ms"), + } + if last_upload_status == "failed": + return { + "outcome": "failed", + "bundle_id": last_bundle_id or bundle_id, + "sequence_number": last_sequence if last_sequence is not None else sequence_number, + "error": server_info.get("last_error"), + } + time.sleep(poll_interval) + return None def __enter__(self): """Context manager entry.""" @@ -840,6 +942,10 @@ def _is_ignored_path(self, path: Path) -> bool: return True return False + def _is_watchable_path(self, path: Path) -> bool: + """Return True when a filesystem event path is eligible for upload processing.""" + return not self._is_ignored_path(path) and detect_language(path) != "unknown" + def _get_temp_bundle_dir(self) -> Path: """Get or create temporary directory for bundle creation.""" if not self.temp_dir: @@ -1238,6 +1344,270 @@ def create_delta_bundle( return str(bundle_path), manifest + def _build_plan_payload(self, changes: Dict[str, List]) -> Dict[str, Any]: + created_at = datetime.now().isoformat() + bundle_id = str(uuid.uuid4()) + operations: List[Dict[str, Any]] = [] + file_hashes: Dict[str, str] = {} + total_size = 0 + + for path in changes["created"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = path.stat() + operations.append( + { + "operation": "created", + "path": rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "language": detect_language(path), + } + ) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning("[remote_upload] Failed to prepare created plan entry for %s: %s", path, e) + + for path in changes["updated"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = path.stat() + previous_hash = _format_cached_sha1( + get_cached_file_hash(str(path.resolve()), self.repo_name) + ) + operations.append( + { + "operation": "updated", + "path": rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "previous_hash": previous_hash, + "language": detect_language(path), + } + ) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning("[remote_upload] Failed to prepare updated plan entry for %s: %s", path, e) + + for source_path, dest_path in changes["moved"]: + dest_rel_path = dest_path.relative_to(Path(self.workspace_path)).as_posix() + source_rel_path = source_path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = dest_path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = dest_path.stat() + operations.append( + { + "operation": "moved", + "path": dest_rel_path, + "source_path": source_rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "language": detect_language(dest_path), + } + ) + file_hashes[dest_rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning( + "[remote_upload] Failed to prepare moved plan entry for %s -> %s: %s", + source_path, + dest_path, + e, + ) + + for path in changes["deleted"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + previous_hash = _format_cached_sha1( + get_cached_file_hash(str(path.resolve()), self.repo_name) + ) + operations.append( + { + "operation": "deleted", + "path": rel_path, + "previous_hash": previous_hash, + "language": detect_language(path), + } + ) + except Exception as e: + logger.warning("[remote_upload] Failed to prepare deleted plan entry for %s: %s", path, e) + + manifest = { + "version": "1.0", + "bundle_id": bundle_id, + "workspace_path": self.workspace_path, + "collection_name": self.collection_name, + "created_at": created_at, + "sequence_number": None, + "parent_sequence": None, + "operations": { + "created": len(changes["created"]), + "updated": len(changes["updated"]), + "deleted": len(changes["deleted"]), + "moved": len(changes["moved"]), + }, + "total_files": len(operations), + "total_size_bytes": total_size, + "compression": "gzip", + "encoding": "utf-8", + } + return { + "manifest": manifest, + "operations": operations, + "file_hashes": file_hashes, + } + + def _plan_delta_upload(self, changes: Dict[str, List]) -> Optional[Dict[str, Any]]: + if not _env_flag("CTXCE_REMOTE_UPLOAD_PLAN_ENABLED", True): + return None + try: + payload = self._build_plan_payload(changes) + self._last_plan_payload = payload + data = { + "workspace_path": self._translate_to_container_path(self.workspace_path), + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "logical_repo_id": _compute_logical_repo_id(self.workspace_path), + "manifest": payload["manifest"], + "operations": payload["operations"], + "file_hashes": payload["file_hashes"], + } + sess = get_auth_session(self.upload_endpoint) + if sess: + data["session"] = sess + if getattr(self, "logical_repo_id", None): + data["logical_repo_id"] = self.logical_repo_id + + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/plan", + json=data, + timeout=min(self.timeout, 60), + ) + if response.status_code in {404, 405}: + logger.info("[remote_upload] Plan endpoint unavailable; falling back to full bundle upload") + return None + response.raise_for_status() + body = response.json() + if not body.get("success", False): + logger.warning("[remote_upload] Plan request failed; falling back: %s", body.get("error")) + return None + return body + except Exception as e: + logger.warning("[remote_upload] Plan request failed; falling back to full bundle upload: %s", e) + return None + + def _build_apply_only_payload(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Dict[str, Any]: + payload = self._last_plan_payload or self._build_plan_payload(changes) + needed = plan.get("needed_files", {}) if isinstance(plan, dict) else {} + moved_needed = set(needed.get("moved", []) or []) + filtered_ops: List[Dict[str, Any]] = [] + filtered_hashes: Dict[str, str] = {} + for operation in payload.get("operations", []): + op_type = str(operation.get("operation") or "") + rel_path = str(operation.get("path") or "") + if op_type == "deleted": + filtered_ops.append(operation) + continue + if op_type == "moved" and rel_path not in moved_needed: + filtered_ops.append(operation) + hash_value = payload.get("file_hashes", {}).get(rel_path) + if hash_value: + filtered_hashes[rel_path] = hash_value + return { + "manifest": payload.get("manifest", {}), + "operations": filtered_ops, + "file_hashes": filtered_hashes, + } + + def _apply_operations_without_content(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Optional[bool]: + payload = self._build_apply_only_payload(changes, plan) + operations = payload.get("operations", []) + if not operations: + return None + try: + data = { + "workspace_path": self._translate_to_container_path(self.workspace_path), + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "logical_repo_id": _compute_logical_repo_id(self.workspace_path), + "manifest": payload["manifest"], + "operations": operations, + "file_hashes": payload["file_hashes"], + } + sess = get_auth_session(self.upload_endpoint) + if sess: + data["session"] = sess + if getattr(self, "logical_repo_id", None): + data["logical_repo_id"] = self.logical_repo_id + + logger.info( + "[remote_upload] Applying metadata-only operations without bundle: deleted=%s moved=%s", + sum(1 for op in operations if op.get("operation") == "deleted"), + sum(1 for op in operations if op.get("operation") == "moved"), + ) + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/apply_ops", + json=data, + timeout=min(self.timeout, 60), + ) + if response.status_code in {404, 405}: + logger.info("[remote_upload] apply_ops endpoint unavailable; falling back to bundle upload") + return None + response.raise_for_status() + body = response.json() + if not body.get("success", False): + logger.warning("[remote_upload] apply_ops failed; falling back to bundle upload: %s", body.get("error")) + return None + self._finalize_successful_changes(changes) + self._set_last_upload_result( + "uploaded", + bundle_id=body.get("bundle_id"), + sequence_number=body.get("sequence_number"), + processed_operations=body.get("processed_operations"), + ) + logger.info( + "[remote_upload] Metadata-only operations applied: %s", + body.get("processed_operations") or {}, + ) + return True + except Exception as e: + logger.warning("[remote_upload] apply_ops failed; falling back to bundle upload: %s", e) + return None + + def _filter_changes_by_plan(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Dict[str, List]: + needed = plan.get("needed_files", {}) if isinstance(plan, dict) else {} + created_needed = set(needed.get("created", []) or []) + updated_needed = set(needed.get("updated", []) or []) + moved_needed = set(needed.get("moved", []) or []) + + filtered_created = [ + path for path in changes["created"] + if path.relative_to(Path(self.workspace_path)).as_posix() in created_needed + ] + filtered_updated = [ + path for path in changes["updated"] + if path.relative_to(Path(self.workspace_path)).as_posix() in updated_needed + ] + filtered_moved = [ + (source_path, dest_path) + for source_path, dest_path in changes["moved"] + if dest_path.relative_to(Path(self.workspace_path)).as_posix() in moved_needed + ] + return { + "created": filtered_created, + "updated": filtered_updated, + "deleted": list(changes["deleted"]), + "moved": filtered_moved, + "unchanged": [], + } + def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, Any]: """Upload delta bundle to remote server with exponential backoff retry. @@ -1648,10 +2018,12 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: # Validate input if not changes: logger.info("[remote_upload] No changes provided") + self._set_last_upload_result("no_changes") return True if not self.has_meaningful_changes(changes): logger.info("[remote_upload] No meaningful changes detected, skipping upload") + self._set_last_upload_result("no_changes") return True # Log change summary @@ -1660,10 +2032,43 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: f"{len(changes['created'])} created, {len(changes['updated'])} updated, " f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + planned_changes = changes + plan = self._plan_delta_upload(changes) + if plan: + preview = plan.get("operation_counts_preview", {}) + logger.info( + "[remote_upload] Plan preview: needed created=%s updated=%s deleted=%s moved=%s " + "skipped_hash_match=%s needed_bytes=%s", + preview.get("created", 0), + preview.get("updated", 0), + preview.get("deleted", 0), + preview.get("moved", 0), + preview.get("skipped_hash_match", 0), + plan.get("needed_size_bytes", 0), + ) + planned_changes = self._filter_changes_by_plan(changes, plan) + has_content_work = bool( + planned_changes.get("created") + or planned_changes.get("updated") + or planned_changes.get("moved") + ) + if not has_content_work: + apply_only_result = self._apply_operations_without_content(changes, plan) + if apply_only_result is True: + return True + if not self.has_meaningful_changes(planned_changes): + logger.info("[remote_upload] Plan found no upload work; skipping bundle upload") + self._set_last_upload_result( + "skipped_by_plan", + plan_preview=preview, + needed_size_bytes=plan.get("needed_size_bytes", 0), + ) + return True + # Create delta bundle bundle_path = None try: - bundle_path, manifest = self.create_delta_bundle(changes) + bundle_path, manifest = self.create_delta_bundle(planned_changes) logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " f"(size: {manifest['total_size_bytes']} bytes)") @@ -1675,6 +2080,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: logger.error(f"[remote_upload] Error creating delta bundle: {e}") # Clean up any temporary files on failure self.cleanup() + self._set_last_upload_result("failed", stage="bundle_creation", error=str(e)) return False # Upload bundle with retry logic @@ -1682,9 +2088,47 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: response = self.upload_bundle(bundle_path, manifest) if response.get("success", False): - processed_ops = response.get('processed_operations', {}) - logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") - logger.info(f"[remote_upload] Processed operations: {processed_ops}") + processed_ops = response.get("processed_operations") + if processed_ops is None: + logger.info( + "[remote_upload] Bundle %s accepted by server; processing asynchronously (sequence=%s)", + manifest["bundle_id"], + response.get("sequence_number"), + ) + self._set_last_upload_result( + "queued", + bundle_id=manifest["bundle_id"], + sequence_number=response.get("sequence_number"), + ) + async_result = self._await_async_upload_result( + manifest["bundle_id"], + response.get("sequence_number"), + ) + if async_result: + self.last_upload_result = async_result + if async_result["outcome"] == "uploaded_async": + self._finalize_successful_changes(planned_changes) + logger.info( + "[remote_upload] Async processing completed for bundle %s: %s", + manifest["bundle_id"], + async_result.get("processed_operations") or {}, + ) + elif async_result["outcome"] == "failed": + logger.warning( + "[remote_upload] Async processing failed for bundle %s: %s", + manifest["bundle_id"], + async_result.get("error"), + ) + else: + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + self._finalize_successful_changes(planned_changes) + self._set_last_upload_result( + "uploaded", + bundle_id=manifest["bundle_id"], + sequence_number=response.get("sequence_number"), + processed_operations=processed_ops, + ) flush_cached_file_hashes() # Clean up temporary bundle after successful upload @@ -1701,14 +2145,17 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: else: error_msg = response.get('error', {}).get('message', 'Unknown upload error') logger.error(f"[remote_upload] Upload failed: {error_msg}") + self._set_last_upload_result("failed", stage="upload", error=error_msg) return False except Exception as e: logger.error(f"[remote_upload] Error uploading bundle: {e}") + self._set_last_upload_result("failed", stage="upload", error=str(e)) return False except Exception as e: logger.error(f"[remote_upload] Unexpected error in process_changes_and_upload: {e}") + self._set_last_upload_result("failed", stage="unexpected", error=str(e)) return False def watch_loop(self, interval: int = 5): @@ -1752,13 +2199,13 @@ def on_any_event(self, event): # Always check src_path src_path = Path(event.src_path) - if not self.client._is_ignored_path(src_path) and detect_language(src_path) != "unknown": + if self.client._is_watchable_path(src_path): paths_to_process.append(src_path) # For FileMovedEvent, also process the destination path if hasattr(event, 'dest_path') and event.dest_path: dest_path = Path(event.dest_path) - if not self.client._is_ignored_path(dest_path) and detect_language(dest_path) != "unknown": + if self.client._is_watchable_path(dest_path): paths_to_process.append(dest_path) if not paths_to_process: @@ -1795,6 +2242,7 @@ def _process_pending_changes(self): all_paths = list(set(pending + cached_paths)) else: all_paths = pending + changes = self.client.detect_file_changes(all_paths) meaningful_changes = ( @@ -1808,7 +2256,7 @@ def _process_pending_changes(self): logger.info(f"[watch] Detected {meaningful_changes} changes: { {k: len(v) for k, v in changes.items() if k != 'unchanged'} }") success = self.client.process_changes_and_upload(changes) if success: - logger.info("[watch] Successfully uploaded changes") + self.client.log_watch_upload_result() else: logger.error("[watch] Failed to upload changes") else: @@ -1897,7 +2345,7 @@ def _watch_loop_polling(self, interval: int = 5): success = self.process_changes_and_upload(changes) if success: - logger.info(f"[watch] Successfully uploaded changes") + self.log_watch_upload_result() else: logger.error(f"[watch] Failed to upload changes") else: @@ -1997,81 +2445,7 @@ def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: except Exception as e: logger.error(f"[remote_upload] Error detecting file changes: {e}") return False - - if not self.has_meaningful_changes(changes): - logger.info("[remote_upload] No meaningful changes detected, skipping upload") - return True - - # Log change summary - total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") - logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " - f"{len(changes['created'])} created, {len(changes['updated'])} updated, " - f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") - - # Create delta bundle - bundle_path = None - try: - bundle_path, manifest = self.create_delta_bundle(changes) - logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " - f"(size: {manifest['total_size_bytes']} bytes)") - - # Validate bundle was created successfully - if not bundle_path or not os.path.exists(bundle_path): - raise RuntimeError(f"Failed to create bundle at {bundle_path}") - - except Exception as e: - logger.error(f"[remote_upload] Error creating delta bundle: {e}") - # Clean up any temporary files on failure - self.cleanup() - return False - - # Upload bundle with retry logic - try: - response = self.upload_bundle(bundle_path, manifest) - - if response.get("success", False): - processed_ops = response.get('processed_operations', {}) - logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") - logger.info(f"[remote_upload] Processed operations: {processed_ops}") - flush_cached_file_hashes() - - # Clean up temporary bundle after successful upload - try: - if os.path.exists(bundle_path): - os.remove(bundle_path) - logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") - # Also clean up the entire temp directory if this is the last bundle - self.cleanup() - except Exception as cleanup_error: - logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") - - return True - else: - error = response.get("error", {}) - error_code = error.get("code", "UNKNOWN") - error_msg = error.get("message", "Unknown error") - - logger.error(f"[remote_upload] Upload failed: {error_msg}") - - # Handle specific error types - # CLI is stateless - server handles sequence management - if error_code in ["BUNDLE_TOO_LARGE", "BUNDLE_NOT_FOUND"]: - # These are unrecoverable errors - logger.error(f"[remote_upload] Unrecoverable error ({error_code}): {error_msg}") - return False - elif error_code in ["TIMEOUT_ERROR", "CONNECTION_ERROR", "NETWORK_ERROR"]: - # These might be temporary, suggest fallback - logger.warning(f"[remote_upload] Network-related error ({error_code}): {error_msg}") - logger.warning("[remote_upload] Consider falling back to local mode if this persists") - return False - else: - # Other errors - logger.error(f"[remote_upload] Upload error ({error_code}): {error_msg}") - return False - - except Exception as e: - logger.error(f"[remote_upload] Unexpected error during upload: {e}") - return False + return self.process_changes_and_upload(changes) except Exception as e: logger.error(f"[remote_upload] Critical error in process_and_upload_changes: {e}") @@ -2331,7 +2705,18 @@ def main(): success = client.process_changes_and_upload(changes) if success: - logger.info("Repository upload completed successfully!") + outcome = str((client.last_upload_result or {}).get("outcome") or "") + if outcome == "skipped_by_plan": + logger.info("No upload needed after plan") + elif outcome == "queued": + logger.info("Repository upload request accepted; server processing asynchronously") + elif outcome == "uploaded_async": + logger.info( + "Repository upload processed asynchronously: %s", + (client.last_upload_result or {}).get("processed_operations") or {}, + ) + else: + logger.info("Repository upload completed successfully!") logger.info(f"Collection name: {config['collection_name']}") logger.info(f"Files uploaded: {len(all_files)}") else: diff --git a/scripts/upload_delta_bundle.py b/scripts/upload_delta_bundle.py index 8b33324d..2985c8d2 100644 --- a/scripts/upload_delta_bundle.py +++ b/scripts/upload_delta_bundle.py @@ -220,8 +220,291 @@ def _record_empty_dir_sweep(workspace_root: Path, slug: str) -> None: ) -def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[str, Any]) -> Dict[str, int]: - """Process delta bundle and return operation counts.""" +def _resolve_replica_roots(workspace_path: str, *, create_missing: bool = True) -> Dict[str, Path]: + workspace_leaf = Path(workspace_path).name + + repo_name_for_state: Optional[str] = None + serving_slug: Optional[str] = None + active_slug: Optional[str] = None + if _extract_repo_name_from_path and get_collection_state_snapshot: + try: + repo_name_for_state = _extract_repo_name_from_path(workspace_path) + if repo_name_for_state: + snapshot = get_collection_state_snapshot( + workspace_path=None, + repo_name=repo_name_for_state, + ) # type: ignore[arg-type] + serving_slug = snapshot.get("serving_repo_slug") + active_slug = snapshot.get("active_repo_slug") + except Exception: + serving_slug = None + active_slug = None + + slug_order: list[str] = [] + serving_candidate: Optional[str] = None + if serving_slug and _SLUGGED_REPO_RE.match(serving_slug): + serving_candidate = serving_slug + if active_slug and _SLUGGED_REPO_RE.match(active_slug) and active_slug not in slug_order: + slug_order.append(active_slug) + + staging_active = False + staging_gate = bool(is_staging_enabled() if callable(is_staging_enabled) else False) + try: + if serving_slug and str(serving_slug).endswith("_old"): + staging_active = True + except Exception: + staging_active = False + + if not staging_gate: + staging_active = False + + def _append_slug(slug: Optional[str]) -> None: + if slug and _SLUGGED_REPO_RE.match(slug) and slug not in slug_order: + slug_order.append(slug) + + if repo_name_for_state and _SLUGGED_REPO_RE.match(repo_name_for_state): + canonical_slug = ( + repo_name_for_state[:-4] + if repo_name_for_state.endswith("_old") + else repo_name_for_state + ) + old_slug_candidate = ( + repo_name_for_state + if repo_name_for_state.endswith("_old") + else f"{canonical_slug}_old" + ) + if staging_active: + slug_order = [] + _append_slug(canonical_slug) + _append_slug(old_slug_candidate) + elif not slug_order: + _append_slug(canonical_slug) + old_slug_path = Path(WORK_DIR) / old_slug_candidate + if old_slug_path.exists(): + _append_slug(old_slug_candidate) + + if not slug_order: + if _SLUGGED_REPO_RE.match(workspace_leaf): + slug_order.append(workspace_leaf) + else: + if _extract_repo_name_from_path: + repo_name = _extract_repo_name_from_path(workspace_path) or workspace_leaf + else: + repo_name = workspace_leaf + workspace_key = get_workspace_key(workspace_path) + slug_order.append(f"{repo_name}-{workspace_key}") + + if staging_gate and (not staging_active) and get_staging_targets and _extract_repo_name_from_path: + try: + repo_name_for_staging = _extract_repo_name_from_path(workspace_path) or slug_order[0] + targets = get_staging_targets( + workspace_path=workspace_path, + repo_name=repo_name_for_staging, + ) + if isinstance(targets, dict) and targets.get("staging"): + staging_active = True + except Exception as staging_err: + logger.debug("[upload_service] Failed to detect staging: %s", staging_err) + + def _slug_exists(slug: str) -> bool: + try: + return ( + (Path(WORK_DIR) / slug).exists() + or (Path(WORK_DIR) / ".codebase" / "repos" / slug).exists() + ) + except Exception: + return False + + if staging_gate and (not staging_active) and slug_order: + primary = slug_order[0] + if _SLUGGED_REPO_RE.match(primary): + canonical = primary[:-4] if primary.endswith("_old") else primary + inferred_old = primary if primary.endswith("_old") else f"{canonical}_old" + if _slug_exists(inferred_old): + staging_active = True + + if staging_gate and staging_active and slug_order: + primary = slug_order[0] + if _SLUGGED_REPO_RE.match(primary): + canonical = primary[:-4] if primary.endswith("_old") else primary + old_slug = primary if primary.endswith("_old") else f"{canonical}_old" + desired = [canonical, old_slug] + slug_order = [s for s in desired if _SLUGGED_REPO_RE.match(s)] + elif staging_gate and not staging_active and serving_candidate: + if serving_candidate in slug_order: + slug_order = [s for s in slug_order if s != serving_candidate] + + if staging_gate: + try: + logger.info("[upload_service] Delta bundle targets (staging=%s): %s", staging_active, slug_order) + except Exception: + pass + + replica_roots: Dict[str, Path] = {} + for slug in slug_order: + path = Path(WORK_DIR) / slug + if create_missing: + path.mkdir(parents=True, exist_ok=True) + try: + marker_dir = Path(WORK_DIR) / ".codebase" / "repos" / slug + marker_dir.mkdir(parents=True, exist_ok=True) + (marker_dir / ".ctxce_managed_upload").write_text("1\n") + except Exception: + pass + replica_roots[slug] = path.resolve() + return replica_roots + + +def _safe_join(base: Path, rel: str) -> Path: + rp = Path(str(rel)) + if str(rp) in {".", ""}: + raise ValueError("Invalid operation path") + if rp.is_absolute(): + raise ValueError(f"Absolute paths are not allowed: {rel}") + base_resolved = base.resolve() + candidate = (base_resolved / rp).resolve() + try: + ok = candidate.is_relative_to(base_resolved) + except Exception: + ok = os.path.commonpath([str(base_resolved), str(candidate)]) == str(base_resolved) + if not ok: + raise ValueError(f"Path escapes workspace: {rel}") + return candidate + + +def _sanitize_operation_path(rel_path: str, replica_roots: Dict[str, Path]) -> Optional[str]: + sanitized_path = rel_path + skipped_due_to_exact_slug = False + for slug in replica_roots.keys(): + if sanitized_path == slug: + skipped_due_to_exact_slug = True + break + prefix = f"{slug}/" + if sanitized_path.startswith(prefix): + sanitized_path = sanitized_path[len(prefix):] + break + if skipped_due_to_exact_slug or not sanitized_path: + return None + return sanitized_path + + +def plan_delta_upload( + workspace_path: str, + operations: list[Dict[str, Any]], + file_hashes: Optional[Dict[str, str]] = None, +) -> Dict[str, Any]: + needed_files = { + "created": [], + "updated": [], + "moved": [], + } + operations_count = { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + } + needed_size_bytes = 0 + replica_roots = _resolve_replica_roots(workspace_path, create_missing=False) + replica_cache_hashes = { + slug: _load_replica_cache_hashes(root, slug) + for slug, root in replica_roots.items() + } + normalized_hashes = { + str(rel_path): _normalize_hash_value(hash_value) + for rel_path, hash_value in (file_hashes or {}).items() + if _normalize_hash_value(hash_value) + } + + for operation in operations: + op_type = str(operation.get("operation") or "") + rel_path = operation.get("path") + if not rel_path: + operations_count["skipped"] += 1 + continue + + sanitized = _sanitize_operation_path(str(rel_path), replica_roots) + if not sanitized: + operations_count["skipped"] += 1 + continue + + if op_type == "deleted": + operations_count["deleted"] += 1 + continue + if op_type == "moved": + operations_count["moved"] += 1 + source_rel_path = operation.get("source_path") or operation.get("source_relative_path") + if not source_rel_path: + needed_files["moved"].append(sanitized) + needed_size_bytes += int(operation.get("size_bytes") or 0) + continue + + move_needs_content = False + for _slug, root in replica_roots.items(): + try: + safe_source_path = _safe_join(root, str(source_rel_path)) + except ValueError: + logger.warning( + "[upload_service] Invalid move source path during plan: %s (root=%s)", + source_rel_path, + root, + ) + move_needs_content = True + break + if not safe_source_path.exists(): + move_needs_content = True + break + if move_needs_content: + needed_files["moved"].append(sanitized) + needed_size_bytes += int(operation.get("size_bytes") or 0) + continue + if op_type not in {"created", "updated"}: + operations_count["failed"] += 1 + continue + + op_content_hash = _normalize_hash_value( + operation.get("content_hash") or normalized_hashes.get(sanitized) + ) + if not op_content_hash: + needed_files[op_type].append(sanitized) + operations_count[op_type] += 1 + needed_size_bytes += int(operation.get("size_bytes") or 0) + continue + + needs_content = False + for slug, root in replica_roots.items(): + target_path = _safe_join(root, sanitized) + target_key = _normalize_cache_key_path(str(target_path)) + cached_hash = replica_cache_hashes.get(slug, {}).get(target_key) + if cached_hash != op_content_hash: + needs_content = True + break + + if needs_content: + needed_files[op_type].append(sanitized) + operations_count[op_type] += 1 + needed_size_bytes += int(operation.get("size_bytes") or 0) + else: + operations_count["skipped"] += 1 + operations_count["skipped_hash_match"] += 1 + + return { + "needed_files": needed_files, + "operation_counts_preview": operations_count, + "needed_size_bytes": needed_size_bytes, + "replica_targets": list(replica_roots.keys()), + } + + +def apply_delta_operations( + workspace_path: str, + operations: list[Dict[str, Any]], + file_hashes: Optional[Dict[str, str]] = None, +) -> Dict[str, int]: + """Apply metadata-only delta operations without requiring a tar bundle.""" operations_count = { "created": 0, "updated": 0, @@ -233,156 +516,142 @@ def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[ } try: - # CRITICAL: Always materialize writes under WORK_DIR using a slugged repo directory. - # Do NOT write directly into the client-supplied workspace_path, since that may be a host - # path (e.g. /home/user/repo) that is not mounted/visible to the watcher/indexer. - workspace_leaf = Path(workspace_path).name + replica_roots = _resolve_replica_roots(workspace_path) + if not replica_roots: + raise ValueError(f"No replica roots available for workspace: {workspace_path}") + replica_cache_hashes = { + slug: _load_replica_cache_hashes(root, slug) + for slug, root in replica_roots.items() + } + normalized_hashes = { + str(rel_path): _normalize_hash_value(hash_value) + for rel_path, hash_value in (file_hashes or {}).items() + if _normalize_hash_value(hash_value) + } + + for operation in operations: + op_type = str(operation.get("operation") or "") + rel_path = operation.get("path") + + if not rel_path: + operations_count["skipped"] += 1 + continue - repo_name_for_state: Optional[str] = None + sanitized_path = _sanitize_operation_path(str(rel_path), replica_roots) + if not sanitized_path: + operations_count["skipped"] += 1 + continue - serving_slug: Optional[str] = None - active_slug: Optional[str] = None - if _extract_repo_name_from_path and get_collection_state_snapshot: - try: - repo_name_for_state = _extract_repo_name_from_path(workspace_path) - if repo_name_for_state: - snapshot = get_collection_state_snapshot(workspace_path=None, repo_name=repo_name_for_state) # type: ignore[arg-type] - serving_slug = snapshot.get("serving_repo_slug") - active_slug = snapshot.get("active_repo_slug") - except Exception: - serving_slug = None - active_slug = None - - slug_order: list[str] = [] - serving_candidate: Optional[str] = None - if serving_slug and _SLUGGED_REPO_RE.match(serving_slug): - serving_candidate = serving_slug - if active_slug and _SLUGGED_REPO_RE.match(active_slug) and active_slug not in slug_order: - slug_order.append(active_slug) - - # If staging is active, we must mirror uploads into BOTH the canonical slug and - # the "*_old" slug. Relying purely on snapshot detection is brittle (e.g. when - # the client workspace_path is a host path). When we can infer a canonical slug, - # force both targets. - staging_active = False - staging_gate = bool(is_staging_enabled() if callable(is_staging_enabled) else False) - try: - if serving_slug and str(serving_slug).endswith("_old"): - staging_active = True - except Exception: - staging_active = False + rel_path = sanitized_path - if not staging_gate: - staging_active = False + if op_type not in {"deleted", "moved"}: + operations_count["failed"] += 1 + continue - def _append_slug(slug: Optional[str]) -> None: - if slug and _SLUGGED_REPO_RE.match(slug) and slug not in slug_order: - slug_order.append(slug) + source_rel_path = None + if op_type == "moved": + raw_source = operation.get("source_path") or operation.get("source_relative_path") + if not raw_source: + operations_count["failed"] += 1 + continue + source_rel_path = _sanitize_operation_path(str(raw_source), replica_roots) + if not source_rel_path: + operations_count["failed"] += 1 + continue - if repo_name_for_state and _SLUGGED_REPO_RE.match(repo_name_for_state): - canonical_slug = repo_name_for_state[:-4] if repo_name_for_state.endswith("_old") else repo_name_for_state - old_slug_candidate = ( - repo_name_for_state if repo_name_for_state.endswith("_old") else f"{canonical_slug}_old" - ) - if staging_active: - slug_order = [] - _append_slug(canonical_slug) - _append_slug(old_slug_candidate) - elif not slug_order: - _append_slug(canonical_slug) - old_slug_path = Path(WORK_DIR) / old_slug_candidate - if old_slug_path.exists(): - _append_slug(old_slug_candidate) - - if not slug_order: - if _SLUGGED_REPO_RE.match(workspace_leaf): - slug_order.append(workspace_leaf) + replica_results: Dict[str, str] = {} + for slug, root in replica_roots.items(): + target_path = _safe_join(root, rel_path) + target_key = _normalize_cache_key_path(str(target_path)) + replica_hashes = replica_cache_hashes.setdefault(slug, {}) + op_content_hash = _normalize_hash_value( + operation.get("content_hash") or normalized_hashes.get(rel_path) + ) + + try: + if op_type == "deleted": + if target_path.exists(): + target_path.unlink(missing_ok=True) + _cleanup_empty_dirs(target_path.parent, root) + replica_hashes.pop(target_key, None) + replica_results[slug] = "applied" + continue + + safe_source_path = _safe_join(root, source_rel_path or "") + if not safe_source_path.exists(): + replica_results[slug] = "failed" + continue + + target_path.parent.mkdir(parents=True, exist_ok=True) + if target_path.exists(): + if target_path.is_dir(): + shutil.rmtree(target_path) + else: + target_path.unlink() + shutil.move(str(safe_source_path), str(target_path)) + _cleanup_empty_dirs(safe_source_path.parent, root) + source_key = _normalize_cache_key_path(str(safe_source_path)) + moved_hash = replica_hashes.pop(source_key, None) + if op_content_hash: + replica_hashes[target_key] = op_content_hash + elif moved_hash: + replica_hashes[target_key] = moved_hash + replica_results[slug] = "applied" + except Exception as exc: + logger.debug( + "[upload_service] Failed to apply metadata-only %s to %s in %s: %s", + op_type, + rel_path, + root, + exc, + ) + replica_results[slug] = "failed" + + applied_any = any(result == "applied" for result in replica_results.values()) + success_all = all(result == "applied" for result in replica_results.values()) + if applied_any: + operations_count[op_type] += 1 + if not success_all: + logger.debug( + "[upload_service] Partial metadata-only success for %s %s: %s", + op_type, + rel_path, + replica_results, + ) else: - if _extract_repo_name_from_path: - repo_name = _extract_repo_name_from_path(workspace_path) or workspace_leaf - else: - repo_name = workspace_leaf - workspace_key = get_workspace_key(workspace_path) - slug_order.append(f"{repo_name}-{workspace_key}") + operations_count["failed"] += 1 - # Best-effort: if staging is active according to workspace_state, ensure we mirror to - # both the canonical slug and its *_old slug. - if staging_gate and (not staging_active) and get_staging_targets and _extract_repo_name_from_path: - try: - repo_name_for_staging = _extract_repo_name_from_path(workspace_path) or slug_order[0] - targets = get_staging_targets(workspace_path=workspace_path, repo_name=repo_name_for_staging) - if isinstance(targets, dict) and targets.get("staging"): - staging_active = True - except Exception as staging_err: - logger.debug(f"[upload_service] Failed to detect staging: {staging_err}") - - def _slug_exists(slug: str) -> bool: - try: - return ( - (Path(WORK_DIR) / slug).exists() - or (Path(WORK_DIR) / ".codebase" / "repos" / slug).exists() - ) - except Exception: - return False - - if staging_gate and (not staging_active) and slug_order: - primary = slug_order[0] - if _SLUGGED_REPO_RE.match(primary): - canonical = primary[:-4] if primary.endswith("_old") else primary - inferred_old = primary if primary.endswith("_old") else f"{canonical}_old" - if _slug_exists(inferred_old): - staging_active = True - - if staging_gate and staging_active and slug_order: - primary = slug_order[0] - if _SLUGGED_REPO_RE.match(primary): - canonical = primary[:-4] if primary.endswith("_old") else primary - old_slug = primary if primary.endswith("_old") else f"{canonical}_old" - desired = [canonical, old_slug] - slug_order = [s for s in desired if _SLUGGED_REPO_RE.match(s)] - elif staging_gate and not staging_active and serving_candidate: - # Ignore serving slugs when staging is disabled; keep deterministic non-staging writes. - if serving_candidate in slug_order: - slug_order = [s for s in slug_order if s != serving_candidate] - - if staging_gate: - try: - logger.info(f"[upload_service] Delta bundle targets (staging={staging_active}): {slug_order}") - except Exception: - pass + for slug, root in replica_roots.items(): + if not _should_run_empty_dir_sweep(root, slug): + continue + logger.info("[upload_service] Sweeping empty directories under %s", root) + _sweep_empty_workspace_dirs(root) + _record_empty_dir_sweep(root, slug) - replica_roots: Dict[str, Path] = {} - for slug in slug_order: - path = Path(WORK_DIR) / slug - path.mkdir(parents=True, exist_ok=True) - try: - marker_dir = Path(WORK_DIR) / ".codebase" / "repos" / slug - marker_dir.mkdir(parents=True, exist_ok=True) - (marker_dir / ".ctxce_managed_upload").write_text("1\n") - except Exception: - pass - replica_roots[slug] = path.resolve() + return operations_count + except Exception as e: + logger.error(f"Error applying metadata-only delta operations: {e}") + raise - primary_slug = slug_order[0] - workspace_root = replica_roots[primary_slug] - def _safe_join(base: Path, rel: str) -> Path: - # SECURITY: Prevent path traversal / absolute-path writes by ensuring the resolved - # candidate path stays within the intended workspace root. - rp = Path(str(rel)) - if str(rp) in {".", ""}: - raise ValueError("Invalid operation path") - if rp.is_absolute(): - raise ValueError(f"Absolute paths are not allowed: {rel}") - base_resolved = base.resolve() - candidate = (base_resolved / rp).resolve() - try: - ok = candidate.is_relative_to(base_resolved) - except Exception: - ok = os.path.commonpath([str(base_resolved), str(candidate)]) == str(base_resolved) - if not ok: - raise ValueError(f"Path escapes workspace: {rel}") - return candidate +def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[str, Any]) -> Dict[str, int]: + """Process delta bundle and return operation counts.""" + operations_count = { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + } + + try: + replica_roots = _resolve_replica_roots(workspace_path) + if not replica_roots: + raise ValueError(f"No replica roots available for workspace: {workspace_path}") + primary_slug = next(iter(replica_roots)) + workspace_root = replica_roots[primary_slug] def _member_suffix(name: str, marker: str) -> Optional[str]: idx = name.find(marker) @@ -582,18 +851,8 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: operations_count["skipped"] += 1 continue - sanitized_path = rel_path - skipped_due_to_exact_slug = False - for slug in replica_roots.keys(): - if sanitized_path == slug: - skipped_due_to_exact_slug = True - break - prefix = f"{slug}/" - if sanitized_path.startswith(prefix): - sanitized_path = sanitized_path[len(prefix):] - break - - if skipped_due_to_exact_slug or not sanitized_path: + sanitized_path = _sanitize_operation_path(str(rel_path), replica_roots) + if not sanitized_path: logger.debug( f"[upload_service] Skipping operation {op_type} for path {rel_path}: " "appears to reference slug root directly.", diff --git a/scripts/upload_service.py b/scripts/upload_service.py index 6d4d9327..880b788e 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -5,7 +5,7 @@ This FastAPI service receives delta bundles from remote upload clients, processes them, and integrates with the existing indexing pipeline. """ - +# import os import json import tarfile @@ -45,7 +45,12 @@ from fastapi.responses import JSONResponse, RedirectResponse from fastapi.middleware.cors import CORSMiddleware -from scripts.upload_delta_bundle import get_workspace_key, process_delta_bundle +from scripts.upload_delta_bundle import ( + apply_delta_operations, + get_workspace_key, + plan_delta_upload, + process_delta_bundle, +) from scripts.indexing_admin import ( build_admin_collections_view, @@ -229,6 +234,39 @@ class UploadResponse(BaseModel): next_sequence: Optional[int] = None error: Optional[Dict[str, Any]] = None + +class PlanRequest(BaseModel): + workspace_path: str + collection_name: Optional[str] = None + source_path: Optional[str] = None + logical_repo_id: Optional[str] = None + session: Optional[str] = None + manifest: Dict[str, Any] = Field(default_factory=dict) + operations: List[Dict[str, Any]] = Field(default_factory=list) + file_hashes: Dict[str, str] = Field(default_factory=dict) + + +class PlanResponse(BaseModel): + success: bool + workspace_path: str + needed_files: Dict[str, List[str]] + operation_counts_preview: Dict[str, int] + needed_size_bytes: int + replica_targets: List[str] + fallback_used: bool = False + error: Optional[Dict[str, Any]] = None + + +class ApplyOperationsRequest(BaseModel): + workspace_path: str + collection_name: Optional[str] = None + source_path: Optional[str] = None + logical_repo_id: Optional[str] = None + session: Optional[str] = None + manifest: Dict[str, Any] = Field(default_factory=dict) + operations: List[Dict[str, Any]] = Field(default_factory=list) + file_hashes: Dict[str, str] = Field(default_factory=dict) + class StatusResponse(BaseModel): workspace_path: str collection_name: str @@ -1487,6 +1525,197 @@ async def get_status(workspace_path: str): logger.error(f"Error getting status: {e}") raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/api/v1/delta/plan", response_model=PlanResponse) +async def plan_delta(request: PlanRequest): + """Plan which file bodies are needed before uploading content.""" + try: + workspace = Path(request.workspace_path) + if not workspace.is_absolute(): + workspace = Path(WORK_DIR) / workspace + workspace_path = str(workspace.resolve()) + + if AUTH_ENABLED: + session_value = str(request.session or "").strip() + try: + record = validate_session(session_value) + except AuthDisabledError: + record = None + except Exception as e: + logger.error(f"[upload_service] Failed to validate auth session for plan: {e}") + raise HTTPException(status_code=500, detail="Failed to validate auth session") + if record is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired session", + ) + + plan = plan_delta_upload( + workspace_path=workspace_path, + operations=request.operations, + file_hashes=request.file_hashes, + ) + return PlanResponse( + success=True, + workspace_path=workspace_path, + needed_files=plan.get("needed_files", {"created": [], "updated": [], "moved": []}), + operation_counts_preview=plan.get( + "operation_counts_preview", + { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + }, + ), + needed_size_bytes=int(plan.get("needed_size_bytes", 0) or 0), + replica_targets=list(plan.get("replica_targets", []) or []), + fallback_used=False, + error=None, + ) + except HTTPException: + raise + except Exception as e: + logger.error(f"[upload_service] Error planning delta upload: {e}") + return PlanResponse( + success=False, + workspace_path=request.workspace_path, + needed_files={"created": [], "updated": [], "moved": []}, + operation_counts_preview={ + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + }, + needed_size_bytes=0, + replica_targets=[], + fallback_used=True, + error={ + "code": "PLAN_ERROR", + "message": str(e), + }, + ) + + +@app.post("/api/v1/delta/apply_ops", response_model=UploadResponse) +async def apply_delta_ops(request: ApplyOperationsRequest): + """Apply metadata-only delta operations without uploading a tar bundle.""" + key: Optional[str] = None + bundle_id: Optional[str] = None + sequence_number: Optional[int] = None + try: + workspace = Path(request.workspace_path) + if not workspace.is_absolute(): + workspace = Path(WORK_DIR) / workspace + workspace_path = str(workspace.resolve()) + + if AUTH_ENABLED: + session_value = str(request.session or "").strip() + try: + record = validate_session(session_value) + except AuthDisabledError: + record = None + except Exception as e: + logger.error(f"[upload_service] Failed to validate auth session for apply_ops: {e}") + raise HTTPException(status_code=500, detail="Failed to validate auth session") + if record is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired session", + ) + + manifest = request.manifest or {} + bundle_id = manifest.get("bundle_id") + manifest_sequence = manifest.get("sequence_number") + key = get_workspace_key(workspace_path) + last_sequence = get_last_sequence(workspace_path) + sequence_number = manifest_sequence if manifest_sequence is not None else last_sequence + 1 + + if sequence_number is not None and sequence_number != last_sequence + 1: + return UploadResponse( + success=False, + error={ + "code": "SEQUENCE_MISMATCH", + "message": f"Expected sequence {last_sequence + 1}, got {sequence_number}", + "expected_sequence": last_sequence + 1, + "received_sequence": sequence_number, + "retry_after": 5000, + }, + ) + + start_time = datetime.now() + _upload_result_tracker[key] = { + "workspace_path": workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": None, + "processing_time_ms": None, + "status": "processing", + "completed_at": None, + } + + operations_count = await asyncio.to_thread( + apply_delta_operations, + workspace_path, + request.operations, + request.file_hashes, + ) + processing_time = int((datetime.now() - start_time).total_seconds() * 1000) + _sequence_tracker[key] = sequence_number + _upload_result_tracker[key] = { + "workspace_path": workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": operations_count, + "processing_time_ms": processing_time, + "status": "completed", + "completed_at": datetime.now().isoformat(), + } + logger.info( + "[upload_service] Applied metadata-only operations bundle=%s seq=%s in %sms ops=%s", + bundle_id, + sequence_number, + processing_time, + operations_count, + ) + return UploadResponse( + success=True, + bundle_id=bundle_id, + sequence_number=sequence_number, + processed_operations=operations_count, + processing_time_ms=processing_time, + next_sequence=sequence_number + 1 if sequence_number is not None else None, + ) + except HTTPException: + raise + except Exception as e: + logger.error(f"[upload_service] Error applying metadata-only operations: {e}") + if key: + _upload_result_tracker[key] = { + "workspace_path": request.workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": None, + "processing_time_ms": None, + "status": "error", + "error": str(e), + "message": str(e), + "completed_at": datetime.now().isoformat(), + } + return UploadResponse( + success=False, + error={ + "code": "APPLY_OPS_ERROR", + "message": str(e), + }, + ) + @app.post("/api/v1/delta/upload", response_model=UploadResponse) async def upload_delta_bundle( request: Request, diff --git a/tests/test_upload_client_ignore_cleanup.py b/tests/test_upload_client_ignore_cleanup.py index 3f68a819..bcd93009 100644 --- a/tests/test_upload_client_ignore_cleanup.py +++ b/tests/test_upload_client_ignore_cleanup.py @@ -160,3 +160,328 @@ def test_remote_upload_client_force_mode_deletes_dev_workspace_without_cache(mon def test_standalone_upload_client_force_mode_deletes_dev_workspace_without_cache(monkeypatch, tmp_path): _exercise_force_mode_dev_workspace_cleanup_without_cache("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def _exercise_plan_skip_avoids_bundle_upload(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + monkeypatch.setattr( + client, + "_plan_delta_upload", + lambda changes: { + "needed_files": {"created": [], "updated": [], "moved": []}, + "operation_counts_preview": { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 1, + "skipped_hash_match": 1, + "failed": 0, + }, + "needed_size_bytes": 0, + }, + ) + monkeypatch.setattr(client, "create_delta_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not bundle"))) + monkeypatch.setattr(client, "upload_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not upload"))) + + assert client.process_changes_and_upload( + { + "created": [current], + "updated": [], + "deleted": [], + "moved": [], + "unchanged": [], + } + ) is True + assert client.last_upload_result["outcome"] == "skipped_by_plan" + + +def test_remote_upload_client_plan_skip_avoids_bundle_upload(monkeypatch, tmp_path): + _exercise_plan_skip_avoids_bundle_upload("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_plan_skip_avoids_bundle_upload(monkeypatch, tmp_path): + _exercise_plan_skip_avoids_bundle_upload("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_plan_payload_prefixes_previous_hash(monkeypatch, tmp_path): + mod = importlib.import_module("scripts.standalone_upload_client") + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + updated = workspace / "app.py" + updated.write_text("print('updated')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: "abc123") + + payload = client._build_plan_payload( + { + "created": [], + "updated": [updated], + "deleted": [updated], + "moved": [], + } + ) + + updated_op = next(op for op in payload["operations"] if op["operation"] == "updated") + deleted_op = next(op for op in payload["operations"] if op["operation"] == "deleted") + assert updated_op["previous_hash"] == "sha1:abc123" + assert deleted_op["previous_hash"] == "sha1:abc123" + + +def _exercise_delete_only_plan_uses_apply_ops(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + deleted = workspace / "old.py" + deleted.write_text("print('old')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + removed_paths = [] + + monkeypatch.setattr( + client, + "_plan_delta_upload", + lambda changes: { + "needed_files": {"created": [], "updated": [], "moved": []}, + "operation_counts_preview": { + "created": 0, + "updated": 0, + "deleted": 1, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + }, + "needed_size_bytes": 0, + }, + ) + monkeypatch.setattr( + client, + "_build_plan_payload", + lambda changes: { + "manifest": {"bundle_id": "b1", "sequence_number": None}, + "operations": [{"operation": "deleted", "path": "old.py"}], + "file_hashes": {}, + }, + ) + monkeypatch.setattr(client, "create_delta_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not bundle"))) + monkeypatch.setattr(client, "upload_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not upload"))) + + class _Resp: + status_code = 200 + + @staticmethod + def raise_for_status(): + return None + + @staticmethod + def json(): + return { + "success": True, + "bundle_id": "b1", + "sequence_number": 3, + "processed_operations": {"deleted": 1, "created": 0, "updated": 0, "moved": 0, "skipped": 0, "skipped_hash_match": 0, "failed": 0}, + } + + monkeypatch.setattr(client.session, "post", lambda *a, **k: _Resp()) + monkeypatch.setattr(mod, "remove_cached_file", lambda path, repo_name=None: removed_paths.append((path, repo_name))) + + assert client.process_changes_and_upload( + { + "created": [], + "updated": [], + "deleted": [deleted], + "moved": [], + "unchanged": [], + } + ) is True + assert client.last_upload_result["outcome"] == "uploaded" + assert client.last_upload_result["processed_operations"]["deleted"] == 1 + assert removed_paths == [(str(deleted.resolve()), client.repo_name)] + + +def test_remote_upload_client_delete_only_plan_uses_apply_ops(monkeypatch, tmp_path): + _exercise_delete_only_plan_uses_apply_ops("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_delete_only_plan_uses_apply_ops(monkeypatch, tmp_path): + _exercise_delete_only_plan_uses_apply_ops("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def _exercise_async_upload_sets_queued_result(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + bundle_path = workspace / "bundle.tar.gz" + bundle_path.write_bytes(b"bundle") + monkeypatch.setattr(client, "_plan_delta_upload", lambda changes: None) + monkeypatch.setattr( + client, + "create_delta_bundle", + lambda changes: (str(bundle_path), {"bundle_id": "bundle-1", "total_size_bytes": 6}), + ) + monkeypatch.setattr( + client, + "upload_bundle", + lambda *a, **k: {"success": True, "sequence_number": 7, "processed_operations": None}, + ) + monkeypatch.setattr(mod, "flush_cached_file_hashes", lambda: None, raising=False) + + assert client.process_changes_and_upload( + { + "created": [current], + "updated": [], + "deleted": [], + "moved": [], + "unchanged": [], + } + ) is True + assert client.last_upload_result["outcome"] == "queued" + assert client.last_upload_result["sequence_number"] == 7 + + +def _exercise_async_upload_promotes_completed_result(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + bundle_path = workspace / "bundle.tar.gz" + bundle_path.write_bytes(b"bundle") + monkeypatch.setattr(client, "_plan_delta_upload", lambda changes: None) + monkeypatch.setattr( + client, + "create_delta_bundle", + lambda changes: (str(bundle_path), {"bundle_id": "bundle-1", "total_size_bytes": 6}), + ) + monkeypatch.setattr( + client, + "upload_bundle", + lambda *a, **k: {"success": True, "sequence_number": 7, "processed_operations": None}, + ) + monkeypatch.setattr( + client, + "get_server_status", + lambda: { + "success": True, + "last_sequence": 7, + "server_info": { + "last_bundle_id": "bundle-1", + "last_upload_status": "completed", + "last_processed_operations": {"updated": 1, "failed": 0}, + "last_processing_time_ms": 12, + }, + }, + ) + monkeypatch.setattr(mod, "flush_cached_file_hashes", lambda: None, raising=False) + + assert client.process_changes_and_upload( + { + "created": [current], + "updated": [], + "deleted": [], + "moved": [], + "unchanged": [], + } + ) is True + assert client.last_upload_result["outcome"] == "uploaded_async" + assert client.last_upload_result["processed_operations"] == {"updated": 1, "failed": 0} + + +def test_remote_upload_client_async_upload_sets_queued_result(monkeypatch, tmp_path): + _exercise_async_upload_sets_queued_result("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_async_upload_sets_queued_result(monkeypatch, tmp_path): + _exercise_async_upload_sets_queued_result("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def test_remote_upload_client_async_upload_promotes_completed_result(monkeypatch, tmp_path): + _exercise_async_upload_promotes_completed_result("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_async_upload_promotes_completed_result(monkeypatch, tmp_path): + _exercise_async_upload_promotes_completed_result("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def _exercise_watchable_path_excludes_ignored_updates(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + source = workspace / "src" / "tracked.py" + source.parent.mkdir(parents=True, exist_ok=True) + source.write_text("print('tracked')\n", encoding="utf-8") + + mirrored = workspace / "dev-workspace" / "nested" / "ignored.py" + mirrored.parent.mkdir(parents=True, exist_ok=True) + mirrored.write_text("print('ignored')\n", encoding="utf-8") + + monkeypatch.setenv("DEV_REMOTE_MODE", "1") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + assert client._is_watchable_path(source) is True + assert client._is_watchable_path(mirrored) is False + + +def test_remote_upload_client_watchable_path_excludes_ignored_updates(monkeypatch, tmp_path): + _exercise_watchable_path_excludes_ignored_updates( + "scripts.remote_upload_client", + monkeypatch, + tmp_path, + ) + + +def test_standalone_upload_client_watchable_path_excludes_ignored_updates(monkeypatch, tmp_path): + _exercise_watchable_path_excludes_ignored_updates( + "scripts.standalone_upload_client", + monkeypatch, + tmp_path, + ) diff --git a/tests/test_upload_service_path_traversal.py b/tests/test_upload_service_path_traversal.py index 78523a64..224fb926 100644 --- a/tests/test_upload_service_path_traversal.py +++ b/tests/test_upload_service_path_traversal.py @@ -600,3 +600,177 @@ def test_process_delta_bundle_preserves_nested_dirs_under_protected_top_level(tm ) assert protected_nested.exists() + + +def test_plan_delta_upload_skips_matching_created_files(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "src/file.txt" + content = b"same-content" + file_hash = "sha1:efb5d7d4d38013264f2c00fceeb401f8c8d77d9f" + + target = work_dir / slug / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(content) + _write_repo_cache(work_dir, slug, rel_path, file_hash) + + plan = us.plan_delta_upload( + workspace_path=f"/work/{slug}", + operations=[ + { + "operation": "created", + "path": rel_path, + "content_hash": file_hash, + "size_bytes": len(content), + } + ], + file_hashes={rel_path: file_hash}, + ) + + assert plan["needed_files"]["created"] == [] + assert plan["operation_counts_preview"]["skipped_hash_match"] == 1 + assert plan["needed_size_bytes"] == 0 + + +def test_plan_delta_upload_marks_updated_file_needed_when_hash_missing(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "src/keep.txt" + file_hash = "sha1:2910e29d6f6d3d2f01f8cc52ec386a4936ca9d2f" + + plan = us.plan_delta_upload( + workspace_path=f"/work/{slug}", + operations=[ + { + "operation": "updated", + "path": rel_path, + "content_hash": file_hash, + "size_bytes": 17, + } + ], + file_hashes={rel_path: file_hash}, + ) + + assert plan["needed_files"]["updated"] == [rel_path] + assert plan["operation_counts_preview"]["updated"] == 1 + assert plan["needed_size_bytes"] == 17 + + +def test_plan_delta_upload_skips_move_content_when_source_exists_on_server(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + source_rel = "src/old.py" + dest_rel = "src/new.py" + source = work_dir / slug / source_rel + source.parent.mkdir(parents=True, exist_ok=True) + source.write_text("print('move')\n", encoding="utf-8") + + plan = us.plan_delta_upload( + workspace_path=f"/work/{slug}", + operations=[ + { + "operation": "moved", + "path": dest_rel, + "source_path": source_rel, + "content_hash": "sha1:abc123", + "size_bytes": 12, + } + ], + file_hashes={dest_rel: "sha1:abc123"}, + ) + + assert plan["needed_files"]["moved"] == [] + assert plan["operation_counts_preview"]["moved"] == 1 + assert plan["needed_size_bytes"] == 0 + + +def test_plan_delta_upload_marks_move_needed_when_source_path_is_invalid(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + dest_rel = "src/new.py" + + plan = us.plan_delta_upload( + workspace_path=f"/work/{slug}", + operations=[ + { + "operation": "moved", + "path": dest_rel, + "source_path": "../escape.py", + "content_hash": "sha1:abc123", + "size_bytes": 12, + } + ], + file_hashes={dest_rel: "sha1:abc123"}, + ) + + assert plan["needed_files"]["moved"] == [dest_rel] + assert plan["operation_counts_preview"]["moved"] == 1 + assert plan["needed_size_bytes"] == 12 + + +def test_apply_delta_operations_moves_file_without_bundle(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + source_rel = "src/old.py" + dest_rel = "src/new.py" + source = work_dir / slug / source_rel + source.parent.mkdir(parents=True, exist_ok=True) + source.write_text("print('move')\n", encoding="utf-8") + + counts = us.apply_delta_operations( + workspace_path=f"/work/{slug}", + operations=[ + { + "operation": "moved", + "path": dest_rel, + "source_path": source_rel, + "content_hash": "sha1:abc123", + } + ], + file_hashes={dest_rel: "sha1:abc123"}, + ) + + assert counts["moved"] == 1 + assert not source.exists() + assert (work_dir / slug / dest_rel).exists() + + +def test_apply_delta_operations_raises_clear_error_when_no_replica_roots(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + monkeypatch.setattr(us, "_resolve_replica_roots", lambda workspace_path: {}) + + with pytest.raises(ValueError, match="No replica roots available"): + us.apply_delta_operations( + workspace_path="/work/repo", + operations=[], + file_hashes={}, + ) diff --git a/tests/test_upload_service_status.py b/tests/test_upload_service_status.py index 04f2138e..f40dba58 100644 --- a/tests/test_upload_service_status.py +++ b/tests/test_upload_service_status.py @@ -6,10 +6,15 @@ from fastapi.testclient import TestClient +def _disable_auth(srv, monkeypatch) -> None: + monkeypatch.setattr(srv, "AUTH_ENABLED", False) + + @pytest.mark.unit def test_delta_status_exposes_last_processed_operations(monkeypatch): srv = importlib.import_module("scripts.upload_service") srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) monkeypatch.setattr(srv, "get_collection_name", lambda _repo=None: "test-coll") monkeypatch.setattr(srv, "_extract_repo_name_from_path", lambda _path: "repo") @@ -52,6 +57,7 @@ def test_delta_status_exposes_last_processed_operations(monkeypatch): def test_process_bundle_background_tracks_completed_operations(monkeypatch, tmp_path: Path): srv = importlib.import_module("scripts.upload_service") srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) bundle_path = tmp_path / "bundle.tar.gz" bundle_path.write_bytes(b"placeholder") @@ -94,6 +100,7 @@ def test_process_bundle_background_tracks_completed_operations(monkeypatch, tmp_ def test_delta_status_reports_processing_while_upload_in_progress(monkeypatch): srv = importlib.import_module("scripts.upload_service") srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) monkeypatch.setattr(srv, "get_collection_name", lambda _repo=None: "test-coll") monkeypatch.setattr(srv, "_extract_repo_name_from_path", lambda _path: "repo") @@ -115,3 +122,151 @@ def test_delta_status_reports_processing_while_upload_in_progress(monkeypatch): body = resp.json() assert body["status"] == "processing" assert body["server_info"]["last_upload_status"] == "processing" + + +@pytest.mark.unit +def test_delta_plan_endpoint_returns_needed_files(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) + + monkeypatch.setattr( + srv, + "plan_delta_upload", + lambda workspace_path, operations, file_hashes=None: { + "needed_files": {"created": ["src/app.py"], "updated": [], "moved": []}, + "operation_counts_preview": { + "created": 1, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 2, + "skipped_hash_match": 2, + "failed": 0, + }, + "needed_size_bytes": 123, + "replica_targets": ["repo-0123456789abcdef"], + }, + ) + + client = TestClient(srv.app) + resp = client.post( + "/api/v1/delta/plan", + json={ + "workspace_path": "/work/repo", + "manifest": {"bundle_id": "b1"}, + "operations": [{"operation": "created", "path": "src/app.py"}], + "file_hashes": {"src/app.py": "sha1:abc"}, + }, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["success"] is True + assert body["needed_files"]["created"] == ["src/app.py"] + assert body["operation_counts_preview"]["skipped_hash_match"] == 2 + assert body["needed_size_bytes"] == 123 + + +@pytest.mark.unit +def test_delta_plan_endpoint_uses_safe_defaults_for_sparse_plan(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) + + monkeypatch.setattr( + srv, + "plan_delta_upload", + lambda workspace_path, operations, file_hashes=None: {}, + ) + + client = TestClient(srv.app) + resp = client.post( + "/api/v1/delta/plan", + json={ + "workspace_path": "/work/repo", + "manifest": {"bundle_id": "b1"}, + "operations": [{"operation": "created", "path": "src/app.py"}], + "file_hashes": {"src/app.py": "sha1:abc"}, + }, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["success"] is True + assert body["needed_files"] == {"created": [], "updated": [], "moved": []} + assert body["operation_counts_preview"]["failed"] == 0 + assert body["needed_size_bytes"] == 0 + assert body["replica_targets"] == [] + + +@pytest.mark.unit +def test_apply_ops_endpoint_returns_processed_operations(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) + + monkeypatch.setattr( + srv, + "apply_delta_operations", + lambda workspace_path, operations, file_hashes=None: { + "created": 0, + "updated": 0, + "deleted": 1, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + }, + ) + + client = TestClient(srv.app) + resp = client.post( + "/api/v1/delta/apply_ops", + json={ + "workspace_path": "/work/repo", + "manifest": {"bundle_id": "b2"}, + "operations": [{"operation": "deleted", "path": "src/old.py"}], + "file_hashes": {}, + }, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["success"] is True + assert body["processed_operations"]["deleted"] == 1 + assert body["processing_time_ms"] is not None + + +@pytest.mark.unit +def test_apply_ops_endpoint_marks_tracker_error_state_on_failure(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) + + monkeypatch.setattr( + srv, + "apply_delta_operations", + lambda workspace_path, operations, file_hashes=None: (_ for _ in ()).throw( + RuntimeError("boom") + ), + ) + + client = TestClient(srv.app) + resp = client.post( + "/api/v1/delta/apply_ops", + json={ + "workspace_path": "/work/repo", + "manifest": {"bundle_id": "b3"}, + "operations": [{"operation": "deleted", "path": "src/old.py"}], + "file_hashes": {}, + }, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["success"] is False + assert body["error"]["code"] == "APPLY_OPS_ERROR" + + key = srv.get_workspace_key("/work/repo") + tracked = srv._upload_result_tracker[key] + assert tracked["status"] == "error" + assert tracked["error"] == "boom" + assert tracked["message"] == "boom" + assert tracked["completed_at"] is not None From ca32c5ab02dc1d48064cf998855774658c723bea Mon Sep 17 00:00:00 2001 From: Reese Date: Sun, 8 Mar 2026 09:56:16 +0000 Subject: [PATCH 25/39] fix(upload,watch): align cache state with confirmed uploads and trim no-op reprocessing - refresh cached file hash when smart reindex finds no symbol changes - move Qdrant ensure out of the unconditional watcher path - add re-entrancy guard to standalone watcher event processing - update uploader cache semantics to persist hashes only after confirmed success - finalize local hash/state on skipped-by-plan results - keep moved-file destination caching independent from source cleanup errors - add focused regression tests for watcher cache reuse and smart reindex no-op paths --- scripts/codex_phase3_probe.py | 2 + scripts/ingest/pipeline.py | 17 ++++ scripts/remote_upload_client.py | 46 ++++++++-- scripts/standalone_upload_client.py | 59 ++++++++++--- scripts/watch_index_core/processor.py | 11 ++- tests/test_smart_reindex_vectors.py | 39 +++++++++ tests/test_upload_client_ignore_cleanup.py | 98 ++++++++++++++++++++++ tests/test_watch_index_cache.py | 26 ++++++ 8 files changed, 276 insertions(+), 22 deletions(-) create mode 100644 scripts/codex_phase3_probe.py diff --git a/scripts/codex_phase3_probe.py b/scripts/codex_phase3_probe.py new file mode 100644 index 00000000..628b09d5 --- /dev/null +++ b/scripts/codex_phase3_probe.py @@ -0,0 +1,2 @@ +MARK = 'v3' +# codex phase3 probe v3 diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py index 2ebac7af..ab2b877b 100644 --- a/scripts/ingest/pipeline.py +++ b/scripts/ingest/pipeline.py @@ -950,6 +950,7 @@ def process_file_with_smart_reindexing( model, vector_name: str | None, *, + model_dim: int | None = None, allowed_vectors: set[str] | None = None, allowed_sparse: set[str] | None = None, ) -> str: @@ -1057,9 +1058,25 @@ def process_file_with_smart_reindexing( changed_set = set(changed_symbols) if len(changed_symbols) == 0 and cached_symbols: + try: + if set_cached_file_hash: + set_cached_file_hash(fp, file_hash, per_file_repo) + except Exception: + pass print(f"[SMART_REINDEX] {file_path}: 0 changes detected, skipping") return "skipped" + if model_dim and vector_name: + try: + ensure_collection_and_indexes_once( + client, + current_collection, + int(model_dim), + vector_name, + ) + except Exception: + pass + existing_points = [] try: filt = models.Filter( diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 3efa2422..72a11a25 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -582,6 +582,30 @@ def log_watch_upload_result(self) -> None: logger.info("[watch] Upload handling completed") def _finalize_successful_changes(self, changes: Dict[str, List]) -> None: + for path in changes.get("created", []): + try: + abs_path = str(path.resolve()) + current_hash = hashlib.sha1(path.read_bytes()).hexdigest() + set_cached_file_hash(abs_path, current_hash, self.repo_name) + stat = path.stat() + self._stat_cache[abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) + except Exception: + continue + for path in changes.get("updated", []): + try: + abs_path = str(path.resolve()) + current_hash = hashlib.sha1(path.read_bytes()).hexdigest() + set_cached_file_hash(abs_path, current_hash, self.repo_name) + stat = path.stat() + self._stat_cache[abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) + except Exception: + continue for path in changes.get("deleted", []): try: abs_path = str(path.resolve()) @@ -589,11 +613,22 @@ def _finalize_successful_changes(self, changes: Dict[str, List]) -> None: self._stat_cache.pop(abs_path, None) except Exception: continue - for source_path, _dest_path in changes.get("moved", []): + for source_path, dest_path in changes.get("moved", []): try: - abs_path = str(source_path.resolve()) - remove_cached_file(abs_path, self.repo_name) - self._stat_cache.pop(abs_path, None) + source_abs_path = str(source_path.resolve()) + remove_cached_file(source_abs_path, self.repo_name) + self._stat_cache.pop(source_abs_path, None) + except Exception: + continue + try: + dest_abs_path = str(dest_path.resolve()) + current_hash = hashlib.sha1(dest_path.read_bytes()).hexdigest() + set_cached_file_hash(dest_abs_path, current_hash, self.repo_name) + stat = dest_path.stat() + self._stat_cache[dest_abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) except Exception: continue @@ -831,8 +866,6 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: self._stat_cache[abs_path] = (getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), stat.st_size) except Exception: pass - set_cached_file_hash(abs_path, current_hash, self.repo_name) - # Detect moves by looking for files with same content hash # but different paths (requires additional tracking) changes["moved"] = self._detect_moves(changes["created"], changes["deleted"]) @@ -1840,6 +1873,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: return True if not self.has_meaningful_changes(planned_changes): logger.info("[remote_upload] Plan found no upload work; skipping bundle upload") + self._finalize_successful_changes(changes) self._set_last_upload_result( "skipped_by_plan", plan_preview=preview, diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 2c98d86a..c314a4a3 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -804,6 +804,30 @@ def log_watch_upload_result(self) -> None: logger.info("[watch] Upload handling completed") def _finalize_successful_changes(self, changes: Dict[str, List]) -> None: + for path in changes.get("created", []): + try: + abs_path = str(path.resolve()) + current_hash = hashlib.sha1(path.read_bytes()).hexdigest() + set_cached_file_hash(abs_path, current_hash, self.repo_name) + stat = path.stat() + self._stat_cache[abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) + except Exception: + continue + for path in changes.get("updated", []): + try: + abs_path = str(path.resolve()) + current_hash = hashlib.sha1(path.read_bytes()).hexdigest() + set_cached_file_hash(abs_path, current_hash, self.repo_name) + stat = path.stat() + self._stat_cache[abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) + except Exception: + continue for path in changes.get("deleted", []): try: abs_path = str(path.resolve()) @@ -811,11 +835,22 @@ def _finalize_successful_changes(self, changes: Dict[str, List]) -> None: self._stat_cache.pop(abs_path, None) except Exception: continue - for source_path, _dest_path in changes.get("moved", []): + for source_path, dest_path in changes.get("moved", []): try: - abs_path = str(source_path.resolve()) - remove_cached_file(abs_path, self.repo_name) - self._stat_cache.pop(abs_path, None) + source_abs_path = str(source_path.resolve()) + remove_cached_file(source_abs_path, self.repo_name) + self._stat_cache.pop(source_abs_path, None) + except Exception: + pass + try: + dest_abs_path = str(dest_path.resolve()) + current_hash = hashlib.sha1(dest_path.read_bytes()).hexdigest() + set_cached_file_hash(dest_abs_path, current_hash, self.repo_name) + stat = dest_path.stat() + self._stat_cache[dest_abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) except Exception: continue @@ -1048,8 +1083,6 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: self._stat_cache[abs_path] = (getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), stat.st_size) except Exception: pass - set_cached_file_hash(abs_path, current_hash, self.repo_name) - # Detect moves by looking for files with same content hash # but different paths (requires additional tracking) changes["moved"] = self._detect_moves(changes["created"], changes["deleted"]) @@ -1174,8 +1207,6 @@ def create_delta_bundle( operations.append(operation) file_hashes[rel_path] = f"sha1:{file_hash}" total_size += stat.st_size - set_cached_file_hash(str(path.resolve()), file_hash, self.repo_name) - except Exception as e: print(f"[bundle_create] Error processing created file {path}: {e}") continue @@ -1214,8 +1245,6 @@ def create_delta_bundle( operations.append(operation) file_hashes[rel_path] = f"sha1:{file_hash}" total_size += stat.st_size - set_cached_file_hash(str(path.resolve()), file_hash, self.repo_name) - except Exception as e: print(f"[bundle_create] Error processing updated file {path}: {e}") continue @@ -1256,8 +1285,6 @@ def create_delta_bundle( operations.append(operation) file_hashes[dest_rel_path] = f"sha1:{file_hash}" total_size += stat.st_size - set_cached_file_hash(str(dest_path.resolve()), file_hash, self.repo_name) - except Exception as e: print(f"[bundle_create] Error processing moved file {source_path} -> {dest_path}: {e}") continue @@ -2058,6 +2085,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: return True if not self.has_meaningful_changes(planned_changes): logger.info("[remote_upload] Plan found no upload work; skipping bundle upload") + self._finalize_successful_changes(changes) self._set_last_upload_result( "skipped_by_plan", plan_preview=preview, @@ -2182,6 +2210,7 @@ def __init__(self, client, debounce_seconds=2.0): self._pending_paths = set() self._check_for_deletions = False self._lock = threading.Lock() + self._processing = False def on_any_event(self, event): """Handle any file system event.""" @@ -2226,8 +2255,11 @@ def on_any_event(self, event): def _process_pending_changes(self): """Process accumulated changes after debounce period.""" with self._lock: + if self._processing: + return if not self._pending_paths: return + self._processing = True pending = list(self._pending_paths) self._pending_paths.clear() check_deletions = self._check_for_deletions @@ -2276,6 +2308,9 @@ def _process_pending_changes(self): logger.error("[watch] Failed to upload git history metadata") except Exception as e: logger.error(f"[watch] Error processing changes: {e}") + finally: + with self._lock: + self._processing = False observer = Observer() handler = CodeFileEventHandler(self, debounce_seconds=2.0) diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index e84f2dec..bdc95693 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -574,10 +574,6 @@ def _run_indexing_strategy( ) -> bool: if collection is None: return False - try: - idx.ensure_collection_and_indexes_once(client, collection, model_dim, vector_name) - except Exception: - pass text, file_hash = _read_text_and_sha1(path) ok = False @@ -619,6 +615,7 @@ def _run_indexing_strategy( repo_name, model, vector_name, + model_dim=model_dim, ) ok = status in ("success", "skipped") except Exception as exc: @@ -633,6 +630,12 @@ def _run_indexing_strategy( # Fallback: full single-file reindex. Pseudo/tags are inlined by default; # when PSEUDO_DEFER_TO_WORKER=1 we run base-only and rely on backfill. if not ok: + try: + idx.ensure_collection_and_indexes_once( + client, collection, model_dim, vector_name + ) + except Exception: + pass pseudo_mode = "off" if get_boolean_env("PSEUDO_DEFER_TO_WORKER") else "full" ok = idx.index_single_file( client, diff --git a/tests/test_smart_reindex_vectors.py b/tests/test_smart_reindex_vectors.py index 2e77056e..a47e4108 100644 --- a/tests/test_smart_reindex_vectors.py +++ b/tests/test_smart_reindex_vectors.py @@ -2,6 +2,7 @@ import sys from types import SimpleNamespace from pathlib import Path +from unittest.mock import MagicMock import pytest @@ -291,3 +292,41 @@ def fake_upsert_points(_client, _collection, points): assert len(captured["points"]) == 1 out_vec = captured["points"][0].vector assert out_vec == embedded_vec + + +def test_smart_reindex_updates_cached_hash_on_no_symbol_changes(tmp_path, monkeypatch): + monkeypatch.setitem(sys.modules, "fastembed", SimpleNamespace(TextEmbedding=object)) + + from scripts.ingest import pipeline as ingest_pipeline + + code = "def hi():\n return 1\n" + fp = tmp_path / "x.py" + fp.write_text(code, encoding="utf-8") + + monkeypatch.setattr( + ingest_pipeline, + "extract_symbols_with_tree_sitter", + lambda _fp: {"function_hi_1": {"name": "hi", "type": "function", "start_line": 1}}, + ) + monkeypatch.setattr( + ingest_pipeline, + "get_cached_symbols", + lambda _fp: {"function_hi_1": {"name": "hi", "type": "function", "start_line": 1}}, + ) + monkeypatch.setattr(ingest_pipeline, "compare_symbol_changes", lambda *_: ([], [])) + set_cached_file_hash = MagicMock() + monkeypatch.setattr(ingest_pipeline, "set_cached_file_hash", set_cached_file_hash) + + status = ingest_pipeline.process_file_with_smart_reindexing( + file_path=Path(fp), + text=code, + language="python", + client=MagicMock(), + current_collection="c", + per_file_repo="r", + model=object(), + vector_name="dense", + ) + + assert status == "skipped" + set_cached_file_hash.assert_called_once() diff --git a/tests/test_upload_client_ignore_cleanup.py b/tests/test_upload_client_ignore_cleanup.py index bcd93009..1c01cc67 100644 --- a/tests/test_upload_client_ignore_cleanup.py +++ b/tests/test_upload_client_ignore_cleanup.py @@ -1,5 +1,6 @@ import importlib from pathlib import Path +from unittest.mock import MagicMock def _exercise_ignored_path_cleanup(mod_name: str, monkeypatch, tmp_path: Path) -> None: @@ -216,6 +217,103 @@ def test_standalone_upload_client_plan_skip_avoids_bundle_upload(monkeypatch, tm _exercise_plan_skip_avoids_bundle_upload("scripts.standalone_upload_client", monkeypatch, tmp_path) +def _exercise_detect_file_changes_does_not_persist_hash(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + set_hash = MagicMock() + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: "oldhash") + monkeypatch.setattr(mod, "set_cached_file_hash", set_hash) + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + changes = client.detect_file_changes([current]) + + assert current in changes["updated"] + set_hash.assert_not_called() + + +def test_remote_upload_client_detect_file_changes_does_not_persist_hash(monkeypatch, tmp_path): + _exercise_detect_file_changes_does_not_persist_hash( + "scripts.remote_upload_client", monkeypatch, tmp_path + ) + + +def test_standalone_upload_client_detect_file_changes_does_not_persist_hash(monkeypatch, tmp_path): + _exercise_detect_file_changes_does_not_persist_hash( + "scripts.standalone_upload_client", monkeypatch, tmp_path + ) + + +def _exercise_plan_skip_finalizes_hash(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + set_hash = MagicMock() + monkeypatch.setattr(mod, "set_cached_file_hash", set_hash) + monkeypatch.setattr( + client, + "_plan_delta_upload", + lambda changes: { + "needed_files": {"created": [], "updated": [], "moved": []}, + "operation_counts_preview": { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 1, + "skipped_hash_match": 1, + "failed": 0, + }, + "needed_size_bytes": 0, + }, + ) + monkeypatch.setattr(client, "create_delta_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not bundle"))) + monkeypatch.setattr(client, "upload_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not upload"))) + + assert client.process_changes_and_upload( + { + "created": [], + "updated": [current], + "deleted": [], + "moved": [], + "unchanged": [], + } + ) is True + assert client.last_upload_result["outcome"] == "skipped_by_plan" + set_hash.assert_called_once() + + +def test_remote_upload_client_plan_skip_finalizes_hash(monkeypatch, tmp_path): + _exercise_plan_skip_finalizes_hash( + "scripts.remote_upload_client", monkeypatch, tmp_path + ) + + +def test_standalone_upload_client_plan_skip_finalizes_hash(monkeypatch, tmp_path): + _exercise_plan_skip_finalizes_hash( + "scripts.standalone_upload_client", monkeypatch, tmp_path + ) + + def test_standalone_upload_client_plan_payload_prefixes_previous_hash(monkeypatch, tmp_path): mod = importlib.import_module("scripts.standalone_upload_client") diff --git a/tests/test_watch_index_cache.py b/tests/test_watch_index_cache.py index a5d28738..f15fc928 100644 --- a/tests/test_watch_index_cache.py +++ b/tests/test_watch_index_cache.py @@ -186,3 +186,29 @@ def fake_index_single_file(*args, **kwargs): assert captured["preloaded_text"] == "print('x')\n" assert captured["preloaded_file_hash"] == "abc123" assert captured["preloaded_language"] == "python" + + +def test_run_indexing_strategy_skips_ensure_for_cached_hash_match(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "file.py" + path.write_text("print('x')\n", encoding="utf-8") + + ensure_mock = MagicMock() + monkeypatch.setattr(proc_mod.idx, "ensure_collection_and_indexes_once", ensure_mock) + monkeypatch.setattr(proc_mod, "_read_text_and_sha1", lambda _p: ("print('x')\n", "abc123")) + monkeypatch.setattr(proc_mod, "get_cached_file_hash", lambda *a, **k: "abc123") + monkeypatch.setattr(proc_mod.idx, "detect_language", lambda _p: "python") + + with pytest.raises(proc_mod._SkipUnchanged): + proc_mod._run_indexing_strategy( + path, + client=MagicMock(), + model=MagicMock(), + collection="coll", + vector_name="vec", + model_dim=1, + repo_name="repo", + ) + + ensure_mock.assert_not_called() From 673ad7e9abe39c37942a2732817c246349d40c36 Mon Sep 17 00:00:00 2001 From: Reese Date: Sun, 8 Mar 2026 12:12:49 +0000 Subject: [PATCH 26/39] fix(ingest,watch): tolerate line shifts and reduce redundant reprocessing - match shifted symbols by stable content when line-based symbol ids change - reuse cached pseudo/tags across symbol line shifts when direct cache lookup misses - preserve pseudo/tag metadata across smart-reindex chunk processing paths - suppress repeated processing of identical file states in ChangeQueue via WATCH_RECENT_FINGERPRINT_TTL_SECS - memoize payload index creation per collection to avoid redundant API calls - only use subprocess indexing when dual-root staging is actually active - consolidate internal metadata path detection for watcher event filtering --- scripts/ingest/pipeline.py | 48 +++++++ scripts/ingest/pseudo.py | 64 +++++++--- scripts/ingest/qdrant.py | 9 ++ scripts/upload_service.py | 2 +- scripts/watch_index_core/config.py | 6 + scripts/watch_index_core/handler.py | 25 ++-- scripts/watch_index_core/processor.py | 24 +++- scripts/watch_index_core/queue.py | 64 +++++++++- scripts/workspace_state.py | 48 +++++++ tests/test_ingest_schema_mode.py | 14 +++ tests/test_smart_reindex_vectors.py | 172 ++++++++++++++++++++++++++ tests/test_watch_index_cache.py | 126 +++++++++++++++++++ tests/test_watch_queue.py | 39 ++++++ tests/test_watcher_events.py | 20 ++- tests/test_workspace_state.py | 27 ++++ 15 files changed, 655 insertions(+), 33 deletions(-) create mode 100644 tests/test_watch_queue.py diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py index ab2b877b..cce5245a 100644 --- a/scripts/ingest/pipeline.py +++ b/scripts/ingest/pipeline.py @@ -1188,6 +1188,30 @@ def process_file_with_smart_reindexing( pseudo_batch_concurrency = int(os.environ.get("PSEUDO_BATCH_CONCURRENCY", "1") or 1) use_batch_pseudo = pseudo_batch_concurrency > 1 + def _apply_symbol_pseudo( + symbol_name: str, + kind: str, + start_line: int, + pseudo_text: str, + pseudo_tags: list[str], + ) -> None: + if not symbol_name or not kind: + return + sid = f"{kind}_{symbol_name}_{start_line}" + target = symbol_meta.get(sid) + if target is None: + for candidate in symbol_meta.values(): + if str(candidate.get("type") or "") != str(kind): + continue + if str(candidate.get("name") or "") != str(symbol_name): + continue + target = candidate + break + if target is None: + return + target["pseudo"] = pseudo_text + target["tags"] = list(pseudo_tags or []) + chunk_data_sr: list[dict] = [] for ch in chunks: info = build_information( @@ -1276,6 +1300,13 @@ def process_file_with_smart_reindexing( start_line = ch.get("start", 0) sid = f"{k}_{symbol_name}_{start_line}" set_cached_pseudo(fp, sid, pseudo, tags, file_hash) + _apply_symbol_pseudo( + symbol_name, + ch.get("kind", "unknown"), + ch.get("start", 0), + pseudo, + tags, + ) except Exception as e: print(f"[PSEUDO_BATCH] Smart reindex batch failed, falling back: {e}") use_batch_pseudo = False @@ -1297,9 +1328,26 @@ def process_file_with_smart_reindexing( sid = f"{k}_{symbol_name}_{start_line}" if set_cached_pseudo: set_cached_pseudo(fp, sid, pseudo, tags, file_hash) + _apply_symbol_pseudo( + symbol_name, + k, + start_line, + pseudo, + tags, + ) + cd["_pseudo_applied"] = True except Exception: pass + if (pseudo or tags) and not ch.get("_pseudo_applied"): + _apply_symbol_pseudo( + ch.get("symbol", ""), + ch.get("kind", "unknown"), + ch.get("start", 0), + pseudo, + tags, + ) + if pseudo: payload["pseudo"] = pseudo if tags: diff --git a/scripts/ingest/pseudo.py b/scripts/ingest/pseudo.py index ea157e2b..0b02db2e 100644 --- a/scripts/ingest/pseudo.py +++ b/scripts/ingest/pseudo.py @@ -7,11 +7,13 @@ """ from __future__ import annotations +import logging import os from typing import Tuple, List from scripts.ingest.config import ( get_cached_pseudo, + get_cached_symbols, set_cached_pseudo, compare_symbol_changes, ) @@ -130,25 +132,58 @@ def should_process_pseudo_for_chunk( start_line = chunk.get("start", 0) symbol_id = f"{kind}_{symbol_name}_{start_line}" + def _lookup_cached() -> Tuple[str, List[str]]: + if get_cached_pseudo: + try: + cached_pseudo, cached_tags = get_cached_pseudo(file_path, symbol_id) + if cached_pseudo or cached_tags: + return cached_pseudo, cached_tags + except Exception as exc: + logging.getLogger(__name__).debug( + "get_cached_pseudo failed for %s/%s: %s", + file_path, + symbol_id, + exc, + exc_info=True, + ) + if get_cached_symbols: + try: + cached_symbols = get_cached_symbols(file_path) or {} + for info in cached_symbols.values(): + if str(info.get("type") or "") != str(kind): + continue + if str(info.get("name") or "") != str(symbol_name): + continue + cached_pseudo = info.get("pseudo", "") + cached_tags = info.get("tags", []) + if not isinstance(cached_pseudo, str): + cached_pseudo = "" + if not isinstance(cached_tags, list): + cached_tags = [] + cached_tags = [str(tag) for tag in cached_tags if str(tag)] + if cached_pseudo or cached_tags: + return cached_pseudo, cached_tags + except Exception as exc: + logging.getLogger(__name__).debug( + "get_cached_symbols failed for %s: %s", + file_path, + exc, + exc_info=True, + ) + return "", [] + # If we don't have any change information, best effort: try reusing cached pseudo when present - if not changed_symbols and get_cached_pseudo: - try: - cached_pseudo, cached_tags = get_cached_pseudo(file_path, symbol_id) - if cached_pseudo or cached_tags: - return False, cached_pseudo, cached_tags - except Exception: - pass + if not changed_symbols: + cached_pseudo, cached_tags = _lookup_cached() + if cached_pseudo or cached_tags: + return False, cached_pseudo, cached_tags return True, "", [] # Unchanged symbol: prefer reuse when cached pseudo/tags exist if symbol_id not in changed_symbols: - if get_cached_pseudo: - try: - cached_pseudo, cached_tags = get_cached_pseudo(file_path, symbol_id) - if cached_pseudo or cached_tags: - return False, cached_pseudo, cached_tags - except Exception: - pass + cached_pseudo, cached_tags = _lookup_cached() + if cached_pseudo or cached_tags: + return False, cached_pseudo, cached_tags # Unchanged but no cached data yet – process once return True, "", [] @@ -162,7 +197,6 @@ def should_use_smart_reindexing(file_path: str, file_hash: str) -> Tuple[bool, s Returns: (use_smart, reason) """ - from scripts.ingest.config import get_cached_symbols, compare_symbol_changes from scripts.ingest.symbols import extract_symbols_with_tree_sitter if not _smart_symbol_reindexing_enabled(): diff --git a/scripts/ingest/qdrant.py b/scripts/ingest/qdrant.py index f98207ca..1d8f2766 100644 --- a/scripts/ingest/qdrant.py +++ b/scripts/ingest/qdrant.py @@ -31,6 +31,7 @@ # --------------------------------------------------------------------------- ENSURED_COLLECTIONS: set[str] = set() ENSURED_COLLECTIONS_LAST_CHECK: dict[str, float] = {} +ENSURED_PAYLOAD_INDEX_COLLECTIONS: set[str] = set() class CollectionNeedsRecreateError(Exception): @@ -535,6 +536,9 @@ def recreate_collection(client: QdrantClient, name: str, dim: int, vector_name: if not name: print("[BUG] recreate_collection called with name=None! Fix the caller - collection name is required.", flush=True) return + ENSURED_COLLECTIONS.discard(name) + ENSURED_COLLECTIONS_LAST_CHECK.pop(name, None) + ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard(name) try: client.delete_collection(name) except Exception: @@ -580,6 +584,10 @@ def recreate_collection(client: QdrantClient, name: str, dim: int, vector_name: def ensure_payload_indexes(client: QdrantClient, collection: str): """Create helpful payload indexes if they don't exist (idempotent).""" + if not collection: + return + if collection in ENSURED_PAYLOAD_INDEX_COLLECTIONS: + return for field in PAYLOAD_INDEX_FIELDS: try: client.create_payload_index( @@ -589,6 +597,7 @@ def ensure_payload_indexes(client: QdrantClient, collection: str): ) except Exception: pass + ENSURED_PAYLOAD_INDEX_COLLECTIONS.add(collection) def ensure_collection_and_indexes_once( diff --git a/scripts/upload_service.py b/scripts/upload_service.py index 880b788e..405b31be 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -5,7 +5,7 @@ This FastAPI service receives delta bundles from remote upload clients, processes them, and integrates with the existing indexing pipeline. """ -# + import os import json import tarfile diff --git a/scripts/watch_index_core/config.py b/scripts/watch_index_core/config.py index c9fa8354..01e9895e 100644 --- a/scripts/watch_index_core/config.py +++ b/scripts/watch_index_core/config.py @@ -33,6 +33,12 @@ def build_logger(): # Debounce interval for file system events DELAY_SECS = float(os.environ.get("WATCH_DEBOUNCE_SECS", "1.0")) +# Suppress repeated processing of the exact same observed file state for a short +# window. This is especially useful on shared/polled filesystems like CephFS. +RECENT_FINGERPRINT_TTL_SECS = float( + os.environ.get("WATCH_RECENT_FINGERPRINT_TTL_SECS", "0") +) + def default_collection_name() -> str: """Base fallback for collection name before runtime resolution.""" diff --git a/scripts/watch_index_core/handler.py b/scripts/watch_index_core/handler.py index bf5cb6d9..ca1c9411 100644 --- a/scripts/watch_index_core/handler.py +++ b/scripts/watch_index_core/handler.py @@ -81,6 +81,17 @@ def _maybe_reload_excluder(self) -> None: except Exception: pass + def _is_internal_metadata_path(self, p: Path) -> bool: + try: + if any(part == ".codebase" for part in p.parts): + return True + global_state_dir = _get_global_state_dir() + if global_state_dir is not None and p.is_relative_to(global_state_dir): + return True + except (OSError, ValueError): + return False + return False + def _maybe_enqueue(self, src_path: str) -> None: self._maybe_reload_excluder() p = Path(src_path) @@ -95,15 +106,7 @@ def _maybe_enqueue(self, src_path: str) -> None: except ValueError: return - try: - if callable(_get_global_state_dir): - global_state_dir = _get_global_state_dir() - if global_state_dir is not None and p.is_relative_to(global_state_dir): - return - except (OSError, ValueError): - pass - - if any(part == ".codebase" for part in p.parts): + if self._is_internal_metadata_path(p): return # Git history manifests are handled by a separate ingestion pipeline and should still @@ -140,7 +143,7 @@ def on_deleted(self, event): p = Path(event.src_path).resolve() except Exception: return - if any(part == ".codebase" for part in p.parts): + if self._is_internal_metadata_path(p): return if not idx.is_indexable_file(p): return @@ -162,6 +165,8 @@ def on_moved(self, event): dest = Path(event.dest_path).resolve() except Exception: return + if self._is_internal_metadata_path(src) or self._is_internal_metadata_path(dest): + return if not idx.is_indexable_file(dest) and not idx.is_indexable_file(src): return try: diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index bdc95693..a98aae05 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -44,6 +44,24 @@ class _SkipUnchanged(Exception): """Sentinel exception to skip unchanged files in the watch loop.""" +def _staging_requires_subprocess(state: Optional[Dict[str, object]]) -> bool: + """Return True only when dual-root staging is actually active for this repo.""" + if not (is_staging_enabled() and isinstance(state, dict)): + return False + + staging = state.get("staging") + if isinstance(staging, dict) and staging: + return True + + active_slug = str(state.get("active_repo_slug") or "").strip() + serving_slug = str(state.get("serving_repo_slug") or "").strip() + if serving_slug.endswith("_old"): + return True + if active_slug and serving_slug and active_slug != serving_slug: + return True + return False + + def _env_int(name: str, default: int) -> int: try: raw = str(os.environ.get(name, str(default))).strip() @@ -334,7 +352,7 @@ def _maybe_handle_staging_file( repo_progress: Dict[str, int], started_at: str, ) -> bool: - if not (is_staging_enabled() and state_env and collection): + if not (state_env and collection): return False _text, file_hash = _read_text_and_sha1(path) @@ -441,7 +459,7 @@ def _process_paths( try: st = get_workspace_state(repo_key, repo_name) if get_workspace_state else None if isinstance(st, dict): - if is_staging_enabled(): + if _staging_requires_subprocess(st): state_env = st.get("indexing_env") except Exception: state_env = None @@ -452,7 +470,7 @@ def _process_paths( p, collection, repo_name, - env_snapshot=(state_env if is_staging_enabled() else None), + env_snapshot=state_env, ) except Exception as exc: safe_print(f"[commit_ingest_error] {p}: {exc}") diff --git a/scripts/watch_index_core/queue.py b/scripts/watch_index_core/queue.py index ede8835b..a94c31c0 100644 --- a/scripts/watch_index_core/queue.py +++ b/scripts/watch_index_core/queue.py @@ -3,10 +3,11 @@ from __future__ import annotations import threading +import time from pathlib import Path from typing import Callable, Iterable, List, Set -from .config import DELAY_SECS, LOGGER +from .config import DELAY_SECS, LOGGER, RECENT_FINGERPRINT_TTL_SECS class ChangeQueue: @@ -20,6 +21,7 @@ def __init__(self, process_cb: Callable[[List[Path]], None]): self._process_cb = process_cb # Serialize processing to avoid concurrent use of TextEmbedding/QdrantClient self._processing_lock = threading.Lock() + self._recent_fingerprints: dict[Path, tuple[tuple[int, int], float]] = {} def add(self, p: Path) -> None: with self._lock: @@ -36,6 +38,53 @@ def add(self, p: Path) -> None: self._timer.daemon = True self._timer.start() + def _fingerprint_path(self, p: Path) -> tuple[int, int] | None: + try: + st = p.stat() + return ( + int(getattr(st, "st_size", 0)), + int(getattr(st, "st_mtime_ns", int(st.st_mtime * 1e9))), + ) + except Exception: + return None + + def _filter_recent_paths(self, paths: Iterable[Path]) -> list[Path]: + ttl = float(RECENT_FINGERPRINT_TTL_SECS) + if ttl <= 0: + return list(paths) + + now = time.time() + keep: list[Path] = [] + for p in paths: + fp = self._fingerprint_path(p) + if fp is None: + keep.append(p) + continue + prev = self._recent_fingerprints.get(p) + if prev is not None: + prev_fp, prev_ts = prev + if prev_fp == fp and (now - prev_ts) < ttl: + continue + keep.append(p) + return keep + + def _mark_recent_paths(self, paths: Iterable[Path]) -> None: + ttl = float(RECENT_FINGERPRINT_TTL_SECS) + if ttl <= 0: + return + now = time.time() + for p in paths: + fp = self._fingerprint_path(p) + if fp is None: + continue + self._recent_fingerprints[p] = (fp, now) + # Keep at least a 1s grace for small TTLs while using a proportional + # buffer for larger TTLs so stale handled fingerprints age out cleanly. + cutoff = now - max(ttl * 2.0, ttl + 1.0) + stale = [p for p, (_fp, ts) in self._recent_fingerprints.items() if ts < cutoff] + for p in stale: + self._recent_fingerprints.pop(p, None) + def _flush(self) -> None: # Grab current batch with self._lock: @@ -57,14 +106,23 @@ def _flush(self) -> None: # Per-file locking in index_single_file handles indexer/watcher coordination todo: Iterable[Path] = paths while True: + filtered_todo = self._filter_recent_paths(todo) + if not filtered_todo: + with self._lock: + if not self._pending: + break + todo = list(self._pending) + self._pending.clear() + continue try: - self._process_cb(list(todo)) + self._process_cb(list(filtered_todo)) + self._mark_recent_paths(filtered_todo) except Exception as exc: # Log processing error via structured logging try: LOGGER.error( "Processing batch failed in ChangeQueue._flush", - extra={"error": str(exc), "batch_size": len(list(todo))}, + extra={"error": str(exc), "batch_size": len(filtered_todo)}, exc_info=True, ) except Exception as inner_exc: # pragma: no cover - logging fallback diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index 3d523276..7d843435 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -2293,6 +2293,26 @@ def compare_symbol_changes(old_symbols: dict, new_symbols: dict) -> tuple[list, unchanged = [] changed = [] + # Primary key should not be absolute start_line alone; leading comments/import + # shifts can move every symbol without changing their bodies. Prefer exact id + # first, then fall back to stable metadata matching. + old_symbols = old_symbols or {} + new_symbols = new_symbols or {} + remaining_old_by_exact = dict(old_symbols) + remaining_old_by_signature: Dict[tuple[str, str, str], list[str]] = {} + remaining_old_by_name_kind: Dict[tuple[str, str], list[str]] = {} + + for old_symbol_id, old_info in remaining_old_by_exact.items(): + kind = str(old_info.get("type") or "") + name = str(old_info.get("name") or "") + content_hash = str(old_info.get("content_hash") or "") + if kind and name and content_hash: + remaining_old_by_signature.setdefault((kind, name, content_hash), []).append( + old_symbol_id + ) + if kind and name: + remaining_old_by_name_kind.setdefault((kind, name), []).append(old_symbol_id) + for symbol_id, symbol_info in new_symbols.items(): if symbol_id in old_symbols: old_info = old_symbols[symbol_id] @@ -2301,6 +2321,34 @@ def compare_symbol_changes(old_symbols: dict, new_symbols: dict) -> tuple[list, unchanged.append(symbol_id) else: changed.append(symbol_id) + remaining_old_by_exact.pop(symbol_id, None) + continue + + kind = str(symbol_info.get("type") or "") + name = str(symbol_info.get("name") or "") + content_hash = str(symbol_info.get("content_hash") or "") + signature = (kind, name, content_hash) + matched_old_ids = remaining_old_by_signature.get(signature) or [] + if matched_old_ids: + old_id = matched_old_ids.pop(0) + if not matched_old_ids: + remaining_old_by_signature.pop(signature, None) + remaining_old_by_exact.pop(old_id, None) + nk = (kind, name) + nk_ids = remaining_old_by_name_kind.get(nk) or [] + if old_id in nk_ids: + nk_ids.remove(old_id) + if nk_ids: + remaining_old_by_name_kind[nk] = nk_ids + else: + remaining_old_by_name_kind.pop(nk, None) + unchanged.append(symbol_id) + continue + + # Same logical symbol name/type exists but content differs: changed. + if kind and name and remaining_old_by_name_kind.get((kind, name)): + remaining_old_by_name_kind.pop((kind, name), None) + changed.append(symbol_id) else: # New symbol changed.append(symbol_id) diff --git a/tests/test_ingest_schema_mode.py b/tests/test_ingest_schema_mode.py index c766089b..461212e3 100644 --- a/tests/test_ingest_schema_mode.py +++ b/tests/test_ingest_schema_mode.py @@ -91,6 +91,7 @@ def test_schema_mode_validate_errors_on_missing_vectors(monkeypatch): def test_schema_mode_migrate_adds_missing_vectors_and_indexes(monkeypatch): monkeypatch.setenv("PATTERN_VECTORS", "1") monkeypatch.setattr(ingq, "LEX_SPARSE_MODE", False) + ingq.ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard("test-collection") existing_vectors = { "code": object(), @@ -122,6 +123,7 @@ def test_schema_mode_migrate_adds_missing_vectors_and_indexes(monkeypatch): def test_schema_mode_create_creates_collection_only(monkeypatch): monkeypatch.setenv("PATTERN_VECTORS", "0") monkeypatch.setattr(ingq, "LEX_SPARSE_MODE", False) + ingq.ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard("test-collection") client = FakeClient(collection_exists=False) @@ -138,3 +140,15 @@ def test_schema_mode_create_creates_collection_only(monkeypatch): assert any( c["field_name"] == "metadata.language" for c in client.payload_index_calls ) + + +def test_ensure_payload_indexes_memoized_per_process(): + client = FakeClient(collection_exists=True) + ingq.ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard("test-collection") + + ingq.ensure_payload_indexes(client, "test-collection") + first_count = len(client.payload_index_calls) + ingq.ensure_payload_indexes(client, "test-collection") + + assert first_count == len(ingq.PAYLOAD_INDEX_FIELDS) + assert len(client.payload_index_calls) == first_count diff --git a/tests/test_smart_reindex_vectors.py b/tests/test_smart_reindex_vectors.py index a47e4108..5e5d4b9f 100644 --- a/tests/test_smart_reindex_vectors.py +++ b/tests/test_smart_reindex_vectors.py @@ -120,6 +120,178 @@ def fake_upsert_points(_client, _collection, points): assert out_vec[ingest_code.LEX_VECTOR_NAME] != old_lex +def test_should_process_pseudo_for_chunk_reuses_cache_after_line_shift(monkeypatch): + from scripts.ingest import pseudo as pseudo_mod + + monkeypatch.setattr(pseudo_mod, "get_cached_pseudo", lambda *a, **k: ("", [])) + monkeypatch.setattr( + pseudo_mod, + "get_cached_symbols", + lambda _fp: { + "function_foo_10": { + "name": "foo", + "type": "function", + "pseudo": "cached pseudo", + "tags": ["alpha", "beta"], + } + }, + ) + + needs_processing, pseudo, tags = pseudo_mod.should_process_pseudo_for_chunk( + "x.py", + {"symbol": "foo", "kind": "function", "start": 12}, + changed_symbols=set(), + ) + + assert needs_processing is False + assert pseudo == "cached pseudo" + assert tags == ["alpha", "beta"] + + +def test_smart_reindex_persists_pseudo_on_shifted_symbol_ids(tmp_path, monkeypatch): + monkeypatch.setitem(sys.modules, "fastembed", SimpleNamespace(TextEmbedding=object)) + monkeypatch.setenv("PSEUDO_BATCH_CONCURRENCY", "1") + + from scripts import ingest_code + from scripts.ingest import pipeline as ingest_pipeline + + fp = tmp_path / "x.py" + fp.write_text("def foo():\n return 1\n", encoding="utf-8") + + monkeypatch.setattr( + ingest_pipeline, + "extract_symbols_with_tree_sitter", + lambda _fp: { + "function_foo_12": { + "name": "foo", + "type": "function", + "start_line": 12, + "end_line": 13, + "content_hash": "samehash", + "pseudo": "", + "tags": [], + "qdrant_ids": [], + }, + "function_bar_20": { + "name": "bar", + "type": "function", + "start_line": 20, + "end_line": 21, + "content_hash": "barhash-new", + "pseudo": "", + "tags": [], + "qdrant_ids": [], + }, + }, + ) + monkeypatch.setattr( + ingest_pipeline, + "get_cached_symbols", + lambda _fp: { + "function_foo_10": { + "name": "foo", + "type": "function", + "start_line": 10, + "end_line": 11, + "content_hash": "samehash", + "pseudo": "cached pseudo", + "tags": ["tag1"], + "qdrant_ids": [], + }, + "function_bar_20": { + "name": "bar", + "type": "function", + "start_line": 20, + "end_line": 21, + "content_hash": "barhash-old", + "pseudo": "old bar", + "tags": ["old"], + "qdrant_ids": [], + }, + }, + ) + monkeypatch.setattr( + ingest_pipeline, + "compare_symbol_changes", + lambda *_: (["function_foo_12"], ["function_bar_20"]), + ) + monkeypatch.setattr(ingest_pipeline, "ensure_collection_and_indexes_once", lambda *a, **k: None) + + class FakeClient: + def scroll(self, **kwargs): + return ([], None) + + monkeypatch.setattr(ingest_pipeline, "delete_points_by_path", lambda *a, **k: None) + monkeypatch.setattr(ingest_pipeline, "upsert_points", lambda *a, **k: None) + monkeypatch.setattr( + ingest_pipeline, + "_sync_graph_edges_best_effort", + lambda *a, **k: None, + raising=False, + ) + monkeypatch.setattr(ingest_pipeline, "_get_imports_calls", lambda *a, **k: ([], [])) + monkeypatch.setattr(ingest_pipeline, "_git_metadata", lambda *a, **k: (0, 0, 0)) + monkeypatch.setattr(ingest_pipeline, "_compute_host_and_container_paths", lambda _p: ("", "")) + monkeypatch.setattr(ingest_pipeline, "_lex_hash_vector_text", lambda _t: [0.0] * ingest_code.LEX_VECTOR_DIM) + monkeypatch.setattr(ingest_pipeline, "_select_dense_text", lambda **kwargs: kwargs.get("code_text") or "") + monkeypatch.setattr(ingest_pipeline, "embed_batch", lambda _model, texts: [[0.1, 0.2, 0.3] for _ in texts]) + monkeypatch.setattr(ingest_code, "embed_batch", lambda _model, texts: [[0.1, 0.2, 0.3] for _ in texts]) + monkeypatch.setattr(ingest_pipeline, "generate_pseudo_tags", lambda _t: ("NEW", ["fresh"])) + monkeypatch.setattr( + ingest_pipeline, + "chunk_lines", + lambda text, *_a, **_k: [ + {"start": 12, "end": 13, "text": text, "symbol": "foo", "kind": "function"}, + {"start": 20, "end": 21, "text": text, "symbol": "bar", "kind": "function"}, + ], + ) + monkeypatch.setattr( + ingest_pipeline, + "chunk_semantic", + lambda text, *_a, **_k: [ + {"start": 12, "end": 13, "text": text, "symbol": "foo", "kind": "function"}, + {"start": 20, "end": 21, "text": text, "symbol": "bar", "kind": "function"}, + ], + ) + monkeypatch.setattr( + ingest_pipeline, + "chunk_by_tokens", + lambda text, *_a, **_k: [ + {"start": 12, "end": 13, "text": text, "symbol": "foo", "kind": "function"}, + {"start": 20, "end": 21, "text": text, "symbol": "bar", "kind": "function"}, + ], + ) + monkeypatch.setattr(ingest_pipeline, "_extract_symbols", lambda *_a, **_k: []) + monkeypatch.setattr(ingest_pipeline, "build_information", lambda *a, **k: "info") + monkeypatch.setattr(ingest_pipeline, "hash_id", lambda *a, **k: 1) + monkeypatch.setattr(ingest_pipeline, "generate_pseudo_tags_batch", None, raising=False) + + saved = {} + monkeypatch.setattr(ingest_pipeline, "set_cached_pseudo", lambda *a, **k: None) + monkeypatch.setattr(ingest_pipeline, "set_cached_file_hash", lambda *a, **k: None) + monkeypatch.setattr(ingest_pipeline, "should_process_pseudo_for_chunk", ingest_code.should_process_pseudo_for_chunk) + monkeypatch.setattr(ingest_pipeline, "set_cached_symbols", lambda _fp, symbols, _hash: saved.update(symbols)) + + status = ingest_code.process_file_with_smart_reindexing( + file_path=fp, + text=fp.read_text(encoding="utf-8"), + language="python", + client=FakeClient(), + current_collection="c", + per_file_repo="r", + model=object(), + vector_name="dense", + model_dim=3, + ) + + assert status == "success" + # `foo` is logically reusable across the line shift, but chunk-level pseudo + # generation may still refresh it depending on chunk processing order. + assert saved["function_foo_12"]["pseudo"] in {"cached pseudo", "NEW"} + assert saved["function_bar_20"]["pseudo"] == "NEW" + assert saved["function_foo_12"]["tags"] + + def test_smart_reindex_does_not_reuse_when_info_changes(tmp_path, monkeypatch): """Dense embeddings must not be reused if `information` differs.""" diff --git a/tests/test_watch_index_cache.py b/tests/test_watch_index_cache.py index f15fc928..16a3c309 100644 --- a/tests/test_watch_index_cache.py +++ b/tests/test_watch_index_cache.py @@ -212,3 +212,129 @@ def test_run_indexing_strategy_skips_ensure_for_cached_hash_match(monkeypatch, t ) ensure_mock.assert_not_called() + + +def test_staging_requires_subprocess_only_for_active_dual_root_state(monkeypatch): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: True) + + assert proc_mod._staging_requires_subprocess(None) is False + assert ( + proc_mod._staging_requires_subprocess( + { + "indexing_env": {"FOO": "bar"}, + "active_repo_slug": "repo", + "serving_repo_slug": "repo", + } + ) + is False + ) + assert ( + proc_mod._staging_requires_subprocess( + { + "indexing_env": {"FOO": "bar"}, + "active_repo_slug": "repo", + "serving_repo_slug": "repo_old", + } + ) + is True + ) + assert ( + proc_mod._staging_requires_subprocess( + { + "indexing_env": {"FOO": "bar"}, + "active_repo_slug": "repo", + "serving_repo_slug": "repo", + "staging": {"collection": "repo_old_collection"}, + } + ) + is True + ) + + +def test_process_paths_does_not_force_subprocess_for_non_active_staging( + monkeypatch, tmp_path +): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "file.py" + path.write_text("print('x')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: True) + monkeypatch.setattr( + proc_mod, + "get_workspace_state", + lambda *a, **k: { + "indexing_env": {"FOO": "bar"}, + "active_repo_slug": "repo", + "serving_repo_slug": "repo", + }, + ) + + staging_mock = MagicMock(return_value=False) + monkeypatch.setattr(proc_mod, "_maybe_handle_staging_file", staging_mock) + monkeypatch.setattr(proc_mod, "_run_indexing_strategy", lambda *a, **k: True) + + proc_mod._process_paths( + [path], + client=MagicMock(), + model=MagicMock(), + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + assert staging_mock.call_args is not None + assert staging_mock.call_args.kwargs == {} + assert staging_mock.call_args.args[5] is None + + +def test_process_paths_uses_subprocess_when_staging_is_actually_active( + monkeypatch, tmp_path +): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "file.py" + path.write_text("print('x')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: True) + monkeypatch.setattr( + proc_mod, + "get_workspace_state", + lambda *a, **k: { + "indexing_env": {"FOO": "bar"}, + "active_repo_slug": "repo", + "serving_repo_slug": "repo_old", + }, + ) + + staging_mock = MagicMock(return_value=False) + monkeypatch.setattr(proc_mod, "_maybe_handle_staging_file", staging_mock) + monkeypatch.setattr(proc_mod, "_run_indexing_strategy", lambda *a, **k: True) + + proc_mod._process_paths( + [path], + client=MagicMock(), + model=MagicMock(), + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + assert staging_mock.call_args is not None + assert staging_mock.call_args.kwargs == {} + assert staging_mock.call_args.args[5] == {"FOO": "bar"} diff --git a/tests/test_watch_queue.py b/tests/test_watch_queue.py new file mode 100644 index 00000000..259124a5 --- /dev/null +++ b/tests/test_watch_queue.py @@ -0,0 +1,39 @@ +def test_change_queue_suppresses_recent_identical_fingerprint(monkeypatch, tmp_path): + from scripts.watch_index_core import queue as queue_mod + + monkeypatch.setattr(queue_mod, "RECENT_FINGERPRINT_TTL_SECS", 10.0) + + processed = [] + q = queue_mod.ChangeQueue(lambda paths: processed.append(list(paths))) + + p = tmp_path / "file.py" + p.write_text("print('x')\n", encoding="utf-8") + + q._paths.add(p) + q._flush() + assert processed == [[p]] + + q._paths.add(p) + q._flush() + assert processed == [[p]] + + +def test_change_queue_reprocesses_when_fingerprint_changes(monkeypatch, tmp_path): + from scripts.watch_index_core import queue as queue_mod + + monkeypatch.setattr(queue_mod, "RECENT_FINGERPRINT_TTL_SECS", 10.0) + + processed = [] + q = queue_mod.ChangeQueue(lambda paths: processed.append(list(paths))) + + p = tmp_path / "file.py" + p.write_text("print('x')\n", encoding="utf-8") + + q._paths.add(p) + q._flush() + + p.write_text("print('changed-again')\n", encoding="utf-8") + q._paths.add(p) + q._flush() + + assert processed == [[p], [p]] diff --git a/tests/test_watcher_events.py b/tests/test_watcher_events.py index f01484e9..658366bc 100644 --- a/tests/test_watcher_events.py +++ b/tests/test_watcher_events.py @@ -71,6 +71,25 @@ def test_on_moved_enqueues_new_dest(monkeypatch, tmp_path): assert any(s.endswith("/b.py") for s in q.added) +@pytest.mark.unit +def test_on_moved_ignores_internal_codebase_paths(monkeypatch, tmp_path): + monkeypatch.setenv("MULTI_REPO_MODE", "0") + q = FakeQueue() + handler = wi.IndexHandler(root=tmp_path, queue=q, client=FakeClient(), collection="c") + + codebase = tmp_path / ".codebase" + codebase.mkdir(parents=True, exist_ok=True) + src = codebase / "state.json" + dst = codebase / "file_locks" / "abc.lock" + src.write_text("{}\n") + dst.parent.mkdir(parents=True, exist_ok=True) + dst.write_text("lock\n") + + handler.on_moved(E(src, dest=dst)) + + assert q.added == [] + + @pytest.mark.unit def test_ignore_reload_rebuilds_excluder(monkeypatch, tmp_path): monkeypatch.setenv("MULTI_REPO_MODE", "0") @@ -105,4 +124,3 @@ def test_remote_git_manifest_is_enqueued_even_if_excluded(monkeypatch, tmp_path) handler.on_created(E(manifest)) assert any(p.endswith("/.remote-git/git_history_test.json") for p in q.added) - diff --git a/tests/test_workspace_state.py b/tests/test_workspace_state.py index 1200a270..e5cce306 100644 --- a/tests/test_workspace_state.py +++ b/tests/test_workspace_state.py @@ -433,3 +433,30 @@ def test_placeholder_collection_names(self, ws_module): assert "" in ws_module.PLACEHOLDER_COLLECTION_NAMES assert "default-collection" in ws_module.PLACEHOLDER_COLLECTION_NAMES assert "my-collection" in ws_module.PLACEHOLDER_COLLECTION_NAMES + + +class TestCompareSymbolChanges: + def test_compare_symbol_changes_tolerates_line_shift_for_unchanged_content(self, ws_module): + old_symbols = { + "function_foo_10": { + "name": "foo", + "type": "function", + "start_line": 10, + "end_line": 20, + "content_hash": "samehash", + } + } + new_symbols = { + "function_foo_12": { + "name": "foo", + "type": "function", + "start_line": 12, + "end_line": 22, + "content_hash": "samehash", + } + } + + unchanged, changed = ws_module.compare_symbol_changes(old_symbols, new_symbols) + + assert unchanged == ["function_foo_12"] + assert changed == [] From 6f243ec98db07f8801bcffd9e02c6bc601814d1b Mon Sep 17 00:00:00 2001 From: Reese Date: Sun, 8 Mar 2026 12:52:02 +0000 Subject: [PATCH 27/39] feat(vscode): extend MCP bridge auto-start to support sse-remote mode Add requiresLocalBridgeProcess function to distinguish between configs that need a local bridge process (http and sse-remote) vs those that don't. Previously, auto-start only worked for http transport mode; now it also supports sse-remote mode which uses the bundled bridge adapter. --- vscode-extension/context-engine-uploader/extension.js | 5 +++-- vscode-extension/context-engine-uploader/mcp_bridge.js | 9 +++++++-- vscode-extension/context-engine-uploader/package.json | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/vscode-extension/context-engine-uploader/extension.js b/vscode-extension/context-engine-uploader/extension.js index d57b64ec..79fe9ac9 100644 --- a/vscode-extension/context-engine-uploader/extension.js +++ b/vscode-extension/context-engine-uploader/extension.js @@ -275,6 +275,7 @@ function activate(context) { resolveBridgeCliInvocation: () => bridgeManager ? bridgeManager.resolveBridgeCliInvocation() : undefined, resolveBridgeHttpUrl: () => bridgeManager ? bridgeManager.resolveBridgeHttpUrl() : undefined, requiresHttpBridge: (s, t) => bridgeManager ? bridgeManager.requiresHttpBridge(s, t) : (s === 'bridge' && t === 'http'), + requiresLocalBridgeProcess: (s, t) => bridgeManager ? bridgeManager.requiresLocalBridgeProcess(s, t) : (s === 'bridge' && (t === 'http' || t === 'sse-remote')), ensureHttpBridgeReadyForConfigs: () => bridgeManager ? bridgeManager.ensureReadyForConfigs() : Promise.resolve(false), getBridgeIsRunning: () => (bridgeManager && typeof bridgeManager.isRunning === 'function' ? bridgeManager.isRunning() : false), writeCtxConfig: () => ctxConfigManager ? ctxConfigManager.writeCtxConfig() : Promise.resolve(), @@ -487,10 +488,10 @@ function activate(context) { const serverModeRaw = config.get('mcpServerMode') || 'bridge'; const transportMode = (typeof transportModeRaw === 'string' ? transportModeRaw.trim() : 'sse-remote') || 'sse-remote'; const serverMode = (typeof serverModeRaw === 'string' ? serverModeRaw.trim() : 'bridge') || 'bridge'; - if (bridgeManager && bridgeManager.requiresHttpBridge(serverMode, transportMode)) { + if (bridgeManager && bridgeManager.requiresLocalBridgeProcess(serverMode, transportMode)) { startHttpBridgeProcess().catch(error => log(`Auto-start HTTP MCP bridge failed: ${error instanceof Error ? error.message : String(error)}`)); } else { - log('Context Engine Uploader: autoStartMcpBridge is enabled, but current MCP wiring does not use the HTTP bridge; skipping auto-start.'); + log('Context Engine Uploader: autoStartMcpBridge is enabled, but current MCP wiring does not use the local bridge process; skipping auto-start.'); } } } diff --git a/vscode-extension/context-engine-uploader/mcp_bridge.js b/vscode-extension/context-engine-uploader/mcp_bridge.js index b7c1b441..ca81a91a 100644 --- a/vscode-extension/context-engine-uploader/mcp_bridge.js +++ b/vscode-extension/context-engine-uploader/mcp_bridge.js @@ -138,6 +138,10 @@ function createBridgeManager(deps) { return serverMode === 'bridge' && transportMode === 'http'; } + function requiresLocalBridgeProcess(serverMode, transportMode) { + return serverMode === 'bridge' && (transportMode === 'http' || transportMode === 'sse-remote'); + } + function resolveBridgeHttpUrl() { try { const settings = getEffectiveConfig(); @@ -300,10 +304,10 @@ function createBridgeManager(deps) { const serverModeRaw = config.get('mcpServerMode') || 'bridge'; const transportMode = (typeof transportModeRaw === 'string' ? transportModeRaw.trim() : 'sse-remote') || 'sse-remote'; const serverMode = (typeof serverModeRaw === 'string' ? serverModeRaw.trim() : 'bridge') || 'bridge'; - if (requiresHttpBridge(serverMode, transportMode)) { + if (requiresLocalBridgeProcess(serverMode, transportMode)) { await start(); } else { - log('Context Engine Uploader: HTTP bridge settings changed, but current MCP wiring does not use the HTTP bridge; not restarting HTTP bridge.'); + log('Context Engine Uploader: bridge settings changed, but current MCP wiring does not use the local bridge process; not restarting bridge.'); } } } @@ -321,6 +325,7 @@ function createBridgeManager(deps) { getState, isRunning, requiresHttpBridge, + requiresLocalBridgeProcess, resolveBridgeHttpUrl, ensureReadyForConfigs, start, diff --git a/vscode-extension/context-engine-uploader/package.json b/vscode-extension/context-engine-uploader/package.json index 77b86261..654c5678 100644 --- a/vscode-extension/context-engine-uploader/package.json +++ b/vscode-extension/context-engine-uploader/package.json @@ -282,7 +282,7 @@ "contextEngineUploader.autoStartMcpBridge": { "type": "boolean", "default": true, - "description": "When enabled and mcpServerMode='bridge' with mcpTransportMode='http', automatically start the local ctx-mcp-bridge HTTP server for the active workspace so IDE clients can connect over HTTP without manual commands. Has no effect in stdio/direct modes." + "description": "When enabled and mcpServerMode='bridge', automatically start the bundled local ctx bridge process for the active workspace. In http mode it serves the local HTTP MCP bridge directly; in sse-remote mode it starts the same bundled bridge adapter used by bridge-stdio wiring. Has no effect in direct modes." }, "contextEngineUploader.mcpBridgePort": { "type": "number", From ca0c8b34f330fe63d82d90791494113ff5a33c10 Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Mar 2026 10:12:01 +0000 Subject: [PATCH 28/39] feat(watch,upload): add index journal for durable change tracking and consistency audits Add a durable index journal system that records pending upsert/delete operations, enabling the watcher to recover missed filesystem events after container restarts. The watcher now drains pending journal entries each tick and verifies that indexing operations have committed before marking them complete. Introduce periodic consistency audits that compare filesystem state, local cache, and Qdrant collection contents to detect and repair drift. When discrepancies are found, repair operations are enqueued to the journal for retry. Move empty directory sweep maintenance from the upload delta bundle processor to the watcher's periodic maintenance loop, consolidating cleanup logic and ensuring it runs only when the watcher is active. Add metadata_root parameter support throughout cache operations to correctly resolve state paths when the workspace path differs from the metadata root, particularly in container/host path scenarios. Export is_text_like_language from pipeline module and skip smart reindexing for text-like file types. --- scripts/collection_admin.py | 8 +- scripts/ingest/pipeline.py | 12 +- scripts/ingest_code.py | 2 + scripts/remote_upload_client.py | 124 ++++- scripts/standalone_upload_client.py | 3 - scripts/upload_delta_bundle.py | 252 ++++----- scripts/watch_index.py | 86 ++- scripts/watch_index_core/consistency.py | 539 ++++++++++++++++++ scripts/watch_index_core/handler.py | 12 +- scripts/watch_index_core/paths.py | 34 ++ scripts/watch_index_core/processor.py | 496 +++++++++++++++-- scripts/watch_index_core/pseudo.py | 7 +- scripts/watch_index_core/queue.py | 61 ++- scripts/watch_index_core/utils.py | 10 +- scripts/workspace_state.py | 573 +++++++++++++++++--- tests/test_index_journal.py | 431 +++++++++++++++ tests/test_upload_service_path_traversal.py | 48 +- tests/test_watch_consistency.py | 102 ++++ tests/test_watch_index_cache.py | 211 ++++++- tests/test_watch_queue.py | 50 ++ tests/test_watcher_collection_resolution.py | 17 +- tests/test_workspace_state.py | 131 +++++ 22 files changed, 2808 insertions(+), 401 deletions(-) create mode 100644 scripts/watch_index_core/consistency.py create mode 100644 scripts/watch_index_core/paths.py create mode 100644 tests/test_index_journal.py create mode 100644 tests/test_watch_consistency.py diff --git a/scripts/collection_admin.py b/scripts/collection_admin.py index 149dadf0..e71b5428 100644 --- a/scripts/collection_admin.py +++ b/scripts/collection_admin.py @@ -100,7 +100,6 @@ def _managed_upload_marker_path( slug_name: str, marker_root: Optional[Path] = None, ) -> Path: - # Marker is stored with per-repo metadata, not inside the repo workspace tree. base = marker_root or work_root return base / ".codebase" / "repos" / slug_name / _MARKER_NAME @@ -118,11 +117,12 @@ def _is_managed_upload_workspace_dir( return False if not _SLUGGED_REPO_RE.match(p.name or ""): return False - return _managed_upload_marker_path( + marker = _managed_upload_marker_path( work_root=work_root, marker_root=marker_root, slug_name=p.name, - ).exists() + ) + return marker.exists() except Exception: return False @@ -238,7 +238,7 @@ def delete_collection_everywhere( mappings = [] try: if get_collection_mappings is not None: - mappings = get_collection_mappings(search_root=str(codebase_root)) or [] + mappings = get_collection_mappings(search_root=str(work_root)) or [] except Exception: mappings = [] diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py index cce5245a..d5ece174 100644 --- a/scripts/ingest/pipeline.py +++ b/scripts/ingest/pipeline.py @@ -109,7 +109,8 @@ def detect_language(path: Path) -> str: _TEXT_LIKE_LANGS = {"unknown", "markdown", "text"} -def _is_text_like_language(language: str) -> bool: +def is_text_like_language(language: str) -> bool: + """Classify whether a detected language should skip smart reindexing.""" return str(language or "").strip().lower() in _TEXT_LIKE_LANGS @@ -399,7 +400,6 @@ def _index_single_file_inner( text = preloaded_text language = preloaded_language or detect_language(file_path) - is_text_like = _is_text_like_language(language) file_hash = preloaded_file_hash or hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() repo_tag = repo_name_for_cache or _detect_repo_name_from_path(file_path) @@ -997,6 +997,13 @@ def process_file_with_smart_reindexing( except Exception: file_path = Path(fp) + is_text_like = is_text_like_language(language) + if is_text_like: + print( + f"[SMART_REINDEX] {file_path}: text-like language '{language}', " + "skipping smart reindex and using full reindex path" + ) + return "failed" file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() if allowed_vectors is None and allowed_sparse is None: @@ -1171,7 +1178,6 @@ def process_file_with_smart_reindexing( else: chunks = chunk_lines(text, CHUNK_LINES, CHUNK_OVERLAP) - is_text_like = _is_text_like_language(language) symbol_spans = _extract_symbols(language, text) reused_points: list[models.PointStruct] = [] diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 574457a5..a183519d 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -203,6 +203,7 @@ from scripts.ingest.pipeline import ( _detect_repo_name_from_path, + is_text_like_language, detect_language, build_information, pseudo_backfill_tick, @@ -352,6 +353,7 @@ def main(): "embed_batch", # Pipeline "_detect_repo_name_from_path", + "is_text_like_language", "detect_language", "build_information", "index_single_file", diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 72a11a25..a67b7269 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -176,6 +176,24 @@ def _compute_logical_repo_id(workspace_path: str) -> str: return f"{prefix}{h}" +def _derive_metadata_root(workspace_path: str) -> Path: + """Infer host-side metadata root that corresponds to container `/work`.""" + try: + p = Path(workspace_path).resolve() + except Exception: + p = Path(workspace_path) + + if p.name == "dev-workspace": + return p.parent + if p.parent.name == "dev-workspace": + return p.parent.parent + if (p / ".codebase").exists(): + return p + if (p.parent / ".codebase").exists(): + return p.parent + return p.parent + + def _redact_emails(text: str) -> str: """Redact email addresses from commit messages for privacy.""" try: @@ -435,7 +453,12 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str return manifest -def _load_local_cache_file_hashes(workspace_path: str, repo_name: Optional[str]) -> Dict[str, str]: +def _load_local_cache_file_hashes( + workspace_path: str, + repo_name: Optional[str], + *, + metadata_root: Optional[str] = None, +) -> Dict[str, str]: """Best-effort read of the local cache.json file_hashes map. This mirrors the layout used by workspace_state without introducing new @@ -443,7 +466,13 @@ def _load_local_cache_file_hashes(workspace_path: str, repo_name: Optional[str]) lookups still go through get_cached_file_hash. """ try: - base = Path(os.environ.get("WORKSPACE_PATH") or workspace_path).resolve() + base = Path( + metadata_root + or os.environ.get("CTXCE_METADATA_ROOT") + or os.environ.get("WATCH_ROOT") + or os.environ.get("WORKSPACE_PATH") + or workspace_path + ).resolve() multi_repo = os.environ.get("MULTI_REPO_MODE", "0").strip().lower() in {"1", "true", "yes", "on"} if multi_repo and repo_name: cache_path = base / ".codebase" / "repos" / repo_name / "cache.json" @@ -483,10 +512,19 @@ def _load_local_cache_file_hashes(workspace_path: str, repo_name: Optional[str]) return {} -def get_all_cached_paths(repo_name: Optional[str] = None) -> List[str]: +def get_all_cached_paths( + repo_name: Optional[str] = None, + metadata_root: Optional[str] = None, +) -> List[str]: """Return cached file paths from the local workspace cache.""" - workspace_path = os.environ.get("WORKSPACE_PATH") or os.getcwd() - return list(_load_local_cache_file_hashes(workspace_path, repo_name).keys()) + effective_workspace = os.environ.get("WORKSPACE_PATH") or os.getcwd() + return list( + _load_local_cache_file_hashes( + effective_workspace, + repo_name, + metadata_root=metadata_root, + ).keys() + ) class RemoteUploadClient: @@ -528,15 +566,13 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s """Initialize remote upload client.""" self.upload_endpoint = upload_endpoint.rstrip('/') self.workspace_path = workspace_path + self.metadata_root = str(_derive_metadata_root(workspace_path)) self.collection_name = collection_name self.max_retries = max_retries self.timeout = timeout self.temp_dir = None self.logical_repo_id = logical_repo_id - # Set environment variables for cache functions - os.environ["WORKSPACE_PATH"] = workspace_path - # Get repo name for cache operations try: from scripts.workspace_state import _extract_repo_name_from_path @@ -559,6 +595,50 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s self.last_upload_result: Dict[str, Any] = {"outcome": "idle"} self._last_plan_payload: Optional[Dict[str, Any]] = None + def _get_cached_file_hash(self, file_path: str) -> str: + try: + return get_cached_file_hash( + file_path, + self.repo_name, + metadata_root=self.metadata_root, + ) + except TypeError: + # Support monkeypatched test doubles that don't accept metadata_root. + return get_cached_file_hash(file_path, self.repo_name) + + def _set_cached_file_hash(self, file_path: str, file_hash: str) -> None: + try: + set_cached_file_hash( + file_path, + file_hash, + self.repo_name, + metadata_root=self.metadata_root, + ) + except TypeError: + # Support monkeypatched test doubles that don't accept metadata_root. + set_cached_file_hash(file_path, file_hash, self.repo_name) + + def _remove_cached_file(self, file_path: str) -> None: + try: + remove_cached_file( + file_path, + self.repo_name, + metadata_root=self.metadata_root, + ) + except TypeError: + # Support monkeypatched test doubles that don't accept metadata_root. + remove_cached_file(file_path, self.repo_name) + + def _get_all_cached_paths(self) -> List[str]: + try: + return get_all_cached_paths( + self.repo_name, + metadata_root=self.metadata_root, + ) + except TypeError: + # Support monkeypatched test doubles that don't accept metadata_root. + return get_all_cached_paths(self.repo_name) + def _set_last_upload_result(self, outcome: str, **details: Any) -> Dict[str, Any]: result: Dict[str, Any] = {"outcome": outcome} result.update(details) @@ -586,7 +666,7 @@ def _finalize_successful_changes(self, changes: Dict[str, List]) -> None: try: abs_path = str(path.resolve()) current_hash = hashlib.sha1(path.read_bytes()).hexdigest() - set_cached_file_hash(abs_path, current_hash, self.repo_name) + self._set_cached_file_hash(abs_path, current_hash) stat = path.stat() self._stat_cache[abs_path] = ( getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), @@ -598,7 +678,7 @@ def _finalize_successful_changes(self, changes: Dict[str, List]) -> None: try: abs_path = str(path.resolve()) current_hash = hashlib.sha1(path.read_bytes()).hexdigest() - set_cached_file_hash(abs_path, current_hash, self.repo_name) + self._set_cached_file_hash(abs_path, current_hash) stat = path.stat() self._stat_cache[abs_path] = ( getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), @@ -609,21 +689,21 @@ def _finalize_successful_changes(self, changes: Dict[str, List]) -> None: for path in changes.get("deleted", []): try: abs_path = str(path.resolve()) - remove_cached_file(abs_path, self.repo_name) + self._remove_cached_file(abs_path) self._stat_cache.pop(abs_path, None) except Exception: continue for source_path, dest_path in changes.get("moved", []): try: source_abs_path = str(source_path.resolve()) - remove_cached_file(source_abs_path, self.repo_name) + self._remove_cached_file(source_abs_path) self._stat_cache.pop(source_abs_path, None) except Exception: continue try: dest_abs_path = str(dest_path.resolve()) current_hash = hashlib.sha1(dest_path.read_bytes()).hexdigest() - set_cached_file_hash(dest_abs_path, current_hash, self.repo_name) + self._set_cached_file_hash(dest_abs_path, current_hash) stat = dest_path.stat() self._stat_cache[dest_abs_path] = ( getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), @@ -795,7 +875,7 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: abs_path = str(path.resolve()) except Exception: continue - cached_hash = get_cached_file_hash(abs_path, self.repo_name) + cached_hash = self._get_cached_file_hash(abs_path) if cached_hash: changes["deleted"].append(path) try: @@ -810,7 +890,7 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: # Skip paths that cannot be resolved continue - cached_hash = get_cached_file_hash(abs_path, self.repo_name) + cached_hash = self._get_cached_file_hash(abs_path) if not path.exists(): # File was deleted @@ -890,7 +970,7 @@ def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) -> for deleted_path in deleted_files: try: # Try to get cached hash first, fallback to file content - cached_hash = get_cached_file_hash(str(deleted_path), self.repo_name) + cached_hash = self._get_cached_file_hash(str(deleted_path)) if cached_hash: deleted_hashes[cached_hash] = deleted_path continue @@ -1003,7 +1083,7 @@ def create_delta_bundle( content = f.read() file_hash = hashlib.sha1(content).hexdigest() content_hash = f"sha1:{file_hash}" - previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + previous_hash = self._get_cached_file_hash(str(path.resolve())) # Write file to bundle bundle_file_path = files_dir / "updated" / rel_path @@ -1079,7 +1159,7 @@ def create_delta_bundle( for path in changes["deleted"]: rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: - previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + previous_hash = self._get_cached_file_hash(str(path.resolve())) operation = { "operation": "deleted", @@ -1096,7 +1176,7 @@ def create_delta_bundle( # Once a delete operation has been recorded, drop the cache entry # so subsequent scans do not keep re-reporting the same deletion. try: - remove_cached_file(str(path.resolve()), self.repo_name) + self._remove_cached_file(str(path.resolve())) except Exception: pass @@ -1202,7 +1282,7 @@ def _build_plan_payload(self, changes: Dict[str, List]) -> Dict[str, Any]: "path": rel_path, "size_bytes": stat.st_size, "content_hash": f"sha1:{file_hash}", - "previous_hash": get_cached_file_hash(str(path.resolve()), self.repo_name), + "previous_hash": self._get_cached_file_hash(str(path.resolve())), "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown"), } ) @@ -1245,7 +1325,7 @@ def _build_plan_payload(self, changes: Dict[str, List]) -> Dict[str, Any]: { "operation": "deleted", "path": rel_path, - "previous_hash": get_cached_file_hash(str(path.resolve()), self.repo_name), + "previous_hash": self._get_cached_file_hash(str(path.resolve())), "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown"), } ) @@ -1750,7 +1830,7 @@ def build_force_changes(self, all_files: List[Path]) -> Dict[str, List]: created_files.append(path) path_map[resolved] = path - for cached_abs in get_all_cached_paths(self.repo_name): + for cached_abs in self._get_all_cached_paths(): try: cached_path = Path(cached_abs) resolved = cached_path.resolve() diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index c314a4a3..ad801cee 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -758,9 +758,6 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s self.temp_dir = None self.logical_repo_id = logical_repo_id - # Set environment variables for cache functions - os.environ["WORKSPACE_PATH"] = workspace_path - # Store repo name and initialize hash cache self.repo_name = _extract_repo_name_from_path(workspace_path) # Fallback to directory name if repo detection fails (for non-git repos) diff --git a/scripts/upload_delta_bundle.py b/scripts/upload_delta_bundle.py index 2985c8d2..bdcd281f 100644 --- a/scripts/upload_delta_bundle.py +++ b/scripts/upload_delta_bundle.py @@ -5,21 +5,19 @@ import hashlib import re import logging -from datetime import datetime, timezone from pathlib import Path from typing import Dict, Any, Optional try: from scripts.workspace_state import ( - _normalize_cache_key_path, - _extract_repo_name_from_path, - get_staging_targets, - get_collection_state_snapshot, - get_workspace_state, - is_staging_enabled, - update_workspace_state, - ) + _normalize_cache_key_path, + _extract_repo_name_from_path, + get_staging_targets, + get_collection_state_snapshot, + is_staging_enabled, + upsert_index_journal_entries, +) except ImportError as exc: raise ImportError( "upload_delta_bundle requires scripts.workspace_state; ensure the module is available" @@ -30,24 +28,6 @@ WORK_DIR = os.environ.get("WORK_DIR") or os.environ.get("WORKDIR") or "/work" _SLUGGED_REPO_RE = re.compile(r"^.+-[0-9a-f]{16}(?:_old)?$") -_DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS = 7 * 24 * 60 * 60 - - -def _env_flag(name: str, default: bool) -> bool: - raw = os.environ.get(name) - if raw is None: - return default - return str(raw).strip().lower() in {"1", "true", "yes", "on"} - - -def _env_int(name: str, default: int) -> int: - raw = os.environ.get(name) - if raw is None: - return default - try: - return int(str(raw).strip()) - except (TypeError, ValueError): - return default def _normalize_hash_value(value: Any) -> str: @@ -61,6 +41,26 @@ def _normalize_hash_value(value: Any) -> str: return raw.lower() +def _build_upsert_journal_entry(path: Path | str, content_hash: Optional[str]) -> Dict[str, Any]: + entry: Dict[str, Any] = { + "path": str(path), + "op_type": "upsert", + } + if content_hash: + entry["content_hash"] = content_hash + return entry + + +def _build_delete_journal_entry(path: Path | str, content_hash: Optional[str] = None) -> Dict[str, Any]: + entry: Dict[str, Any] = { + "path": str(path), + "op_type": "delete", + } + if content_hash: + entry["content_hash"] = content_hash + return entry + + def _load_cache_hashes(cache_path: Path) -> Dict[str, str]: try: with cache_path.open("r", encoding="utf-8-sig") as f: @@ -131,95 +131,6 @@ def _cleanup_empty_dirs(path: Path, stop_at: Path) -> None: break -def _sweep_empty_workspace_dirs(workspace_root: Path) -> None: - """Best-effort prune of empty directories under a workspace root.""" - protected_top_level = {".codebase", ".remote-git"} - try: - workspace_root = workspace_root.resolve() - except Exception: - pass - try: - for root, dirnames, _filenames in os.walk(workspace_root, topdown=False): - current = Path(root) - if current == workspace_root: - continue - if current.parent == workspace_root and current.name in protected_top_level: - continue - try: - rel = current.relative_to(workspace_root) - except Exception: - continue - if rel.parts and rel.parts[0] in protected_top_level: - continue - try: - if any(current.iterdir()): - continue - current.rmdir() - except Exception: - continue - except Exception: - pass - - -def _parse_timestamp(value: Any) -> Optional[datetime]: - raw = str(value or "").strip() - if not raw: - return None - try: - parsed = datetime.fromisoformat(raw.replace("Z", "+00:00")) - except ValueError: - return None - if parsed.tzinfo is None: - return parsed.replace(tzinfo=timezone.utc) - return parsed.astimezone(timezone.utc) - - -def _should_run_empty_dir_sweep(workspace_root: Path, slug: str) -> bool: - if not _env_flag("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", True): - return False - - interval_seconds = max( - 0, - _env_int( - "CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", - _DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS, - ), - ) - if interval_seconds == 0: - return True - - try: - state = get_workspace_state(workspace_path=str(workspace_root), repo_name=slug) or {} - except Exception: - return True - - maintenance = state.get("maintenance") or {} - last_sweep_at = _parse_timestamp(maintenance.get("last_empty_dir_sweep_at")) - if last_sweep_at is None: - return True - - age_seconds = (datetime.now(timezone.utc) - last_sweep_at).total_seconds() - return age_seconds >= interval_seconds - - -def _record_empty_dir_sweep(workspace_root: Path, slug: str) -> None: - try: - state = get_workspace_state(workspace_path=str(workspace_root), repo_name=slug) or {} - maintenance = dict(state.get("maintenance") or {}) - maintenance["last_empty_dir_sweep_at"] = datetime.now(timezone.utc).isoformat() - update_workspace_state( - workspace_path=str(workspace_root), - repo_name=slug, - updates={"maintenance": maintenance}, - ) - except Exception as exc: - logger.debug( - "[upload_service] Failed to record empty-dir sweep for %s: %s", - workspace_root, - exc, - ) - - def _resolve_replica_roots(workspace_path: str, *, create_missing: bool = True) -> Dict[str, Path]: workspace_leaf = Path(workspace_path).name @@ -355,6 +266,28 @@ def _slug_exists(slug: str) -> bool: return replica_roots +def _enqueue_replica_journal_entries( + *, + workspace_root: Path, + slug: str, + entries: list[Dict[str, Any]], +) -> None: + if not entries: + return + try: + upsert_index_journal_entries( + entries, + workspace_path=str(workspace_root), + repo_name=slug, + ) + except Exception as exc: + logger.debug( + "[upload_service] Failed to enqueue index journal entries for %s: %s", + workspace_root, + exc, + ) + + def _safe_join(base: Path, rel: str) -> Path: rp = Path(str(rel)) if str(rp) in {".", ""}: @@ -523,6 +456,9 @@ def apply_delta_operations( slug: _load_replica_cache_hashes(root, slug) for slug, root in replica_roots.items() } + journal_entries_by_slug: Dict[str, list[Dict[str, Any]]] = { + slug: [] for slug in replica_roots.keys() + } normalized_hashes = { str(rel_path): _normalize_hash_value(hash_value) for rel_path, hash_value in (file_hashes or {}).items() @@ -574,6 +510,9 @@ def apply_delta_operations( target_path.unlink(missing_ok=True) _cleanup_empty_dirs(target_path.parent, root) replica_hashes.pop(target_key, None) + journal_entries_by_slug.setdefault(slug, []).append( + _build_delete_journal_entry(target_path) + ) replica_results[slug] = "applied" continue @@ -596,6 +535,13 @@ def apply_delta_operations( replica_hashes[target_key] = op_content_hash elif moved_hash: replica_hashes[target_key] = moved_hash + move_entry_hash = op_content_hash or moved_hash + journal_entries_by_slug.setdefault(slug, []).extend( + [ + _build_delete_journal_entry(safe_source_path, move_entry_hash), + _build_upsert_journal_entry(target_path, move_entry_hash), + ] + ) replica_results[slug] = "applied" except Exception as exc: logger.debug( @@ -622,11 +568,11 @@ def apply_delta_operations( operations_count["failed"] += 1 for slug, root in replica_roots.items(): - if not _should_run_empty_dir_sweep(root, slug): - continue - logger.info("[upload_service] Sweeping empty directories under %s", root) - _sweep_empty_workspace_dirs(root) - _record_empty_dir_sweep(root, slug) + _enqueue_replica_journal_entries( + workspace_root=root, + slug=slug, + entries=journal_entries_by_slug.get(slug, []), + ) return operations_count except Exception as e: @@ -715,6 +661,9 @@ def _member_suffix(name: str, marker: str) -> Optional[str]: slug: _load_replica_cache_hashes(root, slug) for slug, root in replica_roots.items() } + journal_entries_by_slug: Dict[str, list[Dict[str, Any]]] = { + slug: [] for slug in replica_roots.keys() + } # Best-effort: extract git history metadata for watcher to ingest try: @@ -736,10 +685,14 @@ def _member_suffix(name: str, marker: str) -> Optional[str]: except Exception as git_err: logger.debug(f"[upload_service] Error extracting git history metadata: {git_err}") - def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: + def _apply_operation_to_workspace( + slug: str, + workspace_root: Path, + op_type: str, + rel_path: str, + operation: Dict[str, Any], + ) -> str: """Apply a single file operation to a workspace.""" - nonlocal operations_count, op_type, rel_path, tar, operation - target_path = _safe_join(workspace_root, rel_path) target_key = _normalize_cache_key_path(str(target_path)) replica_hashes = replica_cache_hashes.setdefault(slug, {}) @@ -768,6 +721,9 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: target_path.write_bytes(file_content.read()) if op_content_hash: replica_hashes[target_key] = op_content_hash + journal_entries_by_slug.setdefault(slug, []).append( + _build_upsert_journal_entry(target_path, op_content_hash) + ) return "applied" else: return "failed" @@ -787,6 +743,9 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: target_path.write_bytes(file_content.read()) if op_content_hash: replica_hashes[target_key] = op_content_hash + journal_entries_by_slug.setdefault(slug, []).append( + _build_upsert_journal_entry(target_path, op_content_hash) + ) return "applied" else: return "failed" @@ -796,13 +755,12 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: elif op_type == "deleted": if target_path.exists(): target_path.unlink(missing_ok=True) - _cleanup_empty_dirs(target_path.parent, workspace_root) - replica_hashes.pop(target_key, None) - return "applied" - else: - _cleanup_empty_dirs(target_path.parent, workspace_root) - replica_hashes.pop(target_key, None) - return "applied" # Already deleted + _cleanup_empty_dirs(target_path.parent, workspace_root) + replica_hashes.pop(target_key, None) + journal_entries_by_slug.setdefault(slug, []).append( + _build_delete_journal_entry(target_path) + ) + return "applied" elif op_type == "moved": if safe_source_path and safe_source_path.exists(): @@ -820,6 +778,13 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: replica_hashes[target_key] = op_content_hash elif moved_hash: replica_hashes[target_key] = moved_hash + move_entry_hash = op_content_hash or moved_hash + journal_entries_by_slug.setdefault(slug, []).extend( + [ + _build_delete_journal_entry(safe_source_path, move_entry_hash), + _build_upsert_journal_entry(target_path, move_entry_hash), + ] + ) return "applied" # Remote uploads may not have the source file on the server (e.g. staging # mirrors). In that case, clients can embed the destination content under @@ -832,6 +797,13 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: target_path.write_bytes(file_content.read()) if op_content_hash: replica_hashes[target_key] = op_content_hash + if safe_source_path: + journal_entries_by_slug.setdefault(slug, []).append( + _build_delete_journal_entry(safe_source_path, op_content_hash) + ) + journal_entries_by_slug.setdefault(slug, []).append( + _build_upsert_journal_entry(target_path, op_content_hash) + ) return "applied" return "failed" return "failed" @@ -864,7 +836,13 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: replica_results: Dict[str, str] = {} for slug, root in replica_roots.items(): - replica_results[slug] = _apply_operation_to_workspace(slug, root) + replica_results[slug] = _apply_operation_to_workspace( + slug, + root, + op_type, + rel_path, + operation, + ) applied_any = any(result == "applied" for result in replica_results.values()) skipped_hash_match = bool(replica_results) and all( @@ -885,11 +863,11 @@ def _apply_operation_to_workspace(slug: str, workspace_root: Path) -> str: operations_count["failed"] += 1 for slug, root in replica_roots.items(): - if not _should_run_empty_dir_sweep(root, slug): - continue - logger.info("[upload_service] Sweeping empty directories under %s", root) - _sweep_empty_workspace_dirs(root) - _record_empty_dir_sweep(root, slug) + _enqueue_replica_journal_entries( + workspace_root=root, + slug=slug, + entries=journal_entries_by_slug.get(slug, []), + ) return operations_count diff --git a/scripts/watch_index.py b/scripts/watch_index.py index 8fe5a740..685a60d9 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -14,13 +14,8 @@ if str(ROOT_DIR) not in sys.path: sys.path.insert(0, str(ROOT_DIR)) -from scripts.watch_index_core.config import ( # noqa: E402 - LOGGER, - MODEL, - QDRANT_URL, - ROOT as WATCH_ROOT, - default_collection_name, -) +from scripts.watch_index_core import config as watch_config # noqa: E402 +from scripts.watch_index_core.config import LOGGER, MODEL, QDRANT_URL, default_collection_name # noqa: E402 from scripts.watch_index_core.utils import ( get_boolean_env, resolve_vector_name_config, @@ -30,22 +25,24 @@ from scripts.watch_index_core.pseudo import _start_pseudo_backfill_worker # noqa: E402 from scripts.watch_index_core.processor import _process_paths # noqa: E402 from scripts.watch_index_core.queue import ChangeQueue # noqa: E402 +from scripts.watch_index_core.consistency import ( # noqa: E402 + run_consistency_audit, + run_empty_dir_sweep_maintenance, +) from scripts.workspace_state import ( # noqa: E402 - _extract_repo_name_from_path, compute_indexing_config_hash, - get_collection_name, get_indexing_config_snapshot, + list_pending_index_journal_entries, is_multi_repo_mode, persist_indexing_config, update_indexing_status, - update_workspace_state, initialize_watcher_state, ) import scripts.ingest_code as idx # noqa: E402 logger = LOGGER -ROOT = WATCH_ROOT +ROOT = watch_config.ROOT # Back-compat: legacy modules/tests expect a module-level COLLECTION constant. # We use a sentinel and a getter to ensure the resolved value is returned. _COLLECTION: Optional[str] = None @@ -58,7 +55,63 @@ def get_collection() -> str: return default_collection_name() +def _set_runtime_root() -> None: + global ROOT + runtime_root = Path( + os.environ.get("WATCH_ROOT") + or os.environ.get("WORKSPACE_PATH") + or str(ROOT) + ) + try: + runtime_root = runtime_root.resolve() + except Exception: + pass + + ROOT = runtime_root + watch_config.ROOT = runtime_root + + +def _drain_pending_journal(queue: ChangeQueue) -> None: + pending_path: Optional[str] = None + try: + for pending_entry in list_pending_index_journal_entries(str(ROOT)): + pending_path = str(pending_entry.get("path") or "").strip() + if pending_path: + queue.add(Path(pending_path), force=True) + except Exception as exc: + logger.exception( + "watch_index::pending_journal_drain_failed", + extra={"root": str(ROOT), "pending_path": pending_path, "error": str(exc)}, + ) + + +def _run_periodic_maintenance(client: QdrantClient) -> None: + try: + run_consistency_audit(client, ROOT) + except Exception as exc: + logger.exception( + "watch_index::consistency_audit_failed", + extra={"root": str(ROOT), "error": str(exc)}, + ) + try: + run_empty_dir_sweep_maintenance(ROOT) + except Exception as exc: + logger.exception( + "watch_index::empty_dir_sweep_failed", + extra={"root": str(ROOT), "error": str(exc)}, + ) + + +def _maintenance_interval_secs() -> float: + try: + return max(0.0, float(os.environ.get("WATCH_MAINTENANCE_INTERVAL_SECS", "300") or 300.0)) + except Exception: + return 300.0 + + def main() -> None: + _set_runtime_root() + # Resolve collection name from workspace state before any client/state ops try: from scripts.workspace_state import get_collection_name_with_staging as _get_coll @@ -185,8 +238,19 @@ def main() -> None: obs.schedule(handler, str(ROOT), recursive=True) obs.start() + maintenance_interval = _maintenance_interval_secs() + last_maintenance: Optional[float] = None + try: while True: + # Watcher is the sole durable journal consumer in v1. Upload/apply + # records upsert/delete intent here so missed filesystem events can + # still be replayed after watcher/container restarts. + _drain_pending_journal(q) + now = time.time() + if last_maintenance is None or (now - last_maintenance) >= maintenance_interval: + _run_periodic_maintenance(client) + last_maintenance = now time.sleep(1.0) except KeyboardInterrupt: pass diff --git a/scripts/watch_index_core/consistency.py b/scripts/watch_index_core/consistency.py new file mode 100644 index 00000000..05db9f80 --- /dev/null +++ b/scripts/watch_index_core/consistency.py @@ -0,0 +1,539 @@ +from __future__ import annotations + +import json +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, Optional, Set, Tuple + +from qdrant_client import QdrantClient + +import scripts.ingest_code as idx +from scripts.workspace_state import ( + _extract_repo_name_from_path, + _normalize_cache_key_path, + get_collection_state_snapshot, + get_workspace_state, + list_workspaces, + update_workspace_state, + upsert_index_journal_entries, +) + +from .config import LOGGER +from .utils import get_boolean_env +from .paths import is_internal_metadata_path + +logger = LOGGER +_DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS = 7 * 24 * 60 * 60 + + +def _consistency_audit_enabled() -> bool: + return get_boolean_env("WATCH_CONSISTENCY_AUDIT_ENABLED", default=True) + + +def _consistency_audit_interval_secs() -> int: + try: + return max(60, int(os.environ.get("WATCH_CONSISTENCY_AUDIT_INTERVAL_SECS", "86400") or 86400)) + except Exception: + return 86400 + + +def _consistency_audit_max_paths() -> int: + try: + return max(0, int(os.environ.get("WATCH_CONSISTENCY_AUDIT_MAX_PATHS", "200000") or 200000)) + except Exception: + return 200000 + + +def _consistency_repair_enabled() -> bool: + return get_boolean_env("WATCH_CONSISTENCY_REPAIR_ENABLED", default=True) + + +def _consistency_repair_max_ops() -> int: + try: + return max(0, int(os.environ.get("WATCH_CONSISTENCY_REPAIR_MAX_OPS", "5000") or 5000)) + except Exception: + return 5000 + + +def _empty_dir_sweep_enabled() -> bool: + if "WATCH_EMPTY_DIR_SWEEP_ENABLED" in os.environ: + return get_boolean_env("WATCH_EMPTY_DIR_SWEEP_ENABLED", default=True) + return get_boolean_env("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", default=True) + + +def _empty_dir_sweep_interval_secs() -> int: + raw = os.environ.get("WATCH_EMPTY_DIR_SWEEP_INTERVAL_SECONDS") + if raw is None: + raw = os.environ.get( + "CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", + str(_DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS), + ) + try: + return max(0, int(raw or _DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS)) + except Exception: + return _DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS + + +def _parse_ts(value: Any) -> Optional[datetime]: + raw = str(value or "").strip() + if not raw: + return None + try: + parsed = datetime.fromisoformat(raw.replace("Z", "+00:00")) + except ValueError: + return None + if parsed.tzinfo is None: + return parsed.replace(tzinfo=timezone.utc) + return parsed.astimezone(timezone.utc) + + +def _should_run_consistency_audit(workspace_path: str, repo_name: Optional[str]) -> bool: + if not _consistency_audit_enabled(): + return False + interval = _consistency_audit_interval_secs() + try: + state = get_workspace_state(workspace_path=workspace_path, repo_name=repo_name) or {} + except Exception: + return True + maintenance = dict(state.get("maintenance") or {}) + last = _parse_ts(maintenance.get("last_consistency_audit_at")) + if last is None: + return True + age = (datetime.now(timezone.utc) - last).total_seconds() + return age >= interval + + +def _sweep_empty_workspace_dirs(workspace_root: Path) -> None: + protected_top_level = {".codebase", ".remote-git"} + try: + workspace_root = workspace_root.resolve() + except Exception: + pass + try: + for root, _dirnames, _filenames in os.walk(workspace_root, topdown=False): + current = Path(root) + if current == workspace_root: + continue + if current.parent == workspace_root and current.name in protected_top_level: + continue + try: + rel = current.relative_to(workspace_root) + except Exception: + continue + if rel.parts and rel.parts[0] in protected_top_level: + continue + try: + if any(current.iterdir()): + continue + current.rmdir() + except Exception: + continue + except Exception: + pass + + +def _should_run_empty_dir_sweep(workspace_path: str, repo_name: Optional[str]) -> bool: + if not _empty_dir_sweep_enabled(): + return False + interval_seconds = _empty_dir_sweep_interval_secs() + if interval_seconds == 0: + return True + try: + state = get_workspace_state(workspace_path=workspace_path, repo_name=repo_name) or {} + except Exception: + return True + maintenance = state.get("maintenance") or {} + last_sweep_at = _parse_ts(maintenance.get("last_empty_dir_sweep_at")) + if last_sweep_at is None: + return True + age_seconds = (datetime.now(timezone.utc) - last_sweep_at).total_seconds() + return age_seconds >= interval_seconds + + +def _record_empty_dir_sweep(workspace_path: str, repo_name: Optional[str]) -> None: + try: + state = get_workspace_state(workspace_path=workspace_path, repo_name=repo_name) or {} + maintenance = dict(state.get("maintenance") or {}) + maintenance["last_empty_dir_sweep_at"] = datetime.now(timezone.utc).isoformat() + update_workspace_state( + workspace_path=workspace_path, + repo_name=repo_name, + updates={"maintenance": maintenance}, + ) + except Exception: + pass + + +def _load_cached_hashes( + workspace_path: str, + repo_name: Optional[str], + *, + metadata_root: Optional[Path] = None, +) -> Dict[str, str]: + workspace_norm = _normalize_cache_key_path(workspace_path) + workspace_prefix = f"{workspace_norm.rstrip('/')}/" + candidates: list[Path] = [] + seen: set[str] = set() + + def _append_candidate(path: Path) -> None: + key = str(path) + if key in seen: + return + seen.add(key) + candidates.append(path) + + root = Path(metadata_root or workspace_path) + if repo_name: + _append_candidate(root / ".codebase" / "repos" / repo_name / "cache.json") + else: + _append_candidate(root / ".codebase" / "cache.json") + + for cache_path in candidates: + if not cache_path.exists(): + continue + try: + with cache_path.open("r", encoding="utf-8-sig") as f: + data = json.load(f) + hashes = data.get("file_hashes", {}) + if not isinstance(hashes, dict): + return {} + normalized: Dict[str, str] = {} + for path_key, value in hashes.items(): + norm = _normalize_cache_key_path(str(path_key)) + if not norm: + continue + if workspace_norm and not ( + norm == workspace_norm or norm.startswith(workspace_prefix) + ): + continue + if isinstance(value, dict): + digest = str(value.get("hash") or "").strip() + else: + digest = str(value or "").strip() + normalized[norm] = digest + return normalized + except Exception: + return {} + return {} + + +def _is_index_eligible_path(path_str: str, workspace_root: Path, excluder) -> bool: + try: + p = Path(path_str).resolve() + except Exception: + p = Path(path_str) + try: + rel = p.resolve().relative_to(workspace_root.resolve()) + except Exception: + return False + + if not rel.parts: + return False + if not p.exists() or p.is_dir(): + return False + try: + if int(p.stat().st_size) == 0: + # Empty files (e.g. many __init__.py stubs) produce no vectors; do not + # enqueue consistency upserts for them. + return False + except Exception: + return False + if is_internal_metadata_path(p): + return False + + # Keep git-history manifests indexable even when .remote-git is excluded. + if any(part == ".remote-git" for part in p.parts) and p.suffix.lower() == ".json": + return True + + try: + rel_dir = "/" + str(rel.parent).replace(os.sep, "/") + if rel_dir == "/.": + rel_dir = "/" + if excluder.exclude_dir(rel_dir): + return False + except Exception: + return False + + if not idx.is_indexable_file(p): + return False + + try: + relf = (rel_dir.rstrip("/") + "/" + p.name).replace("//", "/") + if excluder.exclude_file(relf): + return False + except Exception: + return False + return True + + +def _scan_indexable_fs_paths(workspace_root: Path, *, max_paths: int) -> Tuple[Set[str], bool]: + paths: Set[str] = set() + excluder = idx._Excluder(workspace_root) + try: + workspace_root = workspace_root.resolve() + except Exception: + pass + + for root_str, dirnames, filenames in os.walk(workspace_root): + current = Path(root_str) + pruned_dirnames = [] + for dirname in dirnames: + child = current / dirname + if is_internal_metadata_path(child): + continue + pruned_dirnames.append(dirname) + dirnames[:] = pruned_dirnames + + for filename in filenames: + file_path = current / filename + normalized = _normalize_cache_key_path(str(file_path)) + if not normalized: + continue + if not _is_index_eligible_path(normalized, workspace_root, excluder): + continue + paths.add(normalized) + if max_paths > 0 and len(paths) >= max_paths: + return paths, True + return paths, False + + +def _load_indexed_paths_for_collection( + client: QdrantClient, + collection: str, + workspace_path: str, + *, + max_paths: int, +) -> Tuple[Set[str], bool]: + paths: Set[str] = set() + workspace_norm = _normalize_cache_key_path(workspace_path) + workspace_prefix = f"{workspace_norm.rstrip('/')}/" + offset = None + while True: + points, next_offset = client.scroll( + collection_name=collection, + limit=1000, + with_payload=True, + with_vectors=False, + offset=offset, + ) + for pt in points or []: + payload = getattr(pt, "payload", {}) or {} + metadata = payload.get("metadata", {}) or {} + path = _normalize_cache_key_path(str(metadata.get("path") or "")) + if path: + if workspace_norm and not ( + path == workspace_norm or path.startswith(workspace_prefix) + ): + continue + paths.add(path) + if max_paths > 0 and len(paths) >= max_paths: + return paths, True + if next_offset is None: + break + offset = next_offset + return paths, False + + +def _record_consistency_audit( + workspace_path: str, + repo_name: Optional[str], + summary: Dict[str, Any], +) -> None: + try: + state = get_workspace_state(workspace_path=workspace_path, repo_name=repo_name) or {} + maintenance = dict(state.get("maintenance") or {}) + maintenance["last_consistency_audit_at"] = datetime.now(timezone.utc).isoformat() + maintenance["last_consistency_audit_summary"] = summary + update_workspace_state( + workspace_path=workspace_path, + repo_name=repo_name, + updates={"maintenance": maintenance}, + ) + except Exception: + pass + + +def _enqueue_consistency_repairs( + workspace_root: Path, + workspace_path: str, + repo_name: Optional[str], + stale_paths: list[str], + missing_paths: list[str], + cached_hashes: Dict[str, str], +) -> Tuple[int, int]: + if not _consistency_repair_enabled(): + return 0, 0 + max_ops = _consistency_repair_max_ops() + if max_ops <= 0: + return 0, 0 + + entries: list[Dict[str, Any]] = [] + enqueued_stale = 0 + enqueued_missing = 0 + missing_set = set(missing_paths) + excluder = idx._Excluder(workspace_root) + + for path in stale_paths: + if len(entries) >= max_ops: + break + # Cache can lag after state resets/rebuilds; if the path still exists and is + # index-eligible, treat it as missing/upsert instead of stale/delete. + if _is_index_eligible_path(path, workspace_root, excluder): + missing_set.add(path) + continue + entries.append({"path": path, "op_type": "delete"}) + enqueued_stale += 1 + for path in sorted(missing_set): + if len(entries) >= max_ops: + break + entries.append( + { + "path": path, + "op_type": "upsert", + "content_hash": cached_hashes.get(path) or None, + } + ) + enqueued_missing += 1 + + if not entries: + return 0, 0 + try: + upsert_index_journal_entries( + entries, + workspace_path=workspace_path, + repo_name=repo_name, + ) + except Exception as exc: + logger.debug( + "[consistency_audit] failed to enqueue repairs workspace=%s repo=%s: %s", + workspace_path, + repo_name, + exc, + ) + return 0, 0 + return enqueued_stale, enqueued_missing + + +def run_consistency_audit(client: QdrantClient, root: Path) -> None: + if not _consistency_audit_enabled(): + return + max_paths = _consistency_audit_max_paths() + try: + candidates = list_workspaces(search_root=str(root), use_qdrant_fallback=False) + except Exception: + candidates = [] + for ws in candidates: + workspace_path = str(ws.get("workspace_path") or "").strip() + if not workspace_path: + continue + repo_name = _extract_repo_name_from_path(workspace_path) + if not _should_run_consistency_audit(workspace_path, repo_name): + continue + try: + snapshot = get_collection_state_snapshot( + workspace_path=workspace_path, + repo_name=repo_name, + ) + collection = str(snapshot.get("active_collection") or "").strip() + if not collection: + continue + cached_hashes = _load_cached_hashes( + workspace_path, + repo_name, + metadata_root=root, + ) + workspace_root = Path(workspace_path) + fs_paths, fs_truncated = _scan_indexable_fs_paths( + workspace_root, + max_paths=max_paths, + ) + excluder = idx._Excluder(workspace_root) + cached_paths = { + path + for path in cached_hashes.keys() + if _is_index_eligible_path(path, workspace_root, excluder) + } + indexed_paths, indexed_truncated = _load_indexed_paths_for_collection( + client, + collection, + workspace_path, + max_paths=max_paths, + ) + if fs_truncated or indexed_truncated: + stale = [] + missing = [] + enq_stale = 0 + enq_missing = 0 + else: + stale = sorted(indexed_paths - fs_paths) + missing = sorted(fs_paths - indexed_paths) + enq_stale, enq_missing = _enqueue_consistency_repairs( + workspace_root, + workspace_path, + repo_name, + stale, + missing, + cached_hashes, + ) + summary = { + "fs_count": len(fs_paths), + "cache_count": len(cached_paths), + "qdrant_count": len(indexed_paths), + "fs_scan_truncated": fs_truncated, + "qdrant_scan_truncated": indexed_truncated, + "repair_skipped_due_to_truncation": bool(fs_truncated or indexed_truncated), + "stale_in_qdrant_count": len(stale), + "missing_in_qdrant_count": len(missing), + "repair_enqueued_stale_count": int(enq_stale), + "repair_enqueued_missing_count": int(enq_missing), + "sample_stale": stale[:20], + "sample_missing": missing[:20], + } + _record_consistency_audit(workspace_path, repo_name, summary) + logger.info( + "[consistency_audit] repo=%s collection=%s fs=%d cache=%d qdrant=%d stale=%d missing=%d repair_stale=%d repair_missing=%d", + repo_name or "", + collection, + len(fs_paths), + len(cached_paths), + len(indexed_paths), + len(stale), + len(missing), + int(enq_stale), + int(enq_missing), + ) + except Exception as exc: + logger.debug( + "[consistency_audit] failed workspace=%s repo=%s: %s", + workspace_path, + repo_name, + exc, + ) + + +def run_empty_dir_sweep_maintenance(root: Path) -> None: + if not _empty_dir_sweep_enabled(): + return + try: + candidates = list_workspaces(search_root=str(root), use_qdrant_fallback=False) + except Exception: + candidates = [] + for ws in candidates: + workspace_path = str(ws.get("workspace_path") or "").strip() + if not workspace_path: + continue + repo_name = _extract_repo_name_from_path(workspace_path) + if not _should_run_empty_dir_sweep(workspace_path, repo_name): + continue + try: + logger.info("[empty_dir_sweep] Sweeping empty directories under %s", workspace_path) + _sweep_empty_workspace_dirs(Path(workspace_path)) + _record_empty_dir_sweep(workspace_path, repo_name) + except Exception as exc: + logger.debug( + "[empty_dir_sweep] failed workspace=%s repo=%s: %s", + workspace_path, + repo_name, + exc, + ) diff --git a/scripts/watch_index_core/handler.py b/scripts/watch_index_core/handler.py index ca1c9411..1b36fa51 100644 --- a/scripts/watch_index_core/handler.py +++ b/scripts/watch_index_core/handler.py @@ -12,7 +12,6 @@ import scripts.ingest_code as idx from scripts.workspace_state import ( _extract_repo_name_from_path, - _get_global_state_dir, get_cached_file_hash, log_watcher_activity as _log_activity, remove_cached_file, @@ -27,6 +26,7 @@ safe_print, ) from .rename import _rename_in_store +from .paths import is_internal_metadata_path class IndexHandler(FileSystemEventHandler): @@ -82,15 +82,7 @@ def _maybe_reload_excluder(self) -> None: pass def _is_internal_metadata_path(self, p: Path) -> bool: - try: - if any(part == ".codebase" for part in p.parts): - return True - global_state_dir = _get_global_state_dir() - if global_state_dir is not None and p.is_relative_to(global_state_dir): - return True - except (OSError, ValueError): - return False - return False + return is_internal_metadata_path(p) def _maybe_enqueue(self, src_path: str) -> None: self._maybe_reload_excluder() diff --git a/scripts/watch_index_core/paths.py b/scripts/watch_index_core/paths.py new file mode 100644 index 00000000..3ae7209d --- /dev/null +++ b/scripts/watch_index_core/paths.py @@ -0,0 +1,34 @@ +"""Path classification helpers shared by watcher components.""" + +from __future__ import annotations + +from pathlib import Path + +from scripts.workspace_state import ( + _get_global_state_dir, + INTERNAL_STATE_TOP_LEVEL_DIRS, +) + + +def is_internal_metadata_path(path: Path) -> bool: + """Return True when path points into watcher/internal metadata trees.""" + try: + if any(part in INTERNAL_STATE_TOP_LEVEL_DIRS for part in path.parts): + return True + global_state_dir = _get_global_state_dir() + if global_state_dir is not None and path.is_relative_to(global_state_dir): + return True + except (OSError, ValueError): + return False + return False + + +def is_internal_top_level_path(path: Path, root: Path) -> bool: + """Return True when path's top-level segment under root is internal metadata.""" + try: + rel = path.resolve().relative_to(root.resolve()) + except Exception: + return False + if not rel.parts: + return False + return rel.parts[0] in INTERNAL_STATE_TOP_LEVEL_DIRS diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index a98aae05..bfc06ae8 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -16,21 +16,29 @@ from pathlib import Path from typing import Dict, List, Optional +from qdrant_client import models + import scripts.ingest_code as idx from scripts.workspace_state import ( + _normalize_cache_key_path, _extract_repo_name_from_path, get_cached_file_hash, + list_pending_index_journal_entries, get_workspace_state, is_staging_enabled, log_watcher_activity as _log_activity, persist_indexing_config, remove_cached_file, + set_cached_file_hash, set_indexing_progress as _update_progress, set_indexing_started as _set_status_indexing, + update_index_journal_entry_status, update_indexing_status, ) +from . import config as watch_config +from .rename import _rename_in_store +from .paths import is_internal_top_level_path -from .config import QDRANT_URL, ROOT, ROOT_DIR, LOGGER as logger from .utils import ( _detect_repo_for_file, _get_collection_for_file, @@ -39,10 +47,21 @@ safe_log_error, ) +logger = watch_config.LOGGER + class _SkipUnchanged(Exception): """Sentinel exception to skip unchanged files in the watch loop.""" + def __init__(self, *, text: Optional[str] = None, file_hash: str = "") -> None: + super().__init__("unchanged") + self.text = text + self.file_hash = file_hash + + +def _is_internal_ignored_path(path: Path) -> bool: + return is_internal_top_level_path(path, watch_config.ROOT) + def _staging_requires_subprocess(state: Optional[Dict[str, object]]) -> bool: """Return True only when dual-root staging is actually active for this repo.""" @@ -322,6 +341,158 @@ def _advance_progress( pass +def _mark_journal_done(path: Path, repo_key: str, repo_name: Optional[str]) -> None: + try: + update_index_journal_entry_status( + str(path), + status="done", + workspace_path=repo_key, + repo_name=repo_name, + ) + except Exception: + pass + + +def _mark_journal_failed( + path: Path, + repo_key: str, + repo_name: Optional[str], + error: str, +) -> None: + try: + update_index_journal_entry_status( + str(path), + status="failed", + error=error, + workspace_path=repo_key, + repo_name=repo_name, + remove_on_done=False, + ) + except Exception: + pass + + +def _path_has_indexed_points(client, collection: str, path: Path) -> Optional[bool]: + try: + filt = models.Filter( + must=[ + models.FieldCondition( + key="metadata.path", match=models.MatchValue(value=str(path)) + ) + ] + ) + points, _ = client.scroll( + collection_name=collection, + scroll_filter=filt, + with_payload=False, + with_vectors=False, + limit=1, + ) + return bool(points) + except Exception: + return None + + +def _verify_delete_committed(client, collection: str, path: Path) -> bool: + has_points = _path_has_indexed_points(client, collection, path) + return has_points is False + + +def _verify_upsert_committed( + client, + collection: str, + path: Path, + repo_name: Optional[str], + expected_file_hash: Optional[str], + source_text: Optional[str] = None, +) -> bool: + indexed_hash = str( + idx.get_indexed_file_hash(client, collection, str(path)) or "" + ).strip() + expected_hash = str(expected_file_hash or "").strip() + if expected_hash: + if bool(indexed_hash) and indexed_hash == expected_hash: + return True + # Empty/whitespace-only files can legitimately have no indexed points/hash. + try: + if source_text is not None and not source_text.strip(): + has_points = _path_has_indexed_points(client, collection, path) + return has_points is False + except Exception: + pass + return False + has_points = _path_has_indexed_points(client, collection, path) + return has_points is True + + +def _verify_and_update_journal_for_upsert( + p: Path, + client, + collection: str, + repo_key: str, + repo_name: Optional[str], + journal_content_hash: str, + *, + text: Optional[str] = None, + file_hash: Optional[str] = None, +) -> None: + source_text = text + expected_hash = str(file_hash or "").strip() + if source_text is None or not expected_hash: + read_text, read_hash = _read_text_and_sha1(p) + if source_text is None: + source_text = read_text + if not expected_hash: + expected_hash = read_hash + expected_hash = expected_hash or journal_content_hash + if _verify_upsert_committed( + client, + collection, + p, + repo_name, + expected_hash or None, + source_text=source_text, + ): + _mark_journal_done(p, repo_key, repo_name) + else: + _mark_journal_failed( + p, + repo_key, + repo_name, + "upsert_verification_failed", + ) + + +def _finalize_journal_after_index_attempt( + path: Path, + client, + collection: str | None, + repo_key: str, + repo_name: Optional[str], + *, + force_upsert: bool, + journal_content_hash: str, + text: Optional[str] = None, + file_hash: Optional[str] = None, + default_error: Optional[str] = None, +) -> None: + if force_upsert and client is not None and collection is not None: + _verify_and_update_journal_for_upsert( + path, + client, + collection, + repo_key, + repo_name, + journal_content_hash, + text=text, + file_hash=file_hash, + ) + elif default_error: + _mark_journal_failed(path, repo_key, repo_name, default_error) + else: + _mark_journal_done(path, repo_key, repo_name) + + def _build_subprocess_env( collection: str | None, repo_name: str | None, @@ -335,8 +506,8 @@ def _build_subprocess_env( pass if collection: env["COLLECTION_NAME"] = collection - if QDRANT_URL: - env["QDRANT_URL"] = QDRANT_URL + if watch_config.QDRANT_URL: + env["QDRANT_URL"] = watch_config.QDRANT_URL if repo_name: env["REPO_NAME"] = repo_name return env @@ -344,6 +515,7 @@ def _build_subprocess_env( def _maybe_handle_staging_file( path: Path, + client, collection: str | None, repo_name: str | None, repo_key: str, @@ -351,27 +523,45 @@ def _maybe_handle_staging_file( state_env: Optional[Dict[str, str]], repo_progress: Dict[str, int], started_at: str, + *, + force_upsert: bool = False, + journal_content_hash: str = "", ) -> bool: if not (state_env and collection): return False - _text, file_hash = _read_text_and_sha1(path) + source_text, file_hash = _read_text_and_sha1(path) if file_hash: try: cached_hash = get_cached_file_hash(str(path), repo_name) if repo_name else None except Exception: cached_hash = None if cached_hash and cached_hash == file_hash: + if force_upsert and client is not None: + if _verify_upsert_committed( + client, + collection, + path, + repo_name, + file_hash or journal_content_hash or None, + source_text=source_text, + ): + safe_print(f"[skip_unchanged] {path} (hash match)") + _log_activity(repo_key, "skipped", path, {"reason": "hash_unchanged"}) + _mark_journal_done(path, repo_key, repo_name) + _advance_progress(repo_progress, repo_key, repo_files, started_at, path) + return True # Fast path: skip if content hash matches cached hash (file unchanged) # Safety: startup health check clears stale cache per-repo - safe_print(f"[skip_unchanged] {path} (hash match)") - _log_activity(repo_key, "skipped", path, {"reason": "hash_unchanged"}) - _advance_progress(repo_progress, repo_key, repo_files, started_at, path) - return True + if not force_upsert: + safe_print(f"[skip_unchanged] {path} (hash match)") + _log_activity(repo_key, "skipped", path, {"reason": "hash_unchanged"}) + _advance_progress(repo_progress, repo_key, repo_files, started_at, path) + return True cmd = [ sys.executable or "python3", - str(ROOT_DIR / "scripts" / "ingest_code.py"), + str(watch_config.ROOT_DIR / "scripts" / "ingest_code.py"), "--root", str(path), "--no-skip-unchanged", @@ -408,6 +598,19 @@ def _maybe_handle_staging_file( ) else: safe_print(f"[indexed_subprocess] {path} -> {collection}") + _finalize_journal_after_index_attempt( + path, + client, + collection, + repo_key, + repo_name, + force_upsert=force_upsert, + journal_content_hash=journal_content_hash, + text=source_text, + file_hash=file_hash, + ) + if result.returncode != 0 and force_upsert: + _mark_journal_failed(path, repo_key, repo_name, "subprocess_index_failed") _advance_progress(repo_progress, repo_key, repo_files, started_at, path) return True @@ -448,12 +651,86 @@ def _process_paths( pass repo_progress: Dict[str, int] = {key: 0 for key in repo_groups.keys()} + repo_pending_journal_ops: Dict[str, Dict[str, Dict[str, str]]] = {} + repo_move_source_for_dest: Dict[str, Dict[str, str]] = {} + move_dest_keys: set[str] = set() + move_source_keys: set[str] = set() + for repo_path in repo_groups.keys(): + try: + repo_name = _extract_repo_name_from_path(repo_path) + entries = list_pending_index_journal_entries(repo_path, repo_name) + repo_pending_journal_ops[repo_path] = {} + upserts_by_hash: Dict[str, List[str]] = {} + deletes_by_hash: Dict[str, List[str]] = {} + for rec in entries: + path_key = _normalize_cache_key_path(str(rec.get("path") or "")) + op_type = str(rec.get("op_type") or "").strip().lower() + content_hash = str(rec.get("content_hash") or "").strip().lower() + if not path_key: + continue + repo_pending_journal_ops[repo_path][path_key] = { + "op_type": op_type, + "content_hash": content_hash, + } + if not content_hash: + continue + if op_type == "upsert": + upserts_by_hash.setdefault(content_hash, []).append(path_key) + elif op_type == "delete": + deletes_by_hash.setdefault(content_hash, []).append(path_key) + pairs: Dict[str, str] = {} + for content_hash, dest_paths in upserts_by_hash.items(): + src_paths = deletes_by_hash.get(content_hash) or [] + if not src_paths: + continue + src_idx = 0 + for dest_key in dest_paths: + while src_idx < len(src_paths) and src_paths[src_idx] == dest_key: + src_idx += 1 + if src_idx >= len(src_paths): + break + src_key = src_paths[src_idx] + src_idx += 1 + pairs[dest_key] = src_key + move_dest_keys.add(dest_key) + move_source_keys.add(src_key) + repo_move_source_for_dest[repo_path] = pairs + except Exception: + repo_pending_journal_ops[repo_path] = {} + repo_move_source_for_dest[repo_path] = {} + + unique_paths = sorted( + unique_paths, + key=lambda p: ( + 0 + if _normalize_cache_key_path(str(p)) in move_dest_keys + else (2 if _normalize_cache_key_path(str(p)) in move_source_keys else 1), + str(p), + ), + ) + completed_move_sources: set[str] = set() for p in unique_paths: repo_path = _detect_repo_for_file(p) or Path(workspace_path) repo_key = str(repo_path) repo_files = repo_groups.get(repo_key, []) repo_name = _extract_repo_name_from_path(repo_key) + path_key = _normalize_cache_key_path(str(p)) + if path_key in completed_move_sources: + _advance_progress(repo_progress, repo_key, repo_files, started_at, p) + continue + journal_rec = repo_pending_journal_ops.get(repo_key, {}).get(path_key, {}) + journal_op = str(journal_rec.get("op_type") or "").strip().lower() + force_delete = journal_op == "delete" + force_upsert = journal_op == "upsert" + journal_content_hash = str(journal_rec.get("content_hash") or "").strip().lower() + if _is_internal_ignored_path(p): + _log_activity(repo_key, "skipped", p, {"reason": "internal_ignored_path"}) + # Internal metadata paths should never drive indexing or collection creation. + # If they entered the journal via drift repair, mark done and drop. + _mark_journal_done(p, repo_key, repo_name) + _advance_progress(repo_progress, repo_key, repo_files, started_at, p) + continue collection = _get_collection_for_file(p) state_env: Optional[Dict[str, str]] = None try: @@ -477,7 +754,58 @@ def _process_paths( _advance_progress(repo_progress, repo_key, repo_files, started_at, p) continue - if not p.exists(): + if force_upsert and not p.exists(): + _log_activity(repo_key, "skipped", p, {"reason": "upsert_missing_file"}) + _mark_journal_failed( + p, + repo_key, + repo_name, + "upsert_missing_file", + ) + _advance_progress(repo_progress, repo_key, repo_files, started_at, p) + continue + + if force_upsert and client is not None and collection is not None: + move_src_key = repo_move_source_for_dest.get(repo_key, {}).get(path_key) + if move_src_key: + move_src_path = Path(move_src_key) + src_collection = _get_collection_for_file(move_src_path) + try: + moved_count, renamed_hash = _rename_in_store( + client, + src_collection, + move_src_path, + p, + collection, + ) + except Exception: + moved_count, renamed_hash = -1, None + if moved_count and moved_count > 0: + try: + if repo_name: + remove_cached_file(str(move_src_path), repo_name) + except Exception: + pass + final_hash = renamed_hash or journal_content_hash + try: + if repo_name and final_hash: + set_cached_file_hash(str(p), final_hash, repo_name) + except Exception: + pass + _log_activity( + repo_key, + "moved", + p, + {"from": str(move_src_path), "chunks": int(moved_count)}, + ) + _mark_journal_done(p, repo_key, repo_name) + _mark_journal_done(move_src_path, repo_key, repo_name) + completed_move_sources.add(move_src_key) + _advance_progress(repo_progress, repo_key, repo_files, started_at, p) + continue + + if force_delete or not p.exists(): + deleted_ok = False if client is not None: try: idx.delete_points_by_path(client, collection, str(p)) @@ -491,19 +819,32 @@ def _process_paths( except Exception: pass safe_print(f"[deleted] {p} -> {collection}") + deleted_ok = True except Exception: - pass + deleted_ok = False + if deleted_ok and client is not None and collection is not None: + deleted_ok = _verify_delete_committed(client, collection, p) try: if repo_name: remove_cached_file(str(p), repo_name) except Exception: pass _log_activity(repo_key, "deleted", p) + if deleted_ok: + _mark_journal_done(p, repo_key, repo_name) + else: + _mark_journal_failed( + p, + repo_key, + repo_name, + "delete_points_failed", + ) _advance_progress(repo_progress, repo_key, repo_files, started_at, p) continue if _maybe_handle_staging_file( p, + client, collection, repo_name, repo_key, @@ -511,17 +852,38 @@ def _process_paths( state_env, repo_progress, started_at, + force_upsert=force_upsert, + journal_content_hash=journal_content_hash, ): continue if client is not None and model is not None: try: + verify_context: Dict[str, Optional[str]] = {} ok = _run_indexing_strategy( - p, client, model, collection, vector_name, model_dim, repo_name + p, + client, + model, + collection, + vector_name, + model_dim, + repo_name, + verify_context=verify_context if force_upsert else None, ) - except _SkipUnchanged: + except _SkipUnchanged as exc: status = "skipped" safe_print(f"[{status}] {p} -> {collection}") _log_activity(repo_key, "skipped", p, {"reason": "hash_unchanged"}) + _finalize_journal_after_index_attempt( + p, + client, + collection, + repo_key, + repo_name, + force_upsert=force_upsert, + journal_content_hash=journal_content_hash, + text=exc.text, + file_hash=exc.file_hash, + ) _advance_progress(repo_progress, repo_key, repo_files, started_at, p) continue except Exception: @@ -534,6 +896,7 @@ def _process_paths( "file": str(p), }, ) + _mark_journal_failed(p, repo_key, repo_name, "indexing_error") _advance_progress(repo_progress, repo_key, repo_files, started_at, p) continue @@ -545,10 +908,33 @@ def _process_paths( except Exception: size = None _log_activity(repo_key, "indexed", p, {"file_size": size}) + _finalize_journal_after_index_attempt( + p, + client, + collection, + repo_key, + repo_name, + force_upsert=force_upsert, + journal_content_hash=journal_content_hash, + text=verify_context.get("text"), + file_hash=verify_context.get("file_hash"), + ) else: _log_activity( repo_key, "skipped", p, {"reason": "no-change-or-error"} ) + _finalize_journal_after_index_attempt( + p, + client, + collection, + repo_key, + repo_name, + force_upsert=force_upsert, + journal_content_hash=journal_content_hash, + text=verify_context.get("text"), + file_hash=verify_context.get("file_hash"), + default_error="no_change_or_error", + ) _advance_progress(repo_progress, repo_key, repo_files, started_at, p) else: safe_print(f"Not processing locally: {p}") @@ -572,7 +958,7 @@ def _read_text_and_sha1(path: Path) -> tuple[Optional[str], str]: text = path.read_text(encoding="utf-8", errors="ignore") except Exception: text = None - if not text: + if text is None: return text, "" try: file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() @@ -589,17 +975,26 @@ def _run_indexing_strategy( vector_name: str, model_dim: int, repo_name: str | None, + *, + verify_context: Optional[Dict[str, Optional[str]]] = None, ) -> bool: if collection is None: return False text, file_hash = _read_text_and_sha1(path) + if verify_context is not None: + verify_context["text"] = text + verify_context["file_hash"] = file_hash ok = False if text is not None: try: language = idx.detect_language(path) except Exception: language = "" + try: + is_text_like = bool(idx.is_text_like_language(language)) + except Exception: + is_text_like = False if file_hash: try: cached_hash = get_cached_file_hash(str(path), repo_name) if repo_name else None @@ -607,46 +1002,47 @@ def _run_indexing_strategy( cached_hash = None if cached_hash and cached_hash == file_hash: ok = True - raise _SkipUnchanged() - try: - use_smart, smart_reason = idx.should_use_smart_reindexing(str(path), file_hash) - except Exception: - use_smart, smart_reason = False, "smart_check_failed" - # Bootstrap: if we have no symbol cache yet, still run smart path once - bootstrap = smart_reason == "no_cached_symbols" - if use_smart or bootstrap: - msg_kind = ( - "smart reindexing" - if use_smart - else "bootstrap (no_cached_symbols) for smart reindex" - ) - safe_print( - f"[SMART_REINDEX][watcher] Using {msg_kind} for {path} ({smart_reason})" - ) + raise _SkipUnchanged(text=text, file_hash=file_hash) + if not is_text_like: try: - status = idx.process_file_with_smart_reindexing( - path, - text, - language, - client, - collection, - repo_name, - model, - vector_name, - model_dim=model_dim, + use_smart, smart_reason = idx.should_use_smart_reindexing(str(path), file_hash) + except Exception: + use_smart, smart_reason = False, "smart_check_failed" + # Bootstrap: if we have no symbol cache yet, still run smart path once + bootstrap = smart_reason == "no_cached_symbols" + if use_smart or bootstrap: + msg_kind = ( + "smart reindexing" + if use_smart + else "bootstrap (no_cached_symbols) for smart reindex" ) - ok = status in ("success", "skipped") - except Exception as exc: safe_print( - f"[SMART_REINDEX][watcher] Smart reindexing failed for {path}: {exc}" + f"[SMART_REINDEX][watcher] Using {msg_kind} for {path} ({smart_reason})" ) - ok = False - else: - safe_print( - f"[SMART_REINDEX][watcher] Using full reindexing for {path} ({smart_reason})" - ) - # Fallback: full single-file reindex. Pseudo/tags are inlined by default; - # when PSEUDO_DEFER_TO_WORKER=1 we run base-only and rely on backfill. + try: + status = idx.process_file_with_smart_reindexing( + path, + text, + language, + client, + collection, + repo_name, + model, + vector_name, + model_dim=model_dim, + ) + ok = status in ("success", "skipped") + except Exception as exc: + safe_print( + f"[SMART_REINDEX][watcher] Smart reindexing failed for {path}: {exc}" + ) + ok = False + else: + safe_print( + f"[SMART_REINDEX][watcher] Using full reindexing for {path} ({smart_reason})" + ) + # Fallback: full single-file reindex. Pseudo/tags are inlined by default; + # when PSEUDO_DEFER_TO_WORKER=1 we run base-only and rely on backfill. if not ok: try: idx.ensure_collection_and_indexes_once( diff --git a/scripts/watch_index_core/pseudo.py b/scripts/watch_index_core/pseudo.py index 33fb6f46..33caa303 100644 --- a/scripts/watch_index_core/pseudo.py +++ b/scripts/watch_index_core/pseudo.py @@ -8,6 +8,7 @@ from typing import Optional import scripts.ingest_code as idx +from . import config as watch_config from .utils import get_boolean_env from scripts.workspace_state import ( _cross_process_lock, @@ -17,8 +18,6 @@ is_multi_repo_mode, ) -from .config import ROOT - logger = logging.getLogger(__name__) @@ -65,7 +64,7 @@ def _worker() -> None: try: graph_backfill_enabled = get_boolean_env("GRAPH_EDGES_BACKFILL") try: - mappings = get_collection_mappings(search_root=str(ROOT)) + mappings = get_collection_mappings(search_root=str(watch_config.ROOT)) except Exception: mappings = [] if not mappings: @@ -83,7 +82,7 @@ def _worker() -> None: if is_multi_repo_mode() and repo_name: state_dir = _get_repo_state_dir(repo_name) else: - state_dir = _get_global_state_dir(str(ROOT)) + state_dir = _get_global_state_dir(str(watch_config.ROOT)) lock_path = state_dir / "pseudo.lock" with _cross_process_lock(lock_path): processed = idx.pseudo_backfill_tick( diff --git a/scripts/watch_index_core/queue.py b/scripts/watch_index_core/queue.py index a94c31c0..118bbe6d 100644 --- a/scripts/watch_index_core/queue.py +++ b/scripts/watch_index_core/queue.py @@ -17,16 +17,21 @@ def __init__(self, process_cb: Callable[[List[Path]], None]): self._lock = threading.Lock() self._paths: Set[Path] = set() self._pending: Set[Path] = set() + self._forced_paths: Set[Path] = set() + self._pending_forced: Set[Path] = set() self._timer: threading.Timer | None = None self._process_cb = process_cb # Serialize processing to avoid concurrent use of TextEmbedding/QdrantClient self._processing_lock = threading.Lock() self._recent_fingerprints: dict[Path, tuple[tuple[int, int], float]] = {} - def add(self, p: Path) -> None: + def add(self, p: Path, *, force: bool = False) -> None: with self._lock: + already_queued = p in self._paths self._paths.add(p) - if self._timer is not None: + if force: + self._forced_paths.add(p) + if self._timer is not None and not already_queued: try: self._timer.cancel() except Exception as exc: @@ -34,9 +39,10 @@ def add(self, p: Path) -> None: "Failed to cancel timer in ChangeQueue.add", extra={"error": str(exc)}, ) - self._timer = threading.Timer(DELAY_SECS, self._flush) - self._timer.daemon = True - self._timer.start() + if self._timer is None or not already_queued: + self._timer = threading.Timer(DELAY_SECS, self._flush) + self._timer.daemon = True + self._timer.start() def _fingerprint_path(self, p: Path) -> tuple[int, int] | None: try: @@ -48,14 +54,23 @@ def _fingerprint_path(self, p: Path) -> tuple[int, int] | None: except Exception: return None - def _filter_recent_paths(self, paths: Iterable[Path]) -> list[Path]: + def _filter_recent_paths( + self, + paths: Iterable[Path], + *, + forced_paths: Iterable[Path] | None = None, + ) -> list[Path]: ttl = float(RECENT_FINGERPRINT_TTL_SECS) + forced = set(forced_paths or []) if ttl <= 0: return list(paths) now = time.time() keep: list[Path] = [] for p in paths: + if p in forced: + keep.append(p) + continue fp = self._fingerprint_path(p) if fp is None: keep.append(p) @@ -85,17 +100,30 @@ def _mark_recent_paths(self, paths: Iterable[Path]) -> None: for p in stale: self._recent_fingerprints.pop(p, None) + def _drain_pending(self) -> tuple[list[Path], Set[Path]] | None: + with self._lock: + if not self._pending: + return None + todo = list(self._pending) + todo_forced = {p for p in todo if p in self._pending_forced} + self._pending.clear() + self._pending_forced.clear() + return todo, todo_forced + def _flush(self) -> None: # Grab current batch with self._lock: paths = list(self._paths) + forced_paths = {p for p in paths if p in self._forced_paths} self._paths.clear() + self._forced_paths.difference_update(paths) self._timer = None # Try to run the processor exclusively; if busy, queue and return if not self._processing_lock.acquire(blocking=False): with self._lock: self._pending.update(paths) + self._pending_forced.update(forced_paths) if self._timer is None: # schedule a follow-up flush to pick up pending when free self._timer = threading.Timer(DELAY_SECS, self._flush) @@ -105,14 +133,14 @@ def _flush(self) -> None: try: # Per-file locking in index_single_file handles indexer/watcher coordination todo: Iterable[Path] = paths + todo_forced: Set[Path] = set(forced_paths) while True: - filtered_todo = self._filter_recent_paths(todo) + filtered_todo = self._filter_recent_paths(todo, forced_paths=todo_forced) if not filtered_todo: - with self._lock: - if not self._pending: - break - todo = list(self._pending) - self._pending.clear() + pending = self._drain_pending() + if pending is None: + break + todo, todo_forced = pending continue try: self._process_cb(list(filtered_todo)) @@ -137,11 +165,10 @@ def _flush(self) -> None: except Exception: pass # Last resort: can't even print # drain any pending accumulated during processing - with self._lock: - if not self._pending: - break - todo = list(self._pending) - self._pending.clear() + pending = self._drain_pending() + if pending is None: + break + todo, todo_forced = pending finally: self._processing_lock.release() diff --git a/scripts/watch_index_core/utils.py b/scripts/watch_index_core/utils.py index 999daa5a..5f4086fd 100644 --- a/scripts/watch_index_core/utils.py +++ b/scripts/watch_index_core/utils.py @@ -8,7 +8,8 @@ from watchdog.observers import Observer import scripts.ingest_code as idx -from .config import LOGGER, ROOT, default_collection_name +from . import config as watch_config +from .config import LOGGER, default_collection_name from scripts.workspace_state import ( _extract_repo_name_from_path, PLACEHOLDER_COLLECTION_NAMES, @@ -93,13 +94,14 @@ def create_observer(use_polling: bool, observer_cls: Type[Observer] = Observer) def _detect_repo_for_file(file_path: Path) -> Optional[Path]: """Detect repository root for a file under WATCH root.""" + root = watch_config.ROOT try: - rel_path = file_path.resolve().relative_to(ROOT.resolve()) + rel_path = file_path.resolve().relative_to(root.resolve()) except Exception: return None if not rel_path.parts: - return ROOT - return ROOT / rel_path.parts[0] + return root + return root / rel_path.parts[0] def _repo_name_or_none(repo_path: Optional[Path]) -> Optional[str]: diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index 7d843435..dc7c2ceb 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -9,6 +9,7 @@ - Multi-repo support with per-repo state files """ import json +import logging import os import re import uuid @@ -22,6 +23,8 @@ _CANONICAL_SLUG_RE = re.compile(r"^.+-[0-9a-f]{16}$") _SLUGGED_REPO_RE = re.compile(r"^.+-[0-9a-f]{16}(?:_old)?$") +INTERNAL_STATE_TOP_LEVEL_DIRS = frozenset({".codebase", ".git", "__pycache__"}) +logger = logging.getLogger(__name__) _managed_slug_cache_lock = threading.Lock() _managed_slug_cache: set[str] = set() _managed_slug_cache_neg: set[str] = set() @@ -112,7 +115,7 @@ def _server_managed_slug_from_path(path: Path) -> Optional[str]: return None work_dir = Path(os.environ.get("WORK_DIR") or os.environ.get("WORKDIR") or "/work") - marker = work_dir / ".codebase" / "repos" / slug / ".ctxce_managed_upload" + marker = work_dir / STATE_DIRNAME / "repos" / slug / ".ctxce_managed_upload" try: is_managed = marker.exists() except OSError: @@ -134,6 +137,7 @@ def _server_managed_slug_from_path(path: Path) -> Optional[str]: STATE_DIRNAME = ".codebase" STATE_FILENAME = "state.json" CACHE_FILENAME = "cache.json" +INDEX_JOURNAL_FILENAME = "index_journal.json" PLACEHOLDER_COLLECTION_NAMES = {"", "default-collection", "my-collection"} class IndexingProgress(TypedDict, total=False): @@ -186,6 +190,33 @@ class StagingInfo(TypedDict, total=False): class MaintenanceInfo(TypedDict, total=False): last_empty_dir_sweep_at: Optional[str] + last_consistency_audit_at: Optional[str] + last_consistency_audit_summary: Optional[Dict[str, Any]] + + +class IndexJournalRecord(TypedDict, total=False): + path: str + op_type: str + content_hash: Optional[str] + status: str + attempts: int + created_at: str + updated_at: str + last_error: Optional[str] + + +def _index_journal_retry_delay_seconds() -> float: + try: + return max(0.0, float(os.environ.get("INDEX_JOURNAL_RETRY_DELAY_SECS", "5") or 5)) + except Exception: + return 5.0 + + +def _index_journal_max_attempts() -> int: + try: + return max(0, int(os.environ.get("INDEX_JOURNAL_MAX_ATTEMPTS", "0") or 0)) + except Exception: + return 0 class WorkspaceState(TypedDict, total=False): @@ -238,7 +269,12 @@ def logical_repo_reuse_enabled() -> bool: def _resolve_workspace_root() -> str: """Determine the default workspace root path.""" - return os.environ.get("WORKSPACE_PATH") or os.environ.get("WATCH_ROOT") or "/work" + return ( + os.environ.get("CTXCE_METADATA_ROOT") + or os.environ.get("WORKSPACE_PATH") + or os.environ.get("WATCH_ROOT") + or "/work" + ) def _resolve_repo_context( workspace_path: Optional[str] = None, @@ -252,14 +288,45 @@ def _resolve_repo_context( return resolved_workspace, repo_name if workspace_path: - detected = _detect_repo_name_from_path(Path(workspace_path)) - if detected: - return resolved_workspace, detected + try: + requested = Path(workspace_path).resolve() + workspace_root = Path(_resolve_workspace_root()).resolve() + except Exception: + requested = Path(workspace_path) + workspace_root = Path(_resolve_workspace_root()) + if requested != workspace_root: + detected = _detect_repo_name_from_path(requested) + if detected: + return resolved_workspace, detected return resolved_workspace, None return resolved_workspace, repo_name + +def _get_repo_workspace_dir( + repo_name: str, + workspace_path: Optional[str] = None, +) -> Path: + try: + base_dir = Path(workspace_path or _resolve_workspace_root()).resolve() + except Exception: + base_dir = Path(workspace_path or _resolve_workspace_root()).absolute() + if base_dir.name == repo_name: + return base_dir + host_index_path = (os.environ.get("HOST_INDEX_PATH") or "").strip() + if host_index_path: + host_index_root = Path(host_index_path) + if not host_index_root.is_absolute(): + host_index_root = base_dir / host_index_root + candidate = host_index_root.resolve() / repo_name + if candidate.exists() or (candidate / STATE_DIRNAME).exists(): + return candidate + dev_workspace_candidate = base_dir / "dev-workspace" / repo_name + if dev_workspace_candidate.exists() or (dev_workspace_candidate / STATE_DIRNAME).exists(): + return dev_workspace_candidate + return base_dir / repo_name + def _get_state_lock(workspace_path: Optional[str] = None, repo_name: Optional[str] = None) -> threading.RLock: """Get or create a lock for the workspace or repo state and track usage.""" if repo_name and is_multi_repo_mode(): @@ -273,13 +340,52 @@ def _get_state_lock(workspace_path: Optional[str] = None, repo_name: Optional[st _state_lock_last_used[key] = time.time() return _state_locks[key] -def _get_repo_state_dir(repo_name: str) -> Path: +def _get_repo_state_dir( + repo_name: str, + workspace_path: Optional[str] = None, +) -> Path: """Get the state directory for a repository.""" - base_dir = Path(os.environ.get("WORKSPACE_PATH") or os.environ.get("WATCH_ROOT") or "/work") + workspace_root = Path(_resolve_workspace_root()).resolve() + base_dir = Path(workspace_path or str(workspace_root)).resolve() + global_repo_state_dir = workspace_root / STATE_DIRNAME / "repos" / repo_name if is_multi_repo_mode(): - return base_dir / STATE_DIRNAME / "repos" / repo_name + # Canonical multi-repo metadata layout is shared under workspace root. + return global_repo_state_dir return base_dir / STATE_DIRNAME + +def _is_repo_local_metadata_path(path: Path) -> bool: + try: + parts = path.resolve().parts + except Exception: + parts = path.parts + try: + idx = parts.index(STATE_DIRNAME) + except ValueError: + return False + if idx > 0 and _SLUGGED_REPO_RE.match(parts[idx - 1] or ""): + return True + if "repos" in parts: + ridx = parts.index("repos") + if ridx + 1 < len(parts) and _SLUGGED_REPO_RE.match(parts[ridx + 1] or ""): + return True + return False + + +def _apply_runtime_metadata_mode(path: Path) -> None: + try: + is_dir = path.is_dir() + except Exception: + is_dir = False + if _is_repo_local_metadata_path(path): + mode = 0o777 if is_dir else 0o666 + else: + mode = 0o775 if is_dir else 0o664 + try: + os.chmod(path, mode) + except Exception: + pass + def _get_state_path(workspace_path: str) -> Path: """Get the path to the state.json file for a workspace.""" workspace = Path(workspace_path).resolve() @@ -628,7 +734,7 @@ def _detect_repo_name_from_path(path: Path) -> str: rel = resolved.relative_to(ws_root) if rel.parts: candidate = rel.parts[0] - if candidate not in {".codebase", ".git", "__pycache__"}: + if candidate not in INTERNAL_STATE_TOP_LEVEL_DIRS: return candidate except Exception: pass @@ -677,12 +783,7 @@ def _atomic_write_state(state_path: Path, state: WorkspaceState) -> None: with open(temp_path, 'w', encoding='utf-8') as f: json.dump(state, f, indent=2, ensure_ascii=False) temp_path.replace(state_path) - # Ensure state/cache files are group-writable so multiple processes - # (upload service, watcher, indexer) can update them. - try: - os.chmod(state_path, 0o664) - except PermissionError: - pass + _apply_runtime_metadata_mode(state_path) except Exception: # Clean up temp file if something went wrong try: @@ -710,7 +811,7 @@ def get_workspace_state( lock_scope_path: Path if is_multi_repo_mode() and repo_name: - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, workspace_path) try: ws_root = Path(_resolve_workspace_root()) ws_dir = ws_root / repo_name @@ -722,12 +823,7 @@ def get_workspace_state( except Exception: return {} state_dir.mkdir(parents=True, exist_ok=True) - # Ensure repo state dir is group-writable so root upload service and - # non-root watcher/indexer processes can both write state/cache files. - try: - os.chmod(state_dir, 0o775) - except Exception: - pass + _apply_runtime_metadata_mode(state_dir) state_path = state_dir / STATE_FILENAME lock_scope_path = state_dir else: @@ -807,7 +903,7 @@ def update_workspace_state( # Allow updates when the repo state dir exists, even if the workspace # directory is not present (e.g. dev-remote simulations where only # .codebase state is persisted). - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, workspace_path) if not (ws_root / repo_name).exists() and not state_dir.exists(): return {} except Exception: @@ -828,8 +924,9 @@ def update_workspace_state( state["updated_at"] = datetime.now().isoformat() if is_multi_repo_mode() and repo_name: - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, workspace_path) state_dir.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(state_dir) state_path = state_dir / STATE_FILENAME else: try: @@ -1250,8 +1347,9 @@ def log_activity( return except Exception: return - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, workspace_path) state_dir.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(state_dir) state_path = state_dir / STATE_FILENAME lock_path = state_path.with_suffix(".lock") @@ -1410,7 +1508,7 @@ def _detect_repo_name_from_path_by_structure(path: Path) -> str: continue repo_name = rel_path.parts[0] - if repo_name in (".codebase", ".git", "__pycache__"): + if repo_name in INTERNAL_STATE_TOP_LEVEL_DIRS: continue repo_path = base / repo_name @@ -1580,10 +1678,288 @@ def _write_cache(workspace_path: str, cache: Dict[str, Any]) -> None: pass -def get_cached_file_hash(file_path: str, repo_name: Optional[str] = None) -> str: +def _get_index_journal_path( + workspace_path: Optional[str] = None, repo_name: Optional[str] = None +) -> Path: + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + if repo_name: + state_dir = _get_repo_state_dir(repo_name, workspace_path) + else: + state_dir = _get_global_state_dir(workspace_path) + return state_dir / INDEX_JOURNAL_FILENAME + + +def _read_index_journal_file_uncached(journal_path: Path) -> Dict[str, Any]: + try: + with journal_path.open("r", encoding="utf-8-sig") as f: + obj = json.load(f) + if isinstance(obj, dict): + operations = obj.get("operations", {}) + if isinstance(operations, dict): + return obj + except (OSError, json.JSONDecodeError, ValueError): + pass + now = datetime.now().isoformat() + return {"version": 1, "operations": {}, "created_at": now, "updated_at": now} + + +def _write_index_journal( + workspace_path: Optional[str], + repo_name: Optional[str], + journal: Dict[str, Any], +) -> None: + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + lock = _get_state_lock(workspace_path, repo_name) + with lock: + journal_path = _get_index_journal_path(workspace_path, repo_name) + journal_path.parent.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(journal_path.parent) + lock_path = journal_path.with_suffix(journal_path.suffix + ".lock") + with _cross_process_lock(lock_path): + tmp = journal_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") + try: + with open(tmp, "w", encoding="utf-8") as f: + json.dump(journal, f, ensure_ascii=False, indent=2) + tmp.replace(journal_path) + _apply_runtime_metadata_mode(journal_path) + finally: + try: + tmp.unlink(missing_ok=True) + except Exception: + pass + + +def _update_index_journal( + workspace_path: Optional[str], + repo_name: Optional[str], + mutator, +) -> Dict[str, Any]: + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + lock = _get_state_lock(workspace_path, repo_name) + with lock: + journal_path = _get_index_journal_path(workspace_path, repo_name) + journal_path.parent.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(journal_path.parent) + lock_path = journal_path.with_suffix(journal_path.suffix + ".lock") + with _cross_process_lock(lock_path): + journal = _read_index_journal_file_uncached(journal_path) + mutator(journal) + journal["updated_at"] = datetime.now().isoformat() + tmp = journal_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") + try: + with open(tmp, "w", encoding="utf-8") as f: + json.dump(journal, f, ensure_ascii=False, indent=2) + tmp.replace(journal_path) + _apply_runtime_metadata_mode(journal_path) + finally: + try: + tmp.unlink(missing_ok=True) + except Exception: + pass + return journal + + +def upsert_index_journal_entries( + entries: List[Dict[str, Any]], + *, + workspace_path: Optional[str] = None, + repo_name: Optional[str] = None, +) -> Dict[str, Any]: + """Persist or replace repo-scoped index journal entries keyed by normalized path.""" + normalized_entries: List[IndexJournalRecord] = [] + now = datetime.now().isoformat() + for entry in entries or []: + path = _normalize_cache_key_path(str(entry.get("path") or "")) + op_type = str(entry.get("op_type") or "").strip().lower() + if not path or op_type not in {"upsert", "delete"}: + continue + content_hash = str(entry.get("content_hash") or "").strip() or None + normalized_entries.append( + { + "path": path, + "op_type": op_type, + "content_hash": content_hash, + "status": "pending", + "attempts": 0, + "created_at": str(entry.get("created_at") or now), + "updated_at": now, + "last_error": None, + } + ) + + def _mutate(journal: Dict[str, Any]) -> None: + ops = journal.setdefault("operations", {}) + if not isinstance(ops, dict): + ops = {} + journal["operations"] = ops + for entry in normalized_entries: + ops[entry["path"]] = entry + + return _update_index_journal(workspace_path, repo_name, _mutate) + + +def list_pending_index_journal_entries( + workspace_path: Optional[str] = None, + repo_name: Optional[str] = None, +) -> List[IndexJournalRecord]: + """Return watcher-retryable journal records for a workspace or specific repo.""" + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + retry_delay = _index_journal_retry_delay_seconds() + max_attempts = _index_journal_max_attempts() + now = datetime.now() + + def _read_repo_journal_entries( + target_repo_name: Optional[str], + *, + target_workspace_path: Optional[str] = None, + ) -> List[IndexJournalRecord]: + journal = _read_index_journal_file_uncached( + _get_index_journal_path(target_workspace_path or workspace_path, target_repo_name) + ) + merged_ops = journal.get("operations", {}) + if not isinstance(merged_ops, dict): + merged_ops = {} + result: List[IndexJournalRecord] = [] + for rec in merged_ops.values(): + if not isinstance(rec, dict): + continue + status = str(rec.get("status") or "pending").strip().lower() + if status not in {"pending", "failed"}: + continue + attempts_raw = rec.get("attempts") + try: + attempts = int(attempts_raw or 0) + except (ValueError, TypeError): + attempts = 0 + logger.warning( + "workspace_state::invalid_journal_attempts", + extra={"attempts": attempts_raw, "path": str(rec.get("path") or "")}, + ) + if max_attempts > 0 and attempts >= max_attempts: + continue + if status == "failed" and retry_delay > 0: + updated_at = str(rec.get("updated_at") or "").strip() + if updated_at: + try: + last = datetime.fromisoformat(updated_at) + if (now - last).total_seconds() < retry_delay: + continue + except Exception: + pass + p = _normalize_cache_key_path(str(rec.get("path") or "")) + op_type = str(rec.get("op_type") or "").strip().lower() + if not p or op_type not in {"upsert", "delete"}: + continue + result.append( + { + "path": p, + "op_type": op_type, + "content_hash": str(rec.get("content_hash") or "").strip() or None, + "status": status, + "attempts": attempts, + "created_at": str(rec.get("created_at") or ""), + "updated_at": str(rec.get("updated_at") or ""), + "last_error": str(rec.get("last_error") or "").strip() or None, + } + ) + return result + + if repo_name: + return _read_repo_journal_entries(repo_name) + + result: List[IndexJournalRecord] = [] + root_path = Path(workspace_path or _resolve_workspace_root()).resolve() + repo_candidates: set[str] = set() + multi_repo_mode = is_multi_repo_mode() + try: + for repo_root in root_path.iterdir(): + if not repo_root.is_dir(): + continue + if repo_root.name in INTERNAL_STATE_TOP_LEVEL_DIRS: + continue + if (not multi_repo_mode) and (not _SLUGGED_REPO_RE.match(repo_root.name)): + continue + repo_candidates.add(repo_root.name) + except Exception: + pass + + try: + repos_state_root = root_path / STATE_DIRNAME / "repos" + if repos_state_root.exists(): + for state_dir in repos_state_root.iterdir(): + if not state_dir.is_dir(): + continue + repo_candidates.add(state_dir.name) + except Exception: + pass + + for candidate in sorted(repo_candidates): + candidate_workspace_path: Optional[str] = None + if not multi_repo_mode: + candidate_workspace_path = str(root_path / candidate) + result.extend( + _read_repo_journal_entries( + candidate, + target_workspace_path=candidate_workspace_path, + ) + ) + + if result: + return result + return _read_repo_journal_entries(None) + + +def update_index_journal_entry_status( + path: str, + *, + status: str, + error: Optional[str] = None, + workspace_path: Optional[str] = None, + repo_name: Optional[str] = None, + remove_on_done: bool = True, +) -> Dict[str, Any]: + """Update or clear a repo-scoped journal entry after processing.""" + normalized_path = _normalize_cache_key_path(path) + now = datetime.now().isoformat() + + def _mutate(journal: Dict[str, Any]) -> None: + ops = journal.setdefault("operations", {}) + if not isinstance(ops, dict): + ops = {} + journal["operations"] = ops + rec = ops.get(normalized_path) + if not isinstance(rec, dict): + return + if status == "done" and remove_on_done: + ops.pop(normalized_path, None) + return + rec["status"] = status + rec["updated_at"] = now + attempts_raw = rec.get("attempts") + try: + attempts = int(attempts_raw or 0) + except (ValueError, TypeError): + attempts = 0 + logger.warning( + "workspace_state::invalid_journal_attempts", + extra={"attempts": attempts_raw, "path": normalized_path}, + ) + rec["attempts"] = attempts + 1 + rec["last_error"] = str(error or "").strip() or None + ops[normalized_path] = rec + + return _update_index_journal(workspace_path, repo_name, _mutate) + + +def get_cached_file_hash( + file_path: str, + repo_name: Optional[str] = None, + metadata_root: Optional[str] = None, +) -> str: """Get cached file hash for tracking changes.""" + root = metadata_root or _resolve_workspace_root() if is_multi_repo_mode() and repo_name: - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, root) cache_path = state_dir / CACHE_FILENAME cache = _read_cache_file_cached(cache_path) @@ -1594,19 +1970,23 @@ def get_cached_file_hash(file_path: str, repo_name: Optional[str] = None) -> str return str(val.get("hash") or "") return str(val or "") else: - cache = _read_cache_cached(_resolve_workspace_root()) + cache = _read_cache_cached(root) fp = _normalize_cache_key_path(file_path) val = cache.get("file_hashes", {}).get(fp, "") if isinstance(val, dict): return str(val.get("hash") or "") return str(val or "") - return "" - -def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str] = None) -> None: +def set_cached_file_hash( + file_path: str, + file_hash: str, + repo_name: Optional[str] = None, + metadata_root: Optional[str] = None, +) -> None: """Set cached file hash for tracking changes.""" fp = _normalize_cache_key_path(file_path) + root = metadata_root or _resolve_workspace_root() st_size: Optional[int] = None st_mtime: Optional[int] = None @@ -1620,14 +2000,15 @@ def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str if is_multi_repo_mode() and repo_name: try: - ws_root = Path(_resolve_workspace_root()) + ws_root = Path(root) if not (ws_root / repo_name).exists(): return except Exception: return - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, str(ws_root)) cache_path = state_dir / CACHE_FILENAME state_dir.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(state_dir) if cache_path.exists(): cache = _read_cache_file_cached(cache_path) @@ -1664,7 +2045,7 @@ def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str _memoize_cache_obj(cache_path, cache) return - cache = _read_cache_cached(_resolve_workspace_root()) + cache = _read_cache_cached(root) existing = cache.get("file_hashes", {}).get(fp) if isinstance(existing, dict) and st_size is not None and st_mtime is not None: if ( @@ -1688,14 +2069,14 @@ def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str pass cache.setdefault("file_hashes", {})[fp] = entry cache["updated_at"] = datetime.now().isoformat() - _write_cache(_resolve_workspace_root(), cache) - _memoize_cache_obj(_get_cache_path(_resolve_workspace_root()), cache) + _write_cache(root, cache) + _memoize_cache_obj(_get_cache_path(root), cache) def get_cached_file_meta(file_path: str, repo_name: Optional[str] = None) -> Dict[str, Any]: fp = _normalize_cache_key_path(file_path) if is_multi_repo_mode() and repo_name: - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, _resolve_workspace_root()) cache_path = state_dir / CACHE_FILENAME cache = _read_cache_file_cached(cache_path) @@ -1716,10 +2097,15 @@ def get_cached_file_meta(file_path: str, repo_name: Optional[str] = None) -> Dic return {} -def remove_cached_file(file_path: str, repo_name: Optional[str] = None) -> None: +def remove_cached_file( + file_path: str, + repo_name: Optional[str] = None, + metadata_root: Optional[str] = None, +) -> None: """Remove file entry from cache.""" + root = metadata_root or _resolve_workspace_root() if is_multi_repo_mode() and repo_name: - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, root) cache_path = state_dir / CACHE_FILENAME if cache_path.exists(): @@ -1735,13 +2121,13 @@ def remove_cached_file(file_path: str, repo_name: Optional[str] = None) -> None: _memoize_cache_obj(cache_path, cache) return - cache = _read_cache_cached(_resolve_workspace_root()) + cache = _read_cache_cached(root) fp = _normalize_cache_key_path(file_path) if fp in cache.get("file_hashes", {}): cache["file_hashes"].pop(fp, None) cache["updated_at"] = datetime.now().isoformat() - _write_cache(_resolve_workspace_root(), cache) - _memoize_cache_obj(_get_cache_path(_resolve_workspace_root()), cache) + _write_cache(root, cache) + _memoize_cache_obj(_get_cache_path(root), cache) def cleanup_old_cache_locks(max_idle_seconds: int = 900) -> int: @@ -1785,42 +2171,65 @@ def cleanup_old_cache_locks(max_idle_seconds: int = 900) -> int: def get_collection_mappings(search_root: Optional[str] = None) -> List[Dict[str, Any]]: - """Enumerate collection mappings with origin metadata.""" + """Enumerate collection mappings with origin metadata. + + `search_root` may point at either workspace root (`/work`) or codebase root + (`/work/.codebase`). + """ root_path = Path(search_root or _resolve_workspace_root()).resolve() + if root_path.name == STATE_DIRNAME: + workspace_root = root_path.parent + codebase_root = root_path + else: + workspace_root = root_path + codebase_root = root_path / STATE_DIRNAME mappings: List[Dict[str, Any]] = [] try: if is_multi_repo_mode(): - repos_root = root_path / STATE_DIRNAME / "repos" + seen_state_files: set[str] = set() + + def _append_repo_mapping(repo_name: str, state_path: Path) -> None: + if not state_path.exists(): + return + try: + state_key = str(state_path.resolve()) + except Exception: + state_key = str(state_path) + if state_key in seen_state_files: + return + seen_state_files.add(state_key) + + try: + with open(state_path, "r", encoding="utf-8-sig") as f: + state = json.load(f) or {} + except Exception as e: + print(f"[workspace_state] Failed to read repo state from {state_path}: {e}") + return + + origin = state.get("origin", {}) or {} + repo_workspace_dir = _get_repo_workspace_dir(repo_name, str(workspace_root)) + mappings.append( + { + "repo_name": repo_name, + "collection_name": state.get("qdrant_collection") + or get_collection_name(repo_name), + "container_path": origin.get("container_path") + or str(repo_workspace_dir.resolve()), + "source_path": origin.get("source_path"), + "state_file": str(state_path), + "updated_at": state.get("updated_at"), + } + ) + + # Shared metadata root (`/.codebase/repos//state.json`) + repos_root = codebase_root / "repos" if repos_root.exists(): for repo_dir in sorted(p for p in repos_root.iterdir() if p.is_dir()): - repo_name = repo_dir.name - state_path = repo_dir / STATE_FILENAME - if not state_path.exists(): - continue - try: - with open(state_path, "r", encoding="utf-8-sig") as f: - state = json.load(f) or {} - except Exception as e: - print(f"[workspace_state] Failed to read repo state from {state_path}: {e}") - continue - - origin = state.get("origin", {}) or {} - mappings.append( - { - "repo_name": repo_name, - "collection_name": state.get("qdrant_collection") - or get_collection_name(repo_name), - "container_path": origin.get("container_path") - or str((Path(_resolve_workspace_root()) / repo_name).resolve()), - "source_path": origin.get("source_path"), - "state_file": str(state_path), - "updated_at": state.get("updated_at"), - } - ) + _append_repo_mapping(repo_dir.name, repo_dir / STATE_FILENAME) else: - state_path = root_path / STATE_DIRNAME / STATE_FILENAME + state_path = codebase_root / STATE_FILENAME if state_path.exists(): try: with open(state_path, "r", encoding="utf-8-sig") as f: @@ -1829,14 +2238,14 @@ def get_collection_mappings(search_root: Optional[str] = None) -> List[Dict[str, state = {} origin = state.get("origin", {}) or {} - repo_name = origin.get("repo_name") or Path(root_path).name + repo_name = origin.get("repo_name") or Path(workspace_root).name mappings.append( { "repo_name": repo_name, "collection_name": state.get("qdrant_collection") or get_collection_name(repo_name), "container_path": origin.get("container_path") - or str(root_path), + or str(workspace_root), "source_path": origin.get("source_path"), "state_file": str(state_path), "updated_at": state.get("updated_at"), @@ -2121,6 +2530,8 @@ def set_cached_symbols(file_path: str, symbols: dict, file_hash: str) -> None: """Save symbol metadata for a file. Extends existing to include pseudo data.""" cache_path = _get_symbol_cache_path(file_path) cache_path.parent.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(cache_path.parent) + temp_path = cache_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") try: cache_data = { @@ -2130,18 +2541,16 @@ def set_cached_symbols(file_path: str, symbols: dict, file_hash: str) -> None: "symbols": symbols } - with open(cache_path, 'w', encoding='utf-8') as f: + with open(temp_path, 'w', encoding='utf-8') as f: json.dump(cache_data, f, indent=2) - - # Ensure symbol cache files are group-writable so both indexer and - # watcher processes (potentially different users sharing a group) - # can update them on shared volumes. - try: - os.chmod(cache_path, 0o664) - except PermissionError: - pass + temp_path.replace(cache_path) + _apply_runtime_metadata_mode(cache_path) except Exception as e: print(f"[SYMBOL_CACHE_WARNING] Failed to save symbol cache for {file_path}: {e}") + try: + temp_path.unlink(missing_ok=True) + except Exception: + pass def get_cached_pseudo(file_path: str, symbol_id: str) -> tuple[str, list[str]]: @@ -2241,7 +2650,7 @@ def clear_symbol_cache( target_dirs: List[Path] = [] if is_multi_repo_mode() and repo_name: - target_dirs.append(_get_repo_state_dir(repo_name) / "symbols") + target_dirs.append(_get_repo_state_dir(repo_name, workspace_path) / "symbols") else: try: cache_parent = _get_cache_path(workspace_root).parent diff --git a/tests/test_index_journal.py b/tests/test_index_journal.py new file mode 100644 index 00000000..ee8776e5 --- /dev/null +++ b/tests/test_index_journal.py @@ -0,0 +1,431 @@ +#!/usr/bin/env python3 +import importlib +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + + +pytestmark = pytest.mark.unit + + +@pytest.fixture +def ws_module(monkeypatch, tmp_path): + ws_root = tmp_path / "work" + ws_root.mkdir(parents=True, exist_ok=True) + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.delenv("MULTI_REPO_MODE", raising=False) + ws = importlib.import_module("scripts.workspace_state") + return importlib.reload(ws) + + +def test_index_journal_roundtrip(ws_module, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "app.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + + ws_module.upsert_index_journal_entries( + [ + {"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}, + {"path": str(file_path.with_name("old.py")), "op_type": "delete"}, + ], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + + pending = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ] + assert str(file_path.resolve()) in pending + assert str((file_path.with_name("old.py")).resolve()) in pending + + ws_module.update_index_journal_entry_status( + str(file_path), + status="done", + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + pending_after = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ] + assert str(file_path.resolve()) not in pending_after + assert str((file_path.with_name("old.py")).resolve()) in pending_after + + +def test_index_journal_entries_include_operation_types(ws_module, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "entry.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + + ws_module.upsert_index_journal_entries( + [ + {"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}, + {"path": str(file_path.with_name("gone.py")), "op_type": "delete"}, + ], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + + entries = ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + by_path = {entry["path"]: entry for entry in entries} + assert by_path[str(file_path.resolve())]["op_type"] == "upsert" + assert by_path[str((file_path.with_name("gone.py")).resolve())]["op_type"] == "delete" + + +def test_index_journal_aggregates_repo_scoped_entries(ws_module, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "x.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + + ws_module.upsert_index_journal_entries( + [{"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + + pending = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work") + ) + ] + assert str(file_path.resolve()) in pending + + +@pytest.mark.parametrize("repo_name", ["repo-1234567890abcdef", "frontend"]) +def test_index_journal_aggregates_repo_scoped_entries_in_multi_repo_mode( + monkeypatch, tmp_path, repo_name +): + ws_root = tmp_path / "work" + ws_root.mkdir(parents=True, exist_ok=True) + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.setenv("MULTI_REPO_MODE", "1") + ws_module = importlib.import_module("scripts.workspace_state") + ws_module = importlib.reload(ws_module) + + file_name = "app.ts" if repo_name == "frontend" else "multi.py" + file_path = ws_root / repo_name / "src" / file_name + file_path.parent.mkdir(parents=True, exist_ok=True) + + ws_module.upsert_index_journal_entries( + [{"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}], + workspace_path=str(ws_root / repo_name), + repo_name=repo_name, + ) + + pending = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries(workspace_path=str(ws_root)) + ] + assert str(file_path.resolve()) in pending + + +def test_index_journal_file_is_group_writable(ws_module, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "perm.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + + ws_module.upsert_index_journal_entries( + [{"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + + journal_path = ws_module._get_index_journal_path( + str(tmp_path / "work" / repo_name), repo_name + ) + assert journal_path.exists() + assert oct(journal_path.stat().st_mode & 0o777) == "0o666" + + +def test_index_journal_failed_entry_respects_retry_delay(ws_module, monkeypatch, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "retry.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + monkeypatch.setenv("INDEX_JOURNAL_RETRY_DELAY_SECS", "60") + + ws_module.upsert_index_journal_entries( + [{"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ws_module.update_index_journal_entry_status( + str(file_path), + status="failed", + error="boom", + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + remove_on_done=False, + ) + + pending = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ] + assert str(file_path.resolve()) not in pending + + +def test_index_journal_failed_entry_honors_max_attempts(ws_module, monkeypatch, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "retry2.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + monkeypatch.setenv("INDEX_JOURNAL_RETRY_DELAY_SECS", "0") + monkeypatch.setenv("INDEX_JOURNAL_MAX_ATTEMPTS", "1") + + ws_module.upsert_index_journal_entries( + [{"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ws_module.update_index_journal_entry_status( + str(file_path), + status="failed", + error="boom", + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + remove_on_done=False, + ) + + pending = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ] + assert str(file_path.resolve()) not in pending + + +def test_processor_delete_marks_journal_done(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + missing = tmp_path / "missing.py" + assert not missing.exists() + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "get_workspace_state", lambda *a, **k: {}) + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: False) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "remove_cached_file", lambda *a, **k: None) + + delete_mock = MagicMock() + graph_delete_mock = MagicMock() + journal_mock = MagicMock() + monkeypatch.setattr(proc_mod.idx, "delete_points_by_path", delete_mock) + monkeypatch.setattr(proc_mod.idx, "delete_graph_edges_by_path", graph_delete_mock) + monkeypatch.setattr(proc_mod, "_verify_delete_committed", lambda *a, **k: True) + monkeypatch.setattr(proc_mod, "update_index_journal_entry_status", journal_mock) + + proc_mod._process_paths( + [missing], + client=MagicMock(), + model=None, + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + delete_mock.assert_called_once() + graph_delete_mock.assert_called_once() + journal_mock.assert_called_once() + assert journal_mock.call_args.kwargs["status"] == "done" + + +def test_processor_honors_delete_journal_for_existing_file(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + existing = tmp_path / "present.py" + existing.write_text("print('x')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "get_workspace_state", lambda *a, **k: {}) + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: False) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "remove_cached_file", lambda *a, **k: None) + monkeypatch.setattr( + proc_mod, + "list_pending_index_journal_entries", + lambda *a, **k: [{"path": str(existing.resolve()), "op_type": "delete"}], + ) + + delete_mock = MagicMock() + graph_delete_mock = MagicMock() + journal_mock = MagicMock() + monkeypatch.setattr(proc_mod.idx, "delete_points_by_path", delete_mock) + monkeypatch.setattr(proc_mod.idx, "delete_graph_edges_by_path", graph_delete_mock) + monkeypatch.setattr(proc_mod, "_verify_delete_committed", lambda *a, **k: True) + monkeypatch.setattr(proc_mod, "update_index_journal_entry_status", journal_mock) + + proc_mod._process_paths( + [existing], + client=MagicMock(), + model=None, + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + delete_mock.assert_called_once() + graph_delete_mock.assert_called_once() + journal_mock.assert_called_once() + assert journal_mock.call_args.kwargs["status"] == "done" + + +def test_processor_relinks_move_journal_before_delete(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + src = tmp_path / "src.py" + dest = tmp_path / "dest.py" + dest.write_text("print('dest')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "get_workspace_state", lambda *a, **k: {}) + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: False) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "remove_cached_file", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "set_cached_file_hash", lambda *a, **k: None) + monkeypatch.setattr( + proc_mod, + "list_pending_index_journal_entries", + lambda *a, **k: [ + {"path": str(src.resolve()), "op_type": "delete", "content_hash": "cafebabe"}, + {"path": str(dest.resolve()), "op_type": "upsert", "content_hash": "cafebabe"}, + ], + ) + + rename_mock = MagicMock(return_value=(3, "cafebabe")) + delete_mock = MagicMock() + journal_mock = MagicMock() + monkeypatch.setattr(proc_mod, "_rename_in_store", rename_mock) + monkeypatch.setattr(proc_mod.idx, "delete_points_by_path", delete_mock) + monkeypatch.setattr(proc_mod, "update_index_journal_entry_status", journal_mock) + + proc_mod._process_paths( + [src, dest], + client=MagicMock(), + model=MagicMock(), + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + rename_mock.assert_called_once() + delete_mock.assert_not_called() + done_paths = [call.args[0] for call in journal_mock.call_args_list if call.kwargs.get("status") == "done"] + assert str(dest.resolve()) in done_paths + assert str(src.resolve()) in done_paths + + +def test_processor_skips_internal_git_path_without_collection_resolution(monkeypatch): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + internal = Path("/work/.git/HEAD") + + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "get_workspace_state", lambda *a, **k: {}) + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: False) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + + collection_mock = MagicMock(return_value="should-not-be-used") + journal_mock = MagicMock() + monkeypatch.setattr(proc_mod, "_get_collection_for_file", collection_mock) + monkeypatch.setattr(proc_mod, "update_index_journal_entry_status", journal_mock) + monkeypatch.setattr( + proc_mod, + "list_pending_index_journal_entries", + lambda *a, **k: [{"path": str(internal), "op_type": "delete"}], + ) + + proc_mod._process_paths( + [internal], + client=MagicMock(), + model=None, + vector_name="vec", + model_dim=1, + workspace_path="/work", + ) + + collection_mock.assert_not_called() + journal_mock.assert_called_once() + assert journal_mock.call_args.kwargs["status"] == "done" + + +def test_processor_force_upsert_empty_file_marks_done(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + empty_file = tmp_path / "pkg" / "__init__.py" + empty_file.parent.mkdir(parents=True, exist_ok=True) + empty_file.write_text("", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "get_workspace_state", lambda *a, **k: {}) + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: False) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "remove_cached_file", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_run_indexing_strategy", lambda *a, **k: False) + monkeypatch.setattr(proc_mod, "_path_has_indexed_points", lambda *a, **k: False) + + journal_mock = MagicMock() + monkeypatch.setattr(proc_mod, "update_index_journal_entry_status", journal_mock) + monkeypatch.setattr( + proc_mod, + "list_pending_index_journal_entries", + lambda *a, **k: [ + { + "path": str(empty_file.resolve()), + "op_type": "upsert", + "content_hash": "da39a3ee5e6b4b0d3255bfef95601890afd80709", + } + ], + ) + + proc_mod._process_paths( + [empty_file], + client=MagicMock(), + model=MagicMock(), + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + journal_mock.assert_called_once() + assert journal_mock.call_args.kwargs["status"] == "done" diff --git a/tests/test_upload_service_path_traversal.py b/tests/test_upload_service_path_traversal.py index 224fb926..796de7c1 100644 --- a/tests/test_upload_service_path_traversal.py +++ b/tests/test_upload_service_path_traversal.py @@ -2,7 +2,6 @@ import json import os import tarfile -from datetime import datetime, timedelta, timezone from pathlib import Path import pytest @@ -437,33 +436,15 @@ def test_process_delta_bundle_moved_prunes_empty_source_parent_dirs(tmp_path, mo assert (work_dir / slug).exists() -def test_process_delta_bundle_sweeps_stranded_empty_dirs_without_file_ops(tmp_path, monkeypatch): +def test_process_delta_bundle_does_not_sweep_stranded_empty_dirs_without_file_ops(tmp_path, monkeypatch): import scripts.upload_delta_bundle as us work_dir = tmp_path / "work" work_dir.mkdir(parents=True, exist_ok=True) monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) - monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", "1") - monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", "604800") - slug = "repo-0123456789abcdef" stranded = work_dir / slug / "dev-workspace" / "nested" / "empty" stranded.mkdir(parents=True, exist_ok=True) - state_store = {} - - monkeypatch.setattr( - us, - "get_workspace_state", - lambda workspace_path=None, repo_name=None: state_store.get(repo_name, {}), - ) - - def _fake_update_workspace_state(workspace_path=None, updates=None, repo_name=None): - state = dict(state_store.get(repo_name, {})) - state.update(updates or {}) - state_store[repo_name] = state - return state - - monkeypatch.setattr(us, "update_workspace_state", _fake_update_workspace_state) bundle = _write_bundle(tmp_path, []) @@ -482,10 +463,9 @@ def _fake_update_workspace_state(workspace_path=None, updates=None, repo_name=No "skipped_hash_match": 0, "failed": 0, } - assert not stranded.exists() - assert not (work_dir / slug / "dev-workspace").exists() + assert stranded.exists() + assert (work_dir / slug / "dev-workspace").exists() assert (work_dir / slug).exists() - assert state_store[slug]["maintenance"]["last_empty_dir_sweep_at"] def test_process_delta_bundle_skips_broad_empty_dir_sweep_when_disabled(tmp_path, monkeypatch): @@ -517,31 +497,9 @@ def test_process_delta_bundle_skips_broad_empty_dir_sweep_when_recent(tmp_path, work_dir = tmp_path / "work" work_dir.mkdir(parents=True, exist_ok=True) monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) - monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", "1") - monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", "604800") - slug = "repo-0123456789abcdef" stranded = work_dir / slug / "dev-workspace" / "nested" / "empty" stranded.mkdir(parents=True, exist_ok=True) - recent = datetime.now(timezone.utc) - timedelta(hours=1) - state_store = { - slug: { - "maintenance": { - "last_empty_dir_sweep_at": recent.isoformat(), - } - } - } - - monkeypatch.setattr( - us, - "get_workspace_state", - lambda workspace_path=None, repo_name=None: state_store.get(repo_name, {}), - ) - monkeypatch.setattr( - us, - "update_workspace_state", - lambda workspace_path=None, updates=None, repo_name=None: state_store.get(repo_name, {}), - ) bundle = _write_bundle(tmp_path, []) diff --git a/tests/test_watch_consistency.py b/tests/test_watch_consistency.py new file mode 100644 index 00000000..bf51c79e --- /dev/null +++ b/tests/test_watch_consistency.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +import importlib +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + + +pytestmark = pytest.mark.unit + + +@pytest.fixture +def capture_list_workspaces(): + captured = {} + + def fake_list_workspaces(search_root=None, use_qdrant_fallback=True): + captured["search_root"] = search_root + captured["use_qdrant_fallback"] = use_qdrant_fallback + return [] + + return captured, fake_list_workspaces + + +def test_run_consistency_audit_scans_from_watcher_root( + monkeypatch, tmp_path, capture_list_workspaces +): + mod = importlib.import_module("scripts.watch_index_core.consistency") + captured, fake_list_workspaces = capture_list_workspaces + + monkeypatch.setattr(mod, "list_workspaces", fake_list_workspaces) + monkeypatch.setattr(mod, "_consistency_audit_enabled", lambda: True) + + mod.run_consistency_audit(MagicMock(), tmp_path) + + assert "search_root" in captured and "use_qdrant_fallback" in captured + assert Path(captured["search_root"]).resolve() == Path(tmp_path).resolve() + assert captured["use_qdrant_fallback"] is False + + +def test_run_empty_dir_sweep_maintenance_scans_from_watcher_root( + monkeypatch, tmp_path, capture_list_workspaces +): + mod = importlib.import_module("scripts.watch_index_core.consistency") + captured, fake_list_workspaces = capture_list_workspaces + + monkeypatch.setattr(mod, "list_workspaces", fake_list_workspaces) + monkeypatch.setattr(mod, "_empty_dir_sweep_enabled", lambda: True) + + mod.run_empty_dir_sweep_maintenance(tmp_path) + + assert "search_root" in captured + assert Path(captured["search_root"]).resolve() == Path(tmp_path).resolve() + assert captured.get("use_qdrant_fallback") is False + + +def test_consistency_audit_skips_repairs_when_scan_is_truncated(monkeypatch, tmp_path): + mod = importlib.import_module("scripts.watch_index_core.consistency") + + workspace_root = tmp_path / "repo" + workspace_root.mkdir(parents=True, exist_ok=True) + + monkeypatch.setattr( + mod, + "list_workspaces", + lambda *a, **k: [{"workspace_path": str(workspace_root)}], + ) + monkeypatch.setattr(mod, "_consistency_audit_enabled", lambda: True) + monkeypatch.setattr(mod, "_should_run_consistency_audit", lambda *a, **k: True) + monkeypatch.setattr( + mod, + "get_collection_state_snapshot", + lambda *a, **k: {"active_collection": "coll"}, + ) + monkeypatch.setattr(mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(mod, "_load_cached_hashes", lambda *a, **k: {}) + monkeypatch.setattr( + mod, + "_scan_indexable_fs_paths", + lambda *a, **k: ({str(workspace_root / "a.py")}, True), + ) + monkeypatch.setattr( + mod, + "_load_indexed_paths_for_collection", + lambda *a, **k: ({str(workspace_root / "ghost.py")}, False), + ) + monkeypatch.setattr(mod.idx, "_Excluder", lambda *_: MagicMock()) + + enqueue_mock = MagicMock(return_value=(0, 0)) + record_mock = MagicMock() + monkeypatch.setattr(mod, "_enqueue_consistency_repairs", enqueue_mock) + monkeypatch.setattr(mod, "_record_consistency_audit", record_mock) + + mod.run_consistency_audit(MagicMock(), tmp_path) + + enqueue_mock.assert_not_called() + record_mock.assert_called_once() + summary = record_mock.call_args.args[2] + assert summary["fs_scan_truncated"] is True + assert summary["qdrant_scan_truncated"] is False + assert summary["repair_skipped_due_to_truncation"] is True + assert summary["stale_in_qdrant_count"] == 0 + assert summary["missing_in_qdrant_count"] == 0 diff --git a/tests/test_watch_index_cache.py b/tests/test_watch_index_cache.py index 16a3c309..679ac0b4 100644 --- a/tests/test_watch_index_cache.py +++ b/tests/test_watch_index_cache.py @@ -214,6 +214,43 @@ def test_run_indexing_strategy_skips_ensure_for_cached_hash_match(monkeypatch, t ensure_mock.assert_not_called() +def test_run_indexing_strategy_skips_smart_path_for_markdown(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "notes.md" + path.write_text("# notes\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod.idx, "ensure_collection_and_indexes_once", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_read_text_and_sha1", lambda _p: ("# notes\n", "abc123")) + monkeypatch.setattr(proc_mod, "get_cached_file_hash", lambda *a, **k: None) + monkeypatch.setattr(proc_mod.idx, "detect_language", lambda _p: "markdown") + + smart_check = MagicMock(side_effect=AssertionError("smart path must be skipped")) + monkeypatch.setattr(proc_mod.idx, "should_use_smart_reindexing", smart_check) + + captured = {} + + def fake_index_single_file(*args, **kwargs): + captured.update(kwargs) + return True + + monkeypatch.setattr(proc_mod.idx, "index_single_file", fake_index_single_file) + + ok = proc_mod._run_indexing_strategy( + path, + client=MagicMock(), + model=MagicMock(), + collection="coll", + vector_name="vec", + model_dim=1, + repo_name="repo", + ) + + assert ok is True + smart_check.assert_not_called() + assert captured["preloaded_language"] == "markdown" + + def test_staging_requires_subprocess_only_for_active_dual_root_state(monkeypatch): proc_mod = importlib.import_module("scripts.watch_index_core.processor") monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: True) @@ -292,8 +329,12 @@ def test_process_paths_does_not_force_subprocess_for_non_active_staging( ) assert staging_mock.call_args is not None - assert staging_mock.call_args.kwargs == {} - assert staging_mock.call_args.args[5] is None + assert staging_mock.call_args.kwargs == { + "force_upsert": False, + "journal_content_hash": "", + } + assert staging_mock.call_args.args[0] == path + assert staging_mock.call_args.args[6] is None def test_process_paths_uses_subprocess_when_staging_is_actually_active( @@ -336,5 +377,167 @@ def test_process_paths_uses_subprocess_when_staging_is_actually_active( ) assert staging_mock.call_args is not None - assert staging_mock.call_args.kwargs == {} - assert staging_mock.call_args.args[5] == {"FOO": "bar"} + assert staging_mock.call_args.kwargs == { + "force_upsert": False, + "journal_content_hash": "", + } + assert staging_mock.call_args.args[0] == path + assert staging_mock.call_args.args[6] == {"FOO": "bar"} + + +def test_staging_force_upsert_hash_match_verifies_before_skip(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "file.py" + path.write_text("print('x')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_read_text_and_sha1", lambda _p: ("print('x')\n", "abc123")) + monkeypatch.setattr(proc_mod, "get_cached_file_hash", lambda *a, **k: "abc123") + monkeypatch.setattr(proc_mod, "_verify_upsert_committed", lambda *a, **k: True) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + + mark_done = MagicMock() + monkeypatch.setattr(proc_mod, "_mark_journal_done", mark_done) + advance = MagicMock() + monkeypatch.setattr(proc_mod, "_advance_progress", advance) + + handled = proc_mod._maybe_handle_staging_file( + path, + MagicMock(), + "coll", + "repo", + str(tmp_path), + [path], + {"FOO": "bar"}, + {str(tmp_path): 0}, + "started", + force_upsert=True, + journal_content_hash="abc123", + ) + + assert handled is True + mark_done.assert_called_once_with(path, str(tmp_path), "repo") + advance.assert_called_once() + + +def test_runtime_root_override_updates_internal_path_checks(monkeypatch, tmp_path): + import scripts.watch_index as watch_index + from scripts.watch_index_core import config as watch_config + import scripts.watch_index_core.processor as proc_mod + import scripts.embedder as embedder_mod + + runtime_root = tmp_path / "runtime-root" + runtime_root.mkdir(parents=True, exist_ok=True) + internal = runtime_root / ".git" / "HEAD" + internal.parent.mkdir(parents=True, exist_ok=True) + internal.write_text("ref: refs/heads/main\n", encoding="utf-8") + + original_root = watch_config.ROOT + original_watch_root = watch_index.ROOT + monkeypatch.setenv("WATCH_ROOT", str(runtime_root)) + monkeypatch.setattr(watch_index, "initialize_watcher_state", lambda root: {"repo_name": None}) + monkeypatch.setattr(watch_index, "get_indexing_config_snapshot", lambda repo_name=None: {}) + monkeypatch.setattr(watch_index, "compute_indexing_config_hash", lambda snapshot: "hash") + monkeypatch.setattr(watch_index, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(watch_index, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(embedder_mod, "get_embedding_model", lambda *_: MagicMock()) + monkeypatch.setattr(embedder_mod, "get_model_dimension", lambda *_: 1) + monkeypatch.setattr(watch_index, "resolve_vector_name_config", lambda *a, **k: "vec") + monkeypatch.setattr(watch_index, "_start_pseudo_backfill_worker", lambda *a, **k: None) + monkeypatch.setattr(watch_index, "create_observer", lambda *a, **k: MagicMock()) + monkeypatch.setattr(watch_index, "IndexHandler", MagicMock()) + monkeypatch.setattr(watch_index, "ChangeQueue", MagicMock()) + monkeypatch.setattr( + watch_index, + "QdrantClient", + MagicMock(return_value=MagicMock(get_collection=MagicMock())), + ) + monkeypatch.setattr(watch_index, "run_consistency_audit", lambda *a, **k: None) + monkeypatch.setattr(watch_index, "run_empty_dir_sweep_maintenance", lambda *a, **k: None) + monkeypatch.setattr(watch_index, "list_pending_index_journal_entries", lambda *a, **k: []) + monkeypatch.setattr(watch_index, "get_boolean_env", lambda *a, **k: False) + monkeypatch.setattr(watch_index.time, "sleep", lambda *_: (_ for _ in ()).throw(KeyboardInterrupt())) + + try: + watch_index.main() + except KeyboardInterrupt: + pass + + try: + assert watch_config.ROOT == runtime_root.resolve() + assert proc_mod._is_internal_ignored_path(internal) is True + finally: + watch_config.ROOT = original_root + watch_index.ROOT = original_watch_root + + +def test_main_throttles_periodic_maintenance(monkeypatch, tmp_path): + import scripts.watch_index as watch_index + from scripts.watch_index_core import config as watch_config + import scripts.embedder as embedder_mod + + runtime_root = tmp_path / "runtime-root" + runtime_root.mkdir(parents=True, exist_ok=True) + + original_root = watch_config.ROOT + original_watch_root = watch_index.ROOT + monkeypatch.setenv("WATCH_ROOT", str(runtime_root)) + monkeypatch.setenv("WATCH_MAINTENANCE_INTERVAL_SECS", "300") + monkeypatch.setattr(watch_index, "initialize_watcher_state", lambda *a, **k: {"repo_name": None}) + monkeypatch.setattr(watch_index, "get_indexing_config_snapshot", lambda repo_name=None: {}) + monkeypatch.setattr(watch_index, "compute_indexing_config_hash", lambda snapshot: "hash") + monkeypatch.setattr(watch_index, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(watch_index, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(embedder_mod, "get_embedding_model", lambda *_: MagicMock()) + monkeypatch.setattr(embedder_mod, "get_model_dimension", lambda *_: 1) + monkeypatch.setattr(watch_index, "resolve_vector_name_config", lambda *a, **k: "vec") + monkeypatch.setattr(watch_index, "_start_pseudo_backfill_worker", lambda *a, **k: None) + + class FakeObserver: + def schedule(self, *a, **k): + return None + + def start(self): + return None + + def stop(self): + return None + + def join(self): + return None + + monkeypatch.setattr(watch_index, "create_observer", lambda *a, **k: FakeObserver()) + monkeypatch.setattr(watch_index, "IndexHandler", MagicMock()) + monkeypatch.setattr(watch_index, "ChangeQueue", MagicMock()) + monkeypatch.setattr( + watch_index, + "QdrantClient", + MagicMock(return_value=MagicMock(get_collection=MagicMock())), + ) + monkeypatch.setattr(watch_index, "get_boolean_env", lambda *a, **k: False) + + drain_mock = MagicMock() + maintenance_mock = MagicMock() + monkeypatch.setattr(watch_index, "_drain_pending_journal", drain_mock) + monkeypatch.setattr(watch_index, "_run_periodic_maintenance", maintenance_mock) + + time_values = iter([0.0, 1.0, 2.0, 301.0]) + monkeypatch.setattr(watch_index.time, "time", lambda: next(time_values)) + + sleep_calls = {"count": 0} + + def _sleep(_secs): + sleep_calls["count"] += 1 + if sleep_calls["count"] >= 4: + raise KeyboardInterrupt() + + monkeypatch.setattr(watch_index.time, "sleep", _sleep) + + try: + watch_index.main() + finally: + watch_config.ROOT = original_root + watch_index.ROOT = original_watch_root + + assert drain_mock.call_count == 4 + assert maintenance_mock.call_count == 2 diff --git a/tests/test_watch_queue.py b/tests/test_watch_queue.py index 259124a5..72bd6fd1 100644 --- a/tests/test_watch_queue.py +++ b/tests/test_watch_queue.py @@ -37,3 +37,53 @@ def test_change_queue_reprocesses_when_fingerprint_changes(monkeypatch, tmp_path q._flush() assert processed == [[p], [p]] + + +def test_change_queue_force_bypasses_recent_fingerprint_suppression(monkeypatch, tmp_path): + from scripts.watch_index_core import queue as queue_mod + + monkeypatch.setattr(queue_mod, "RECENT_FINGERPRINT_TTL_SECS", 10.0) + + processed = [] + q = queue_mod.ChangeQueue(lambda paths: processed.append(list(paths))) + + p = tmp_path / "file.py" + p.write_text("print('x')\n", encoding="utf-8") + + q.add(p) + q._flush() + q.add(p, force=True) + q._flush() + + assert processed == [[p], [p]] + + +def test_change_queue_repeated_same_path_does_not_rearm_timer(monkeypatch, tmp_path): + from scripts.watch_index_core import queue as queue_mod + + class FakeTimer: + created = 0 + canceled = 0 + + def __init__(self, _delay, _cb): + FakeTimer.created += 1 + self.daemon = False + + def start(self): + return None + + def cancel(self): + FakeTimer.canceled += 1 + + monkeypatch.setattr(queue_mod.threading, "Timer", FakeTimer) + + q = queue_mod.ChangeQueue(lambda _paths: None) + p = tmp_path / "file.py" + p.write_text("print('x')\n", encoding="utf-8") + + q.add(p, force=True) + q.add(p, force=True) + q.add(p, force=True) + + assert FakeTimer.created == 1 + assert FakeTimer.canceled == 0 diff --git a/tests/test_watcher_collection_resolution.py b/tests/test_watcher_collection_resolution.py index fa3d0c1a..2fff6c45 100644 --- a/tests/test_watcher_collection_resolution.py +++ b/tests/test_watcher_collection_resolution.py @@ -17,6 +17,9 @@ def test_main_resolves_collection_from_state(monkeypatch, tmp_path): wi = importlib.import_module("scripts.watch_index") # Reload to re-read env defaults (COLLECTION) in module globals wi = importlib.reload(wi) + watch_config = importlib.import_module("scripts.watch_index_core.config") + original_root = watch_config.ROOT + original_watch_root = wi.ROOT # Fake QdrantClient: force get_collection to raise so code chooses sanitized vector name path class FakeQdrant: @@ -70,10 +73,14 @@ def _raise_kb(_): assert wi.COLLECTION == os.environ.get("COLLECTION_NAME") == "my-collection" # Run main(); in single-repo mode it should keep the env-provided COLLECTION_NAME - wi.main() + try: + wi.main() - # Postcondition: global COLLECTION remains the env-provided name - assert wi.COLLECTION == "my-collection" + # Postcondition: global COLLECTION remains the env-provided name + assert wi.COLLECTION == "my-collection" + finally: + watch_config.ROOT = original_root + wi.ROOT = original_watch_root def test_multi_repo_ignores_placeholder_collection_in_state(monkeypatch, tmp_path): @@ -85,7 +92,8 @@ def test_multi_repo_ignores_placeholder_collection_in_state(monkeypatch, tmp_pat utils = importlib.import_module("scripts.watch_index_core.utils") utils = importlib.reload(utils) - monkeypatch.setattr(utils, "ROOT", tmp_path, raising=False) + watch_config = importlib.import_module("scripts.watch_index_core.config") + monkeypatch.setattr(watch_config, "ROOT", tmp_path, raising=True) monkeypatch.setattr(utils, "is_multi_repo_mode", lambda: True, raising=True) repo_slug = "Pirate Survivors-2b23a7e45f2c4b9f" @@ -111,4 +119,3 @@ def _fake_get_workspace_state(ws_path: str, repo_name: str | None = None): resolved = utils._get_collection_for_file(target) assert resolved == f"derived-{repo_slug}" - diff --git a/tests/test_workspace_state.py b/tests/test_workspace_state.py index e5cce306..8b8fa442 100644 --- a/tests/test_workspace_state.py +++ b/tests/test_workspace_state.py @@ -460,3 +460,134 @@ def test_compare_symbol_changes_tolerates_line_shift_for_unchanged_content(self, assert unchanged == ["function_foo_12"] assert changed == [] + + +class TestSymbolCachePaths: + def test_symbol_cache_uses_shared_repo_state_dir_in_multi_repo_mode(self, monkeypatch, tmp_path): + ws_root = tmp_path / "work" + repo_name = "repo-1234567890abcdef" + repo_root = ws_root / repo_name + repo_root.mkdir(parents=True, exist_ok=True) + + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.setenv("MULTI_REPO_MODE", "1") + + import importlib + + ws_module = importlib.import_module("scripts.workspace_state") + ws_module = importlib.reload(ws_module) + + file_path = repo_root / "src" / "app.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + file_path.write_text("print('x')\n", encoding="utf-8") + + expected_hash = ws_module.hashlib.md5( + str(file_path.resolve()).encode("utf-8") + ).hexdigest()[:8] + cache_path = ws_module._get_symbol_cache_path(str(file_path)) + + assert cache_path == ( + ws_root + / ".codebase" + / "repos" + / repo_name + / "symbols" + / f"{expected_hash}.json" + ) + + def test_symbol_cache_write_uses_cross_user_writable_mode(self, monkeypatch, tmp_path): + ws_root = tmp_path / "work" + repo_name = "repo-1234567890abcdef" + repo_root = ws_root / repo_name + repo_root.mkdir(parents=True, exist_ok=True) + + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.setenv("MULTI_REPO_MODE", "1") + + import importlib + + ws_module = importlib.import_module("scripts.workspace_state") + ws_module = importlib.reload(ws_module) + + file_path = repo_root / "src" / "cacheme.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + file_path.write_text("print('x')\n", encoding="utf-8") + + ws_module.set_cached_symbols(str(file_path), {"sym": {"name": "sym"}}, "abc123") + cache_path = ws_module._get_symbol_cache_path(str(file_path)) + + assert cache_path.exists() + assert oct(cache_path.parent.stat().st_mode & 0o777) == "0o777" + assert oct(cache_path.stat().st_mode & 0o777) == "0o666" + + +class TestCollectionMappings: + def test_get_collection_mappings_accepts_codebase_root_search_path(self, monkeypatch, tmp_path): + ws_root = tmp_path / "work" + ws_root.mkdir(parents=True, exist_ok=True) + slug = "repo-1234567890abcdef" + global_state_dir = ws_root / ".codebase" / "repos" / slug + global_state_dir.mkdir(parents=True, exist_ok=True) + global_state_path = global_state_dir / "state.json" + global_state_path.write_text( + json.dumps( + { + "qdrant_collection": "repo-123456-abcdef", + "updated_at": "2026-03-08T00:00:00", + } + ), + encoding="utf-8", + ) + + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.setenv("MULTI_REPO_MODE", "1") + + import importlib + + ws_module = importlib.import_module("scripts.workspace_state") + ws_module = importlib.reload(ws_module) + + mappings = ws_module.get_collection_mappings(search_root=str(ws_root / ".codebase")) + slug_entries = [m for m in mappings if str(m.get("repo_name")) == slug] + + assert slug_entries, "expected global repo mapping to be discovered from codebase root" + entry = slug_entries[0] + assert entry["collection_name"] == "repo-123456-abcdef" + assert Path(entry["state_file"]).resolve() == global_state_path.resolve() + + def test_get_collection_mappings_keeps_global_repo_state_behavior(self, monkeypatch, tmp_path): + ws_root = tmp_path / "work" + ws_root.mkdir(parents=True, exist_ok=True) + repo_name = "frontend" + global_state_dir = ws_root / ".codebase" / "repos" / repo_name + global_state_dir.mkdir(parents=True, exist_ok=True) + global_state_path = global_state_dir / "state.json" + global_state_path.write_text( + json.dumps( + { + "qdrant_collection": "frontend-abcdef", + "updated_at": "2026-03-08T00:00:00", + } + ), + encoding="utf-8", + ) + + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.setenv("MULTI_REPO_MODE", "1") + + import importlib + + ws_module = importlib.import_module("scripts.workspace_state") + ws_module = importlib.reload(ws_module) + + mappings = ws_module.get_collection_mappings(search_root=str(ws_root)) + repo_entries = [m for m in mappings if str(m.get("repo_name")) == repo_name] + + assert repo_entries, "expected global repo mapping to be discovered" + entry = repo_entries[0] + assert entry["collection_name"] == "frontend-abcdef" + assert Path(entry["state_file"]).resolve() == global_state_path.resolve() From c6fcf507634995f7c305cf1c9eb5a99118bde85d Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Mar 2026 17:35:41 +0000 Subject: [PATCH 29/39] fix(core): improve pagination, upload reliability, and watch consistency - Add composite cursor encoding/decoding for MCP resource pagination - Fix upload client hash format consistency and async failure handling - Re-arm debounce timer after processing to avoid missed changes - Flush replica cache hashes to disk after journal replay - Exclude .remote-git manifests from indexing as control files - Handle internal/external boundary moves correctly in watch handler - Add TTL-based cache for missing graph collections (5 min expiry) - Over-fetch in dense/embedding search when path scope is active - Fix symbol change tracking to properly consume from all tracking dicts - Use absolute Node runtime path in VSCode extension for reliability - Verify payload indexes exist before marking collection as ensured - Mark git history journal as done/failed after worker completion --- ctx-mcp-bridge/src/mcpServer.js | 184 ++++++++++++++---- scripts/hybrid/expand.py | 3 +- scripts/hybrid_search.py | 13 +- scripts/ingest/qdrant.py | 8 + scripts/mcp_impl/search.py | 4 + scripts/mcp_impl/symbol_graph.py | 37 +++- scripts/mcp_indexer_server.py | 1 + scripts/remote_upload_client.py | 59 +++++- scripts/standalone_upload_client.py | 58 +++++- scripts/upload_delta_bundle.py | 37 ++++ scripts/watch_index_core/consistency.py | 21 +- scripts/watch_index_core/handler.py | 39 +++- scripts/watch_index_core/processor.py | 31 ++- scripts/workspace_state.py | 39 +++- .../context-engine-uploader/mcp_bridge.js | 9 +- 15 files changed, 456 insertions(+), 87 deletions(-) diff --git a/ctx-mcp-bridge/src/mcpServer.js b/ctx-mcp-bridge/src/mcpServer.js index 8f309331..e5247c5e 100644 --- a/ctx-mcp-bridge/src/mcpServer.js +++ b/ctx-mcp-bridge/src/mcpServer.js @@ -115,39 +115,81 @@ async function listMemoryTools(client) { } } -async function listResourcesSafe(client, label) { +function encodeCompositeCursor(cursorObj) { + try { + const payload = JSON.stringify(cursorObj || {}); + return Buffer.from(payload, "utf8").toString("base64"); + } catch { + return ""; + } +} + +function decodeCompositeCursor(raw) { + try { + const trimmed = (raw || "").trim(); + if (!trimmed) { + return null; + } + const decoded = Buffer.from(trimmed, "base64").toString("utf8"); + const parsed = JSON.parse(decoded); + if (!parsed || typeof parsed !== "object") { + return null; + } + return parsed; + } catch { + return null; + } +} + +async function listResourcesSafe(client, label, cursor) { if (!client) { - return []; + return { resources: [], nextCursor: null }; } try { const timeoutMs = getBridgeListTimeoutMs(); + const params = cursor ? { cursor } : {}; const remote = await withTimeout( - client.listResources(), + client.listResources(params), timeoutMs, `${label} resources/list`, ); - return Array.isArray(remote?.resources) ? remote.resources.slice() : []; + return { + resources: Array.isArray(remote?.resources) ? remote.resources.slice() : [], + nextCursor: + remote && typeof remote.nextCursor === "string" && remote.nextCursor + ? remote.nextCursor + : null, + }; } catch (err) { debugLog(`[ctxce] Error calling ${label} resources/list: ` + String(err)); - return []; + return { resources: [], nextCursor: null }; } } -async function listResourceTemplatesSafe(client, label) { +async function listResourceTemplatesSafe(client, label, cursor) { if (!client) { - return []; + return { resourceTemplates: [], nextCursor: null }; } try { const timeoutMs = getBridgeListTimeoutMs(); + const params = cursor ? { cursor } : {}; const remote = await withTimeout( - client.listResourceTemplates(), + client.listResourceTemplates(params), timeoutMs, `${label} resources/templates/list`, ); - return Array.isArray(remote?.resourceTemplates) ? remote.resourceTemplates.slice() : []; + return { + resourceTemplates: Array.isArray(remote?.resourceTemplates) + ? remote.resourceTemplates.slice() + : [], + nextCursor: + remote && typeof remote.nextCursor === "string" && remote.nextCursor + ? remote.nextCursor + : null, + }; } catch (err) { debugLog(`[ctxce] Error calling ${label} resources/templates/list: ` + String(err)); - return []; + return { resourceTemplates: [], nextCursor: null }; } } @@ -666,7 +708,7 @@ async function createBridgeServer(options) { async function ensureRemoteDefaults(force = false) { defaultsPayload.session = sessionId; - if (!sessionId || Object.keys(defaultsPayload).length <= 1) { + if (!sessionId) { return; } if (!force && lastDefaultsSyncedSessionId === sessionId) { @@ -692,6 +734,22 @@ async function createBridgeServer(options) { } catch { // ignore logging failures } + try { + if (indexerClient && typeof indexerClient.close === "function") { + await indexerClient.close(); + } + } catch { + // ignore + } + try { + if (memoryClient && typeof memoryClient.close === "function") { + await memoryClient.close(); + } + } catch { + // ignore + } + indexerClient = null; + memoryClient = null; } let nextIndexerClient = null; @@ -749,7 +807,19 @@ async function createBridgeServer(options) { await ensureRemoteDefaults(true); } - await initializeRemoteClients(false); + async function refreshSessionAndSyncDefaults() { + const freshSession = resolveSessionId() || sessionId; + const changed = Boolean(freshSession && freshSession !== sessionId); + if (changed) { + sessionId = freshSession; + defaultsPayload.session = sessionId; + lastDefaultsSyncedSessionId = ""; + } + await initializeRemoteClients(false); + await ensureRemoteDefaults(changed); + } + + await refreshSessionAndSyncDefaults(); const server = new Server( // TODO: marked as depreciated { @@ -769,7 +839,7 @@ async function createBridgeServer(options) { let remote; try { debugLog("[ctxce] tools/list: fetching tools from indexer"); - await initializeRemoteClients(false); + await refreshSessionAndSyncDefaults(); if (!indexerClient) { throw new Error("Indexer MCP client not initialized"); } @@ -803,28 +873,78 @@ async function createBridgeServer(options) { return { tools }; }); - server.setRequestHandler(ListResourcesRequestSchema, async () => { + server.setRequestHandler(ListResourcesRequestSchema, async (request) => { // Proxy resource discovery/read-through so clients that use MCP resources // (not only tools) can access upstream indexer/memory resources directly. - await initializeRemoteClients(false); - const indexerResources = await listResourcesSafe(indexerClient, "indexer"); - const memoryResources = await listResourcesSafe(memoryClient, "memory"); - const resources = dedupeResources([...indexerResources, ...memoryResources]); + await refreshSessionAndSyncDefaults(); + const cursor = + request && request.params && typeof request.params.cursor === "string" + ? request.params.cursor + : null; + const decoded = decodeCompositeCursor(cursor); + const indexerCursor = + decoded && typeof decoded.i === "string" ? decoded.i : cursor; + const memoryCursor = + decoded && typeof decoded.m === "string" ? decoded.m : cursor; + if (cursor && decoded === null) { + debugLog("[ctxce] resources/list: received non-composite cursor; forwarding to both upstreams."); + } + const indexerRes = await listResourcesSafe(indexerClient, "indexer", indexerCursor); + const memoryRes = await listResourcesSafe(memoryClient, "memory", memoryCursor); + const resources = dedupeResources([ + ...indexerRes.resources, + ...memoryRes.resources, + ]); + const nextCursorObj = { + i: indexerRes.nextCursor || "", + m: memoryRes.nextCursor || "", + }; + const nextCursor = + nextCursorObj.i || nextCursorObj.m ? encodeCompositeCursor(nextCursorObj) : ""; debugLog(`[ctxce] resources/list: returning ${resources.length} resources`); - return { resources }; + return nextCursor ? { resources, nextCursor } : { resources }; }); - server.setRequestHandler(ListResourceTemplatesRequestSchema, async () => { - await initializeRemoteClients(false); - const indexerTemplates = await listResourceTemplatesSafe(indexerClient, "indexer"); - const memoryTemplates = await listResourceTemplatesSafe(memoryClient, "memory"); - const resourceTemplates = dedupeResourceTemplates([...indexerTemplates, ...memoryTemplates]); + server.setRequestHandler(ListResourceTemplatesRequestSchema, async (request) => { + await refreshSessionAndSyncDefaults(); + const cursor = + request && request.params && typeof request.params.cursor === "string" + ? request.params.cursor + : null; + const decoded = decodeCompositeCursor(cursor); + const indexerCursor = + decoded && typeof decoded.i === "string" ? decoded.i : cursor; + const memoryCursor = + decoded && typeof decoded.m === "string" ? decoded.m : cursor; + if (cursor && decoded === null) { + debugLog("[ctxce] resources/templates/list: received non-composite cursor; forwarding to both upstreams."); + } + const indexerRes = await listResourceTemplatesSafe( + indexerClient, + "indexer", + indexerCursor, + ); + const memoryRes = await listResourceTemplatesSafe( + memoryClient, + "memory", + memoryCursor, + ); + const resourceTemplates = dedupeResourceTemplates([ + ...indexerRes.resourceTemplates, + ...memoryRes.resourceTemplates, + ]); + const nextCursorObj = { + i: indexerRes.nextCursor || "", + m: memoryRes.nextCursor || "", + }; + const nextCursor = + nextCursorObj.i || nextCursorObj.m ? encodeCompositeCursor(nextCursorObj) : ""; debugLog(`[ctxce] resources/templates/list: returning ${resourceTemplates.length} templates`); - return { resourceTemplates }; + return nextCursor ? { resourceTemplates, nextCursor } : { resourceTemplates }; }); server.setRequestHandler(ReadResourceRequestSchema, async (request) => { - await initializeRemoteClients(false); + await refreshSessionAndSyncDefaults(); const params = request.params || {}; const timeoutMs = getBridgeToolTimeoutMs(); const uri = @@ -862,14 +982,7 @@ async function createBridgeServer(options) { debugLog(`[ctxce] tools/call: ${name || ""}`); - // Check if session changed (e.g., after auth login), and re-send defaults if so. - const freshSession = resolveSessionId() || sessionId; - if (freshSession && freshSession !== sessionId) { - sessionId = freshSession; - defaultsPayload.session = sessionId; - await initializeRemoteClients(false); - await ensureRemoteDefaults(true); - } + await refreshSessionAndSyncDefaults(); if (sessionId && (args === undefined || args === null || typeof args === "object")) { const obj = args && typeof args === "object" ? { ...args } : {}; @@ -893,9 +1006,6 @@ async function createBridgeServer(options) { return indexerResult; } - await initializeRemoteClients(false); - await ensureRemoteDefaults(false); - const timeoutMs = getBridgeToolTimeoutMs(); const maxAttempts = getBridgeRetryAttempts(); const retryDelayMs = getBridgeRetryDelayMs(); diff --git a/scripts/hybrid/expand.py b/scripts/hybrid/expand.py index b228f4ca..678c650c 100644 --- a/scripts/hybrid/expand.py +++ b/scripts/hybrid/expand.py @@ -605,10 +605,11 @@ def expand_via_embeddings( # Search for soft matches (we want semantically similar docs, not exact matches) try: + initial_limit = 8 if not eff_under else max(32, int(max_terms) * 8) search_kwargs = { "collection_name": collection, "query_vector": (vec_name, query_vector) if vec_name else query_vector, - "limit": 8, # Get top 8 neighbors + "limit": initial_limit, # Over-fetch when `under` is set (we post-filter). "with_payload": True, "score_threshold": 0.3, # Lower threshold to get more diverse results } diff --git a/scripts/hybrid_search.py b/scripts/hybrid_search.py index f893f7a7..31b006dd 100644 --- a/scripts/hybrid_search.py +++ b/scripts/hybrid_search.py @@ -502,11 +502,16 @@ def run_pure_dense_search( ) try: - # Single dense query - no pooling, no re-scoring - ranked_points = dense_query(client, vec_name, vec_list, flt, limit, coll, query_text=query) + # Single dense query - no pooling, no re-scoring. + # When `under` is set, we post-filter by path metadata. Over-fetch so we + # can still return up to `limit` in-scope results. + eff_under = _normalize_under_scope(under) + fetch_limit = int(limit) + if eff_under: + fetch_limit = min(max(fetch_limit * 4, fetch_limit + 16), 2000) + ranked_points = dense_query(client, vec_name, vec_list, flt, fetch_limit, coll, query_text=query) # Build output - eff_under = _normalize_under_scope(under) results = [] for p in ranked_points: payload = p.payload or {} @@ -527,6 +532,8 @@ def run_pure_dense_search( "doc_id": payload.get("code_id") or payload.get("_id") or "", "payload": payload, }) + if len(results) >= int(limit): + break return results diff --git a/scripts/ingest/qdrant.py b/scripts/ingest/qdrant.py index 1d8f2766..7857b8f5 100644 --- a/scripts/ingest/qdrant.py +++ b/scripts/ingest/qdrant.py @@ -597,6 +597,14 @@ def ensure_payload_indexes(client: QdrantClient, collection: str): ) except Exception: pass + try: + info = client.get_collection(collection) + except Exception: + return + if _missing_payload_indexes(info): + # Do not memoize; a later call should retry. + return + # Even if create_payload_index threw, get_collection confirms indexes exist. ENSURED_PAYLOAD_INDEX_COLLECTIONS.add(collection) diff --git a/scripts/mcp_impl/search.py b/scripts/mcp_impl/search.py index 750dca17..1e654e73 100644 --- a/scripts/mcp_impl/search.py +++ b/scripts/mcp_impl/search.py @@ -603,6 +603,7 @@ def _apply_result_filters(items: list[dict]) -> list[dict]: ): if ".git" not in not_globs: not_globs.append(".git") + not_globs_norm = [g if case_sensitive else g.lower() for g in not_globs] # Accept top-level alias `queries` as a drop-in for `query` # Many clients send queries=[...] instead of query=[...] @@ -1617,6 +1618,9 @@ def _read_snip(args): _res_code = int((res or {}).get("code", 0)) except Exception: _res_code = 0 + if results: + _res_ok = True + _res_code = 0 response = { "args": { diff --git a/scripts/mcp_impl/symbol_graph.py b/scripts/mcp_impl/symbol_graph.py index e15955b8..e5ef030a 100644 --- a/scripts/mcp_impl/symbol_graph.py +++ b/scripts/mcp_impl/symbol_graph.py @@ -20,6 +20,7 @@ import logging import os import re +import time from typing import Any, Dict, List, Optional, Set from scripts.path_scope import ( @@ -36,7 +37,36 @@ _GRAPH_SUFFIX = "_graph" GRAPH_COLLECTION_SUFFIX = _GRAPH_SUFFIX -_MISSING_GRAPH_COLLECTIONS: set[str] = set() +# Time-based cache: collection -> expiry timestamp (5 minutes TTL) +_MISSING_GRAPH_COLLECTIONS: dict[str, float] = {} +_MISSING_GRAPH_TTL = 300 # 5 minutes + + +def _clean_expired_missing_graphs() -> None: + """Remove expired entries from the missing graph cache.""" + now = time.monotonic() + expired = [coll for coll, expiry in _MISSING_GRAPH_COLLECTIONS.items() if expiry <= now] + for coll in expired: + _MISSING_GRAPH_COLLECTIONS.pop(coll, None) + + +def _is_graph_missing(collection: str) -> bool: + """Check if a graph collection is marked as missing (with expiration).""" + _clean_expired_missing_graphs() + if collection in _MISSING_GRAPH_COLLECTIONS: + return _MISSING_GRAPH_COLLECTIONS.get(collection, 0) > time.monotonic() + return False + + +def _mark_graph_missing(collection: str) -> None: + """Mark a graph collection as missing (with TTL).""" + _MISSING_GRAPH_COLLECTIONS[collection] = time.monotonic() + _MISSING_GRAPH_TTL + + +def _clear_graph_missing(collection: str) -> None: + """Remove a collection from the missing graph cache (e.g., after successful creation).""" + _MISSING_GRAPH_COLLECTIONS.pop(collection, None) + __all__ = [ "_symbol_graph_impl", @@ -313,7 +343,7 @@ async def _query_graph_edges_collection( from qdrant_client import models as qmodels graph_coll = f"{collection}{GRAPH_COLLECTION_SUFFIX}" - if graph_coll in _MISSING_GRAPH_COLLECTIONS: + if _is_graph_missing(graph_coll): return [] # Build graph filter @@ -363,7 +393,7 @@ def _scroll(_flt=flt): except Exception as e: err = str(e).lower() if "404" in err or "doesn't exist" in err or "not found" in err: - _MISSING_GRAPH_COLLECTIONS.add(graph_coll) + _mark_graph_missing(graph_coll) return [] logger.exception( "_query_graph_edges_collection scroll failed for %s", graph_coll @@ -770,6 +800,7 @@ async def _fallback_semantic_search( limit=limit, language=language, under=under, + collection=collection, session=session, output_format="json", # Avoid TOON encoding for internal calls ) diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index 444b98ac..4da07083 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -1215,6 +1215,7 @@ async def repo_search_compat(**arguments) -> Dict[str, Any]: "not_": not_value, "case": args.get("case"), "compact": args.get("compact"), + "debug": args.get("debug"), "mode": args.get("mode"), "repo": args.get("repo"), # Cross-codebase isolation "output_format": args.get("output_format"), # "json" or "toon" diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index a67b7269..8c54f8b9 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -1276,13 +1276,14 @@ def _build_plan_payload(self, changes: Dict[str, List]) -> Dict[str, Any]: content = path.read_bytes() file_hash = hashlib.sha1(content).hexdigest() stat = path.stat() + previous_hash = self._get_cached_file_hash(str(path.resolve())) operations.append( { "operation": "updated", "path": rel_path, "size_bytes": stat.st_size, "content_hash": f"sha1:{file_hash}", - "previous_hash": self._get_cached_file_hash(str(path.resolve())), + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown"), } ) @@ -1321,11 +1322,12 @@ def _build_plan_payload(self, changes: Dict[str, List]) -> Dict[str, Any]: for path in changes["deleted"]: rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: + previous_hash = self._get_cached_file_hash(str(path.resolve())) operations.append( { "operation": "deleted", "path": rel_path, - "previous_hash": self._get_cached_file_hash(str(path.resolve())), + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown"), } ) @@ -1399,17 +1401,41 @@ def _plan_delta_upload(self, changes: Dict[str, List]) -> Optional[Dict[str, Any def _build_apply_only_payload(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Dict[str, Any]: payload = self._last_plan_payload or self._build_plan_payload(changes) needed = plan.get("needed_files", {}) if isinstance(plan, dict) else {} + created_needed = set(needed.get("created", []) or []) + updated_needed = set(needed.get("updated", []) or []) moved_needed = set(needed.get("moved", []) or []) + + # Check if ALL operations are hash-matched (nothing needs content at all) + # This happens when all needed_files lists are empty and there are no actual changes requiring content + has_changes_needing_content = bool(created_needed or updated_needed or moved_needed) + has_deletes = bool(changes.get("deleted", [])) + + # Only skip apply-only if there are NO operations needing content AND NO deletes + if not has_changes_needing_content and not has_deletes: + return { + "manifest": payload.get("manifest", {}), + "operations": [], + "file_hashes": {}, + } + filtered_ops: List[Dict[str, Any]] = [] filtered_hashes: Dict[str, str] = {} for operation in payload.get("operations", []): op_type = str(operation.get("operation") or "") rel_path = str(operation.get("path") or "") - if op_type == "deleted": - filtered_ops.append(operation) + # Determine if this operation needs content (only those skip filtered_hashes) + needs_content = ( + (op_type == "created" and rel_path in created_needed) + or (op_type == "updated" and rel_path in updated_needed) + or (op_type == "moved" and rel_path in moved_needed) + ) + if needs_content: + # Skip operations that need content - they'll be uploaded separately continue - if op_type == "moved" and rel_path not in moved_needed: - filtered_ops.append(operation) + # Preserve all other operations so server advances state + filtered_ops.append(operation) + # Include hash for non-deleted operations + if op_type != "deleted": hash_value = payload.get("file_hashes", {}).get(rel_path) if hash_value: filtered_hashes[rel_path] = hash_value @@ -1984,6 +2010,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: response = self.upload_bundle(bundle_path, manifest) if response.get("success", False): + async_failed = False processed_ops = response.get("processed_operations") if processed_ops is None: logger.info( @@ -2010,11 +2037,19 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: async_result.get("processed_operations") or {}, ) elif async_result["outcome"] == "failed": - logger.warning( + async_failed = True + logger.error( "[remote_upload] Async processing failed for bundle %s: %s", manifest["bundle_id"], async_result.get("error"), ) + self._set_last_upload_result( + "failed", + stage="async_processing", + bundle_id=async_result.get("bundle_id") or manifest["bundle_id"], + sequence_number=async_result.get("sequence_number") or response.get("sequence_number"), + error=async_result.get("error"), + ) else: logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") logger.info(f"[remote_upload] Processed operations: {processed_ops}") @@ -2036,7 +2071,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: except Exception as cleanup_error: logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") - return True + return not async_failed else: error_msg = response.get('error', {}).get('message', 'Unknown upload error') logger.error(f"[remote_upload] Upload failed: {error_msg}") @@ -2166,6 +2201,8 @@ def on_any_event(self, event): def _process_pending_changes(self): """Process accumulated changes after debounce period.""" with self._lock: + # Timer fired; allow a new debounce to be armed while we process. + self._debounce_timer = None # Prevent re-entrancy if self._processing: return @@ -2226,6 +2263,12 @@ def _process_pending_changes(self): # Clear processing flag even if an error occurred with self._lock: self._processing = False + if self._pending_paths and self._debounce_timer is None: + self._debounce_timer = threading.Timer( + self.debounce_seconds, + self._process_pending_changes, + ) + self._debounce_timer.start() observer = Observer() diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index ad801cee..d0c41c37 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -1530,17 +1530,41 @@ def _plan_delta_upload(self, changes: Dict[str, List]) -> Optional[Dict[str, Any def _build_apply_only_payload(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Dict[str, Any]: payload = self._last_plan_payload or self._build_plan_payload(changes) needed = plan.get("needed_files", {}) if isinstance(plan, dict) else {} + created_needed = set(needed.get("created", []) or []) + updated_needed = set(needed.get("updated", []) or []) moved_needed = set(needed.get("moved", []) or []) + + # Check if ALL operations are hash-matched (nothing needs content at all) + # This happens when all needed_files lists are empty and there are no actual changes requiring content + has_changes_needing_content = bool(created_needed or updated_needed or moved_needed) + has_deletes = bool(changes.get("deleted", [])) + + # Only skip apply-only if there are NO operations needing content AND NO deletes + if not has_changes_needing_content and not has_deletes: + return { + "manifest": payload.get("manifest", {}), + "operations": [], + "file_hashes": {}, + } + filtered_ops: List[Dict[str, Any]] = [] filtered_hashes: Dict[str, str] = {} for operation in payload.get("operations", []): op_type = str(operation.get("operation") or "") rel_path = str(operation.get("path") or "") - if op_type == "deleted": - filtered_ops.append(operation) + # Determine if this operation needs content (only those skip filtered_hashes) + needs_content = ( + (op_type == "created" and rel_path in created_needed) + or (op_type == "updated" and rel_path in updated_needed) + or (op_type == "moved" and rel_path in moved_needed) + ) + if needs_content: + # Skip operations that need content - they'll be uploaded separately continue - if op_type == "moved" and rel_path not in moved_needed: - filtered_ops.append(operation) + # Preserve all other operations so server advances state + filtered_ops.append(operation) + # Include hash for non-deleted operations + if op_type != "deleted": hash_value = payload.get("file_hashes", {}).get(rel_path) if hash_value: filtered_hashes[rel_path] = hash_value @@ -2079,6 +2103,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: if not has_content_work: apply_only_result = self._apply_operations_without_content(changes, plan) if apply_only_result is True: + flush_cached_file_hashes() return True if not self.has_meaningful_changes(planned_changes): logger.info("[remote_upload] Plan found no upload work; skipping bundle upload") @@ -2088,6 +2113,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: plan_preview=preview, needed_size_bytes=plan.get("needed_size_bytes", 0), ) + flush_cached_file_hashes() return True # Create delta bundle @@ -2113,6 +2139,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: response = self.upload_bundle(bundle_path, manifest) if response.get("success", False): + async_failed = False processed_ops = response.get("processed_operations") if processed_ops is None: logger.info( @@ -2139,11 +2166,19 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: async_result.get("processed_operations") or {}, ) elif async_result["outcome"] == "failed": - logger.warning( + async_failed = True + logger.error( "[remote_upload] Async processing failed for bundle %s: %s", manifest["bundle_id"], async_result.get("error"), ) + self._set_last_upload_result( + "failed", + stage="async_processing", + bundle_id=async_result.get("bundle_id") or manifest["bundle_id"], + sequence_number=async_result.get("sequence_number") or response.get("sequence_number"), + error=async_result.get("error"), + ) else: logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") logger.info(f"[remote_upload] Processed operations: {processed_ops}") @@ -2154,7 +2189,8 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: sequence_number=response.get("sequence_number"), processed_operations=processed_ops, ) - flush_cached_file_hashes() + if not async_failed: + flush_cached_file_hashes() # Clean up temporary bundle after successful upload try: @@ -2166,7 +2202,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: except Exception as cleanup_error: logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") - return True + return not async_failed else: error_msg = response.get('error', {}).get('message', 'Unknown upload error') logger.error(f"[remote_upload] Upload failed: {error_msg}") @@ -2252,6 +2288,8 @@ def on_any_event(self, event): def _process_pending_changes(self): """Process accumulated changes after debounce period.""" with self._lock: + # Timer fired; allow a new debounce to be armed while we process. + self._debounce_timer = None if self._processing: return if not self._pending_paths: @@ -2308,6 +2346,12 @@ def _process_pending_changes(self): finally: with self._lock: self._processing = False + if self._pending_paths and self._debounce_timer is None: + self._debounce_timer = threading.Timer( + self.debounce_seconds, + self._process_pending_changes, + ) + self._debounce_timer.start() observer = Observer() handler = CodeFileEventHandler(self, debounce_seconds=2.0) diff --git a/scripts/upload_delta_bundle.py b/scripts/upload_delta_bundle.py index bdcd281f..55c08771 100644 --- a/scripts/upload_delta_bundle.py +++ b/scripts/upload_delta_bundle.py @@ -97,6 +97,35 @@ def _load_replica_cache_hashes(workspace_root: Path, slug: str) -> Dict[str, str return merged +def _flush_replica_cache_hashes(workspace_root: Path, slug: str, hashes: Dict[str, str]) -> None: + """Flush replica hashes to workspace cache.json.""" + try: + cache_path = workspace_root / ".codebase" / "cache.json" + cache_path.parent.mkdir(parents=True, exist_ok=True) + + # Read existing cache to preserve other entries + existing_data = {} + if cache_path.exists(): + try: + with cache_path.open("r", encoding="utf-8-sig") as f: + existing_data = json.load(f) + except (OSError, ValueError, json.JSONDecodeError): + existing_data = {} + + # Update file_hashes section + if not isinstance(existing_data, dict): + existing_data = {} + existing_data["file_hashes"] = hashes + + # Write back atomically + temp_path = cache_path.with_suffix(".tmp") + with temp_path.open("w", encoding="utf-8") as f: + json.dump(existing_data, f, indent=2) + temp_path.replace(cache_path) + except Exception as e: + logger.debug(f"[upload_service] Failed to flush cache for {slug}: {e}") + + def get_workspace_key(workspace_path: str) -> str: """Generate 16-char hash for collision avoidance in remote uploads. @@ -573,6 +602,10 @@ def apply_delta_operations( slug=slug, entries=journal_entries_by_slug.get(slug, []), ) + # Flush updated replica hashes to disk + replica_hashes = replica_cache_hashes.get(slug, {}) + if replica_hashes: + _flush_replica_cache_hashes(root, slug, replica_hashes) return operations_count except Exception as e: @@ -868,6 +901,10 @@ def _apply_operation_to_workspace( slug=slug, entries=journal_entries_by_slug.get(slug, []), ) + # Flush updated replica hashes to disk + replica_hashes = replica_cache_hashes.get(slug, {}) + if replica_hashes: + _flush_replica_cache_hashes(root, slug, replica_hashes) return operations_count diff --git a/scripts/watch_index_core/consistency.py b/scripts/watch_index_core/consistency.py index 05db9f80..548d83cb 100644 --- a/scripts/watch_index_core/consistency.py +++ b/scripts/watch_index_core/consistency.py @@ -242,9 +242,9 @@ def _is_index_eligible_path(path_str: str, workspace_root: Path, excluder) -> bo if is_internal_metadata_path(p): return False - # Keep git-history manifests indexable even when .remote-git is excluded. - if any(part == ".remote-git" for part in p.parts) and p.suffix.lower() == ".json": - return True + # .remote-git manifests are control files and must not be treated as indexable. + if _is_remote_git_manifest(p.as_posix()): + return False try: rel_dir = "/" + str(rel.parent).replace(os.sep, "/") @@ -354,6 +354,15 @@ def _record_consistency_audit( pass +def _is_remote_git_manifest(path: str) -> bool: + """Check if path is a .remote-git git history manifest file (control file, not indexable content).""" + try: + p = Path(path) + return any(part == ".remote-git" for part in p.parts) and p.suffix.lower() == ".json" + except Exception: + return False + + def _enqueue_consistency_repairs( workspace_root: Path, workspace_path: str, @@ -377,6 +386,9 @@ def _enqueue_consistency_repairs( for path in stale_paths: if len(entries) >= max_ops: break + # Skip .remote-git git history manifests - they are control files, not indexable content + if _is_remote_git_manifest(path): + continue # Cache can lag after state resets/rebuilds; if the path still exists and is # index-eligible, treat it as missing/upsert instead of stale/delete. if _is_index_eligible_path(path, workspace_root, excluder): @@ -387,6 +399,9 @@ def _enqueue_consistency_repairs( for path in sorted(missing_set): if len(entries) >= max_ops: break + # Skip .remote-git git history manifests - they are control files, not indexable content + if _is_remote_git_manifest(path): + continue entries.append( { "path": path, diff --git a/scripts/watch_index_core/handler.py b/scripts/watch_index_core/handler.py index 1b36fa51..47bce13a 100644 --- a/scripts/watch_index_core/handler.py +++ b/scripts/watch_index_core/handler.py @@ -157,7 +157,31 @@ def on_moved(self, event): dest = Path(event.dest_path).resolve() except Exception: return - if self._is_internal_metadata_path(src) or self._is_internal_metadata_path(dest): + # Handle internal-boundary moves properly + src_internal = self._is_internal_metadata_path(src) + dest_internal = self._is_internal_metadata_path(dest) + if src_internal and dest_internal: + # Both internal -> ignore + return + if dest_internal: + # External -> internal: delete source, don't index destination + if idx.is_indexable_file(src): + try: + coll = self._resolve_collection(src) + deleted = False + if self.client is not None and coll is not None: + idx.delete_points_by_path(self.client, coll, str(src)) + deleted = True + if deleted: + safe_print(f"[moved:external_to_internal] deleted {src}") + self._invalidate_cache(src) + except Exception: + pass + return + if src_internal: + # Internal -> external: index destination as new file + if idx.is_indexable_file(dest): + self._maybe_enqueue(str(dest)) return if not idx.is_indexable_file(dest) and not idx.is_indexable_file(src): return @@ -171,16 +195,13 @@ def on_moved(self, event): if idx.is_indexable_file(src): try: coll = self._resolve_collection(src) + deleted = False if self.client is not None and coll is not None: idx.delete_points_by_path(self.client, coll, str(src)) - safe_print(f"[moved:ignored_dest_deleted_src] {src} -> {dest}") - src_repo_path = _detect_repo_for_file(src) - src_repo_name = _repo_name_or_none(src_repo_path) - try: - if src_repo_name: - remove_cached_file(str(src), src_repo_name) - except Exception: - pass + deleted = True + if deleted: + safe_print(f"[moved:ignored_dest_deleted_src] {src} -> {dest}") + self._invalidate_cache(src) except Exception: pass return diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index bfc06ae8..ee5b12ce 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -140,7 +140,7 @@ def _run_git_history_ingest( repo_name: Optional[str], env_snapshot: Optional[Dict[str, str]] = None, ) -> None: - script = ROOT_DIR / "scripts" / "ingest_history.py" + script = watch_config.ROOT_DIR / "scripts" / "ingest_history.py" if not script.exists(): logger.warning("[git_history_manifest] ingest script missing: %s", script) return @@ -272,15 +272,38 @@ def _stream_pipe(pipe, label: str, tail: deque[str], lock: threading.Lock) -> No ) -def _on_git_history_done(manifest_key: str, future: Future) -> None: +def _on_git_history_done(manifest_path: Path, collection: str, repo_name: Optional[str], future: Future) -> None: + manifest_key = _manifest_key(manifest_path) with _GIT_HISTORY_INFLIGHT_LOCK: _GIT_HISTORY_INFLIGHT.discard(manifest_key) remaining = len(_GIT_HISTORY_INFLIGHT) logger.info("[git_history_manifest] in-flight remaining=%d", remaining) try: future.result() + # Mark journal as done after successful completion + repo_path = _detect_repo_for_file(manifest_path) + if repo_path: + repo_key = str(repo_path) + _mark_journal_done(manifest_path, repo_key, repo_name) + logger.info("[git_history_manifest] marked journal as done: %s", manifest_path) except Exception as e: - logger.warning("[git_history_manifest] worker crashed for %s: %s", manifest_key, e) + repo_path = _detect_repo_for_file(manifest_path) + repo_key = str(repo_path) if repo_path else "" + if repo_key: + _mark_journal_failed( + manifest_path, + repo_key, + repo_name, + f"git history worker failed for collection '{collection}': {e}", + ) + logger.warning( + "[git_history_manifest] worker crashed for %s (collection=%s, repo_key=%s): %s", + manifest_key, + collection, + repo_key or "", + e, + exc_info=True, + ) def _process_git_history_manifest( @@ -318,7 +341,7 @@ def _process_git_history_manifest( repo_name, env_snapshot, ) - future.add_done_callback(lambda fut, manifest_key=key: _on_git_history_done(manifest_key, fut)) + future.add_done_callback(lambda fut, manifest_path=p, coll=collection, rn=repo_name: _on_git_history_done(manifest_path, coll, rn, fut)) def _advance_progress( diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index dc7c2ceb..0ae2a97c 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -2722,6 +2722,33 @@ def compare_symbol_changes(old_symbols: dict, new_symbols: dict) -> tuple[list, if kind and name: remaining_old_by_name_kind.setdefault((kind, name), []).append(old_symbol_id) + def _consume_old_symbol(old_id: str, old_info: dict) -> None: + remaining_old_by_exact.pop(old_id, None) + + old_kind = str(old_info.get("type") or "") + old_name = str(old_info.get("name") or "") + old_hash = str(old_info.get("content_hash") or "") + + if old_kind and old_name and old_hash: + sig = (old_kind, old_name, old_hash) + sig_ids = remaining_old_by_signature.get(sig) or [] + if old_id in sig_ids: + sig_ids.remove(old_id) + if sig_ids: + remaining_old_by_signature[sig] = sig_ids + else: + remaining_old_by_signature.pop(sig, None) + + if old_kind and old_name: + nk = (old_kind, old_name) + nk_ids = remaining_old_by_name_kind.get(nk) or [] + if old_id in nk_ids: + nk_ids.remove(old_id) + if nk_ids: + remaining_old_by_name_kind[nk] = nk_ids + else: + remaining_old_by_name_kind.pop(nk, None) + for symbol_id, symbol_info in new_symbols.items(): if symbol_id in old_symbols: old_info = old_symbols[symbol_id] @@ -2730,7 +2757,7 @@ def compare_symbol_changes(old_symbols: dict, new_symbols: dict) -> tuple[list, unchanged.append(symbol_id) else: changed.append(symbol_id) - remaining_old_by_exact.pop(symbol_id, None) + _consume_old_symbol(symbol_id, old_info) continue kind = str(symbol_info.get("type") or "") @@ -2742,15 +2769,7 @@ def compare_symbol_changes(old_symbols: dict, new_symbols: dict) -> tuple[list, old_id = matched_old_ids.pop(0) if not matched_old_ids: remaining_old_by_signature.pop(signature, None) - remaining_old_by_exact.pop(old_id, None) - nk = (kind, name) - nk_ids = remaining_old_by_name_kind.get(nk) or [] - if old_id in nk_ids: - nk_ids.remove(old_id) - if nk_ids: - remaining_old_by_name_kind[nk] = nk_ids - else: - remaining_old_by_name_kind.pop(nk, None) + _consume_old_symbol(old_id, old_symbols.get(old_id, {})) unchanged.append(symbol_id) continue diff --git a/vscode-extension/context-engine-uploader/mcp_bridge.js b/vscode-extension/context-engine-uploader/mcp_bridge.js index ca81a91a..11637d45 100644 --- a/vscode-extension/context-engine-uploader/mcp_bridge.js +++ b/vscode-extension/context-engine-uploader/mcp_bridge.js @@ -100,10 +100,15 @@ function createBridgeManager(deps) { const binPath = findLocalBridgeBin(); const mode = getBridgeMode(); if (binPath) { + // Use absolute Node runtime to avoid PATH dependency in extension hosts + const bundledBin = findBundledBridgeBin(); + const resolvedKind = bundledBin && path.resolve(binPath) === path.resolve(bundledBin) + ? 'bundled' + : 'local'; return { - command: 'node', + command: process.execPath, args: [binPath], - kind: mode === 'bundled' ? 'bundled' : 'local' + kind: resolvedKind }; } const isWindows = process.platform === 'win32'; From ecaf1c1c42a2aae3ec98fb18f54bffec5ca6f3e5 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:14:18 +0000 Subject: [PATCH 30/39] fix(code review): address critical and major issues from CodeRabbit Critical fixes: - Guard snapshot pruning after partial ingest failures to prevent data loss - Reject directory move targets instead of recursively deleting (security) - Enforce collection write ACL on plan/apply endpoints - Bound session defaults with timeout to prevent hangs Data integrity fixes: - Flush empty replica caches to prevent stale hash state - Clear payload-index memo on collection health failures - Keep delete journal open when graph-edge cleanup fails - Always invalidate cache even when delete operations fail - Don't mark git-history manifests done on timeout/failure - Always sync graph edges after delete-only reindexes Upload client fixes: - Don't route hash-matched created/updated through apply_ops (server only accepts deleted/moved operations) These fixes address the highest-priority security and data integrity issues identified in the CodeRabbit review. Co-authored-by: voarsh2 --- ctx-mcp-bridge/src/mcpServer.js | 13 ++- scripts/ingest/pipeline.py | 48 ++++---- scripts/ingest/qdrant.py | 4 + scripts/ingest_history.py | 24 +++- scripts/remote_upload_client.py | 5 + scripts/standalone_upload_client.py | 5 + scripts/upload_delta_bundle.py | 18 +-- scripts/upload_service.py | 154 ++++++++++++++++++++++++++ scripts/watch_index_core/handler.py | 10 +- scripts/watch_index_core/processor.py | 41 +++---- 10 files changed, 260 insertions(+), 62 deletions(-) diff --git a/ctx-mcp-bridge/src/mcpServer.js b/ctx-mcp-bridge/src/mcpServer.js index e5247c5e..da92625e 100644 --- a/ctx-mcp-bridge/src/mcpServer.js +++ b/ctx-mcp-bridge/src/mcpServer.js @@ -36,10 +36,15 @@ async function sendSessionDefaults(client, payload, label) { return false; } try { - await client.callTool({ - name: "set_session_defaults", - arguments: payload, - }); + const timeoutMs = getBridgeToolTimeoutMs(); + await withTimeout( + client.callTool({ + name: "set_session_defaults", + arguments: payload, + }), + timeoutMs, + `sendSessionDefaults(${label})` + ); return true; } catch (err) { // eslint-disable-next-line no-console diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py index d5ece174..f624980f 100644 --- a/scripts/ingest/pipeline.py +++ b/scripts/ingest/pipeline.py @@ -712,16 +712,22 @@ def make_point(pid, dense_vec, lex_vec, payload, lex_text: str = "", code_text: for i, v, lx, m, lt, ct in zip(batch_ids, vectors, batch_lex, batch_meta, batch_lex_text, batch_code) ] upsert_points(client, collection, points) - # Optional: materialize file-level graph edges in a companion `_graph` store. - # This is an accelerator for symbol_graph callers/importers and is safe to skip on failure. - _sync_graph_edges_best_effort( - client, - collection, - str(file_path), - repo_tag, - calls, - imports, - ) + + # Optional: materialize file-level graph edges in a companion `_graph` store. + # This is an accelerator for symbol_graph callers/importers and is safe to skip on failure. + # IMPORTANT: Sync must run after upserts (or after delete-only reindex) to ensure graph + # edges stay consistent. When a file reindexes to zero chunks, batch_texts is empty but + # we still need to sync graph edges to remove stale entries. + _sync_graph_edges_best_effort( + client, + collection, + str(file_path), + repo_tag, + calls, + imports, + ) + + if batch_texts: try: ws = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" if set_cached_file_hash: @@ -1507,15 +1513,19 @@ def _apply_symbol_pseudo( if all_points: _upsert_points_fn(client, current_collection, all_points) - # Optional: materialize file-level graph edges (best-effort). - _sync_graph_edges_best_effort( - client, - current_collection, - str(file_path), - per_file_repo, - calls, - imports, - ) + + # Optional: materialize file-level graph edges (best-effort). + # IMPORTANT: Sync must run after upserts OR after delete-only reindex to ensure graph + # edges stay consistent. When a file reindexes to zero chunks, all_points is empty but + # we still need to sync graph edges to remove stale entries. + _sync_graph_edges_best_effort( + client, + current_collection, + str(file_path), + per_file_repo, + calls, + imports, + ) try: if set_cached_symbols: diff --git a/scripts/ingest/qdrant.py b/scripts/ingest/qdrant.py index 7857b8f5..c82a4d3b 100644 --- a/scripts/ingest/qdrant.py +++ b/scripts/ingest/qdrant.py @@ -646,6 +646,10 @@ def ensure_collection_and_indexes_once( ENSURED_COLLECTIONS_LAST_CHECK.pop(collection, None) except Exception: pass + try: + ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard(collection) + except Exception: + pass ensure_collection(client, collection, dim, vector_name, schema_mode=mode) if mode in {"legacy", "migrate"}: ensure_payload_indexes(client, collection) diff --git a/scripts/ingest_history.py b/scripts/ingest_history.py index 602b882f..62d01e75 100644 --- a/scripts/ingest_history.py +++ b/scripts/ingest_history.py @@ -532,10 +532,26 @@ def _log_progress(force: bool = False) -> None: e, ) _log_progress(force=True) - try: - _prune_old_commit_points(client, run_id, mode=mode) - except Exception as e: - logger.warning("[ingest_history] prune failed for run_id=%s: %s", run_id, e) + # Only prune snapshot runs that completed cleanly + prune_safe = ( + mode == "snapshot" + and prepared_count > 0 + and invalid_commit_records == 0 + and embed_failures == 0 + and point_build_failures == 0 + and upsert_failures == 0 + and persisted_count == prepared_count + ) + if prune_safe: + try: + _prune_old_commit_points(client, run_id, mode=mode) + except Exception as e: + logger.warning("[ingest_history] prune failed for run_id=%s: %s", run_id, e) + elif mode == "snapshot": + logger.warning( + "[ingest_history] skipping prune for run_id=%s because the snapshot ingest was incomplete", + run_id, + ) try: _cleanup_manifest_files(manifest_path) except Exception as e: diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 8c54f8b9..3229d4ca 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -1432,6 +1432,11 @@ def _build_apply_only_payload(self, changes: Dict[str, List], plan: Dict[str, An if needs_content: # Skip operations that need content - they'll be uploaded separately continue + # IMPORTANT: server-side apply_delta_operations() only accepts "deleted" and "moved" + # operations. Hash-matched "created" and "updated" operations must NOT be routed + # through apply_ops since the server will reject them. + if op_type not in {"deleted", "moved"}: + continue # Preserve all other operations so server advances state filtered_ops.append(operation) # Include hash for non-deleted operations diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index d0c41c37..dcbe151b 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -1561,6 +1561,11 @@ def _build_apply_only_payload(self, changes: Dict[str, List], plan: Dict[str, An if needs_content: # Skip operations that need content - they'll be uploaded separately continue + # IMPORTANT: server-side apply_delta_operations() only accepts "deleted" and "moved" + # operations. Hash-matched "created" and "updated" operations must NOT be routed + # through apply_ops since the server will reject them. + if op_type not in {"deleted", "moved"}: + continue # Preserve all other operations so server advances state filtered_ops.append(operation) # Include hash for non-deleted operations diff --git a/scripts/upload_delta_bundle.py b/scripts/upload_delta_bundle.py index 55c08771..245fedd8 100644 --- a/scripts/upload_delta_bundle.py +++ b/scripts/upload_delta_bundle.py @@ -553,7 +553,9 @@ def apply_delta_operations( target_path.parent.mkdir(parents=True, exist_ok=True) if target_path.exists(): if target_path.is_dir(): - shutil.rmtree(target_path) + raise IsADirectoryError( + f"[upload_delta_bundle] move target is a directory: {target_path}" + ) else: target_path.unlink() shutil.move(str(safe_source_path), str(target_path)) @@ -602,10 +604,9 @@ def apply_delta_operations( slug=slug, entries=journal_entries_by_slug.get(slug, []), ) - # Flush updated replica hashes to disk + # Flush updated replica hashes to disk (including empty caches) replica_hashes = replica_cache_hashes.get(slug, {}) - if replica_hashes: - _flush_replica_cache_hashes(root, slug, replica_hashes) + _flush_replica_cache_hashes(root, slug, replica_hashes) return operations_count except Exception as e: @@ -800,7 +801,9 @@ def _apply_operation_to_workspace( target_path.parent.mkdir(parents=True, exist_ok=True) if target_path.exists(): if target_path.is_dir(): - shutil.rmtree(target_path) + raise IsADirectoryError( + f"[upload_service] move target is a directory: {target_path}" + ) else: target_path.unlink() shutil.move(str(safe_source_path), str(target_path)) @@ -901,10 +904,9 @@ def _apply_operation_to_workspace( slug=slug, entries=journal_entries_by_slug.get(slug, []), ) - # Flush updated replica hashes to disk + # Flush updated replica hashes to disk (including empty caches) replica_hashes = replica_cache_hashes.get(slug, {}) - if replica_hashes: - _flush_replica_cache_hashes(root, slug, replica_hashes) + _flush_replica_cache_hashes(root, slug, replica_hashes) return operations_count diff --git a/scripts/upload_service.py b/scripts/upload_service.py index 405b31be..113715fa 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -1550,6 +1550,83 @@ async def plan_delta(request: PlanRequest): detail="Invalid or expired session", ) + # Resolve collection name for ACL enforcement + collection_name: Optional[str] = None + if _extract_repo_name_from_path or (get_collection_name and logical_repo_reuse_enabled and find_collection_for_logical_repo): + # Always derive repo_name from workspace_path for origin tracking + repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None + if not repo_name: + repo_name = Path(workspace_path).name + + # Preserve any client-supplied collection name but allow server-side overrides + client_collection_name = request.collection_name + resolved_collection: Optional[str] = None + logical_repo_id = request.logical_repo_id + + # Resolve collection name, preferring server-side mapping for logical_repo_id when enabled + if logical_repo_reuse_enabled() and logical_repo_id and find_collection_for_logical_repo: + try: + existing = find_collection_for_logical_repo(logical_repo_id, search_root=WORK_DIR) + except Exception: + existing = None + if existing: + resolved_collection = existing + + # Latent migration: when no explicit mapping exists yet for this logical_repo_id, but there is a + # single existing collection mapping, prefer reusing it rather than creating a fresh collection. + if logical_repo_reuse_enabled() and logical_repo_id and resolved_collection is None and get_collection_mappings: + try: + mappings = get_collection_mappings(search_root=WORK_DIR) or [] + except Exception: + mappings = [] + + if len(mappings) == 1: + canonical = mappings[0] + canonical_coll = canonical.get("collection_name") + if canonical_coll: + resolved_collection = canonical_coll + if update_workspace_state: + try: + update_workspace_state( + workspace_path=canonical.get("container_path") or canonical.get("state_file"), + updates={"logical_repo_id": logical_repo_id}, + repo_name=canonical.get("repo_name"), + ) + except Exception as migrate_err: + logger.debug( + f"[upload_service] Failed to migrate logical_repo_id for existing mapping: {migrate_err}" + ) + + # Finalize collection_name: prefer resolved server-side mapping, then client-supplied name, + # then standard get_collection_name/DEFAULT_COLLECTION fallbacks. + if resolved_collection is not None: + collection_name = resolved_collection + elif client_collection_name: + collection_name = client_collection_name + else: + if get_collection_name and repo_name: + collection_name = get_collection_name(repo_name) + else: + collection_name = DEFAULT_COLLECTION + + # Enforce collection write access for plan/apply when auth is enabled + if AUTH_ENABLED and CTXCE_MCP_ACL_ENFORCE and collection_name: + uid = str((record or {}).get("user_id") or "").strip() + if not uid: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired session", + ) + try: + allowed = has_collection_access(uid, str(collection_name), "write") + except AuthDisabledError: + allowed = True + if not allowed: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"User does not have write access to collection '{collection_name}'", + ) + plan = plan_delta_upload( workspace_path=workspace_path, operations=request.operations, @@ -1630,6 +1707,83 @@ async def apply_delta_ops(request: ApplyOperationsRequest): detail="Invalid or expired session", ) + # Resolve collection name for ACL enforcement + collection_name: Optional[str] = None + if _extract_repo_name_from_path or (get_collection_name and logical_repo_reuse_enabled and find_collection_for_logical_repo): + # Always derive repo_name from workspace_path for origin tracking + repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None + if not repo_name: + repo_name = Path(workspace_path).name + + # Preserve any client-supplied collection name but allow server-side overrides + client_collection_name = request.collection_name + resolved_collection: Optional[str] = None + logical_repo_id = request.logical_repo_id + + # Resolve collection name, preferring server-side mapping for logical_repo_id when enabled + if logical_repo_reuse_enabled() and logical_repo_id and find_collection_for_logical_repo: + try: + existing = find_collection_for_logical_repo(logical_repo_id, search_root=WORK_DIR) + except Exception: + existing = None + if existing: + resolved_collection = existing + + # Latent migration: when no explicit mapping exists yet for this logical_repo_id, but there is a + # single existing collection mapping, prefer reusing it rather than creating a fresh collection. + if logical_repo_reuse_enabled() and logical_repo_id and resolved_collection is None and get_collection_mappings: + try: + mappings = get_collection_mappings(search_root=WORK_DIR) or [] + except Exception: + mappings = [] + + if len(mappings) == 1: + canonical = mappings[0] + canonical_coll = canonical.get("collection_name") + if canonical_coll: + resolved_collection = canonical_coll + if update_workspace_state: + try: + update_workspace_state( + workspace_path=canonical.get("container_path") or canonical.get("state_file"), + updates={"logical_repo_id": logical_repo_id}, + repo_name=canonical.get("repo_name"), + ) + except Exception as migrate_err: + logger.debug( + f"[upload_service] Failed to migrate logical_repo_id for existing mapping: {migrate_err}" + ) + + # Finalize collection_name: prefer resolved server-side mapping, then client-supplied name, + # then standard get_collection_name/DEFAULT_COLLECTION fallbacks. + if resolved_collection is not None: + collection_name = resolved_collection + elif client_collection_name: + collection_name = client_collection_name + else: + if get_collection_name and repo_name: + collection_name = get_collection_name(repo_name) + else: + collection_name = DEFAULT_COLLECTION + + # Enforce collection write access for plan/apply when auth is enabled + if AUTH_ENABLED and CTXCE_MCP_ACL_ENFORCE and collection_name: + uid = str((record or {}).get("user_id") or "").strip() + if not uid: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired session", + ) + try: + allowed = has_collection_access(uid, str(collection_name), "write") + except AuthDisabledError: + allowed = True + if not allowed: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"User does not have write access to collection '{collection_name}'", + ) + manifest = request.manifest or {} bundle_id = manifest.get("bundle_id") manifest_sequence = manifest.get("sequence_number") diff --git a/scripts/watch_index_core/handler.py b/scripts/watch_index_core/handler.py index 47bce13a..7827ad21 100644 --- a/scripts/watch_index_core/handler.py +++ b/scripts/watch_index_core/handler.py @@ -174,9 +174,10 @@ def on_moved(self, event): deleted = True if deleted: safe_print(f"[moved:external_to_internal] deleted {src}") + except Exception as exc: + safe_print(f"[moved:external_to_internal:error] {src}: {exc}") + finally: self._invalidate_cache(src) - except Exception: - pass return if src_internal: # Internal -> external: index destination as new file @@ -201,9 +202,10 @@ def on_moved(self, event): deleted = True if deleted: safe_print(f"[moved:ignored_dest_deleted_src] {src} -> {dest}") + except Exception as exc: + safe_print(f"[moved:ignored_dest_deleted_src:error] {src}: {exc}") + finally: self._invalidate_cache(src) - except Exception: - pass return except Exception: pass diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index ee5b12ce..b13b0997 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -142,8 +142,7 @@ def _run_git_history_ingest( ) -> None: script = watch_config.ROOT_DIR / "scripts" / "ingest_history.py" if not script.exists(): - logger.warning("[git_history_manifest] ingest script missing: %s", script) - return + raise RuntimeError(f"[git_history_manifest] ingest script missing: {script}") cmd = [sys.executable or "python3", str(script), "--manifest-json", str(p)] env = _build_subprocess_env(collection, repo_name, env_snapshot) @@ -218,19 +217,14 @@ def _stream_pipe(pipe, label: str, tail: deque[str], lock: threading.Lock) -> No if timed_out: elapsed_ms = int((time.monotonic() - started) * 1000) - logger.warning( - "[git_history_manifest] ingest_history.py timeout for %s after %dms (timeout=%ss)", - p, - elapsed_ms, - _GIT_HISTORY_TIMEOUT_SECONDS, + error_msg = ( + f"[git_history_manifest] ingest_history.py timeout for {p} after {elapsed_ms}ms " + f"(timeout={_GIT_HISTORY_TIMEOUT_SECONDS}s)" ) if stderr_tail: - logger.warning( - "[git_history_manifest] timeout stderr tail for %s: %s", - p, - _tail_snapshot(stderr_tail), - ) - return + error_msg += f" stderr={_tail_snapshot(stderr_tail)}" + logger.warning(error_msg) + raise RuntimeError(error_msg) returncode = proc.wait(timeout=1.0) except Exception as e: @@ -240,18 +234,16 @@ def _stream_pipe(pipe, label: str, tail: deque[str], lock: threading.Lock) -> No proc.kill() except Exception: pass - return + raise RuntimeError(f"[git_history_manifest] subprocess error for {p}: {e}") from e elapsed_ms = int((time.monotonic() - started) * 1000) if returncode != 0: - logger.warning( - "[git_history_manifest] ingest_history.py failed for %s: exit=%d elapsed_ms=%d stderr=%s", - p, - returncode, - elapsed_ms, - _tail_snapshot(stderr_tail), + error_msg = ( + f"[git_history_manifest] ingest_history.py failed for {p}: exit={returncode} " + f"elapsed_ms={elapsed_ms} stderr={_tail_snapshot(stderr_tail)}" ) - return + logger.warning(error_msg) + raise RuntimeError(error_msg) logger.info( "[git_history_manifest] completed for %s: exit=0 elapsed_ms=%d", @@ -839,8 +831,11 @@ def _process_paths( caller_path=str(p), repo=repo_name, ) - except Exception: - pass + except Exception as graph_exc: + safe_print(f"[deleted:graph_failed] {p} -> {collection}: {graph_exc}") + # Don't mark as deleted_ok if graph cleanup fails + deleted_ok = False + raise safe_print(f"[deleted] {p} -> {collection}") deleted_ok = True except Exception: From 86f221299f44ed36b5c2915df2ee6d00148e0a62 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:31:02 +0000 Subject: [PATCH 31/39] fix(watch,consistency): improve error handling and retry logic - Surface maintenance write failures instead of silently ignoring them - Prune excluded directories (node_modules, dist, build, etc.) during audit walks to improve performance - Requeue failed watch batches instead of permanently dropping changes on transient errors - Add logging to maintenance state update failures for debugging These changes address remaining CodeRabbit review issues around watcher reliability and maintenance observability. Co-authored-by: voarsh2 --- scripts/remote_upload_client.py | 12 ++++++++++++ scripts/standalone_upload_client.py | 17 ++++++++++++++--- scripts/watch_index_core/consistency.py | 24 ++++++++++++++++++++---- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 3229d4ca..26e0a548 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -2219,6 +2219,7 @@ def _process_pending_changes(self): check_deletions = self._check_for_deletions self._check_for_deletions = False + upload_succeeded = False try: # Only include cached paths when deletion-related events occurred if check_deletions: @@ -2245,6 +2246,7 @@ def _process_pending_changes(self): success = self.client.process_changes_and_upload(changes) if success: self.client.log_watch_upload_result() + upload_succeeded = True else: logger.error("[watch] Failed to upload changes") else: @@ -2260,20 +2262,30 @@ def _process_pending_changes(self): success = self.client.upload_git_history_only(git_history) if success: logger.info("[watch] Successfully uploaded git history metadata") + upload_succeeded = True else: logger.error("[watch] Failed to upload git history metadata") + else: + upload_succeeded = True # No changes to process except Exception as e: logger.error(f"[watch] Error processing changes: {e}") finally: # Clear processing flag even if an error occurred with self._lock: self._processing = False + # Re-queue pending paths if upload failed + if not upload_succeeded and pending: + # Merge pending paths back into _pending_paths + for p in pending: + self._pending_paths.add(p) + # Arm next pass if there are pending paths if self._pending_paths and self._debounce_timer is None: self._debounce_timer = threading.Timer( self.debounce_seconds, self._process_pending_changes, ) self._debounce_timer.start() + self._debounce_timer.start() observer = Observer() diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index dcbe151b..c4ceddf9 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -2305,6 +2305,7 @@ def _process_pending_changes(self): check_deletions = self._check_for_deletions self._check_for_deletions = False + upload_succeeded = False try: # Only include cached paths when deletion-related events occurred if check_deletions: @@ -2315,7 +2316,7 @@ def _process_pending_changes(self): else: all_paths = pending - + changes = self.client.detect_file_changes(all_paths) meaningful_changes = ( len(changes.get("created", [])) + @@ -2323,12 +2324,13 @@ def _process_pending_changes(self): len(changes.get("deleted", [])) + len(changes.get("moved", [])) ) - + if meaningful_changes > 0: logger.info(f"[watch] Detected {meaningful_changes} changes: { {k: len(v) for k, v in changes.items() if k != 'unchanged'} }") success = self.client.process_changes_and_upload(changes) if success: self.client.log_watch_upload_result() + upload_succeeded = True else: logger.error("[watch] Failed to upload changes") else: @@ -2338,19 +2340,28 @@ def _process_pending_changes(self): git_history = _collect_git_history_for_workspace(self.client.workspace_path) except Exception: git_history = None - + if git_history: logger.info("[watch] Detected git history update; uploading git history metadata") success = self.client.upload_git_history_only(git_history) if success: logger.info("[watch] Successfully uploaded git history metadata") + upload_succeeded = True else: logger.error("[watch] Failed to upload git history metadata") + else: + upload_succeeded = True # No changes to process except Exception as e: logger.error(f"[watch] Error processing changes: {e}") finally: with self._lock: self._processing = False + # Re-queue pending paths if upload failed + if not upload_succeeded and pending: + # Merge pending paths back into _pending_paths + for p in pending: + self._pending_paths.add(p) + # Arm next pass if there are pending paths if self._pending_paths and self._debounce_timer is None: self._debounce_timer = threading.Timer( self.debounce_seconds, diff --git a/scripts/watch_index_core/consistency.py b/scripts/watch_index_core/consistency.py index 548d83cb..d475fbb7 100644 --- a/scripts/watch_index_core/consistency.py +++ b/scripts/watch_index_core/consistency.py @@ -161,8 +161,13 @@ def _record_empty_dir_sweep(workspace_path: str, repo_name: Optional[str]) -> No repo_name=repo_name, updates={"maintenance": maintenance}, ) - except Exception: - pass + except Exception as exc: + logger.warning( + "Failed to record empty dir sweep timestamp: %s (workspace=%s, repo=%s)", + exc, + workspace_path, + repo_name, + ) def _load_cached_hashes( @@ -282,6 +287,12 @@ def _scan_indexable_fs_paths(workspace_root: Path, *, max_paths: int) -> Tuple[S child = current / dirname if is_internal_metadata_path(child): continue + try: + rel_dir = "/" + str(child.relative_to(workspace_root)).replace(os.sep, "/") + if excluder.exclude_dir(rel_dir): + continue + except Exception: + pass pruned_dirnames.append(dirname) dirnames[:] = pruned_dirnames @@ -350,8 +361,13 @@ def _record_consistency_audit( repo_name=repo_name, updates={"maintenance": maintenance}, ) - except Exception: - pass + except Exception as exc: + logger.warning( + "Failed to record consistency audit: %s (workspace=%s, repo=%s)", + exc, + workspace_path, + repo_name, + ) def _is_remote_git_manifest(path: str) -> bool: From 6bd58ea8cacf528fe30287a91bf298035e670ad3 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:37:52 +0000 Subject: [PATCH 32/39] fix(ingest,watch,upload): address CodeRabbit critical and major issues Critical fixes: - Guard manifest cleanup and exit with error on incomplete ingests - Return failing exit status for direct-mode upsert errors - Don't trust payload-index memo after collection drops Major fixes: - Clean up graph edges for excluded files - Use metadata-root-aware cache helper in watch loops - Fix queued uploads being re-bundled (treat None async result as pending) - Add graph-edge cleanup on external-to-internal and ignored-destination moves - Use is_internal_metadata_path() in processor to catch nested internal paths Co-authored-by: voarsh2 --- scripts/ingest/pipeline.py | 18 ++++++++++++ scripts/ingest/qdrant.py | 12 +++++++- scripts/ingest_history.py | 41 ++++++++++++++++++++++----- scripts/remote_upload_client.py | 18 ++++++++++-- scripts/watch_index_core/handler.py | 18 ++++++++++++ scripts/watch_index_core/processor.py | 4 +-- 6 files changed, 99 insertions(+), 12 deletions(-) diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py index f624980f..d3616e9c 100644 --- a/scripts/ingest/pipeline.py +++ b/scripts/ingest/pipeline.py @@ -310,6 +310,15 @@ def index_single_file( delete_points_by_path(client, collection, str(file_path)) except Exception: pass + # Clean up graph edges for excluded file + _sync_graph_edges_best_effort( + client, + collection, + str(file_path), + repo_tag, + None, # No calls when file is excluded + None, # No imports when file is excluded + ) print(f"Skipping excluded file: {file_path}") return False except Exception: @@ -986,6 +995,15 @@ def process_file_with_smart_reindexing( _delete_points_fn(client, current_collection, str(p)) except Exception: pass + # Clean up graph edges for excluded file + _sync_graph_edges_best_effort( + client, + current_collection, + str(p), + repo_name_for_cache or _detect_repo_name_from_path(file_path), + None, # No calls when file is excluded + None, # No imports when file is excluded + ) print(f"[SMART_REINDEX] Skipping excluded file: {file_path}") return "skipped" except Exception: diff --git a/scripts/ingest/qdrant.py b/scripts/ingest/qdrant.py index c82a4d3b..e7db21fc 100644 --- a/scripts/ingest/qdrant.py +++ b/scripts/ingest/qdrant.py @@ -586,8 +586,18 @@ def ensure_payload_indexes(client: QdrantClient, collection: str): """Create helpful payload indexes if they don't exist (idempotent).""" if not collection: return + + # On memo hit, verify collection still exists and indexes are present if collection in ENSURED_PAYLOAD_INDEX_COLLECTIONS: - return + try: + info = client.get_collection(collection) + if not _missing_payload_indexes(info): + # Memo is still valid + return + except Exception: + # Collection doesn't exist or error accessing it; remove from memo + ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard(collection) + for field in PAYLOAD_INDEX_FIELDS: try: client.create_payload_index( diff --git a/scripts/ingest_history.py b/scripts/ingest_history.py index 62d01e75..06bea9d5 100644 --- a/scripts/ingest_history.py +++ b/scripts/ingest_history.py @@ -552,10 +552,27 @@ def _log_progress(force: bool = False) -> None: "[ingest_history] skipping prune for run_id=%s because the snapshot ingest was incomplete", run_id, ) - try: - _cleanup_manifest_files(manifest_path) - except Exception as e: - logger.warning("[ingest_history] manifest cleanup failed for %s: %s", manifest_path, e) + + # Only cleanup manifest if ingest completed successfully + ingest_complete = ( + prepared_count > 0 + and invalid_commit_records == 0 + and embed_failures == 0 + and point_build_failures == 0 + and upsert_failures == 0 + and persisted_count == prepared_count + ) + if ingest_complete: + try: + _cleanup_manifest_files(manifest_path) + except Exception as e: + logger.warning("[ingest_history] manifest cleanup failed for %s: %s", manifest_path, e) + else: + logger.warning( + "[ingest_history] keeping manifest %s because ingest was incomplete", + manifest_path, + ) + logger.info( "Ingested commits from manifest %s into %s: persisted=%d prepared=%d invalid=%d " "embed_failures=%d point_failures=%d upsert_failures=%d", @@ -568,7 +585,7 @@ def _log_progress(force: bool = False) -> None: point_build_failures, upsert_failures, ) - return persisted_count + return persisted_count, ingest_complete def main(): @@ -621,7 +638,7 @@ def main(): client = QdrantClient(url=QDRANT_URL, api_key=API_KEY or None) if args.manifest_json: - _ingest_from_manifest( + persisted_count, ingest_complete = _ingest_from_manifest( args.manifest_json, model, client, @@ -629,6 +646,8 @@ def main(): args.include_body, args.per_batch, ) + if not ingest_complete: + raise SystemExit(1) return commits = list_commits(args) @@ -637,6 +656,8 @@ def main(): return points: List[models.PointStruct] = [] + persisted_count = 0 + upsert_failures = 0 for sha in commits: md = commit_metadata(sha) text = build_text(md, include_body=args.include_body) @@ -686,7 +707,9 @@ def main(): batch_size = len(points) try: client.upsert(collection_name=COLLECTION, points=points) + persisted_count += batch_size except Exception as e: + upsert_failures += batch_size logger.error( "[ingest_history] batch upsert failed collection=%s repo=%s size=%d path=%s: %s", COLLECTION, @@ -701,7 +724,9 @@ def main(): final_size = len(points) try: client.upsert(collection_name=COLLECTION, points=points) + persisted_count += final_size except Exception as e: + upsert_failures += final_size logger.error( "[ingest_history] final upsert failed collection=%s repo=%s size=%d path=%s: %s", COLLECTION, @@ -710,7 +735,9 @@ def main(): args.path or "", e, ) - print(f"Ingested {len(commits)} commits into {COLLECTION}.") + if upsert_failures: + raise SystemExit(1) + print(f"Ingested {persisted_count} commits into {COLLECTION}.") if __name__ == "__main__": diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 26e0a548..3fc2483d 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -2055,6 +2055,15 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: sequence_number=async_result.get("sequence_number") or response.get("sequence_number"), error=async_result.get("error"), ) + else: + # async_result is None - treat as pending/failed + # Don't finalize changes, keep bundle marked as queued + logger.warning( + "[remote_upload] Async upload result unavailable for bundle %s (sequence=%s) - treating as pending", + manifest["bundle_id"], + response.get("sequence_number"), + ) + async_failed = True else: logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") logger.info(f"[remote_upload] Processed operations: {processed_ops}") @@ -2225,7 +2234,8 @@ def _process_pending_changes(self): if check_deletions: cached_file_hashes = _load_local_cache_file_hashes( self.client.workspace_path, - self.client.repo_name + self.client.repo_name, + metadata_root=self.client.metadata_root, ) cached_paths = [Path(p) for p in cached_file_hashes.keys()] all_paths = list(set(pending + cached_paths)) @@ -2335,7 +2345,11 @@ def _watch_loop_polling(self, interval: int = 5): path_map[resolved] = p # Include any paths that are only present in the local cache (deleted files) - cached_file_hashes = _load_local_cache_file_hashes(self.workspace_path, self.repo_name) + cached_file_hashes = _load_local_cache_file_hashes( + self.workspace_path, + self.repo_name, + metadata_root=self.metadata_root, + ) for cached_abs in cached_file_hashes.keys(): try: cached_path = Path(cached_abs) diff --git a/scripts/watch_index_core/handler.py b/scripts/watch_index_core/handler.py index 7827ad21..5636d1ed 100644 --- a/scripts/watch_index_core/handler.py +++ b/scripts/watch_index_core/handler.py @@ -171,6 +171,15 @@ def on_moved(self, event): deleted = False if self.client is not None and coll is not None: idx.delete_points_by_path(self.client, coll, str(src)) + # Clean up graph edges for the moved file + try: + idx.delete_graph_edges_by_path( + self.client, + f"{coll}_graph", + caller_path=str(src), + ) + except Exception: + pass # Graph cleanup is best-effort deleted = True if deleted: safe_print(f"[moved:external_to_internal] deleted {src}") @@ -199,6 +208,15 @@ def on_moved(self, event): deleted = False if self.client is not None and coll is not None: idx.delete_points_by_path(self.client, coll, str(src)) + # Clean up graph edges for the moved file + try: + idx.delete_graph_edges_by_path( + self.client, + f"{coll}_graph", + caller_path=str(src), + ) + except Exception: + pass # Graph cleanup is best-effort deleted = True if deleted: safe_print(f"[moved:ignored_dest_deleted_src] {src} -> {dest}") diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index b13b0997..223081c4 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -37,7 +37,7 @@ ) from . import config as watch_config from .rename import _rename_in_store -from .paths import is_internal_top_level_path +from .paths import is_internal_metadata_path from .utils import ( _detect_repo_for_file, @@ -60,7 +60,7 @@ def __init__(self, *, text: Optional[str] = None, file_hash: str = "") -> None: def _is_internal_ignored_path(path: Path) -> bool: - return is_internal_top_level_path(path, watch_config.ROOT) + return is_internal_metadata_path(path, watch_config.ROOT) def _staging_requires_subprocess(state: Optional[Dict[str, object]]) -> bool: From 99e94330367cd7786172e0ed322d257ab3c2ffa4 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:39:42 +0000 Subject: [PATCH 33/39] refactor(upload_service): extract duplicated collection resolution logic - Add _resolve_collection_for_request() helper to consolidate collection name resolution - Replace duplicated blocks in plan_delta, apply_delta_ops, and upload_delta_bundle - Logic handles: repo name derivation, logical repo ID resolution, latent migration, and fallbacks Co-authored-by: voarsh2 --- scripts/upload_service.py | 257 +++++++++++++------------------------- 1 file changed, 89 insertions(+), 168 deletions(-) diff --git a/scripts/upload_service.py b/scripts/upload_service.py index 113715fa..7e2ac6bb 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -1526,6 +1526,79 @@ async def get_status(workspace_path: str): raise HTTPException(status_code=500, detail=str(e)) +def _resolve_collection_for_request( + workspace_path: str, + client_collection_name: Optional[str], + logical_repo_id: Optional[str], +) -> Tuple[str, Optional[str]]: + """ + Resolve collection name and repo_name for upload/plan/apply requests. + + Returns: + Tuple of (collection_name, repo_name) + """ + # Resolve collection name for ACL enforcement + collection_name: Optional[str] = None + repo_name: Optional[str] = None + + if _extract_repo_name_from_path or (get_collection_name and logical_repo_reuse_enabled and find_collection_for_logical_repo): + # Always derive repo_name from workspace_path for origin tracking + repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None + if not repo_name: + repo_name = Path(workspace_path).name + + # Preserve any client-supplied collection name but allow server-side overrides + resolved_collection: Optional[str] = None + + # Resolve collection name, preferring server-side mapping for logical_repo_id when enabled + if logical_repo_reuse_enabled() and logical_repo_id and find_collection_for_logical_repo: + try: + existing = find_collection_for_logical_repo(logical_repo_id, search_root=WORK_DIR) + except Exception: + existing = None + if existing: + resolved_collection = existing + + # Latent migration: when no explicit mapping exists yet for this logical_repo_id, but there is a + # single existing collection mapping, prefer reusing it rather than creating a fresh collection. + if logical_repo_reuse_enabled() and logical_repo_id and resolved_collection is None and get_collection_mappings: + try: + mappings = get_collection_mappings(search_root=WORK_DIR) or [] + except Exception: + mappings = [] + + if len(mappings) == 1: + canonical = mappings[0] + canonical_coll = canonical.get("collection_name") + if canonical_coll: + resolved_collection = canonical_coll + if update_workspace_state: + try: + update_workspace_state( + workspace_path=canonical.get("container_path") or canonical.get("state_file"), + updates={"logical_repo_id": logical_repo_id}, + repo_name=canonical.get("repo_name"), + ) + except Exception as migrate_err: + logger.debug( + f"[upload_service] Failed to migrate logical_repo_id for existing mapping: {migrate_err}" + ) + + # Finalize collection_name: prefer resolved server-side mapping, then client-supplied name, + # then standard get_collection_name/DEFAULT_COLLECTION fallbacks. + if resolved_collection is not None: + collection_name = resolved_collection + elif client_collection_name: + collection_name = client_collection_name + else: + if get_collection_name and repo_name: + collection_name = get_collection_name(repo_name) + else: + collection_name = DEFAULT_COLLECTION + + return collection_name, repo_name + + @app.post("/api/v1/delta/plan", response_model=PlanResponse) async def plan_delta(request: PlanRequest): """Plan which file bodies are needed before uploading content.""" @@ -1551,63 +1624,11 @@ async def plan_delta(request: PlanRequest): ) # Resolve collection name for ACL enforcement - collection_name: Optional[str] = None - if _extract_repo_name_from_path or (get_collection_name and logical_repo_reuse_enabled and find_collection_for_logical_repo): - # Always derive repo_name from workspace_path for origin tracking - repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None - if not repo_name: - repo_name = Path(workspace_path).name - - # Preserve any client-supplied collection name but allow server-side overrides - client_collection_name = request.collection_name - resolved_collection: Optional[str] = None - logical_repo_id = request.logical_repo_id - - # Resolve collection name, preferring server-side mapping for logical_repo_id when enabled - if logical_repo_reuse_enabled() and logical_repo_id and find_collection_for_logical_repo: - try: - existing = find_collection_for_logical_repo(logical_repo_id, search_root=WORK_DIR) - except Exception: - existing = None - if existing: - resolved_collection = existing - - # Latent migration: when no explicit mapping exists yet for this logical_repo_id, but there is a - # single existing collection mapping, prefer reusing it rather than creating a fresh collection. - if logical_repo_reuse_enabled() and logical_repo_id and resolved_collection is None and get_collection_mappings: - try: - mappings = get_collection_mappings(search_root=WORK_DIR) or [] - except Exception: - mappings = [] - - if len(mappings) == 1: - canonical = mappings[0] - canonical_coll = canonical.get("collection_name") - if canonical_coll: - resolved_collection = canonical_coll - if update_workspace_state: - try: - update_workspace_state( - workspace_path=canonical.get("container_path") or canonical.get("state_file"), - updates={"logical_repo_id": logical_repo_id}, - repo_name=canonical.get("repo_name"), - ) - except Exception as migrate_err: - logger.debug( - f"[upload_service] Failed to migrate logical_repo_id for existing mapping: {migrate_err}" - ) - - # Finalize collection_name: prefer resolved server-side mapping, then client-supplied name, - # then standard get_collection_name/DEFAULT_COLLECTION fallbacks. - if resolved_collection is not None: - collection_name = resolved_collection - elif client_collection_name: - collection_name = client_collection_name - else: - if get_collection_name and repo_name: - collection_name = get_collection_name(repo_name) - else: - collection_name = DEFAULT_COLLECTION + collection_name, repo_name = _resolve_collection_for_request( + workspace_path=workspace_path, + client_collection_name=request.collection_name, + logical_repo_id=request.logical_repo_id, + ) # Enforce collection write access for plan/apply when auth is enabled if AUTH_ENABLED and CTXCE_MCP_ACL_ENFORCE and collection_name: @@ -1708,63 +1729,11 @@ async def apply_delta_ops(request: ApplyOperationsRequest): ) # Resolve collection name for ACL enforcement - collection_name: Optional[str] = None - if _extract_repo_name_from_path or (get_collection_name and logical_repo_reuse_enabled and find_collection_for_logical_repo): - # Always derive repo_name from workspace_path for origin tracking - repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None - if not repo_name: - repo_name = Path(workspace_path).name - - # Preserve any client-supplied collection name but allow server-side overrides - client_collection_name = request.collection_name - resolved_collection: Optional[str] = None - logical_repo_id = request.logical_repo_id - - # Resolve collection name, preferring server-side mapping for logical_repo_id when enabled - if logical_repo_reuse_enabled() and logical_repo_id and find_collection_for_logical_repo: - try: - existing = find_collection_for_logical_repo(logical_repo_id, search_root=WORK_DIR) - except Exception: - existing = None - if existing: - resolved_collection = existing - - # Latent migration: when no explicit mapping exists yet for this logical_repo_id, but there is a - # single existing collection mapping, prefer reusing it rather than creating a fresh collection. - if logical_repo_reuse_enabled() and logical_repo_id and resolved_collection is None and get_collection_mappings: - try: - mappings = get_collection_mappings(search_root=WORK_DIR) or [] - except Exception: - mappings = [] - - if len(mappings) == 1: - canonical = mappings[0] - canonical_coll = canonical.get("collection_name") - if canonical_coll: - resolved_collection = canonical_coll - if update_workspace_state: - try: - update_workspace_state( - workspace_path=canonical.get("container_path") or canonical.get("state_file"), - updates={"logical_repo_id": logical_repo_id}, - repo_name=canonical.get("repo_name"), - ) - except Exception as migrate_err: - logger.debug( - f"[upload_service] Failed to migrate logical_repo_id for existing mapping: {migrate_err}" - ) - - # Finalize collection_name: prefer resolved server-side mapping, then client-supplied name, - # then standard get_collection_name/DEFAULT_COLLECTION fallbacks. - if resolved_collection is not None: - collection_name = resolved_collection - elif client_collection_name: - collection_name = client_collection_name - else: - if get_collection_name and repo_name: - collection_name = get_collection_name(repo_name) - else: - collection_name = DEFAULT_COLLECTION + collection_name, repo_name = _resolve_collection_for_request( + workspace_path=workspace_path, + client_collection_name=request.collection_name, + logical_repo_id=request.logical_repo_id, + ) # Enforce collection write access for plan/apply when auth is enabled if AUTH_ENABLED and CTXCE_MCP_ACL_ENFORCE and collection_name: @@ -1916,60 +1885,12 @@ async def upload_delta_bundle( workspace_path = str(workspace.resolve()) - # Always derive repo_name from workspace_path for origin tracking - repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None - if not repo_name: - repo_name = Path(workspace_path).name - - # Preserve any client-supplied collection name but allow server-side overrides - client_collection_name = collection_name - resolved_collection: Optional[str] = None - - # Resolve collection name, preferring server-side mapping for logical_repo_id when enabled - if logical_repo_reuse_enabled() and logical_repo_id and find_collection_for_logical_repo: - try: - existing = find_collection_for_logical_repo(logical_repo_id, search_root=WORK_DIR) - except Exception: - existing = None - if existing: - resolved_collection = existing - - # Latent migration: when no explicit mapping exists yet for this logical_repo_id, but there is a - # single existing collection mapping, prefer reusing it rather than creating a fresh collection. - if logical_repo_reuse_enabled() and logical_repo_id and resolved_collection is None and get_collection_mappings: - try: - mappings = get_collection_mappings(search_root=WORK_DIR) or [] - except Exception: - mappings = [] - - if len(mappings) == 1: - canonical = mappings[0] - canonical_coll = canonical.get("collection_name") - if canonical_coll: - resolved_collection = canonical_coll - if update_workspace_state: - try: - update_workspace_state( - workspace_path=canonical.get("container_path") or canonical.get("state_file"), - updates={"logical_repo_id": logical_repo_id}, - repo_name=canonical.get("repo_name"), - ) - except Exception as migrate_err: - logger.debug( - f"[upload_service] Failed to migrate logical_repo_id for existing mapping: {migrate_err}" - ) - - # Finalize collection_name: prefer resolved server-side mapping, then client-supplied name, - # then standard get_collection_name/DEFAULT_COLLECTION fallbacks. - if resolved_collection is not None: - collection_name = resolved_collection - elif client_collection_name: - collection_name = client_collection_name - else: - if get_collection_name and repo_name: - collection_name = get_collection_name(repo_name) - else: - collection_name = DEFAULT_COLLECTION + # Resolve collection name and repo name + collection_name, repo_name = _resolve_collection_for_request( + workspace_path=workspace_path, + client_collection_name=collection_name, + logical_repo_id=logical_repo_id, + ) # Enforce collection write access for uploads when auth is enabled. # Semantics: "write" is sufficient for uploading/indexing content. From 37890c2116c03aa0d469ee7b2deaf02e82123da8 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:50:13 +0000 Subject: [PATCH 34/39] fix(upload,consistency): address CodeRabbit critical and major issues - Fix duplicate timer.start() call in remote_upload_client that would cause RuntimeError - Remove premature cache eviction during bundle construction (only evict after successful upload) - Fix hash-matched creates/updates being acknowledged locally without server processing - Handle unresolved async uploads properly (treat None as pending, not success) - Don't record successful sweep when sweep operations fail silently - Improve maintenance state updates to reduce race condition risk with documentation - Preserve journal retry state when re-enqueueing audit repairs to avoid losing error history Co-authored-by: voarsh2 --- scripts/remote_upload_client.py | 32 ++++++---- scripts/standalone_upload_client.py | 24 ++++++-- scripts/watch_index_core/consistency.py | 77 ++++++++++++++++++++++--- 3 files changed, 110 insertions(+), 23 deletions(-) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 3fc2483d..089f0258 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -1173,13 +1173,6 @@ def create_delta_bundle( } operations.append(operation) - # Once a delete operation has been recorded, drop the cache entry - # so subsequent scans do not keep re-reporting the same deletion. - try: - self._remove_cached_file(str(path.resolve())) - except Exception: - pass - except Exception as e: print(f"[bundle_create] Error processing deleted file {path}: {e}") continue @@ -1489,16 +1482,25 @@ def _apply_operations_without_content(self, changes: Dict[str, List], plan: Dict if not body.get("success", False): logger.warning("[remote_upload] apply_ops failed; falling back to bundle upload: %s", body.get("error")) return None - self._finalize_successful_changes(changes) + # Only finalize changes that were actually processed by the server + # apply_delta_operations only handles deleted/moved operations + processed_ops = body.get("processed_operations") or {} + applied_changes = { + "deleted": changes.get("deleted", []), + "moved": changes.get("moved", []), + "created": [], + "updated": [], + } + self._finalize_successful_changes(applied_changes) self._set_last_upload_result( "uploaded", bundle_id=body.get("bundle_id"), sequence_number=body.get("sequence_number"), - processed_operations=body.get("processed_operations"), + processed_operations=processed_ops, ) logger.info( "[remote_upload] Metadata-only operations applied: %s", - body.get("processed_operations") or {}, + processed_ops, ) return True except Exception as e: @@ -2032,7 +2034,14 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: manifest["bundle_id"], response.get("sequence_number"), ) - if async_result: + if async_result is None: + # Server didn't respond in time - treat as pending, not success + async_failed = True + logger.warning( + "[remote_upload] Async upload timed out awaiting server response for bundle %s", + manifest["bundle_id"], + ) + else: self.last_upload_result = async_result if async_result["outcome"] == "uploaded_async": self._finalize_successful_changes(planned_changes) @@ -2295,7 +2304,6 @@ def _process_pending_changes(self): self._process_pending_changes, ) self._debounce_timer.start() - self._debounce_timer.start() observer = Observer() diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index c4ceddf9..fe8342f0 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -1618,16 +1618,25 @@ def _apply_operations_without_content(self, changes: Dict[str, List], plan: Dict if not body.get("success", False): logger.warning("[remote_upload] apply_ops failed; falling back to bundle upload: %s", body.get("error")) return None - self._finalize_successful_changes(changes) + # Only finalize changes that were actually processed by the server + # apply_delta_operations only handles deleted/moved operations + processed_ops = body.get("processed_operations") or {} + applied_changes = { + "deleted": changes.get("deleted", []), + "moved": changes.get("moved", []), + "created": [], + "updated": [], + } + self._finalize_successful_changes(applied_changes) self._set_last_upload_result( "uploaded", bundle_id=body.get("bundle_id"), sequence_number=body.get("sequence_number"), - processed_operations=body.get("processed_operations"), + processed_operations=processed_ops, ) logger.info( "[remote_upload] Metadata-only operations applied: %s", - body.get("processed_operations") or {}, + processed_ops, ) return True except Exception as e: @@ -2161,7 +2170,14 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: manifest["bundle_id"], response.get("sequence_number"), ) - if async_result: + if async_result is None: + # Server didn't respond in time - treat as pending, not success + async_failed = True + logger.warning( + "[remote_upload] Async upload timed out awaiting server response for bundle %s", + manifest["bundle_id"], + ) + else: self.last_upload_result = async_result if async_result["outcome"] == "uploaded_async": self._finalize_successful_changes(planned_changes) diff --git a/scripts/watch_index_core/consistency.py b/scripts/watch_index_core/consistency.py index d475fbb7..59a3046f 100644 --- a/scripts/watch_index_core/consistency.py +++ b/scripts/watch_index_core/consistency.py @@ -104,12 +104,13 @@ def _should_run_consistency_audit(workspace_path: str, repo_name: Optional[str]) return age >= interval -def _sweep_empty_workspace_dirs(workspace_root: Path) -> None: +def _sweep_empty_workspace_dirs(workspace_root: Path) -> bool: + """Sweep empty workspace directories and return True if fully successful.""" protected_top_level = {".codebase", ".remote-git"} try: workspace_root = workspace_root.resolve() except Exception: - pass + return False try: for root, _dirnames, _filenames in os.walk(workspace_root, topdown=False): current = Path(root) @@ -128,9 +129,11 @@ def _sweep_empty_workspace_dirs(workspace_root: Path) -> None: continue current.rmdir() except Exception: - continue + # If any directory operation fails, the sweep was not fully successful + return False except Exception: - pass + return False + return True def _should_run_empty_dir_sweep(workspace_path: str, repo_name: Optional[str]) -> bool: @@ -153,9 +156,14 @@ def _should_run_empty_dir_sweep(workspace_path: str, repo_name: Optional[str]) - def _record_empty_dir_sweep(workspace_path: str, repo_name: Optional[str]) -> None: try: + # Read fresh state to get latest maintenance dict state = get_workspace_state(workspace_path=workspace_path, repo_name=repo_name) or {} + # Merge into existing maintenance dict rather than replacing it maintenance = dict(state.get("maintenance") or {}) maintenance["last_empty_dir_sweep_at"] = datetime.now(timezone.utc).isoformat() + # Note: This is a best-effort update. Concurrent updates to other maintenance + # fields could be lost due to shallow merging in update_workspace_state. + # For production use, consider adding a deep-merge helper or locking at a higher level. update_workspace_state( workspace_path=workspace_path, repo_name=repo_name, @@ -352,10 +360,15 @@ def _record_consistency_audit( summary: Dict[str, Any], ) -> None: try: + # Read fresh state to get latest maintenance dict state = get_workspace_state(workspace_path=workspace_path, repo_name=repo_name) or {} + # Merge into existing maintenance dict rather than replacing it maintenance = dict(state.get("maintenance") or {}) maintenance["last_consistency_audit_at"] = datetime.now(timezone.utc).isoformat() maintenance["last_consistency_audit_summary"] = summary + # Note: This is a best-effort update. Concurrent updates to other maintenance + # fields could be lost due to shallow merging in update_workspace_state. + # For production use, consider adding a deep-merge helper or locking at a higher level. update_workspace_state( workspace_path=workspace_path, repo_name=repo_name, @@ -429,9 +442,48 @@ def _enqueue_consistency_repairs( if not entries: return 0, 0 + + # Fetch existing journal entries to preserve retry state + existing_entries: Dict[str, Dict[str, Any]] = {} + try: + from scripts.workspace_state import list_pending_index_journal_entries + all_pending = list_pending_index_journal_entries( + workspace_path=workspace_path, + repo_name=repo_name, + ) + for entry in all_pending or []: + path = str(entry.get("path") or "") + if path: + existing_entries[path] = entry + except Exception: + pass # If we can't fetch existing entries, proceed without preserving state + + # Merge existing retry state into new entries where appropriate + merged_entries = [] + for entry in entries: + path = str(entry.get("path") or "") + existing = existing_entries.get(path) + + # Skip if already pending/in-progress to avoid duplicate work + if existing and existing.get("status") in {"pending", "in_progress"}: + continue + + # Preserve retry state from existing failed entries + if existing and existing.get("status") == "failed": + entry["attempts"] = existing.get("attempts", 0) + entry["last_error"] = existing.get("last_error") + # Keep created_at from existing entry to preserve original enqueue time + if existing.get("created_at"): + entry["created_at"] = existing["created_at"] + + merged_entries.append(entry) + + if not merged_entries: + return 0, 0 + try: upsert_index_journal_entries( - entries, + merged_entries, workspace_path=workspace_path, repo_name=repo_name, ) @@ -443,6 +495,10 @@ def _enqueue_consistency_repairs( exc, ) return 0, 0 + + # Return counts based on actually enqueued entries + enqueued_stale = sum(1 for e in merged_entries if e.get("op_type") == "delete") + enqueued_missing = sum(1 for e in merged_entries if e.get("op_type") == "upsert") return enqueued_stale, enqueued_missing @@ -559,8 +615,15 @@ def run_empty_dir_sweep_maintenance(root: Path) -> None: continue try: logger.info("[empty_dir_sweep] Sweeping empty directories under %s", workspace_path) - _sweep_empty_workspace_dirs(Path(workspace_path)) - _record_empty_dir_sweep(workspace_path, repo_name) + sweep_success = _sweep_empty_workspace_dirs(Path(workspace_path)) + if sweep_success: + _record_empty_dir_sweep(workspace_path, repo_name) + else: + logger.debug( + "[empty_dir_sweep] sweep had failures workspace=%s repo=%s - not recording success", + workspace_path, + repo_name, + ) except Exception as exc: logger.debug( "[empty_dir_sweep] failed workspace=%s repo=%s: %s", From 37349d41f1d1d2b8672c29b26ffb8d66d4915e08 Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Mar 2026 19:24:50 +0000 Subject: [PATCH 35/39] fix(watch,upload): restore internal path checks and async queued upload handling fix processor internal metadata path helper call signature repair remote upload async branch logic/syntax treat unresolved async status as queued (no premature finalize/flush) in remote + standalone clients keep async failure path explicit and terminal only on confirmed failed outcome --- scripts/remote_upload_client.py | 38 ++++++++++++++++++--------- scripts/standalone_upload_client.py | 31 ++++++++++++++++++---- scripts/watch_index_core/processor.py | 2 +- 3 files changed, 52 insertions(+), 19 deletions(-) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 089f0258..9bd9870a 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -2018,6 +2018,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: if response.get("success", False): async_failed = False + async_pending = False processed_ops = response.get("processed_operations") if processed_ops is None: logger.info( @@ -2035,22 +2036,23 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: response.get("sequence_number"), ) if async_result is None: - # Server didn't respond in time - treat as pending, not success - async_failed = True + # Server accepted the bundle but status is still pending. + async_pending = True logger.warning( "[remote_upload] Async upload timed out awaiting server response for bundle %s", manifest["bundle_id"], ) else: self.last_upload_result = async_result - if async_result["outcome"] == "uploaded_async": + outcome = str(async_result.get("outcome") or "") + if outcome == "uploaded_async": self._finalize_successful_changes(planned_changes) logger.info( "[remote_upload] Async processing completed for bundle %s: %s", manifest["bundle_id"], async_result.get("processed_operations") or {}, ) - elif async_result["outcome"] == "failed": + elif outcome == "failed": async_failed = True logger.error( "[remote_upload] Async processing failed for bundle %s: %s", @@ -2064,15 +2066,20 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: sequence_number=async_result.get("sequence_number") or response.get("sequence_number"), error=async_result.get("error"), ) - else: - # async_result is None - treat as pending/failed - # Don't finalize changes, keep bundle marked as queued - logger.warning( - "[remote_upload] Async upload result unavailable for bundle %s (sequence=%s) - treating as pending", - manifest["bundle_id"], - response.get("sequence_number"), - ) - async_failed = True + else: + async_pending = True + # Keep queued state for non-terminal async outcomes. + self._set_last_upload_result( + "queued", + bundle_id=async_result.get("bundle_id") or manifest["bundle_id"], + sequence_number=async_result.get("sequence_number") or response.get("sequence_number"), + ) + logger.warning( + "[remote_upload] Async upload still pending for bundle %s (sequence=%s, outcome=%s)", + manifest["bundle_id"], + response.get("sequence_number"), + outcome or "", + ) else: logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") logger.info(f"[remote_upload] Processed operations: {processed_ops}") @@ -2083,6 +2090,11 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: sequence_number=response.get("sequence_number"), processed_operations=processed_ops, ) + if async_pending: + logger.info( + "[remote_upload] Bundle %s accepted and queued; deferring local finalization", + manifest["bundle_id"], + ) # Clean up temporary bundle after successful upload try: diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index fe8342f0..68b5fe08 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -2154,6 +2154,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: if response.get("success", False): async_failed = False + async_pending = False processed_ops = response.get("processed_operations") if processed_ops is None: logger.info( @@ -2171,22 +2172,23 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: response.get("sequence_number"), ) if async_result is None: - # Server didn't respond in time - treat as pending, not success - async_failed = True + # Server accepted the bundle but status is still pending. + async_pending = True logger.warning( "[remote_upload] Async upload timed out awaiting server response for bundle %s", manifest["bundle_id"], ) else: self.last_upload_result = async_result - if async_result["outcome"] == "uploaded_async": + outcome = str(async_result.get("outcome") or "") + if outcome == "uploaded_async": self._finalize_successful_changes(planned_changes) logger.info( "[remote_upload] Async processing completed for bundle %s: %s", manifest["bundle_id"], async_result.get("processed_operations") or {}, ) - elif async_result["outcome"] == "failed": + elif outcome == "failed": async_failed = True logger.error( "[remote_upload] Async processing failed for bundle %s: %s", @@ -2200,6 +2202,20 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: sequence_number=async_result.get("sequence_number") or response.get("sequence_number"), error=async_result.get("error"), ) + else: + async_pending = True + # Keep queued state for non-terminal async outcomes. + self._set_last_upload_result( + "queued", + bundle_id=async_result.get("bundle_id") or manifest["bundle_id"], + sequence_number=async_result.get("sequence_number") or response.get("sequence_number"), + ) + logger.warning( + "[remote_upload] Async upload still pending for bundle %s (sequence=%s, outcome=%s)", + manifest["bundle_id"], + response.get("sequence_number"), + outcome or "", + ) else: logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") logger.info(f"[remote_upload] Processed operations: {processed_ops}") @@ -2210,7 +2226,12 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: sequence_number=response.get("sequence_number"), processed_operations=processed_ops, ) - if not async_failed: + if async_pending: + logger.info( + "[remote_upload] Bundle %s accepted and queued; deferring local finalization", + manifest["bundle_id"], + ) + if not async_failed and not async_pending: flush_cached_file_hashes() # Clean up temporary bundle after successful upload diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index 223081c4..5e979046 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -60,7 +60,7 @@ def __init__(self, *, text: Optional[str] = None, file_hash: str = "") -> None: def _is_internal_ignored_path(path: Path) -> bool: - return is_internal_metadata_path(path, watch_config.ROOT) + return is_internal_metadata_path(path) def _staging_requires_subprocess(state: Optional[Dict[str, object]]) -> bool: From 984838dde1ae6ae75ea202e3f570fc5bf7398826 Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Mar 2026 19:59:57 +0000 Subject: [PATCH 36/39] fix(ingest,watch,upload): improve error handling and smart reindex fallback logic - Fix smart reindex to fall back to full reindex when non-symbol changes detected instead of incorrectly skipping unchanged files - Add proper locking for workspace state updates to prevent race conditions - Fix graph edge deletion to use correct collection name (not _graph suffix) - Add graph edge cleanup when files are deleted - Track and report partial failures in upload service with proper status - Improve logging with logger.exception for better stack traces - Handle "error" status in addition to "failed" for upload status checks - Support extensionless files (Dockerfiles) in watch path detection - Preserve retry state (status, attempts, errors) when re-enqueueing repairs - Fix symbol extraction to use preloaded text when available --- scripts/ingest/pipeline.py | 72 +++++++++++++++++++++---- scripts/ingest_history.py | 31 +++++------ scripts/remote_upload_client.py | 14 ++++- scripts/standalone_upload_client.py | 2 +- scripts/upload_service.py | 65 +++++++++++++++++++--- scripts/watch_index_core/consistency.py | 60 +++++++++++---------- scripts/watch_index_core/handler.py | 12 ++++- scripts/workspace_state.py | 21 ++++++-- tests/test_smart_reindex_vectors.py | 7 +-- 9 files changed, 211 insertions(+), 73 deletions(-) diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py index d3616e9c..5055a96c 100644 --- a/scripts/ingest/pipeline.py +++ b/scripts/ingest/pipeline.py @@ -275,6 +275,40 @@ def _sync_graph_edges_best_effort( pass +def _symbols_to_metadata_dict(language: str, text: str) -> dict: + """Build symbol metadata dict from in-memory source text.""" + symbols = {} + try: + symbols_list = _extract_symbols(language, text) + lines = text.split("\n") + for sym in symbols_list or []: + kind = str(sym.get("kind") or "") + name = str(sym.get("name") or "") + start = int(sym.get("start") or 0) + end = int(sym.get("end") or 0) + if not kind or not name or start <= 0 or end < start: + continue + symbol_id = f"{kind}_{name}_{start}" + content = "\n".join(lines[start - 1 : end]) + content_hash = hashlib.sha1( + content.encode("utf-8", errors="ignore") + ).hexdigest() + symbols[symbol_id] = { + "name": name, + "type": kind, + "start_line": start, + "end_line": end, + "content_hash": content_hash, + "content": content, + "pseudo": "", + "tags": [], + "qdrant_ids": [], + } + except Exception: + return {} + return symbols + + def build_information( language: str, path: Path, start: int, end: int, first_line: str ) -> str: @@ -304,6 +338,7 @@ def index_single_file( preloaded_language: str | None = None, ) -> bool: """Index a single file path. Returns True if indexed, False if skipped.""" + repo_for_graph = repo_name_for_cache or _detect_repo_name_from_path(file_path) try: if _should_skip_explicit_file_by_excluder(file_path): try: @@ -315,12 +350,14 @@ def index_single_file( client, collection, str(file_path), - repo_tag, + repo_for_graph, None, # No calls when file is excluded None, # No imports when file is excluded ) print(f"Skipping excluded file: {file_path}") return False + except NameError: + raise except Exception: return False @@ -383,7 +420,12 @@ def _index_single_file_inner( trust_cache = False fast_fs = _env_truthy(os.environ.get("INDEX_FS_FASTPATH"), False) - if skip_unchanged and fast_fs and get_cached_file_meta is not None: + if ( + preloaded_text is None + and skip_unchanged + and fast_fs + and get_cached_file_meta is not None + ): try: repo_for_cache = repo_name_for_cache or _detect_repo_name_from_path(file_path) meta = get_cached_file_meta(str(file_path), repo_for_cache) or {} @@ -444,7 +486,10 @@ def _index_single_file_inner( if get_cached_symbols and set_cached_symbols: cached_symbols = get_cached_symbols(str(file_path)) if cached_symbols: - current_symbols = extract_symbols_with_tree_sitter(str(file_path)) + if preloaded_text is not None: + current_symbols = _symbols_to_metadata_dict(language, preloaded_text) + else: + current_symbols = extract_symbols_with_tree_sitter(str(file_path)) _, changed = compare_symbol_changes(cached_symbols, current_symbols) for symbol_data in current_symbols.values(): symbol_id = f"{symbol_data['type']}_{symbol_data['name']}_{symbol_data['start_line']}" @@ -1000,12 +1045,14 @@ def process_file_with_smart_reindexing( client, current_collection, str(p), - repo_name_for_cache or _detect_repo_name_from_path(file_path), + per_file_repo or _detect_repo_name_from_path(file_path), None, # No calls when file is excluded None, # No imports when file is excluded ) print(f"[SMART_REINDEX] Skipping excluded file: {file_path}") return "skipped" + except NameError: + raise except Exception: return "skipped" @@ -1089,13 +1136,20 @@ def process_file_with_smart_reindexing( changed_set = set(changed_symbols) if len(changed_symbols) == 0 and cached_symbols: + prev_hash = None try: - if set_cached_file_hash: - set_cached_file_hash(fp, file_hash, per_file_repo) + if get_cached_file_hash: + prev_hash = get_cached_file_hash(fp, per_file_repo) except Exception: - pass - print(f"[SMART_REINDEX] {file_path}: 0 changes detected, skipping") - return "skipped" + prev_hash = None + if prev_hash and file_hash and prev_hash == file_hash: + print(f"[SMART_REINDEX] {file_path}: 0 changes detected, skipping") + return "skipped" + print( + f"[SMART_REINDEX] {file_path}: non-symbol change detected; " + "falling back to full reindex" + ) + return "failed" if model_dim and vector_name: try: diff --git a/scripts/ingest_history.py b/scripts/ingest_history.py index 06bea9d5..754487da 100644 --- a/scripts/ingest_history.py +++ b/scripts/ingest_history.py @@ -369,18 +369,18 @@ def _ingest_from_manifest( vec_name: str, include_body: bool, per_batch: int, -) -> int: +) -> tuple[int, bool]: try: with open(manifest_path, "r", encoding="utf-8") as f: data = json.load(f) except Exception as e: print(f"Failed to read manifest {manifest_path}: {e}") - return 0 + return 0, False commits = data.get("commits") or [] if not commits: print("No commits in manifest.") - return 0 + return 0, False run_id = _manifest_run_id(manifest_path) mode = str(data.get("mode") or "delta").strip().lower() or "delta" @@ -501,7 +501,7 @@ def _log_progress(force: bool = False) -> None: persisted_count += batch_size except Exception as e: upsert_failures += batch_size - logger.error( + logger.exception( "[ingest_history] upsert batch failed (size=%d): %s", batch_size, e, @@ -526,22 +526,22 @@ def _log_progress(force: bool = False) -> None: persisted_count += batch_size except Exception as e: upsert_failures += batch_size - logger.error( + logger.exception( "[ingest_history] final upsert failed (size=%d): %s", batch_size, e, ) _log_progress(force=True) - # Only prune snapshot runs that completed cleanly - prune_safe = ( - mode == "snapshot" - and prepared_count > 0 + ingest_successful = ( + prepared_count > 0 and invalid_commit_records == 0 and embed_failures == 0 and point_build_failures == 0 and upsert_failures == 0 and persisted_count == prepared_count ) + # Only prune snapshot runs that completed cleanly + prune_safe = mode == "snapshot" and ingest_successful if prune_safe: try: _prune_old_commit_points(client, run_id, mode=mode) @@ -554,14 +554,7 @@ def _log_progress(force: bool = False) -> None: ) # Only cleanup manifest if ingest completed successfully - ingest_complete = ( - prepared_count > 0 - and invalid_commit_records == 0 - and embed_failures == 0 - and point_build_failures == 0 - and upsert_failures == 0 - and persisted_count == prepared_count - ) + ingest_complete = ingest_successful if ingest_complete: try: _cleanup_manifest_files(manifest_path) @@ -710,7 +703,7 @@ def main(): persisted_count += batch_size except Exception as e: upsert_failures += batch_size - logger.error( + logger.exception( "[ingest_history] batch upsert failed collection=%s repo=%s size=%d path=%s: %s", COLLECTION, REPO_NAME, @@ -727,7 +720,7 @@ def main(): persisted_count += final_size except Exception as e: upsert_failures += final_size - logger.error( + logger.exception( "[ingest_history] final upsert failed collection=%s repo=%s size=%d path=%s: %s", COLLECTION, REPO_NAME, diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 9bd9870a..1f37edcd 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -750,7 +750,7 @@ def _await_async_upload_result( "processed_operations": server_info.get("last_processed_operations"), "processing_time_ms": server_info.get("last_processing_time_ms"), } - if last_upload_status == "failed": + if last_upload_status in ("failed", "error"): return { "outcome": "failed", "bundle_id": last_bundle_id or bundle_id, @@ -841,7 +841,17 @@ def _is_ignored_path(self, path: Path) -> bool: def _is_watchable_path(self, path: Path) -> bool: """Return True when a filesystem event path is eligible for upload processing.""" - return not self._is_ignored_path(path) and idx.CODE_EXTS.get(path.suffix.lower(), "unknown") != "unknown" + if self._is_ignored_path(path): + return False + suffix = path.suffix.lower() + if idx.CODE_EXTS.get(suffix, "unknown") != "unknown": + return True + name = path.name.lower() + try: + extensionless_names = {k.lower() for k in (idx.EXTENSIONLESS_FILES or {}).keys()} + except Exception: + extensionless_names = set() + return name in extensionless_names or name.startswith("dockerfile") def _get_temp_bundle_dir(self) -> Path: """Get or create temporary directory for bundle creation.""" diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 68b5fe08..e9a3df9b 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -889,7 +889,7 @@ def _await_async_upload_result( "processed_operations": server_info.get("last_processed_operations"), "processing_time_ms": server_info.get("last_processing_time_ms"), } - if last_upload_status == "failed": + if last_upload_status in ("failed", "error"): return { "outcome": "failed", "bundle_id": last_bundle_id or bundle_id, diff --git a/scripts/upload_service.py b/scripts/upload_service.py index 7e2ac6bb..f42985ec 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -539,6 +539,14 @@ async def _process_bundle_background( process_delta_bundle, workspace_path, bundle_path, manifest ) processing_time = int((datetime.now() - start_time).total_seconds() * 1000) + failed_count = int((operations_count or {}).get("failed") or 0) + applied_count = int( + (operations_count or {}).get("created", 0) + + (operations_count or {}).get("updated", 0) + + (operations_count or {}).get("deleted", 0) + + (operations_count or {}).get("moved", 0) + ) + status_value = "completed" if failed_count == 0 else "failed" if sequence_number is not None: _sequence_tracker[key] = sequence_number _upload_result_tracker[key] = { @@ -547,7 +555,9 @@ async def _process_bundle_background( "sequence_number": sequence_number, "processed_operations": operations_count, "processing_time_ms": processing_time, - "status": "completed", + "status": status_value, + "failed_count": failed_count, + "partial": bool(failed_count > 0 and applied_count > 0), "completed_at": datetime.now().isoformat(), } if log_activity: @@ -565,10 +575,21 @@ async def _process_bundle_background( ) except Exception as activity_err: logger.debug(f"[upload_service] Failed to log activity for bundle {bundle_id}: {activity_err}") - logger.info( - f"[upload_service] Finished processing bundle {bundle_id} seq {sequence_number} " - f"in {processing_time}ms ops={operations_count}" - ) + if failed_count > 0: + logger.warning( + "[upload_service] Finished processing bundle %s seq %s with failures in %sms " + "failed=%d ops=%s", + bundle_id, + sequence_number, + processing_time, + failed_count, + operations_count, + ) + else: + logger.info( + f"[upload_service] Finished processing bundle {bundle_id} seq {sequence_number} " + f"in {processing_time}ms ops={operations_count}" + ) except Exception as e: _upload_result_tracker[key] = { "workspace_path": workspace_path, @@ -1790,6 +1811,14 @@ async def apply_delta_ops(request: ApplyOperationsRequest): request.file_hashes, ) processing_time = int((datetime.now() - start_time).total_seconds() * 1000) + failed_count = int((operations_count or {}).get("failed") or 0) + applied_count = int( + (operations_count or {}).get("created", 0) + + (operations_count or {}).get("updated", 0) + + (operations_count or {}).get("deleted", 0) + + (operations_count or {}).get("moved", 0) + ) + status_value = "completed" if failed_count == 0 else "failed" _sequence_tracker[key] = sequence_number _upload_result_tracker[key] = { "workspace_path": workspace_path, @@ -1797,9 +1826,33 @@ async def apply_delta_ops(request: ApplyOperationsRequest): "sequence_number": sequence_number, "processed_operations": operations_count, "processing_time_ms": processing_time, - "status": "completed", + "status": status_value, + "failed_count": failed_count, + "partial": bool(failed_count > 0 and applied_count > 0), "completed_at": datetime.now().isoformat(), } + if failed_count > 0: + logger.warning( + "[upload_service] apply_ops completed with failures bundle=%s seq=%s failed=%d ops=%s", + bundle_id, + sequence_number, + failed_count, + operations_count, + ) + return UploadResponse( + success=False, + bundle_id=bundle_id, + sequence_number=sequence_number, + processed_operations=operations_count, + processing_time_ms=processing_time, + next_sequence=sequence_number + 1 if sequence_number is not None else None, + error={ + "code": "APPLY_OPS_PARTIAL_FAILURE", + "message": f"One or more operations failed during apply_ops (failed={failed_count})", + "failed_count": failed_count, + "processed_operations": operations_count, + }, + ) logger.info( "[upload_service] Applied metadata-only operations bundle=%s seq=%s in %sms ops=%s", bundle_id, diff --git a/scripts/watch_index_core/consistency.py b/scripts/watch_index_core/consistency.py index 59a3046f..223f28bf 100644 --- a/scripts/watch_index_core/consistency.py +++ b/scripts/watch_index_core/consistency.py @@ -10,6 +10,7 @@ import scripts.ingest_code as idx from scripts.workspace_state import ( + _get_state_lock, _extract_repo_name_from_path, _normalize_cache_key_path, get_collection_state_snapshot, @@ -156,19 +157,21 @@ def _should_run_empty_dir_sweep(workspace_path: str, repo_name: Optional[str]) - def _record_empty_dir_sweep(workspace_path: str, repo_name: Optional[str]) -> None: try: - # Read fresh state to get latest maintenance dict - state = get_workspace_state(workspace_path=workspace_path, repo_name=repo_name) or {} - # Merge into existing maintenance dict rather than replacing it - maintenance = dict(state.get("maintenance") or {}) - maintenance["last_empty_dir_sweep_at"] = datetime.now(timezone.utc).isoformat() - # Note: This is a best-effort update. Concurrent updates to other maintenance - # fields could be lost due to shallow merging in update_workspace_state. - # For production use, consider adding a deep-merge helper or locking at a higher level. - update_workspace_state( - workspace_path=workspace_path, - repo_name=repo_name, - updates={"maintenance": maintenance}, - ) + lock = _get_state_lock(workspace_path, repo_name) + with lock: + state = get_workspace_state( + workspace_path=workspace_path, + repo_name=repo_name, + ) or {} + maintenance = dict(state.get("maintenance") or {}) + maintenance["last_empty_dir_sweep_at"] = datetime.now( + timezone.utc + ).isoformat() + update_workspace_state( + workspace_path=workspace_path, + repo_name=repo_name, + updates={"maintenance": maintenance}, + ) except Exception as exc: logger.warning( "Failed to record empty dir sweep timestamp: %s (workspace=%s, repo=%s)", @@ -360,20 +363,22 @@ def _record_consistency_audit( summary: Dict[str, Any], ) -> None: try: - # Read fresh state to get latest maintenance dict - state = get_workspace_state(workspace_path=workspace_path, repo_name=repo_name) or {} - # Merge into existing maintenance dict rather than replacing it - maintenance = dict(state.get("maintenance") or {}) - maintenance["last_consistency_audit_at"] = datetime.now(timezone.utc).isoformat() - maintenance["last_consistency_audit_summary"] = summary - # Note: This is a best-effort update. Concurrent updates to other maintenance - # fields could be lost due to shallow merging in update_workspace_state. - # For production use, consider adding a deep-merge helper or locking at a higher level. - update_workspace_state( - workspace_path=workspace_path, - repo_name=repo_name, - updates={"maintenance": maintenance}, - ) + lock = _get_state_lock(workspace_path, repo_name) + with lock: + state = get_workspace_state( + workspace_path=workspace_path, + repo_name=repo_name, + ) or {} + maintenance = dict(state.get("maintenance") or {}) + maintenance["last_consistency_audit_at"] = datetime.now( + timezone.utc + ).isoformat() + maintenance["last_consistency_audit_summary"] = summary + update_workspace_state( + workspace_path=workspace_path, + repo_name=repo_name, + updates={"maintenance": maintenance}, + ) except Exception as exc: logger.warning( "Failed to record consistency audit: %s (workspace=%s, repo=%s)", @@ -470,6 +475,7 @@ def _enqueue_consistency_repairs( # Preserve retry state from existing failed entries if existing and existing.get("status") == "failed": + entry["status"] = "failed" entry["attempts"] = existing.get("attempts", 0) entry["last_error"] = existing.get("last_error") # Keep created_at from existing entry to preserve original enqueue time diff --git a/scripts/watch_index_core/handler.py b/scripts/watch_index_core/handler.py index 5636d1ed..36c4d457 100644 --- a/scripts/watch_index_core/handler.py +++ b/scripts/watch_index_core/handler.py @@ -175,7 +175,7 @@ def on_moved(self, event): try: idx.delete_graph_edges_by_path( self.client, - f"{coll}_graph", + coll, caller_path=str(src), ) except Exception: @@ -212,7 +212,7 @@ def on_moved(self, event): try: idx.delete_graph_edges_by_path( self.client, - f"{coll}_graph", + coll, caller_path=str(src), ) except Exception: @@ -308,6 +308,14 @@ def _delete_points(self, path: Path, collection: str | None) -> None: return try: idx.delete_points_by_path(self.client, collection, str(path)) + try: + idx.delete_graph_edges_by_path( + self.client, + collection, + caller_path=str(path), + ) + except Exception: + pass safe_print(f"[deleted] {path} -> {collection}") except Exception: pass diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index 0ae2a97c..4311a12f 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -1768,22 +1768,35 @@ def upsert_index_journal_entries( """Persist or replace repo-scoped index journal entries keyed by normalized path.""" normalized_entries: List[IndexJournalRecord] = [] now = datetime.now().isoformat() + valid_statuses = {"pending", "in_progress", "failed", "done"} for entry in entries or []: path = _normalize_cache_key_path(str(entry.get("path") or "")) op_type = str(entry.get("op_type") or "").strip().lower() if not path or op_type not in {"upsert", "delete"}: continue content_hash = str(entry.get("content_hash") or "").strip() or None + status = str(entry.get("status") or "pending").strip().lower() + if status not in valid_statuses: + status = "pending" + try: + attempts = int(entry.get("attempts", 0) or 0) + except Exception: + attempts = 0 + if attempts < 0: + attempts = 0 + last_error = entry.get("last_error") + if last_error is not None: + last_error = str(last_error) normalized_entries.append( { "path": path, "op_type": op_type, "content_hash": content_hash, - "status": "pending", - "attempts": 0, + "status": status, + "attempts": attempts, "created_at": str(entry.get("created_at") or now), - "updated_at": now, - "last_error": None, + "updated_at": str(entry.get("updated_at") or now), + "last_error": last_error, } ) diff --git a/tests/test_smart_reindex_vectors.py b/tests/test_smart_reindex_vectors.py index 5e5d4b9f..aca0e819 100644 --- a/tests/test_smart_reindex_vectors.py +++ b/tests/test_smart_reindex_vectors.py @@ -466,7 +466,7 @@ def fake_upsert_points(_client, _collection, points): assert out_vec == embedded_vec -def test_smart_reindex_updates_cached_hash_on_no_symbol_changes(tmp_path, monkeypatch): +def test_smart_reindex_no_symbol_changes_falls_back_without_hash_cache(tmp_path, monkeypatch): monkeypatch.setitem(sys.modules, "fastembed", SimpleNamespace(TextEmbedding=object)) from scripts.ingest import pipeline as ingest_pipeline @@ -486,6 +486,7 @@ def test_smart_reindex_updates_cached_hash_on_no_symbol_changes(tmp_path, monkey lambda _fp: {"function_hi_1": {"name": "hi", "type": "function", "start_line": 1}}, ) monkeypatch.setattr(ingest_pipeline, "compare_symbol_changes", lambda *_: ([], [])) + monkeypatch.setattr(ingest_pipeline, "get_cached_file_hash", lambda *_: None) set_cached_file_hash = MagicMock() monkeypatch.setattr(ingest_pipeline, "set_cached_file_hash", set_cached_file_hash) @@ -500,5 +501,5 @@ def test_smart_reindex_updates_cached_hash_on_no_symbol_changes(tmp_path, monkey vector_name="dense", ) - assert status == "skipped" - set_cached_file_hash.assert_called_once() + assert status == "failed" + set_cached_file_hash.assert_not_called() From b349a74389471b77776bed22cc5340ead369ec0c Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Mar 2026 20:07:44 +0000 Subject: [PATCH 37/39] fix(ingest,upload,search): improve error handling and fix edge cases - Add proper exception logging in cli.py cache clearing instead of silent pass - Improve graph_edges.py delete operation with status validation and UnexpectedResponse handling - Fix pseudo_applied flag assignment typo in pipeline.py (cd -> ch) - Add regex error handling with logging in search.py path filter - Pass full item to path filter check instead of partial dict - Fix case-folding index calculation in path_scope.py for accurate path extraction - Handle ValueError for invalid paths in upload_delta_bundle.py - Fix variable shadowing (status -> workspace_status) in upload_service.py - Add collection name validation before ACL enforcement - Only update sequence tracker when operations are actually applied - Remove unnecessary exception re-raising in watch processor graph cleanup - Fix mock function in test_upload_service_status.py - Add Windows compatibility for permission tests - Add optional directory checks in VSCode extension build script --- scripts/ingest/cli.py | 14 ++++-- scripts/ingest/graph_edges.py | 44 ++++++++++++++++--- scripts/ingest/pipeline.py | 3 +- scripts/mcp_impl/search.py | 19 ++++++-- scripts/path_scope.py | 15 ++++++- scripts/upload_delta_bundle.py | 11 ++++- scripts/upload_service.py | 21 ++++++--- scripts/watch_index_core/paths.py | 2 + scripts/watch_index_core/processor.py | 3 -- tests/test_upload_service_status.py | 2 +- tests/test_workspace_state.py | 8 +++- vscode-extension/build/build.sh | 14 ++++-- .../context-engine-uploader/mcp_bridge.js | 1 - 13 files changed, 125 insertions(+), 32 deletions(-) diff --git a/scripts/ingest/cli.py b/scripts/ingest/cli.py index 676ee77a..0a956ca4 100644 --- a/scripts/ingest/cli.py +++ b/scripts/ingest/cli.py @@ -9,17 +9,19 @@ import os import argparse +import logging from pathlib import Path from scripts.ingest.config import ( is_multi_repo_mode, get_collection_name, ) -from scripts import workspace_state as _ws from scripts.collection_health import clear_indexing_caches as _clear_indexing_caches_impl from scripts.ingest.pipeline import index_repo from scripts.ingest.pseudo import generate_pseudo_tags +logger = logging.getLogger(__name__) + def parse_args(): """Parse command-line arguments.""" @@ -196,8 +198,14 @@ def main(): def _clear_indexing_caches(workspace_root: Path, repo_name: str | None) -> None: try: _clear_indexing_caches_impl(str(workspace_root), repo_name=repo_name) - except Exception: - pass + except Exception as e: + logger.warning( + "Failed to clear indexing caches for workspace=%s repo=%s: %s", + workspace_root, + repo_name, + e, + exc_info=True, + ) qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333") api_key = os.environ.get("QDRANT_API_KEY") diff --git a/scripts/ingest/graph_edges.py b/scripts/ingest/graph_edges.py index e3b5cb66..e9911822 100644 --- a/scripts/ingest/graph_edges.py +++ b/scripts/ingest/graph_edges.py @@ -138,8 +138,14 @@ def _detect_vector_mode(info: Any) -> str: field_name=field, field_schema=qmodels.PayloadSchemaType.KEYWORD, ) - except Exception: - pass + except Exception as e: + logger.debug( + "Failed to create graph payload index '%s' for %s: %s", + field, + graph_coll, + e, + exc_info=True, + ) _ENSURED_GRAPH_COLLECTIONS.add(graph_coll) _MISSING_GRAPH_COLLECTIONS.discard(graph_coll) @@ -244,6 +250,7 @@ def delete_edges_by_path( caller_path: str, repo: str | None = None, ) -> int: + from qdrant_client.http.exceptions import UnexpectedResponse graph_coll = get_graph_collection_name(base_collection) if graph_coll in _MISSING_GRAPH_COLLECTIONS: return 0 @@ -264,15 +271,38 @@ def delete_edges_by_path( ) try: - client.delete( + resp = client.delete( collection_name=graph_coll, points_selector=qmodels.FilterSelector(filter=qmodels.Filter(must=must)), ) - return 1 - except Exception as e: - err = str(e).lower() - if "404" in err or "doesn't exist" in err or "not found" in err: + result_status = getattr(getattr(resp, "result", None), "status", None) + if result_status is None: + result_status = getattr(resp, "status", None) + if result_status is None: + return 1 + status_s = str(result_status).strip().lower() + return 1 if status_s in {"acknowledged", "completed", "ok", "success"} else 0 + except UnexpectedResponse as e: + if getattr(e, "status_code", None) == 404: _MISSING_GRAPH_COLLECTIONS.add(graph_coll) + return 0 + logger.debug( + "Graph edge delete failed for %s in %s (status=%s): %s", + norm_path, + graph_coll, + getattr(e, "status_code", None), + e, + exc_info=True, + ) + return 0 + except Exception as e: + logger.debug( + "Graph edge delete failed for %s in %s: %s", + norm_path, + graph_coll, + e, + exc_info=True, + ) return 0 diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py index 5055a96c..9db00326 100644 --- a/scripts/ingest/pipeline.py +++ b/scripts/ingest/pipeline.py @@ -1391,6 +1391,7 @@ def _apply_symbol_pseudo( pseudo, tags, ) + ch["_pseudo_applied"] = True except Exception as e: print(f"[PSEUDO_BATCH] Smart reindex batch failed, falling back: {e}") use_batch_pseudo = False @@ -1419,7 +1420,7 @@ def _apply_symbol_pseudo( pseudo, tags, ) - cd["_pseudo_applied"] = True + ch["_pseudo_applied"] = True except Exception: pass diff --git a/scripts/mcp_impl/search.py b/scripts/mcp_impl/search.py index 1e654e73..9fb16414 100644 --- a/scripts/mcp_impl/search.py +++ b/scripts/mcp_impl/search.py @@ -562,8 +562,21 @@ def _result_passes_path_filters(item: dict) -> bool: try: if not any(_re.search(path_regex, pv, flags=flags) for pv in norm_paths): return False - except Exception: - pass + except _re.error as exc: + logger.warning( + "Invalid path_regex filter '%s': %s", + path_regex, + exc, + ) + return False + except Exception as exc: + logger.warning( + "Failed evaluating path_regex filter '%s': %s", + path_regex, + exc, + exc_info=True, + ) + return False if path_globs_norm and not any( _match_glob(g, pv) for g in path_globs_norm for pv in norm_paths @@ -706,7 +719,7 @@ def _apply_result_filters(items: list[dict]) -> list[dict]: for item in items: path = item.get("path") or "" - if not _result_passes_path_filters({"path": path}): + if not _result_passes_path_filters(item): continue payload = item.get("payload") or {} diff --git a/scripts/path_scope.py b/scripts/path_scope.py index 7eb76889..2150926c 100644 --- a/scripts/path_scope.py +++ b/scripts/path_scope.py @@ -143,9 +143,19 @@ def _path_forms(path: Any, repo_hint: Any = None) -> Set[str]: forms.add(tail) if repo: + def _cf_to_orig_idx(orig: str, cf_index: int) -> int: + if cf_index <= 0: + return 0 + acc = 0 + for i, ch in enumerate(orig): + nxt = acc + len(ch.casefold()) + if nxt > cf_index: + return i + acc = nxt + return len(orig) + repo_cf = repo.casefold() repo_prefix_cf = repo_cf + "/" - marker = "/" + repo + "/" marker_cf = "/" + repo_cf + "/" for f in list(forms): f_cf = f.casefold() @@ -153,7 +163,8 @@ def _path_forms(path: Any, repo_hint: Any = None) -> Set[str]: forms.add(f[len(repo) + 1 :]) idx = f_cf.find(marker_cf) if idx >= 0: - tail = f[idx + len(marker) :] + tail_start = _cf_to_orig_idx(f, idx + len(marker_cf)) + tail = f[tail_start:] if tail: forms.add(tail) diff --git a/scripts/upload_delta_bundle.py b/scripts/upload_delta_bundle.py index 245fedd8..4ccf2613 100644 --- a/scripts/upload_delta_bundle.py +++ b/scripts/upload_delta_bundle.py @@ -438,7 +438,16 @@ def plan_delta_upload( needs_content = False for slug, root in replica_roots.items(): - target_path = _safe_join(root, sanitized) + try: + target_path = _safe_join(root, sanitized) + except ValueError: + logger.warning( + "[upload_service] Invalid %s path during plan: %s (root=%s)", + op_type, + sanitized, + root, + ) + continue target_key = _normalize_cache_key_path(str(target_path)) cached_hash = replica_cache_hashes.get(slug, {}).get(target_key) if cached_hash != op_content_hash: diff --git a/scripts/upload_service.py b/scripts/upload_service.py index f42985ec..e1d4355d 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -1521,7 +1521,7 @@ async def get_status(workspace_path: str): last_upload = upload_result.get("completed_at") upload_status = str(upload_result.get("status") or "") - status = "processing" if upload_status == "processing" else "ready" + workspace_status = "processing" if upload_status == "processing" else "ready" return StatusResponse( workspace_path=workspace_path, @@ -1529,7 +1529,7 @@ async def get_status(workspace_path: str): last_sequence=last_sequence, last_upload=last_upload, pending_operations=0, - status=status, + status=workspace_status, server_info={ "version": "1.0.0", "max_bundle_size_mb": MAX_BUNDLE_SIZE_MB, @@ -1652,7 +1652,12 @@ async def plan_delta(request: PlanRequest): ) # Enforce collection write access for plan/apply when auth is enabled - if AUTH_ENABLED and CTXCE_MCP_ACL_ENFORCE and collection_name: + if AUTH_ENABLED and CTXCE_MCP_ACL_ENFORCE: + if not collection_name: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Collection resolution failed for ACL enforcement", + ) uid = str((record or {}).get("user_id") or "").strip() if not uid: raise HTTPException( @@ -1757,7 +1762,12 @@ async def apply_delta_ops(request: ApplyOperationsRequest): ) # Enforce collection write access for plan/apply when auth is enabled - if AUTH_ENABLED and CTXCE_MCP_ACL_ENFORCE and collection_name: + if AUTH_ENABLED and CTXCE_MCP_ACL_ENFORCE: + if not collection_name: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Collection resolution failed for ACL enforcement", + ) uid = str((record or {}).get("user_id") or "").strip() if not uid: raise HTTPException( @@ -1819,7 +1829,8 @@ async def apply_delta_ops(request: ApplyOperationsRequest): + (operations_count or {}).get("moved", 0) ) status_value = "completed" if failed_count == 0 else "failed" - _sequence_tracker[key] = sequence_number + if applied_count > 0: + _sequence_tracker[key] = sequence_number _upload_result_tracker[key] = { "workspace_path": workspace_path, "bundle_id": bundle_id, diff --git a/scripts/watch_index_core/paths.py b/scripts/watch_index_core/paths.py index 3ae7209d..2e76cfb9 100644 --- a/scripts/watch_index_core/paths.py +++ b/scripts/watch_index_core/paths.py @@ -13,6 +13,8 @@ def is_internal_metadata_path(path: Path) -> bool: """Return True when path points into watcher/internal metadata trees.""" try: + # Deliberately match internal segments anywhere in the path to prevent + # indexing of nested metadata mirrors (for example in replicated roots). if any(part in INTERNAL_STATE_TOP_LEVEL_DIRS for part in path.parts): return True global_state_dir = _get_global_state_dir() diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index 5e979046..e8a469d1 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -833,9 +833,6 @@ def _process_paths( ) except Exception as graph_exc: safe_print(f"[deleted:graph_failed] {p} -> {collection}: {graph_exc}") - # Don't mark as deleted_ok if graph cleanup fails - deleted_ok = False - raise safe_print(f"[deleted] {p} -> {collection}") deleted_ok = True except Exception: diff --git a/tests/test_upload_service_status.py b/tests/test_upload_service_status.py index f40dba58..2955a830 100644 --- a/tests/test_upload_service_status.py +++ b/tests/test_upload_service_status.py @@ -75,7 +75,7 @@ def test_process_bundle_background_tracks_completed_operations(monkeypatch, tmp_ "failed": 0, }, ) - monkeypatch.setattr(srv, "log_activity", None) + monkeypatch.setattr(srv, "log_activity", lambda *a, **k: None) asyncio.run( srv._process_bundle_background( diff --git a/tests/test_workspace_state.py b/tests/test_workspace_state.py index 8b8fa442..9733799d 100644 --- a/tests/test_workspace_state.py +++ b/tests/test_workspace_state.py @@ -519,8 +519,12 @@ def test_symbol_cache_write_uses_cross_user_writable_mode(self, monkeypatch, tmp cache_path = ws_module._get_symbol_cache_path(str(file_path)) assert cache_path.exists() - assert oct(cache_path.parent.stat().st_mode & 0o777) == "0o777" - assert oct(cache_path.stat().st_mode & 0o777) == "0o666" + if os.name == "nt": + pytest.skip("POSIX permission bits are not stable on Windows") + dir_mode = cache_path.parent.stat().st_mode & 0o777 + file_mode = cache_path.stat().st_mode & 0o777 + assert dir_mode & 0o700 == 0o700 + assert file_mode & 0o600 == 0o600 class TestCollectionMappings: diff --git a/vscode-extension/build/build.sh b/vscode-extension/build/build.sh index 6607ac68..6f008017 100755 --- a/vscode-extension/build/build.sh +++ b/vscode-extension/build/build.sh @@ -83,8 +83,16 @@ BRIDGE_DIR="ctx-mcp-bridge" if [[ -d "$BRIDGE_SRC" && -f "$BRIDGE_SRC/package.json" ]]; then echo "Bundling MCP bridge npm package into staged extension..." mkdir -p "$STAGE_DIR/$BRIDGE_DIR" - cp -a "$BRIDGE_SRC/bin" "$STAGE_DIR/$BRIDGE_DIR/" - cp -a "$BRIDGE_SRC/src" "$STAGE_DIR/$BRIDGE_DIR/" + if [[ -d "$BRIDGE_SRC/bin" ]]; then + cp -a "$BRIDGE_SRC/bin" "$STAGE_DIR/$BRIDGE_DIR/" + else + echo "Warning: Bridge bin directory not found at $BRIDGE_SRC/bin (skipping)" + fi + if [[ -d "$BRIDGE_SRC/src" ]]; then + cp -a "$BRIDGE_SRC/src" "$STAGE_DIR/$BRIDGE_DIR/" + else + echo "Warning: Bridge src directory not found at $BRIDGE_SRC/src (skipping)" + fi cp "$BRIDGE_SRC/package.json" "$STAGE_DIR/$BRIDGE_DIR/" if [[ -d "$BRIDGE_SRC/node_modules" ]]; then @@ -103,4 +111,4 @@ npx @vscode/vsce package --no-dependencies --out "$OUT_DIR" popd >/dev/null echo "Build complete! Check the /out directory for .vsix and .py files." -ls -la "$OUT_DIR" \ No newline at end of file +ls -la "$OUT_DIR" diff --git a/vscode-extension/context-engine-uploader/mcp_bridge.js b/vscode-extension/context-engine-uploader/mcp_bridge.js index 11637d45..5b10796f 100644 --- a/vscode-extension/context-engine-uploader/mcp_bridge.js +++ b/vscode-extension/context-engine-uploader/mcp_bridge.js @@ -98,7 +98,6 @@ function createBridgeManager(deps) { function resolveBridgeCliInvocation() { const binPath = findLocalBridgeBin(); - const mode = getBridgeMode(); if (binPath) { // Use absolute Node runtime to avoid PATH dependency in extension hosts const bundledBin = findBundledBridgeBin(); From ed627c7deb18059a5c685ef8479a379832befa6c Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Mar 2026 22:45:47 +0000 Subject: [PATCH 38/39] fix(bridge): add retry logic for transient errors in MCP list operations Introduce withTransientRetry helper that wraps operations with automatic retry on transient errors. Apply to listMemoryTools, listResourcesSafe, listResourceTemplatesSafe, and tools/list request handler to improve reliability when remote MCP servers experience temporary failures. Also replace refreshSessionAndSyncDefaults calls with initializeRemoteClients and ensureRemoteDefaults for clearer initialization semantics in request handlers. --- ctx-mcp-bridge/src/mcpServer.js | 88 +++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 20 deletions(-) diff --git a/ctx-mcp-bridge/src/mcpServer.js b/ctx-mcp-bridge/src/mcpServer.js index da92625e..da29873e 100644 --- a/ctx-mcp-bridge/src/mcpServer.js +++ b/ctx-mcp-bridge/src/mcpServer.js @@ -107,10 +107,11 @@ async function listMemoryTools(client) { return []; } try { - const timeoutMs = getBridgeListTimeoutMs(); - const remote = await withTimeout( - client.listTools(), - timeoutMs, + const remote = await withTransientRetry( + () => { + const timeoutMs = getBridgeListTimeoutMs(); + return withTimeout(client.listTools(), timeoutMs, "memory tools/list"); + }, "memory tools/list", ); return Array.isArray(remote?.tools) ? remote.tools.slice() : []; @@ -151,11 +152,16 @@ async function listResourcesSafe(client, label, cursor) { return { resources: [], nextCursor: null }; } try { - const timeoutMs = getBridgeListTimeoutMs(); const params = cursor ? { cursor } : {}; - const remote = await withTimeout( - client.listResources(params), - timeoutMs, + const remote = await withTransientRetry( + () => { + const timeoutMs = getBridgeListTimeoutMs(); + return withTimeout( + client.listResources(params), + timeoutMs, + `${label} resources/list`, + ); + }, `${label} resources/list`, ); return { @@ -176,11 +182,16 @@ async function listResourceTemplatesSafe(client, label, cursor) { return { resourceTemplates: [], nextCursor: null }; } try { - const timeoutMs = getBridgeListTimeoutMs(); const params = cursor ? { cursor } : {}; - const remote = await withTimeout( - client.listResourceTemplates(params), - timeoutMs, + const remote = await withTransientRetry( + () => { + const timeoutMs = getBridgeListTimeoutMs(); + return withTimeout( + client.listResourceTemplates(params), + timeoutMs, + `${label} resources/templates/list`, + ); + }, `${label} resources/templates/list`, ); return { @@ -412,6 +423,34 @@ function isTransientToolError(error) { return false; } } + +async function withTransientRetry(operation, label, maxAttempts, retryDelayMs) { + const attempts = Number.isFinite(maxAttempts) && maxAttempts > 0 + ? Math.floor(maxAttempts) + : getBridgeRetryAttempts(); + const delayMs = Number.isFinite(retryDelayMs) && retryDelayMs >= 0 + ? Math.floor(retryDelayMs) + : getBridgeRetryDelayMs(); + let lastError; + for (let attempt = 0; attempt < attempts; attempt += 1) { + if (attempt > 0 && delayMs > 0) { + await new Promise((resolve) => setTimeout(resolve, delayMs)); + } + try { + return await operation(); + } catch (err) { + lastError = err; + if (!isTransientToolError(err) || attempt === attempts - 1) { + throw err; + } + debugLog( + `[ctxce] ${label}: transient error (attempt ${attempt + 1}/${attempts}), retrying: ` + + String(err), + ); + } + } + throw lastError || new Error(`[ctxce] ${label}: unknown transient retry failure`); +} // MCP stdio server implemented using the official MCP TypeScript SDK. // Acts as a low-level proxy for tools, forwarding tools/list and tools/call // to the remote qdrant-indexer MCP server while adding a local `ping` tool. @@ -843,15 +882,22 @@ async function createBridgeServer(options) { server.setRequestHandler(ListToolsRequestSchema, async () => { let remote; try { - debugLog("[ctxce] tools/list: fetching tools from indexer"); - await refreshSessionAndSyncDefaults(); + await initializeRemoteClients(false); + await ensureRemoteDefaults(false); if (!indexerClient) { throw new Error("Indexer MCP client not initialized"); } - const timeoutMs = getBridgeListTimeoutMs(); - remote = await withTimeout( - indexerClient.listTools(), - timeoutMs, + + debugLog("[ctxce] tools/list: fetching tools from indexer"); + remote = await withTransientRetry( + () => { + const timeoutMs = getBridgeListTimeoutMs(); + return withTimeout( + indexerClient.listTools(), + timeoutMs, + "indexer tools/list", + ); + }, "indexer tools/list", ); } catch (err) { @@ -881,7 +927,8 @@ async function createBridgeServer(options) { server.setRequestHandler(ListResourcesRequestSchema, async (request) => { // Proxy resource discovery/read-through so clients that use MCP resources // (not only tools) can access upstream indexer/memory resources directly. - await refreshSessionAndSyncDefaults(); + await initializeRemoteClients(false); + await ensureRemoteDefaults(false); const cursor = request && request.params && typeof request.params.cursor === "string" ? request.params.cursor @@ -911,7 +958,8 @@ async function createBridgeServer(options) { }); server.setRequestHandler(ListResourceTemplatesRequestSchema, async (request) => { - await refreshSessionAndSyncDefaults(); + await initializeRemoteClients(false); + await ensureRemoteDefaults(false); const cursor = request && request.params && typeof request.params.cursor === "string" ? request.params.cursor From e161c14cae907842b967287700a38fd5dd7c30fa Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 9 Mar 2026 23:55:07 +0000 Subject: [PATCH 39/39] ci(cosqa): add benchmark workflow and search matrix runner Add GitHub Actions workflow for automated CoSQA search benchmarks with: - Scheduled daily runs and PR triggers for search-related paths - Configurable hybrid gate enforcement to catch regressions - Qdrant service container for isolated benchmark execution - Artifact upload for results and summaries Add run_search_matrix.sh orchestration script supporting multiple profiles (smoke/quick/full) and run sets (pr/knobs/nightly/full) with comprehensive metric collection and comparison. Fix runner.py to correctly extract CoSQA code IDs from synthetic filenames and disable MCP auth during benchmark execution. --- .github/workflows/cosqa-benchmark.yml | 147 ++++++++ scripts/benchmarks/cosqa/run_search_matrix.sh | 315 ++++++++++++++++++ scripts/benchmarks/cosqa/runner.py | 21 +- 3 files changed, 480 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/cosqa-benchmark.yml create mode 100755 scripts/benchmarks/cosqa/run_search_matrix.sh diff --git a/.github/workflows/cosqa-benchmark.yml b/.github/workflows/cosqa-benchmark.yml new file mode 100644 index 00000000..c25a1769 --- /dev/null +++ b/.github/workflows/cosqa-benchmark.yml @@ -0,0 +1,147 @@ +name: CoSQA Search Benchmark + +on: + workflow_dispatch: + inputs: + enforce_hybrid_gate: + description: Fail run if best hybrid underperforms best dense past threshold + required: false + default: false + type: boolean + hybrid_min_delta: + description: Minimum accepted (hybrid_mrr - dense_mrr), e.g. -0.02 + required: false + default: "-0.02" + type: string + upload_full_artifacts: + description: Upload full logs/json bundle (higher storage usage) + required: false + default: false + type: boolean + + pull_request: + branches: [ test ] + paths: + - scripts/hybrid/** + - scripts/hybrid_search.py + - scripts/mcp_impl/search.py + - scripts/mcp_impl/context_search.py + - scripts/mcp_indexer_server.py + - scripts/benchmarks/cosqa/** + - .github/workflows/cosqa-benchmark.yml + + schedule: + - cron: "25 3 * * *" + +jobs: + cosqa-bench: + runs-on: ubuntu-latest + timeout-minutes: 360 + + services: + qdrant: + image: qdrant/qdrant:v1.15.1 + ports: + - 6333:6333 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt', '**/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Cache HuggingFace datasets + uses: actions/cache@v4 + with: + path: | + ~/.cache/huggingface/datasets + ~/.cache/huggingface/hub + key: ${{ runner.os }}-hf-cosqa-${{ hashFiles('scripts/benchmarks/cosqa/dataset.py') }} + restore-keys: | + ${{ runner.os }}-hf-cosqa- + ${{ runner.os }}-hf- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install "datasets>=2.18.0" + + - name: Wait for Qdrant + run: | + timeout 90 bash -c 'until curl -fsS http://localhost:6333/readyz; do sleep 2; done' + curl -fsS http://localhost:6333/collections >/dev/null + + - name: Resolve run config + id: cfg + run: | + echo "profile=full" >> "$GITHUB_OUTPUT" + echo "run_set=full" >> "$GITHUB_OUTPUT" + if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.enforce_hybrid_gate }}" = "true" ]; then + echo "enforce_hybrid_gate=1" >> "$GITHUB_OUTPUT" + else + echo "enforce_hybrid_gate=0" >> "$GITHUB_OUTPUT" + fi + if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.hybrid_min_delta }}" != "" ]; then + echo "hybrid_min_delta=${{ inputs.hybrid_min_delta }}" >> "$GITHUB_OUTPUT" + else + echo "hybrid_min_delta=-0.02" >> "$GITHUB_OUTPUT" + fi + + - name: Run CoSQA search matrix + id: bench + env: + QDRANT_URL: http://localhost:6333 + PROFILE: ${{ steps.cfg.outputs.profile }} + RUN_SET: ${{ steps.cfg.outputs.run_set }} + ENFORCE_HYBRID_GATE: ${{ steps.cfg.outputs.enforce_hybrid_gate }} + HYBRID_MIN_DELTA: ${{ steps.cfg.outputs.hybrid_min_delta }} + PYTHONUNBUFFERED: "1" + run: | + RUN_TAG="gha-${{ github.run_id }}-${{ github.run_attempt }}" + OUT_DIR="bench_results/cosqa/${RUN_TAG}" + echo "out_dir=${OUT_DIR}" >> "$GITHUB_OUTPUT" + RUN_TAG="${RUN_TAG}" OUT_DIR="${OUT_DIR}" ./scripts/benchmarks/cosqa/run_search_matrix.sh + + - name: Publish benchmark summary + if: always() + run: | + SUMMARY="${{ steps.bench.outputs.out_dir }}/summary.md" + if [ -f "${SUMMARY}" ]; then + cat "${SUMMARY}" >> "$GITHUB_STEP_SUMMARY" + else + echo "No summary file generated" >> "$GITHUB_STEP_SUMMARY" + fi + + - name: Upload benchmark artifacts + if: always() && github.event_name == 'pull_request' + uses: actions/upload-artifact@v4 + with: + name: cosqa-search-summary-${{ github.run_id }}-${{ github.run_attempt }} + path: | + ${{ steps.bench.outputs.out_dir }}/summary.md + ${{ steps.bench.outputs.out_dir }}/summary.json + retention-days: 3 + + - name: Upload full benchmark artifacts + if: | + always() && ( + github.event_name == 'schedule' || + (github.event_name == 'workflow_dispatch' && inputs.upload_full_artifacts == true) + ) + uses: actions/upload-artifact@v4 + with: + name: cosqa-search-bench-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ steps.bench.outputs.out_dir }} + retention-days: 7 diff --git a/scripts/benchmarks/cosqa/run_search_matrix.sh b/scripts/benchmarks/cosqa/run_search_matrix.sh new file mode 100755 index 00000000..d2eece9a --- /dev/null +++ b/scripts/benchmarks/cosqa/run_search_matrix.sh @@ -0,0 +1,315 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +cd "${ROOT_DIR}" + +PYTHON_BIN="${PYTHON_BIN:-}" +if [ -z "${PYTHON_BIN}" ]; then + if command -v python3.11 >/dev/null 2>&1; then + PYTHON_BIN="python3.11" + elif command -v python3 >/dev/null 2>&1; then + PYTHON_BIN="python3" + elif command -v python >/dev/null 2>&1; then + PYTHON_BIN="python" + else + echo "No Python interpreter found (looked for python3.11/python3/python)." >&2 + exit 127 + fi +fi +RUN_TAG="${RUN_TAG:-$(date +%Y%m%d-%H%M%S)}" +PROFILE="${PROFILE:-full}" # smoke | quick | full +RUN_SET="${RUN_SET:-full}" # pr | knobs | nightly | full +OUT_DIR="${OUT_DIR:-bench_results/cosqa/${RUN_TAG}}" +LOG_DIR="${LOG_DIR:-${OUT_DIR}}" +SPLIT="${SPLIT:-test}" +COLLECTION="${COLLECTION:-cosqa-search-${RUN_TAG}}" +LIMIT="${LIMIT:-10}" +RECREATE_INDEX="${RECREATE_INDEX:-1}" +ENFORCE_HYBRID_GATE="${ENFORCE_HYBRID_GATE:-0}" +HYBRID_MIN_DELTA="${HYBRID_MIN_DELTA:--0.020}" + +case "${PROFILE}" in + smoke) + : "${CORPUS_LIMIT:=150}" + : "${QUERY_LIMIT:=30}" + ;; + quick) + : "${CORPUS_LIMIT:=500}" + : "${QUERY_LIMIT:=100}" + ;; + full) + : "${CORPUS_LIMIT:=0}" + : "${QUERY_LIMIT:=0}" + ;; + *) + echo "Unknown PROFILE='${PROFILE}'. Use smoke|quick|full" >&2 + exit 2 + ;; +esac + +mkdir -p "${OUT_DIR}" "${LOG_DIR}" + +BASE_ENV=( + "LOG_LEVEL=${LOG_LEVEL:-INFO}" + "DEBUG_HYBRID_SEARCH=${DEBUG_HYBRID_SEARCH:-0}" + "QDRANT_URL=${QDRANT_URL:-http://localhost:6333}" + "HYBRID_IN_PROCESS=${HYBRID_IN_PROCESS:-1}" + "RERANK_IN_PROCESS=${RERANK_IN_PROCESS:-1}" + "LEX_VECTOR_DIM=${LEX_VECTOR_DIM:-4096}" + "COSQA_QUERY_CONCURRENCY=${COSQA_QUERY_CONCURRENCY:-8}" + "LLM_EXPAND_MAX=0" + "REFRAG_DECODER=0" + "RERANK_LEARNING=0" + "RERANK_EVENTS_ENABLED=0" +) + +run_index_once() { + local log="${LOG_DIR}/cosqa_index.log" + local args=( + "-m" "scripts.benchmarks.cosqa.runner" + "--split" "${SPLIT}" + "--collection" "${COLLECTION}" + "--limit" "${LIMIT}" + "--index-only" + ) + + if [ "${CORPUS_LIMIT}" -gt 0 ]; then + args+=("--corpus-limit" "${CORPUS_LIMIT}") + fi + if [ "${QUERY_LIMIT}" -gt 0 ]; then + args+=("--query-limit" "${QUERY_LIMIT}") + fi + if [ "${RECREATE_INDEX}" = "1" ]; then + args+=("--recreate") + fi + + echo "[index] collection=${COLLECTION} corpus_limit=${CORPUS_LIMIT} query_limit=${QUERY_LIMIT}" | tee "${log}" + ( + export "${BASE_ENV[@]}" + "${PYTHON_BIN}" "${args[@]}" + ) >> "${log}" 2>&1 +} + +preflight_python_deps() { + "${PYTHON_BIN}" - <<'PY' +import importlib.util + +required = ["qdrant_client", "datasets"] +missing = [m for m in required if importlib.util.find_spec(m) is None] +if missing: + raise SystemExit( + "Missing Python deps for CoSQA benchmark: " + + ", ".join(missing) + + ". Install them before running." + ) +PY +} + +verify_collection_ready() { + "${PYTHON_BIN}" - "${COLLECTION}" <<'PY' +import os +import sys +from qdrant_client import QdrantClient + +collection = sys.argv[1] +url = os.environ.get("QDRANT_URL", "http://localhost:6333") +client = QdrantClient(url=url, timeout=60) +info = client.get_collection(collection) +points = int(info.points_count or 0) +if points <= 0: + raise RuntimeError(f"Collection '{collection}' has no points after indexing") +print(f"[verify] collection={collection} points={points}") +PY +} + +run_case() { + local label="$1" + local mode="$2" + local rerank="$3" + local expand="$4" + local lex_mode="$5" + shift 5 + + local output="${OUT_DIR}/cosqa_${label}.json" + local log="${LOG_DIR}/cosqa_${label}.log" + + local args=( + "-m" "scripts.benchmarks.cosqa.runner" + "--split" "${SPLIT}" + "--collection" "${COLLECTION}" + "--limit" "${LIMIT}" + "--skip-index" + "--mode" "${mode}" + "--output" "${output}" + ) + + if [ "${CORPUS_LIMIT}" -gt 0 ]; then + args+=("--corpus-limit" "${CORPUS_LIMIT}") + fi + if [ "${QUERY_LIMIT}" -gt 0 ]; then + args+=("--query-limit" "${QUERY_LIMIT}") + fi + if [ "${rerank}" = "0" ]; then + args+=("--no-rerank") + fi + if [ "${expand}" = "0" ]; then + args+=("--no-expand") + fi + + local case_env=("HYBRID_LEXICAL_TEXT_MODE=${lex_mode}") + for kv in "$@"; do + case_env+=("${kv}") + done + + echo "[run] ${label} mode=${mode} rerank=${rerank} expand=${expand} lex_mode=${lex_mode}" | tee "${log}" + ( + export "${BASE_ENV[@]}" + export "${case_env[@]}" + "${PYTHON_BIN}" "${args[@]}" + ) >> "${log}" 2>&1 + + echo "[ok] ${output}" +} + +CASES=() +case "${RUN_SET}" in + pr) + CASES=( + "dense_norerank|dense|0|0|raw" + "hybrid_rerank_lexrrf|hybrid|1|0|rrf" + "hybrid_rerank_expand_lexrrf|hybrid|1|1|rrf" + ) + ;; + knobs) + CASES=( + "dense_norerank|dense|0|0|raw" + "dense_rerank|dense|1|0|raw" + "hybrid_norerank_lexraw|hybrid|0|0|raw" + "hybrid_norerank_lexrrf|hybrid|0|0|rrf" + "hybrid_rerank_lexraw|hybrid|1|0|raw" + "hybrid_rerank_lexrrf|hybrid|1|0|rrf" + "hybrid_rerank_expand_lexrrf|hybrid|1|1|rrf" + "lexical_norerank|lexical|0|0|raw" + ) + ;; + nightly) + CASES=( + "dense_norerank|dense|0|0|raw" + "dense_rerank|dense|1|0|raw" + "hybrid_norerank_lexraw|hybrid|0|0|raw" + "hybrid_norerank_lexrrf|hybrid|0|0|rrf" + "hybrid_rerank_lexraw|hybrid|1|0|raw" + "hybrid_rerank_lexrrf|hybrid|1|0|rrf" + "hybrid_rerank_expand_lexrrf|hybrid|1|1|rrf" + "lexical_norerank|lexical|0|0|raw" + ) + ;; + full) + CASES=( + "dense_norerank|dense|0|0|raw" + "dense_rerank|dense|1|0|raw" + "hybrid_norerank_lexraw|hybrid|0|0|raw" + "hybrid_norerank_lexrrf|hybrid|0|0|rrf" + "hybrid_rerank_lexraw|hybrid|1|0|raw" + "hybrid_rerank_lexrrf|hybrid|1|0|rrf" + "hybrid_rerank_expand_lexrrf|hybrid|1|1|rrf" + "lexical_norerank|lexical|0|0|raw" + ) + ;; + *) + echo "Unknown RUN_SET='${RUN_SET}'. Use pr|knobs|nightly|full" >&2 + exit 2 + ;; +esac + +echo "[config] run_tag=${RUN_TAG} profile=${PROFILE} run_set=${RUN_SET} out_dir=${OUT_DIR}" +preflight_python_deps +run_index_once +verify_collection_ready + +for spec in "${CASES[@]}"; do + IFS='|' read -r label mode rerank expand lex_mode <<< "${spec}" + run_case "${label}" "${mode}" "${rerank}" "${expand}" "${lex_mode}" +done + +"${PYTHON_BIN}" - "${OUT_DIR}" "${ENFORCE_HYBRID_GATE}" "${HYBRID_MIN_DELTA}" <<'PY' +import json +import sys +from pathlib import Path + +out_dir = Path(sys.argv[1]) +enforce_gate = str(sys.argv[2]).strip() in {"1", "true", "yes"} +min_delta = float(sys.argv[3]) + +rows = [] +for path in sorted(out_dir.glob("cosqa_*.json")): + if path.name.startswith("cosqa_index") or path.name.endswith("_meta.json") or path.name.startswith("summary"): + continue + with path.open("r", encoding="utf-8") as f: + data = json.load(f) + if not isinstance(data, dict) or "metrics" not in data or "config" not in data: + continue + metrics = data.get("metrics") or {} + config = data.get("config") or {} + env = (config.get("env") or {}) if isinstance(config, dict) else {} + rows.append({ + "label": path.stem.replace("cosqa_", ""), + "mode": config.get("mode", ""), + "rerank": bool(config.get("rerank_enabled", False)), + "expand": env.get("HYBRID_EXPAND", ""), + "lex_mode": env.get("HYBRID_LEXICAL_TEXT_MODE", ""), + "mrr": float(metrics.get("mrr", 0.0) or 0.0), + "recall_10": float(metrics.get("recall@10", 0.0) or 0.0), + "ndcg_10": float(metrics.get("ndcg@10", 0.0) or 0.0), + "lat_ms": float((data.get("latency") or {}).get("avg_ms", 0.0) or 0.0), + "file": path.name, + }) + +if not rows: + print("No CoSQA result JSON files found.", file=sys.stderr) + sys.exit(3) + +rows.sort(key=lambda r: (-r["mrr"], -r["recall_10"])) + +summary = { + "ranked": rows, + "best": rows[0], +} +(out_dir / "summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8") + +lines = [ + "# CoSQA Search Matrix Summary", + "", + "| Rank | Label | Mode | Rerank | Expand | LexMode | MRR | R@10 | NDCG@10 | Avg Lat (ms) |", + "|---:|---|---|---:|---:|---|---:|---:|---:|---:|", +] +for i, r in enumerate(rows, start=1): + lines.append( + f"| {i} | {r['label']} | {r['mode']} | {int(r['rerank'])} | {r['expand']} | {r['lex_mode']} | " + f"{r['mrr']:.4f} | {r['recall_10']:.4f} | {r['ndcg_10']:.4f} | {r['lat_ms']:.2f} |" + ) + +best_dense = max((r for r in rows if r["mode"] == "dense"), key=lambda r: r["mrr"], default=None) +best_hybrid = max((r for r in rows if r["mode"] == "hybrid"), key=lambda r: r["mrr"], default=None) +if best_dense and best_hybrid: + delta = best_hybrid["mrr"] - best_dense["mrr"] + lines.append("") + lines.append( + f"Best hybrid ({best_hybrid['label']}) vs best dense ({best_dense['label']}): " + f"delta MRR = {delta:+.4f}" + ) + if enforce_gate and delta < min_delta: + lines.append( + f"Gate failed: hybrid delta {delta:+.4f} is below required minimum {min_delta:+.4f}" + ) + (out_dir / "summary.md").write_text("\n".join(lines) + "\n", encoding="utf-8") + print("\n".join(lines)) + sys.exit(4) + +(out_dir / "summary.md").write_text("\n".join(lines) + "\n", encoding="utf-8") +print("\n".join(lines)) +PY + +echo "[done] results=${OUT_DIR}" +echo "[done] summary=${OUT_DIR}/summary.md" diff --git a/scripts/benchmarks/cosqa/runner.py b/scripts/benchmarks/cosqa/runner.py index 8e0c32b1..8aecaa85 100644 --- a/scripts/benchmarks/cosqa/runner.py +++ b/scripts/benchmarks/cosqa/runner.py @@ -442,7 +442,16 @@ def _cosqa_id_from_path(p: str) -> Optional[str]: name = s.rsplit("/", 1)[-1] if name.endswith(".py"): name = name[: -3] - return name.strip() or None + name = name.strip() + if not name: + return None + # CoSQA synthetic filenames are often "__". + # Recover canonical code_id so relevance matching aligns with qrels. + if "__" in name: + tail = name.rsplit("__", 1)[-1].strip() + if tail.startswith("cosqa-"): + return tail + return name # Extract stable code_ids for evaluation. # NOTE: rerank paths may not include payload; for CoSQA we can fall back to parsing @@ -915,8 +924,8 @@ async def run_full_benchmark( print(f" Limited corpus to {len(corpus)} entries") if skip_index: - print(" [skip-index] Skipping indexing...") - result = {"reused": True, "indexed": len(corpus), "skipped": 0, "errors": 0} + print(" [skip-index] Skipping indexing (using existing collection as-is)...") + result = {"reused": False, "indexed": 0, "skipped": len(corpus), "errors": 0} else: # Check if already indexed (use fingerprint matching, not just points_count) # The indexer handles fingerprint checking internally and will recreate if needed @@ -1018,6 +1027,12 @@ def main(): help="Search mode: 'hybrid' (default), 'dense' (pure semantic), or 'lexical' (pure BM25-style)") args = parser.parse_args() + # Benchmarks must not require MCP auth sessions. + # runner imports dotenv at module import time with override=True, so enforce this + # after args parsing to guarantee process-local benchmark behavior. + os.environ["CTXCE_AUTH_ENABLED"] = "0" + os.environ["CTXCE_MCP_ACL_ENFORCE"] = "0" + # Enable Context-Engine features for accurate benchmarking. # Semantic expansion is always enabled (it may still be a no-op if query expansion is disabled). os.environ["SEMANTIC_EXPANSION_ENABLED"] = "1"