diff --git a/.github/workflows/claude.yaml b/.github/workflows/claude.yaml new file mode 100644 index 00000000..732de78c --- /dev/null +++ b/.github/workflows/claude.yaml @@ -0,0 +1,68 @@ +name: Claude Code + +on: + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + issues: + types: [opened] + pull_request_review: + types: [submitted] + pull_request_target: + types: [opened, synchronize] + +jobs: + claude: + # This simplified condition is more robust and correctly checks permissions. + if: > + (contains(github.event.comment.body, '@claude') || + contains(github.event.review.body, '@claude') || + contains(github.event.issue.body, '@claude') || + contains(github.event.pull_request.body, '@claude')) && + (github.event.sender.type == 'User' && ( + github.event.comment.author_association == 'OWNER' || + github.event.comment.author_association == 'MEMBER' || + github.event.comment.author_association == 'COLLABORATOR' + )) + runs-on: ubuntu-latest + permissions: + # CRITICAL: Write permissions are required for the action to push branches and update issues/PRs. + contents: write + pull-requests: write + issues: write + id-token: write # Required for OIDC token exchange + actions: read # Required for Claude to read CI results on PRs + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + # This correctly checks out the PR's head commit for pull_request_target events. + ref: ${{ github.event.pull_request.head.sha }} + + - name: Create Claude settings file + run: | + mkdir -p /home/runner/.claude + cat > /home/runner/.claude/settings.json << 'EOF' + { + "env": { + "ANTHROPIC_BASE_URL": "https://api.z.ai/api/anthropic", + "ANTHROPIC_AUTH_TOKEN": "${{ secrets.CUSTOM_ENDPOINT_API_KEY }}" + } + } + EOF + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@v1 + with: + # Still need this to satisfy the action's validation + anthropic_api_key: ${{ secrets.CUSTOM_ENDPOINT_API_KEY }} + + # Use the same variable names as your local setup + settings: '{"env": {"ANTHROPIC_BASE_URL": "https://api.z.ai/api/anthropic", "ANTHROPIC_AUTH_TOKEN": "${{ secrets.CUSTOM_ENDPOINT_API_KEY }}"}}' + + track_progress: true + claude_args: | + --allowedTools "Bash,Edit,Read,Write,Glob,Grep" diff --git a/.github/workflows/cosqa-benchmark.yml b/.github/workflows/cosqa-benchmark.yml new file mode 100644 index 00000000..c25a1769 --- /dev/null +++ b/.github/workflows/cosqa-benchmark.yml @@ -0,0 +1,147 @@ +name: CoSQA Search Benchmark + +on: + workflow_dispatch: + inputs: + enforce_hybrid_gate: + description: Fail run if best hybrid underperforms best dense past threshold + required: false + default: false + type: boolean + hybrid_min_delta: + description: Minimum accepted (hybrid_mrr - dense_mrr), e.g. -0.02 + required: false + default: "-0.02" + type: string + upload_full_artifacts: + description: Upload full logs/json bundle (higher storage usage) + required: false + default: false + type: boolean + + pull_request: + branches: [ test ] + paths: + - scripts/hybrid/** + - scripts/hybrid_search.py + - scripts/mcp_impl/search.py + - scripts/mcp_impl/context_search.py + - scripts/mcp_indexer_server.py + - scripts/benchmarks/cosqa/** + - .github/workflows/cosqa-benchmark.yml + + schedule: + - cron: "25 3 * * *" + +jobs: + cosqa-bench: + runs-on: ubuntu-latest + timeout-minutes: 360 + + services: + qdrant: + image: qdrant/qdrant:v1.15.1 + ports: + - 6333:6333 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt', '**/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Cache HuggingFace datasets + uses: actions/cache@v4 + with: + path: | + ~/.cache/huggingface/datasets + ~/.cache/huggingface/hub + key: ${{ runner.os }}-hf-cosqa-${{ hashFiles('scripts/benchmarks/cosqa/dataset.py') }} + restore-keys: | + ${{ runner.os }}-hf-cosqa- + ${{ runner.os }}-hf- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install "datasets>=2.18.0" + + - name: Wait for Qdrant + run: | + timeout 90 bash -c 'until curl -fsS http://localhost:6333/readyz; do sleep 2; done' + curl -fsS http://localhost:6333/collections >/dev/null + + - name: Resolve run config + id: cfg + run: | + echo "profile=full" >> "$GITHUB_OUTPUT" + echo "run_set=full" >> "$GITHUB_OUTPUT" + if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.enforce_hybrid_gate }}" = "true" ]; then + echo "enforce_hybrid_gate=1" >> "$GITHUB_OUTPUT" + else + echo "enforce_hybrid_gate=0" >> "$GITHUB_OUTPUT" + fi + if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.hybrid_min_delta }}" != "" ]; then + echo "hybrid_min_delta=${{ inputs.hybrid_min_delta }}" >> "$GITHUB_OUTPUT" + else + echo "hybrid_min_delta=-0.02" >> "$GITHUB_OUTPUT" + fi + + - name: Run CoSQA search matrix + id: bench + env: + QDRANT_URL: http://localhost:6333 + PROFILE: ${{ steps.cfg.outputs.profile }} + RUN_SET: ${{ steps.cfg.outputs.run_set }} + ENFORCE_HYBRID_GATE: ${{ steps.cfg.outputs.enforce_hybrid_gate }} + HYBRID_MIN_DELTA: ${{ steps.cfg.outputs.hybrid_min_delta }} + PYTHONUNBUFFERED: "1" + run: | + RUN_TAG="gha-${{ github.run_id }}-${{ github.run_attempt }}" + OUT_DIR="bench_results/cosqa/${RUN_TAG}" + echo "out_dir=${OUT_DIR}" >> "$GITHUB_OUTPUT" + RUN_TAG="${RUN_TAG}" OUT_DIR="${OUT_DIR}" ./scripts/benchmarks/cosqa/run_search_matrix.sh + + - name: Publish benchmark summary + if: always() + run: | + SUMMARY="${{ steps.bench.outputs.out_dir }}/summary.md" + if [ -f "${SUMMARY}" ]; then + cat "${SUMMARY}" >> "$GITHUB_STEP_SUMMARY" + else + echo "No summary file generated" >> "$GITHUB_STEP_SUMMARY" + fi + + - name: Upload benchmark artifacts + if: always() && github.event_name == 'pull_request' + uses: actions/upload-artifact@v4 + with: + name: cosqa-search-summary-${{ github.run_id }}-${{ github.run_attempt }} + path: | + ${{ steps.bench.outputs.out_dir }}/summary.md + ${{ steps.bench.outputs.out_dir }}/summary.json + retention-days: 3 + + - name: Upload full benchmark artifacts + if: | + always() && ( + github.event_name == 'schedule' || + (github.event_name == 'workflow_dispatch' && inputs.upload_full_artifacts == true) + ) + uses: actions/upload-artifact@v4 + with: + name: cosqa-search-bench-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ steps.bench.outputs.out_dir }} + retention-days: 7 diff --git a/ctx-mcp-bridge/src/mcpServer.js b/ctx-mcp-bridge/src/mcpServer.js index 53cb05b7..da29873e 100644 --- a/ctx-mcp-bridge/src/mcpServer.js +++ b/ctx-mcp-bridge/src/mcpServer.js @@ -8,7 +8,13 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js" import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js"; import { Client } from "@modelcontextprotocol/sdk/client/index.js"; import { StreamableHTTPClientTransport } from "@modelcontextprotocol/sdk/client/streamableHttp.js"; -import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js"; +import { + CallToolRequestSchema, + ListToolsRequestSchema, + ListResourcesRequestSchema, + ListResourceTemplatesRequestSchema, + ReadResourceRequestSchema, +} from "@modelcontextprotocol/sdk/types.js"; import { loadAnyAuthEntry, loadAuthEntry, readConfig, saveAuthEntry } from "./authConfig.js"; import { maybeRemapToolArgs, maybeRemapToolResult } from "./resultPathMapping.js"; import * as oauthHandler from "./oauthHandler.js"; @@ -27,16 +33,23 @@ function debugLog(message) { async function sendSessionDefaults(client, payload, label) { if (!client) { - return; + return false; } try { - await client.callTool({ - name: "set_session_defaults", - arguments: payload, - }); + const timeoutMs = getBridgeToolTimeoutMs(); + await withTimeout( + client.callTool({ + name: "set_session_defaults", + arguments: payload, + }), + timeoutMs, + `sendSessionDefaults(${label})` + ); + return true; } catch (err) { // eslint-disable-next-line no-console console.error(`[ctxce] Failed to call set_session_defaults on ${label}:`, err); + return false; } } function dedupeTools(tools) { @@ -58,14 +71,47 @@ function dedupeTools(tools) { return out; } +function dedupeResources(resources) { + const seen = new Set(); + const out = []; + for (const resource of resources) { + const uri = resource && typeof resource.uri === "string" ? resource.uri : ""; + if (!uri || seen.has(uri)) { + continue; + } + seen.add(uri); + out.push(resource); + } + return out; +} + +function dedupeResourceTemplates(templates) { + const seen = new Set(); + const out = []; + for (const template of templates) { + const uri = + template && typeof template.uriTemplate === "string" + ? template.uriTemplate + : ""; + if (!uri || seen.has(uri)) { + continue; + } + seen.add(uri); + out.push(template); + } + return out; +} + async function listMemoryTools(client) { if (!client) { return []; } try { - const remote = await withTimeout( - client.listTools(), - 5000, + const remote = await withTransientRetry( + () => { + const timeoutMs = getBridgeListTimeoutMs(); + return withTimeout(client.listTools(), timeoutMs, "memory tools/list"); + }, "memory tools/list", ); return Array.isArray(remote?.tools) ? remote.tools.slice() : []; @@ -75,6 +121,94 @@ async function listMemoryTools(client) { } } +function encodeCompositeCursor(cursorObj) { + try { + const payload = JSON.stringify(cursorObj || {}); + return Buffer.from(payload, "utf8").toString("base64"); + } catch { + return ""; + } +} + +function decodeCompositeCursor(raw) { + try { + const trimmed = (raw || "").trim(); + if (!trimmed) { + return null; + } + const decoded = Buffer.from(trimmed, "base64").toString("utf8"); + const parsed = JSON.parse(decoded); + if (!parsed || typeof parsed !== "object") { + return null; + } + return parsed; + } catch { + return null; + } +} + +async function listResourcesSafe(client, label, cursor) { + if (!client) { + return { resources: [], nextCursor: null }; + } + try { + const params = cursor ? { cursor } : {}; + const remote = await withTransientRetry( + () => { + const timeoutMs = getBridgeListTimeoutMs(); + return withTimeout( + client.listResources(params), + timeoutMs, + `${label} resources/list`, + ); + }, + `${label} resources/list`, + ); + return { + resources: Array.isArray(remote?.resources) ? remote.resources.slice() : [], + nextCursor: + remote && typeof remote.nextCursor === "string" && remote.nextCursor + ? remote.nextCursor + : null, + }; + } catch (err) { + debugLog(`[ctxce] Error calling ${label} resources/list: ` + String(err)); + return { resources: [], nextCursor: null }; + } +} + +async function listResourceTemplatesSafe(client, label, cursor) { + if (!client) { + return { resourceTemplates: [], nextCursor: null }; + } + try { + const params = cursor ? { cursor } : {}; + const remote = await withTransientRetry( + () => { + const timeoutMs = getBridgeListTimeoutMs(); + return withTimeout( + client.listResourceTemplates(params), + timeoutMs, + `${label} resources/templates/list`, + ); + }, + `${label} resources/templates/list`, + ); + return { + resourceTemplates: Array.isArray(remote?.resourceTemplates) + ? remote.resourceTemplates.slice() + : [], + nextCursor: + remote && typeof remote.nextCursor === "string" && remote.nextCursor + ? remote.nextCursor + : null, + }; + } catch (err) { + debugLog(`[ctxce] Error calling ${label} resources/templates/list: ` + String(err)); + return { resourceTemplates: [], nextCursor: null }; + } +} + function withTimeout(promise, ms, label) { return new Promise((resolve, reject) => { let settled = false; @@ -125,6 +259,25 @@ function getBridgeToolTimeoutMs() { } } +function getBridgeListTimeoutMs() { + try { + // Keep list operations on a separate budget from tools/call. + // Some streamable-http clients (including Codex) probe tools/resources early, + // and a short timeout here can make the bridge appear unavailable. + const raw = process.env.CTXCE_LIST_TIMEOUT_MSEC; + if (!raw) { + return 60000; + } + const parsed = Number.parseInt(String(raw), 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + return 60000; + } + return parsed; + } catch { + return 60000; + } +} + function selectClientForTool(name, indexerClient, memoryClient) { if (!name) { return indexerClient; @@ -270,6 +423,34 @@ function isTransientToolError(error) { return false; } } + +async function withTransientRetry(operation, label, maxAttempts, retryDelayMs) { + const attempts = Number.isFinite(maxAttempts) && maxAttempts > 0 + ? Math.floor(maxAttempts) + : getBridgeRetryAttempts(); + const delayMs = Number.isFinite(retryDelayMs) && retryDelayMs >= 0 + ? Math.floor(retryDelayMs) + : getBridgeRetryDelayMs(); + let lastError; + for (let attempt = 0; attempt < attempts; attempt += 1) { + if (attempt > 0 && delayMs > 0) { + await new Promise((resolve) => setTimeout(resolve, delayMs)); + } + try { + return await operation(); + } catch (err) { + lastError = err; + if (!isTransientToolError(err) || attempt === attempts - 1) { + throw err; + } + debugLog( + `[ctxce] ${label}: transient error (attempt ${attempt + 1}/${attempts}), retrying: ` + + String(err), + ); + } + } + throw lastError || new Error(`[ctxce] ${label}: unknown transient retry failure`); +} // MCP stdio server implemented using the official MCP TypeScript SDK. // Acts as a low-level proxy for tools, forwarding tools/list and tools/call // to the remote qdrant-indexer MCP server while adding a local `ping` tool. @@ -440,6 +621,7 @@ async function createBridgeServer(options) { let indexerClient = null; let memoryClient = null; + let lastDefaultsSyncedSessionId = ""; // Derive a simple session identifier for this bridge process. In the // future this can be made user-aware (e.g. from auth), but for now we @@ -568,6 +750,23 @@ async function createBridgeServer(options) { defaultsPayload.under = defaultUnder; } + async function ensureRemoteDefaults(force = false) { + defaultsPayload.session = sessionId; + if (!sessionId) { + return; + } + if (!force && lastDefaultsSyncedSessionId === sessionId) { + return; + } + const indexerOk = await sendSessionDefaults(indexerClient, defaultsPayload, "indexer"); + if (memoryClient) { + await sendSessionDefaults(memoryClient, defaultsPayload, "memory"); + } + if (indexerOk) { + lastDefaultsSyncedSessionId = sessionId; + } + } + async function initializeRemoteClients(forceRecreate = false) { if (!forceRecreate && indexerClient) { return; @@ -579,6 +778,22 @@ async function createBridgeServer(options) { } catch { // ignore logging failures } + try { + if (indexerClient && typeof indexerClient.close === "function") { + await indexerClient.close(); + } + } catch { + // ignore + } + try { + if (memoryClient && typeof memoryClient.close === "function") { + await memoryClient.close(); + } + } catch { + // ignore + } + indexerClient = null; + memoryClient = null; } let nextIndexerClient = null; @@ -633,15 +848,22 @@ async function createBridgeServer(options) { indexerClient = nextIndexerClient; memoryClient = nextMemoryClient; - if (Object.keys(defaultsPayload).length > 1 && indexerClient) { - await sendSessionDefaults(indexerClient, defaultsPayload, "indexer"); - if (memoryClient) { - await sendSessionDefaults(memoryClient, defaultsPayload, "memory"); - } + await ensureRemoteDefaults(true); + } + + async function refreshSessionAndSyncDefaults() { + const freshSession = resolveSessionId() || sessionId; + const changed = Boolean(freshSession && freshSession !== sessionId); + if (changed) { + sessionId = freshSession; + defaultsPayload.session = sessionId; + lastDefaultsSyncedSessionId = ""; } + await initializeRemoteClients(false); + await ensureRemoteDefaults(changed); } - await initializeRemoteClients(false); + await refreshSessionAndSyncDefaults(); const server = new Server( // TODO: marked as depreciated { @@ -651,6 +873,7 @@ async function createBridgeServer(options) { { capabilities: { tools: {}, + resources: {}, }, }, ); @@ -659,14 +882,22 @@ async function createBridgeServer(options) { server.setRequestHandler(ListToolsRequestSchema, async () => { let remote; try { - debugLog("[ctxce] tools/list: fetching tools from indexer"); await initializeRemoteClients(false); + await ensureRemoteDefaults(false); if (!indexerClient) { throw new Error("Indexer MCP client not initialized"); } - remote = await withTimeout( - indexerClient.listTools(), - 10000, + + debugLog("[ctxce] tools/list: fetching tools from indexer"); + remote = await withTransientRetry( + () => { + const timeoutMs = getBridgeListTimeoutMs(); + return withTimeout( + indexerClient.listTools(), + timeoutMs, + "indexer tools/list", + ); + }, "indexer tools/list", ); } catch (err) { @@ -693,6 +924,109 @@ async function createBridgeServer(options) { return { tools }; }); + server.setRequestHandler(ListResourcesRequestSchema, async (request) => { + // Proxy resource discovery/read-through so clients that use MCP resources + // (not only tools) can access upstream indexer/memory resources directly. + await initializeRemoteClients(false); + await ensureRemoteDefaults(false); + const cursor = + request && request.params && typeof request.params.cursor === "string" + ? request.params.cursor + : null; + const decoded = decodeCompositeCursor(cursor); + const indexerCursor = + decoded && typeof decoded.i === "string" ? decoded.i : cursor; + const memoryCursor = + decoded && typeof decoded.m === "string" ? decoded.m : cursor; + if (cursor && decoded === null) { + debugLog("[ctxce] resources/list: received non-composite cursor; forwarding to both upstreams."); + } + const indexerRes = await listResourcesSafe(indexerClient, "indexer", indexerCursor); + const memoryRes = await listResourcesSafe(memoryClient, "memory", memoryCursor); + const resources = dedupeResources([ + ...indexerRes.resources, + ...memoryRes.resources, + ]); + const nextCursorObj = { + i: indexerRes.nextCursor || "", + m: memoryRes.nextCursor || "", + }; + const nextCursor = + nextCursorObj.i || nextCursorObj.m ? encodeCompositeCursor(nextCursorObj) : ""; + debugLog(`[ctxce] resources/list: returning ${resources.length} resources`); + return nextCursor ? { resources, nextCursor } : { resources }; + }); + + server.setRequestHandler(ListResourceTemplatesRequestSchema, async (request) => { + await initializeRemoteClients(false); + await ensureRemoteDefaults(false); + const cursor = + request && request.params && typeof request.params.cursor === "string" + ? request.params.cursor + : null; + const decoded = decodeCompositeCursor(cursor); + const indexerCursor = + decoded && typeof decoded.i === "string" ? decoded.i : cursor; + const memoryCursor = + decoded && typeof decoded.m === "string" ? decoded.m : cursor; + if (cursor && decoded === null) { + debugLog("[ctxce] resources/templates/list: received non-composite cursor; forwarding to both upstreams."); + } + const indexerRes = await listResourceTemplatesSafe( + indexerClient, + "indexer", + indexerCursor, + ); + const memoryRes = await listResourceTemplatesSafe( + memoryClient, + "memory", + memoryCursor, + ); + const resourceTemplates = dedupeResourceTemplates([ + ...indexerRes.resourceTemplates, + ...memoryRes.resourceTemplates, + ]); + const nextCursorObj = { + i: indexerRes.nextCursor || "", + m: memoryRes.nextCursor || "", + }; + const nextCursor = + nextCursorObj.i || nextCursorObj.m ? encodeCompositeCursor(nextCursorObj) : ""; + debugLog(`[ctxce] resources/templates/list: returning ${resourceTemplates.length} templates`); + return nextCursor ? { resourceTemplates, nextCursor } : { resourceTemplates }; + }); + + server.setRequestHandler(ReadResourceRequestSchema, async (request) => { + await refreshSessionAndSyncDefaults(); + const params = request.params || {}; + const timeoutMs = getBridgeToolTimeoutMs(); + const uri = + params && typeof params.uri === "string" ? params.uri : ""; + debugLog(`[ctxce] resources/read: ${uri}`); + + const tryRead = async (client, label) => { + if (!client) { + return null; + } + try { + return await client.readResource(params, { timeout: timeoutMs }); + } catch (err) { + debugLog(`[ctxce] resources/read failed on ${label}: ` + String(err)); + return null; + } + }; + + const indexerResult = await tryRead(indexerClient, "indexer"); + if (indexerResult) { + return indexerResult; + } + const memoryResult = await tryRead(memoryClient, "memory"); + if (memoryResult) { + return memoryResult; + } + throw new Error(`Resource ${uri} not available on any configured MCP server`); + }); + // tools/call → proxied to indexer or memory server server.setRequestHandler(CallToolRequestSchema, async (request) => { const params = request.params || {}; @@ -701,16 +1035,8 @@ async function createBridgeServer(options) { debugLog(`[ctxce] tools/call: ${name || ""}`); - // Refresh session before each call; re-init clients if session changes. - const freshSession = resolveSessionId() || sessionId; - if (freshSession && freshSession !== sessionId) { - sessionId = freshSession; - try { - await initializeRemoteClients(true); - } catch (err) { - debugLog("[ctxce] Failed to reinitialize clients after session refresh: " + String(err)); - } - } + await refreshSessionAndSyncDefaults(); + if (sessionId && (args === undefined || args === null || typeof args === "object")) { const obj = args && typeof args === "object" ? { ...args } : {}; if (!Object.prototype.hasOwnProperty.call(obj, "session")) { @@ -733,8 +1059,6 @@ async function createBridgeServer(options) { return indexerResult; } - await initializeRemoteClients(false); - const timeoutMs = getBridgeToolTimeoutMs(); const maxAttempts = getBridgeRetryAttempts(); const retryDelayMs = getBridgeRetryDelayMs(); @@ -770,6 +1094,7 @@ async function createBridgeServer(options) { String(err), ); await initializeRemoteClients(true); + await ensureRemoteDefaults(true); sessionRetried = true; continue; } @@ -843,6 +1168,13 @@ export async function runHttpMcpServer(options) { typeof options.port === "number" ? options.port : Number.parseInt(process.env.CTXCE_HTTP_PORT || "30810", 10) || 30810; + // TODO(auth): replace this boolean toggle with explicit auth modes (none|required). + // In required mode, enforce Bearer auth on /mcp with consistent 401 challenges and + // only advertise OAuth metadata/endpoints when authentication is mandatory. + // In local/dev mode, leaving OAuth discovery off avoids clients entering an + // unnecessary OAuth path for otherwise unauthenticated bridge usage. + const oauthEnabled = String(process.env.CTXCE_ENABLE_OAUTH || "").trim().toLowerCase(); + const oauthEndpointsEnabled = oauthEnabled === "1" || oauthEnabled === "true" || oauthEnabled === "yes"; const transport = new StreamableHTTPServerTransport({ sessionIdGenerator: undefined, @@ -865,34 +1197,36 @@ export async function runHttpMcpServer(options) { // OAuth 2.0 Endpoints (RFC9728 Protected Resource Metadata + RFC7591) // ================================================================ - // OAuth metadata endpoint (RFC9728) - if (parsedUrl.pathname === "/.well-known/oauth-authorization-server") { - oauthHandler.handleOAuthMetadata(req, res, issuerUrl); - return; - } + if (oauthEndpointsEnabled) { + // OAuth metadata endpoint (RFC9728) + if (parsedUrl.pathname === "/.well-known/oauth-authorization-server") { + oauthHandler.handleOAuthMetadata(req, res, issuerUrl); + return; + } - // OAuth Dynamic Client Registration endpoint (RFC7591) - if (parsedUrl.pathname === "/oauth/register" && req.method === "POST") { - oauthHandler.handleOAuthRegister(req, res); - return; - } + // OAuth Dynamic Client Registration endpoint (RFC7591) + if (parsedUrl.pathname === "/oauth/register" && req.method === "POST") { + oauthHandler.handleOAuthRegister(req, res); + return; + } - // OAuth authorize endpoint - if (parsedUrl.pathname === "/oauth/authorize") { - oauthHandler.handleOAuthAuthorize(req, res, parsedUrl.searchParams); - return; - } + // OAuth authorize endpoint + if (parsedUrl.pathname === "/oauth/authorize") { + oauthHandler.handleOAuthAuthorize(req, res, parsedUrl.searchParams); + return; + } - // Store session endpoint (helper for login page) - if (parsedUrl.pathname === "/oauth/store-session" && req.method === "POST") { - oauthHandler.handleOAuthStoreSession(req, res); - return; - } + // Store session endpoint (helper for login page) + if (parsedUrl.pathname === "/oauth/store-session" && req.method === "POST") { + oauthHandler.handleOAuthStoreSession(req, res); + return; + } - // OAuth token endpoint - if (parsedUrl.pathname === "/oauth/token" && req.method === "POST") { - oauthHandler.handleOAuthToken(req, res); - return; + // OAuth token endpoint + if (parsedUrl.pathname === "/oauth/token" && req.method === "POST") { + oauthHandler.handleOAuthToken(req, res); + return; + } } // ================================================================ @@ -1058,4 +1392,3 @@ function detectRepoName(workspace, config) { const leaf = workspace ? path.basename(workspace) : ""; return leaf && SLUGGED_REPO_RE.test(leaf) ? leaf : null; } - diff --git a/scripts/benchmarks/cosqa/run_search_matrix.sh b/scripts/benchmarks/cosqa/run_search_matrix.sh new file mode 100755 index 00000000..d2eece9a --- /dev/null +++ b/scripts/benchmarks/cosqa/run_search_matrix.sh @@ -0,0 +1,315 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +cd "${ROOT_DIR}" + +PYTHON_BIN="${PYTHON_BIN:-}" +if [ -z "${PYTHON_BIN}" ]; then + if command -v python3.11 >/dev/null 2>&1; then + PYTHON_BIN="python3.11" + elif command -v python3 >/dev/null 2>&1; then + PYTHON_BIN="python3" + elif command -v python >/dev/null 2>&1; then + PYTHON_BIN="python" + else + echo "No Python interpreter found (looked for python3.11/python3/python)." >&2 + exit 127 + fi +fi +RUN_TAG="${RUN_TAG:-$(date +%Y%m%d-%H%M%S)}" +PROFILE="${PROFILE:-full}" # smoke | quick | full +RUN_SET="${RUN_SET:-full}" # pr | knobs | nightly | full +OUT_DIR="${OUT_DIR:-bench_results/cosqa/${RUN_TAG}}" +LOG_DIR="${LOG_DIR:-${OUT_DIR}}" +SPLIT="${SPLIT:-test}" +COLLECTION="${COLLECTION:-cosqa-search-${RUN_TAG}}" +LIMIT="${LIMIT:-10}" +RECREATE_INDEX="${RECREATE_INDEX:-1}" +ENFORCE_HYBRID_GATE="${ENFORCE_HYBRID_GATE:-0}" +HYBRID_MIN_DELTA="${HYBRID_MIN_DELTA:--0.020}" + +case "${PROFILE}" in + smoke) + : "${CORPUS_LIMIT:=150}" + : "${QUERY_LIMIT:=30}" + ;; + quick) + : "${CORPUS_LIMIT:=500}" + : "${QUERY_LIMIT:=100}" + ;; + full) + : "${CORPUS_LIMIT:=0}" + : "${QUERY_LIMIT:=0}" + ;; + *) + echo "Unknown PROFILE='${PROFILE}'. Use smoke|quick|full" >&2 + exit 2 + ;; +esac + +mkdir -p "${OUT_DIR}" "${LOG_DIR}" + +BASE_ENV=( + "LOG_LEVEL=${LOG_LEVEL:-INFO}" + "DEBUG_HYBRID_SEARCH=${DEBUG_HYBRID_SEARCH:-0}" + "QDRANT_URL=${QDRANT_URL:-http://localhost:6333}" + "HYBRID_IN_PROCESS=${HYBRID_IN_PROCESS:-1}" + "RERANK_IN_PROCESS=${RERANK_IN_PROCESS:-1}" + "LEX_VECTOR_DIM=${LEX_VECTOR_DIM:-4096}" + "COSQA_QUERY_CONCURRENCY=${COSQA_QUERY_CONCURRENCY:-8}" + "LLM_EXPAND_MAX=0" + "REFRAG_DECODER=0" + "RERANK_LEARNING=0" + "RERANK_EVENTS_ENABLED=0" +) + +run_index_once() { + local log="${LOG_DIR}/cosqa_index.log" + local args=( + "-m" "scripts.benchmarks.cosqa.runner" + "--split" "${SPLIT}" + "--collection" "${COLLECTION}" + "--limit" "${LIMIT}" + "--index-only" + ) + + if [ "${CORPUS_LIMIT}" -gt 0 ]; then + args+=("--corpus-limit" "${CORPUS_LIMIT}") + fi + if [ "${QUERY_LIMIT}" -gt 0 ]; then + args+=("--query-limit" "${QUERY_LIMIT}") + fi + if [ "${RECREATE_INDEX}" = "1" ]; then + args+=("--recreate") + fi + + echo "[index] collection=${COLLECTION} corpus_limit=${CORPUS_LIMIT} query_limit=${QUERY_LIMIT}" | tee "${log}" + ( + export "${BASE_ENV[@]}" + "${PYTHON_BIN}" "${args[@]}" + ) >> "${log}" 2>&1 +} + +preflight_python_deps() { + "${PYTHON_BIN}" - <<'PY' +import importlib.util + +required = ["qdrant_client", "datasets"] +missing = [m for m in required if importlib.util.find_spec(m) is None] +if missing: + raise SystemExit( + "Missing Python deps for CoSQA benchmark: " + + ", ".join(missing) + + ". Install them before running." + ) +PY +} + +verify_collection_ready() { + "${PYTHON_BIN}" - "${COLLECTION}" <<'PY' +import os +import sys +from qdrant_client import QdrantClient + +collection = sys.argv[1] +url = os.environ.get("QDRANT_URL", "http://localhost:6333") +client = QdrantClient(url=url, timeout=60) +info = client.get_collection(collection) +points = int(info.points_count or 0) +if points <= 0: + raise RuntimeError(f"Collection '{collection}' has no points after indexing") +print(f"[verify] collection={collection} points={points}") +PY +} + +run_case() { + local label="$1" + local mode="$2" + local rerank="$3" + local expand="$4" + local lex_mode="$5" + shift 5 + + local output="${OUT_DIR}/cosqa_${label}.json" + local log="${LOG_DIR}/cosqa_${label}.log" + + local args=( + "-m" "scripts.benchmarks.cosqa.runner" + "--split" "${SPLIT}" + "--collection" "${COLLECTION}" + "--limit" "${LIMIT}" + "--skip-index" + "--mode" "${mode}" + "--output" "${output}" + ) + + if [ "${CORPUS_LIMIT}" -gt 0 ]; then + args+=("--corpus-limit" "${CORPUS_LIMIT}") + fi + if [ "${QUERY_LIMIT}" -gt 0 ]; then + args+=("--query-limit" "${QUERY_LIMIT}") + fi + if [ "${rerank}" = "0" ]; then + args+=("--no-rerank") + fi + if [ "${expand}" = "0" ]; then + args+=("--no-expand") + fi + + local case_env=("HYBRID_LEXICAL_TEXT_MODE=${lex_mode}") + for kv in "$@"; do + case_env+=("${kv}") + done + + echo "[run] ${label} mode=${mode} rerank=${rerank} expand=${expand} lex_mode=${lex_mode}" | tee "${log}" + ( + export "${BASE_ENV[@]}" + export "${case_env[@]}" + "${PYTHON_BIN}" "${args[@]}" + ) >> "${log}" 2>&1 + + echo "[ok] ${output}" +} + +CASES=() +case "${RUN_SET}" in + pr) + CASES=( + "dense_norerank|dense|0|0|raw" + "hybrid_rerank_lexrrf|hybrid|1|0|rrf" + "hybrid_rerank_expand_lexrrf|hybrid|1|1|rrf" + ) + ;; + knobs) + CASES=( + "dense_norerank|dense|0|0|raw" + "dense_rerank|dense|1|0|raw" + "hybrid_norerank_lexraw|hybrid|0|0|raw" + "hybrid_norerank_lexrrf|hybrid|0|0|rrf" + "hybrid_rerank_lexraw|hybrid|1|0|raw" + "hybrid_rerank_lexrrf|hybrid|1|0|rrf" + "hybrid_rerank_expand_lexrrf|hybrid|1|1|rrf" + "lexical_norerank|lexical|0|0|raw" + ) + ;; + nightly) + CASES=( + "dense_norerank|dense|0|0|raw" + "dense_rerank|dense|1|0|raw" + "hybrid_norerank_lexraw|hybrid|0|0|raw" + "hybrid_norerank_lexrrf|hybrid|0|0|rrf" + "hybrid_rerank_lexraw|hybrid|1|0|raw" + "hybrid_rerank_lexrrf|hybrid|1|0|rrf" + "hybrid_rerank_expand_lexrrf|hybrid|1|1|rrf" + "lexical_norerank|lexical|0|0|raw" + ) + ;; + full) + CASES=( + "dense_norerank|dense|0|0|raw" + "dense_rerank|dense|1|0|raw" + "hybrid_norerank_lexraw|hybrid|0|0|raw" + "hybrid_norerank_lexrrf|hybrid|0|0|rrf" + "hybrid_rerank_lexraw|hybrid|1|0|raw" + "hybrid_rerank_lexrrf|hybrid|1|0|rrf" + "hybrid_rerank_expand_lexrrf|hybrid|1|1|rrf" + "lexical_norerank|lexical|0|0|raw" + ) + ;; + *) + echo "Unknown RUN_SET='${RUN_SET}'. Use pr|knobs|nightly|full" >&2 + exit 2 + ;; +esac + +echo "[config] run_tag=${RUN_TAG} profile=${PROFILE} run_set=${RUN_SET} out_dir=${OUT_DIR}" +preflight_python_deps +run_index_once +verify_collection_ready + +for spec in "${CASES[@]}"; do + IFS='|' read -r label mode rerank expand lex_mode <<< "${spec}" + run_case "${label}" "${mode}" "${rerank}" "${expand}" "${lex_mode}" +done + +"${PYTHON_BIN}" - "${OUT_DIR}" "${ENFORCE_HYBRID_GATE}" "${HYBRID_MIN_DELTA}" <<'PY' +import json +import sys +from pathlib import Path + +out_dir = Path(sys.argv[1]) +enforce_gate = str(sys.argv[2]).strip() in {"1", "true", "yes"} +min_delta = float(sys.argv[3]) + +rows = [] +for path in sorted(out_dir.glob("cosqa_*.json")): + if path.name.startswith("cosqa_index") or path.name.endswith("_meta.json") or path.name.startswith("summary"): + continue + with path.open("r", encoding="utf-8") as f: + data = json.load(f) + if not isinstance(data, dict) or "metrics" not in data or "config" not in data: + continue + metrics = data.get("metrics") or {} + config = data.get("config") or {} + env = (config.get("env") or {}) if isinstance(config, dict) else {} + rows.append({ + "label": path.stem.replace("cosqa_", ""), + "mode": config.get("mode", ""), + "rerank": bool(config.get("rerank_enabled", False)), + "expand": env.get("HYBRID_EXPAND", ""), + "lex_mode": env.get("HYBRID_LEXICAL_TEXT_MODE", ""), + "mrr": float(metrics.get("mrr", 0.0) or 0.0), + "recall_10": float(metrics.get("recall@10", 0.0) or 0.0), + "ndcg_10": float(metrics.get("ndcg@10", 0.0) or 0.0), + "lat_ms": float((data.get("latency") or {}).get("avg_ms", 0.0) or 0.0), + "file": path.name, + }) + +if not rows: + print("No CoSQA result JSON files found.", file=sys.stderr) + sys.exit(3) + +rows.sort(key=lambda r: (-r["mrr"], -r["recall_10"])) + +summary = { + "ranked": rows, + "best": rows[0], +} +(out_dir / "summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8") + +lines = [ + "# CoSQA Search Matrix Summary", + "", + "| Rank | Label | Mode | Rerank | Expand | LexMode | MRR | R@10 | NDCG@10 | Avg Lat (ms) |", + "|---:|---|---|---:|---:|---|---:|---:|---:|---:|", +] +for i, r in enumerate(rows, start=1): + lines.append( + f"| {i} | {r['label']} | {r['mode']} | {int(r['rerank'])} | {r['expand']} | {r['lex_mode']} | " + f"{r['mrr']:.4f} | {r['recall_10']:.4f} | {r['ndcg_10']:.4f} | {r['lat_ms']:.2f} |" + ) + +best_dense = max((r for r in rows if r["mode"] == "dense"), key=lambda r: r["mrr"], default=None) +best_hybrid = max((r for r in rows if r["mode"] == "hybrid"), key=lambda r: r["mrr"], default=None) +if best_dense and best_hybrid: + delta = best_hybrid["mrr"] - best_dense["mrr"] + lines.append("") + lines.append( + f"Best hybrid ({best_hybrid['label']}) vs best dense ({best_dense['label']}): " + f"delta MRR = {delta:+.4f}" + ) + if enforce_gate and delta < min_delta: + lines.append( + f"Gate failed: hybrid delta {delta:+.4f} is below required minimum {min_delta:+.4f}" + ) + (out_dir / "summary.md").write_text("\n".join(lines) + "\n", encoding="utf-8") + print("\n".join(lines)) + sys.exit(4) + +(out_dir / "summary.md").write_text("\n".join(lines) + "\n", encoding="utf-8") +print("\n".join(lines)) +PY + +echo "[done] results=${OUT_DIR}" +echo "[done] summary=${OUT_DIR}/summary.md" diff --git a/scripts/benchmarks/cosqa/runner.py b/scripts/benchmarks/cosqa/runner.py index 8e0c32b1..8aecaa85 100644 --- a/scripts/benchmarks/cosqa/runner.py +++ b/scripts/benchmarks/cosqa/runner.py @@ -442,7 +442,16 @@ def _cosqa_id_from_path(p: str) -> Optional[str]: name = s.rsplit("/", 1)[-1] if name.endswith(".py"): name = name[: -3] - return name.strip() or None + name = name.strip() + if not name: + return None + # CoSQA synthetic filenames are often "__". + # Recover canonical code_id so relevance matching aligns with qrels. + if "__" in name: + tail = name.rsplit("__", 1)[-1].strip() + if tail.startswith("cosqa-"): + return tail + return name # Extract stable code_ids for evaluation. # NOTE: rerank paths may not include payload; for CoSQA we can fall back to parsing @@ -915,8 +924,8 @@ async def run_full_benchmark( print(f" Limited corpus to {len(corpus)} entries") if skip_index: - print(" [skip-index] Skipping indexing...") - result = {"reused": True, "indexed": len(corpus), "skipped": 0, "errors": 0} + print(" [skip-index] Skipping indexing (using existing collection as-is)...") + result = {"reused": False, "indexed": 0, "skipped": len(corpus), "errors": 0} else: # Check if already indexed (use fingerprint matching, not just points_count) # The indexer handles fingerprint checking internally and will recreate if needed @@ -1018,6 +1027,12 @@ def main(): help="Search mode: 'hybrid' (default), 'dense' (pure semantic), or 'lexical' (pure BM25-style)") args = parser.parse_args() + # Benchmarks must not require MCP auth sessions. + # runner imports dotenv at module import time with override=True, so enforce this + # after args parsing to guarantee process-local benchmark behavior. + os.environ["CTXCE_AUTH_ENABLED"] = "0" + os.environ["CTXCE_MCP_ACL_ENFORCE"] = "0" + # Enable Context-Engine features for accurate benchmarking. # Semantic expansion is always enabled (it may still be a no-op if query expansion is disabled). os.environ["SEMANTIC_EXPANSION_ENABLED"] = "1" diff --git a/scripts/codex_phase3_probe.py b/scripts/codex_phase3_probe.py new file mode 100644 index 00000000..628b09d5 --- /dev/null +++ b/scripts/codex_phase3_probe.py @@ -0,0 +1,2 @@ +MARK = 'v3' +# codex phase3 probe v3 diff --git a/scripts/collection_admin.py b/scripts/collection_admin.py index d970e941..e71b5428 100644 --- a/scripts/collection_admin.py +++ b/scripts/collection_admin.py @@ -1,12 +1,15 @@ +import logging import os import json import re import shutil import time -from pathlib import Path from datetime import datetime +from pathlib import Path from typing import Any, Dict, Optional, List +logger = logging.getLogger(__name__) + from scripts.auth_backend import mark_collection_deleted try: @@ -97,7 +100,6 @@ def _managed_upload_marker_path( slug_name: str, marker_root: Optional[Path] = None, ) -> Path: - # Marker is stored with per-repo metadata, not inside the repo workspace tree. base = marker_root or work_root return base / ".codebase" / "repos" / slug_name / _MARKER_NAME @@ -115,11 +117,12 @@ def _is_managed_upload_workspace_dir( return False if not _SLUGGED_REPO_RE.match(p.name or ""): return False - return _managed_upload_marker_path( + marker = _managed_upload_marker_path( work_root=work_root, marker_root=marker_root, slug_name=p.name, - ).exists() + ) + return marker.exists() except Exception: return False @@ -193,6 +196,7 @@ def delete_collection_everywhere( out: Dict[str, Any] = { "collection": name, "qdrant_deleted": False, + "qdrant_graph_deleted": False, "registry_marked_deleted": False, "deleted_state_files": 0, "deleted_managed_workspaces": 0, @@ -209,6 +213,14 @@ def delete_collection_everywhere( out["qdrant_deleted"] = True except Exception: out["qdrant_deleted"] = False + # Best-effort: also delete companion graph edges collection when present. + # This branch stores file-level edges in `_graph`. + if not name.endswith("_graph"): + try: + cli.delete_collection(collection_name=f"{name}_graph") + out["qdrant_graph_deleted"] = True + except Exception: + out["qdrant_graph_deleted"] = False except Exception: out["qdrant_deleted"] = False @@ -226,7 +238,7 @@ def delete_collection_everywhere( mappings = [] try: if get_collection_mappings is not None: - mappings = get_collection_mappings(search_root=str(codebase_root)) or [] + mappings = get_collection_mappings(search_root=str(work_root)) or [] except Exception: mappings = [] @@ -359,8 +371,10 @@ def _manual_copy_points() -> None: vectors_config = None sparse_vectors_config = None + # Support vector-less collections (e.g. payload-only graph edge collections). if vectors_config is None: - raise RuntimeError(f"Cannot determine vectors config for source collection {src}") + vectors_config = {} + vectorless = isinstance(vectors_config, dict) and not vectors_config try: cli.create_collection( @@ -401,7 +415,7 @@ def _manual_copy_points() -> None: limit=batch_limit, offset=offset, with_payload=True, - with_vectors=True, + with_vectors=(not vectorless), ) except Exception as exc: raise RuntimeError(f"Failed to scroll points from {src}: {exc}") from exc @@ -414,7 +428,9 @@ def _manual_copy_points() -> None: point_id = getattr(record, "id", None) payload = getattr(record, "payload", None) vector = None - if hasattr(record, "vector") and getattr(record, "vector") is not None: + if vectorless: + vector = {} + elif hasattr(record, "vector") and getattr(record, "vector") is not None: vector = getattr(record, "vector") elif hasattr(record, "vectors") and getattr(record, "vectors") is not None: vector = getattr(record, "vectors") @@ -477,4 +493,23 @@ def _count_points(name: str) -> Optional[int]: # The manual path guarantees the destination gets the exact same points/payloads/vectors. _manual_copy_points() + # Best-effort: copy the companion graph collection when copying a base collection. + # Graph edges are derived data and can be rebuilt, but copying avoids a cold-start window + # during staging cutovers where the clone has no graph. + if not src.endswith("_graph") and not dest.endswith("_graph"): + try: + copy_collection_qdrant( + source=f"{src}_graph", + target=f"{dest}_graph", + qdrant_url=base_url, + overwrite=overwrite, + ) + except Exception as exc: + logger.debug( + "Best-effort graph collection copy %s_graph -> %s_graph failed: %s", + src, + dest, + exc, + ) + return dest diff --git a/scripts/hybrid/expand.py b/scripts/hybrid/expand.py index 20a43834..678c650c 100644 --- a/scripts/hybrid/expand.py +++ b/scripts/hybrid/expand.py @@ -29,6 +29,11 @@ from typing import List, Dict, Any, TYPE_CHECKING from pathlib import Path +from scripts.path_scope import ( + normalize_under as _normalize_under_scope, + metadata_matches_under as _metadata_matches_under, +) + logger = logging.getLogger("hybrid_expand") # Import QdrantClient type for annotations @@ -542,20 +547,8 @@ def expand_via_embeddings( except Exception: vec_name = None - def _norm_under(u: str | None) -> str | None: - if not u: - return None - u = str(u).strip().replace("\\", "/") - u = "/".join([p for p in u.split("/") if p]) - if not u: - return None - if u.startswith("/work/"): - return u - if not u.startswith("/"): - return "/work/" + u - return "/work/" + u.lstrip("/") - flt = None + eff_under = _normalize_under_scope(under) try: from qdrant_client import models @@ -567,15 +560,6 @@ def _norm_under(u: str | None) -> str | None: match=models.MatchValue(value=language), ) ) - if under: - eff_under = _norm_under(under) - if eff_under: - must.append( - models.FieldCondition( - key="metadata.path_prefix", - match=models.MatchValue(value=eff_under), - ) - ) if kind: must.append( models.FieldCondition( @@ -621,10 +605,11 @@ def _norm_under(u: str | None) -> str | None: # Search for soft matches (we want semantically similar docs, not exact matches) try: + initial_limit = 8 if not eff_under else max(32, int(max_terms) * 8) search_kwargs = { "collection_name": collection, "query_vector": (vec_name, query_vector) if vec_name else query_vector, - "limit": 8, # Get top 8 neighbors + "limit": initial_limit, # Over-fetch when `under` is set (we post-filter). "with_payload": True, "score_threshold": 0.3, # Lower threshold to get more diverse results } @@ -637,6 +622,17 @@ def _norm_under(u: str | None) -> str | None: if not results: return [] + if eff_under: + _scoped = [] + for hit in results: + payload = getattr(hit, "payload", None) or {} + md = payload.get("metadata") or {} + if _metadata_matches_under(md, eff_under): + _scoped.append(hit) + results = _scoped + if not results: + return [] + # Extract unique terms from neighbors extracted_terms: set[str] = set() query_tokens = set(combined_query.lower().split()) diff --git a/scripts/hybrid_search.py b/scripts/hybrid_search.py index 70b5b094..31b006dd 100644 --- a/scripts/hybrid_search.py +++ b/scripts/hybrid_search.py @@ -296,6 +296,10 @@ from scripts.utils import sanitize_vector_name as _sanitize_vector_name from scripts.ingest_code import ensure_collection as _ensure_collection_raw from scripts.ingest_code import project_mini as _project_mini +from scripts.path_scope import ( + normalize_under as _normalize_under_scope, + metadata_matches_under as _metadata_matches_under, +) # --------------------------------------------------------------------------- # Module logger @@ -440,7 +444,7 @@ def run_pure_dense_search( model: Embedding model (will load default if None) collection: Qdrant collection name language: Optional language filter - under: Optional path prefix filter + under: Optional recursive workspace subtree filter repo: Optional repo filter Returns: @@ -465,12 +469,10 @@ def run_pure_dense_search( vec_name = sanitize_vector_name(model_name) coll = collection or _collection() - # Build filter + # Build server-side filter (exclude `under` here; recursive under is post-filtered) must = [] if language: must.append(models.FieldCondition(key="metadata.language", match=models.MatchValue(value=language))) - if under: - must.append(models.FieldCondition(key="metadata.path_prefix", match=models.MatchValue(value=under))) if repo and repo != "*": if isinstance(repo, list): must.append(models.FieldCondition(key="metadata.repo", match=models.MatchAny(any=repo))) @@ -500,14 +502,22 @@ def run_pure_dense_search( ) try: - # Single dense query - no pooling, no re-scoring - ranked_points = dense_query(client, vec_name, vec_list, flt, limit, coll, query_text=query) + # Single dense query - no pooling, no re-scoring. + # When `under` is set, we post-filter by path metadata. Over-fetch so we + # can still return up to `limit` in-scope results. + eff_under = _normalize_under_scope(under) + fetch_limit = int(limit) + if eff_under: + fetch_limit = min(max(fetch_limit * 4, fetch_limit + 16), 2000) + ranked_points = dense_query(client, vec_name, vec_list, flt, fetch_limit, coll, query_text=query) # Build output results = [] for p in ranked_points: payload = p.payload or {} md = payload.get("metadata") or {} + if eff_under and not _metadata_matches_under(md, eff_under): + continue # Prefer host_path when available (consistent with hybrid search) _path = md.get("host_path") or payload.get("path") or md.get("path") or "" @@ -522,6 +532,8 @@ def run_pure_dense_search( "doc_id": payload.get("code_id") or payload.get("_id") or "", "payload": payload, }) + if len(results) >= int(limit): + break return results @@ -690,21 +702,8 @@ def _normalize_globs(globs: list[str]) -> list[str]: eff_path_globs_norm = _normalize_globs(eff_path_globs) eff_not_globs_norm = _normalize_globs(eff_not_globs) - # Normalize under - def _norm_under(u: str | None) -> str | None: - if not u: - return None - u = str(u).strip().replace("\\", "/") - u = "/".join([p for p in u.split("/") if p]) - if not u: - return None - if not u.startswith("/"): - v = "/work/" + u - else: - v = "/work/" + u.lstrip("/") if not u.startswith("/work/") else u - return v - - eff_under = _norm_under(eff_under) + # Normalize under as a user-facing recursive subtree scope. + eff_under = _normalize_under_scope(eff_under) # Expansion knobs that affect query construction/results (must be part of cache key) try: @@ -810,12 +809,8 @@ def _norm_under(u: str | None) -> str | None: key="metadata.repo", match=models.MatchValue(value=eff_repo) ) ) - if eff_under: - must.append( - models.FieldCondition( - key="metadata.path_prefix", match=models.MatchValue(value=eff_under) - ) - ) + # NOTE: `under` is recursive and user-facing; we enforce it in client-side + # filtering via normalized metadata paths instead of exact path_prefix equality. if eff_kind: must.append( models.FieldCondition( @@ -2105,7 +2100,7 @@ def _match_glob(pat: str, path: str) -> bool: return _fnm.fnmatchcase(path, pat) return _fnm.fnmatchcase(path.lower(), pat.lower()) - if eff_not or eff_path_regex or eff_ext or eff_path_globs or eff_not_globs: + if eff_under or eff_not or eff_path_regex or eff_ext or eff_path_globs or eff_not_globs: def _pass_filters(m: Dict[str, Any]) -> bool: md = (m["pt"].payload or {}).get("metadata") or {} @@ -2118,6 +2113,8 @@ def _pass_filters(m: Dict[str, Any]) -> bool: nn = eff_not if case_sensitive else eff_not.lower() if nn in p_for_sub or nn in pp_for_sub: return False + if eff_under and not _metadata_matches_under(md, eff_under): + return False if eff_not_globs_norm and any(_match_glob(g, path) or _match_glob(g, rel) for g in eff_not_globs_norm): return False if eff_ext: diff --git a/scripts/indexing_admin.py b/scripts/indexing_admin.py index f3bb69d8..4ab8932d 100644 --- a/scripts/indexing_admin.py +++ b/scripts/indexing_admin.py @@ -927,6 +927,17 @@ def delete_collection_qdrant(*, qdrant_url: str, api_key: Optional[str], collect return try: cli.delete_collection(collection_name=name) + # Best-effort: also delete companion graph edges collection when present. + if not name.endswith("_graph"): + try: + cli.delete_collection(collection_name=f"{name}_graph") + except Exception as exc: + try: + print( + f"[indexing_admin] best-effort graph collection delete failed for {name}_graph: {exc}" + ) + except Exception: + pass except Exception: pass finally: @@ -951,6 +962,17 @@ def recreate_collection_qdrant(*, qdrant_url: str, api_key: Optional[str], colle cli.delete_collection(collection_name=name) except Exception as delete_error: raise RuntimeError(f"Failed to delete existing collection '{name}' in Qdrant: {delete_error}") from delete_error + # Best-effort: also delete companion graph edges collection when present. + if not name.endswith("_graph"): + try: + cli.delete_collection(collection_name=f"{name}_graph") + except Exception as exc: + try: + print( + f"[indexing_admin] best-effort graph collection delete failed for {name}_graph: {exc}" + ) + except Exception: + pass finally: try: cli.close() @@ -984,12 +1006,9 @@ def spawn_ingest_code( env.pop(k, None) else: env[str(k)] = str(v) - # When we provide env overrides for a run (e.g. staging rebuild), we also want to - # force ingest_code to honor the explicit COLLECTION_NAME instead of routing based - # on per-repo state/serving_collection in multi-repo mode. - # CTXCE_FORCE_COLLECTION_NAME is only used for these subprocess runs; normal watcher - # and indexer flows do not set it. - env["CTXCE_FORCE_COLLECTION_NAME"] = "1" # Force ingest_code to use COLLECTION_NAME for staging/pending env overrides + # For admin-triggered subprocess runs (recreate/reindex/staging), force ingest_code to + # honor explicit COLLECTION_NAME and avoid multi-repo enumeration. + env["CTXCE_FORCE_COLLECTION_NAME"] = "1" env["COLLECTION_NAME"] = collection env["WATCH_ROOT"] = work_dir env["WORKSPACE_PATH"] = work_dir diff --git a/scripts/ingest/cli.py b/scripts/ingest/cli.py index 8561a9c2..0a956ca4 100644 --- a/scripts/ingest/cli.py +++ b/scripts/ingest/cli.py @@ -9,15 +9,19 @@ import os import argparse +import logging from pathlib import Path from scripts.ingest.config import ( is_multi_repo_mode, get_collection_name, ) +from scripts.collection_health import clear_indexing_caches as _clear_indexing_caches_impl from scripts.ingest.pipeline import index_repo from scripts.ingest.pseudo import generate_pseudo_tags +logger = logging.getLogger(__name__) + def parse_args(): """Parse command-line arguments.""" @@ -40,6 +44,11 @@ def parse_args(): action="store_true", help="Do not skip files whose content hash matches existing index", ) + parser.add_argument( + "--clear-indexing-caches", + action="store_true", + help="Clear local indexing caches (file hash/symbol caches) before indexing", + ) parser.add_argument( "--schema-mode", type=str, @@ -186,13 +195,31 @@ def main(): ) return + def _clear_indexing_caches(workspace_root: Path, repo_name: str | None) -> None: + try: + _clear_indexing_caches_impl(str(workspace_root), repo_name=repo_name) + except Exception as e: + logger.warning( + "Failed to clear indexing caches for workspace=%s repo=%s: %s", + workspace_root, + repo_name, + e, + exc_info=True, + ) + qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333") api_key = os.environ.get("QDRANT_API_KEY") collection = os.environ.get("COLLECTION_NAME") or os.environ.get("DEFAULT_COLLECTION") or "codebase" model_name = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") # Resolve collection name based on multi-repo mode - multi_repo = bool(is_multi_repo_mode and is_multi_repo_mode()) + force_collection = (os.environ.get("CTXCE_FORCE_COLLECTION_NAME") or "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + multi_repo = bool(is_multi_repo_mode and is_multi_repo_mode()) and not force_collection if multi_repo: print("[multi_repo] Multi-repo mode enabled - will create separate collections per repository") @@ -231,6 +258,9 @@ def main(): if not repo_collection: repo_collection = "codebase" + if args.clear_indexing_caches: + _clear_indexing_caches(root_path, repo_name) + index_repo( repo_root, qdrant_url, @@ -249,7 +279,7 @@ def main(): try: resolved = get_collection_name(str(Path(args.root).resolve())) placeholders = {"", "default-collection", "my-collection", "codebase"} - if resolved and collection in placeholders: + if resolved and collection in placeholders and not force_collection: collection = resolved except Exception: pass @@ -260,6 +290,9 @@ def main(): flag = (os.environ.get("PSEUDO_DEFER_TO_WORKER") or "").strip().lower() pseudo_mode = "off" if flag in {"1", "true", "yes", "on"} else "full" + if args.clear_indexing_caches: + _clear_indexing_caches(Path(args.root).resolve(), None) + index_repo( Path(args.root).resolve(), qdrant_url, diff --git a/scripts/ingest/graph_edges.py b/scripts/ingest/graph_edges.py new file mode 100644 index 00000000..e9911822 --- /dev/null +++ b/scripts/ingest/graph_edges.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python3 +""" +ingest/graph_edges.py - Materialized graph edges in Qdrant. + +This is a small, MIT-safe reimplementation of the "graph edges collection" idea: +- Maintain a dedicated Qdrant collection named `_graph` +- Store payload-only edge docs for fast lookups: + - callers/importers queries become simple keyword filters on an indexed payload field + +Design goals for this branch: +- Keep this as an *accelerator* (symbol_graph still works without it) +- Avoid Neo4j/PageRank/GraphRAG complexity +- Avoid CLI flags; watcher can backfill opportunistically +""" + +from __future__ import annotations + +import hashlib +import logging +import os +from typing import Any, Dict, Iterable, List, Optional, Tuple + +logger = logging.getLogger(__name__) + +GRAPH_COLLECTION_SUFFIX = "_graph" + +EDGE_TYPE_CALLS = "calls" +EDGE_TYPE_IMPORTS = "imports" + +GRAPH_INDEX_FIELDS: Tuple[str, ...] = ( + "caller_path", + "callee_symbol", + "edge_type", + "repo", +) + +_ENSURED_GRAPH_COLLECTIONS: set[str] = set() +_GRAPH_VECTOR_MODE: dict[str, str] = {} +_MISSING_GRAPH_COLLECTIONS: set[str] = set() +_BACKFILL_OFFSETS: dict[tuple[str, Optional[str]], Any] = {} + +_EDGE_VECTOR_NAME = "_edge" +_EDGE_VECTOR_VALUE = [0.0] + + +def _normalize_path(path: str) -> str: + if not path: + return "" + try: + normalized = os.path.normpath(str(path)) + except Exception: + normalized = str(path) + return normalized.replace("\\", "/") + + +def get_graph_collection_name(base_collection: str) -> str: + return f"{base_collection}{GRAPH_COLLECTION_SUFFIX}" + + +def _edge_vector_for_upsert(graph_collection: str) -> dict: + mode = _GRAPH_VECTOR_MODE.get(graph_collection) + if mode == "named": + return {_EDGE_VECTOR_NAME: _EDGE_VECTOR_VALUE} + return {} + + +def ensure_graph_collection(client: Any, base_collection: str) -> Optional[str]: + """Ensure `_graph` exists and has payload indexes.""" + from qdrant_client import models as qmodels + from qdrant_client.http.exceptions import UnexpectedResponse + + if not base_collection: + return None + graph_coll = get_graph_collection_name(base_collection) + if graph_coll in _ENSURED_GRAPH_COLLECTIONS: + return graph_coll + + def _detect_vector_mode(info: Any) -> str: + try: + vectors = getattr( + getattr(getattr(info, "config", None), "params", None), "vectors", None + ) + if isinstance(vectors, dict): + return "none" if not vectors else "named" + return "none" if vectors is None else "named" + except Exception: + return "named" + + try: + info = client.get_collection(graph_coll) + _GRAPH_VECTOR_MODE[graph_coll] = _detect_vector_mode(info) + _ENSURED_GRAPH_COLLECTIONS.add(graph_coll) + _MISSING_GRAPH_COLLECTIONS.discard(graph_coll) + return graph_coll + except UnexpectedResponse as e: + # Only a 404 means "missing"; any other HTTP failure should be visible. + if getattr(e, "status_code", None) != 404: + logger.exception( + "Failed to get graph collection %s (status=%s): %s", + graph_coll, + getattr(e, "status_code", None), + e, + ) + return None + except Exception as e: + logger.exception("Failed to get graph collection %s: %s", graph_coll, e) + return None + + try: + # Prefer vector-less collection when supported by server/client. + try: + client.create_collection( + collection_name=graph_coll, + vectors_config={}, + ) + _GRAPH_VECTOR_MODE[graph_coll] = "none" + except Exception as vec_exc: + logger.debug( + "Vector-less creation failed for %s, trying named vector: %s", + graph_coll, + vec_exc, + ) + client.create_collection( + collection_name=graph_coll, + vectors_config={ + _EDGE_VECTOR_NAME: qmodels.VectorParams( + size=1, distance=qmodels.Distance.COSINE + ) + }, + ) + _GRAPH_VECTOR_MODE[graph_coll] = "named" + + # Create payload indexes (best-effort). + for field in GRAPH_INDEX_FIELDS: + try: + client.create_payload_index( + collection_name=graph_coll, + field_name=field, + field_schema=qmodels.PayloadSchemaType.KEYWORD, + ) + except Exception as e: + logger.debug( + "Failed to create graph payload index '%s' for %s: %s", + field, + graph_coll, + e, + exc_info=True, + ) + + _ENSURED_GRAPH_COLLECTIONS.add(graph_coll) + _MISSING_GRAPH_COLLECTIONS.discard(graph_coll) + return graph_coll + except Exception as e: + logger.debug("Failed to ensure graph collection %s: %s", graph_coll, e) + return None + + +def _edge_id(edge_type: str, repo: str, caller_path: str, callee_symbol: str) -> str: + key = f"{edge_type}\x00{repo}\x00{caller_path}\x00{callee_symbol}" + return hashlib.sha256(key.encode("utf-8", errors="ignore")).hexdigest()[:32] + + +def _iter_edges( + *, + caller_path: str, + repo: str, + calls: Iterable[str] = (), + imports: Iterable[str] = (), +) -> List[Dict[str, Any]]: + norm_path = _normalize_path(caller_path) + repo_s = (repo or "").strip() or "default" + + edges: List[Dict[str, Any]] = [] + for sym in calls or []: + s = str(sym).strip() + if not s: + continue + edges.append( + { + "id": _edge_id(EDGE_TYPE_CALLS, repo_s, norm_path, s), + "payload": { + "caller_path": norm_path, + "callee_symbol": s, + "edge_type": EDGE_TYPE_CALLS, + "repo": repo_s, + }, + } + ) + for sym in imports or []: + s = str(sym).strip() + if not s: + continue + edges.append( + { + "id": _edge_id(EDGE_TYPE_IMPORTS, repo_s, norm_path, s), + "payload": { + "caller_path": norm_path, + "callee_symbol": s, + "edge_type": EDGE_TYPE_IMPORTS, + "repo": repo_s, + }, + } + ) + return edges + + +def upsert_file_edges( + client: Any, + base_collection: str, + *, + caller_path: str, + repo: str | None, + calls: List[str] | None = None, + imports: List[str] | None = None, +) -> int: + graph_coll = ensure_graph_collection(client, base_collection) + if not graph_coll: + return 0 + edges = _iter_edges( + caller_path=caller_path, + repo=repo or "default", + calls=calls or [], + imports=imports or [], + ) + if not edges: + return 0 + + from qdrant_client import models as qmodels + + points = [ + qmodels.PointStruct( + id=e["id"], + vector=_edge_vector_for_upsert(graph_coll), + payload=e["payload"], + ) + for e in edges + ] + try: + client.upsert(collection_name=graph_coll, points=points, wait=True) + return len(points) + except Exception as e: + logger.debug("Graph edge upsert failed for %s: %s", caller_path, e) + return 0 + + +def delete_edges_by_path( + client: Any, + base_collection: str, + *, + caller_path: str, + repo: str | None = None, +) -> int: + from qdrant_client.http.exceptions import UnexpectedResponse + graph_coll = get_graph_collection_name(base_collection) + if graph_coll in _MISSING_GRAPH_COLLECTIONS: + return 0 + + from qdrant_client import models as qmodels + + norm_path = _normalize_path(caller_path) + must: list[Any] = [ + qmodels.FieldCondition( + key="caller_path", match=qmodels.MatchValue(value=norm_path) + ) + ] + if repo: + r = str(repo).strip() + if r and r != "*": + must.append( + qmodels.FieldCondition(key="repo", match=qmodels.MatchValue(value=r)) + ) + + try: + resp = client.delete( + collection_name=graph_coll, + points_selector=qmodels.FilterSelector(filter=qmodels.Filter(must=must)), + ) + result_status = getattr(getattr(resp, "result", None), "status", None) + if result_status is None: + result_status = getattr(resp, "status", None) + if result_status is None: + return 1 + status_s = str(result_status).strip().lower() + return 1 if status_s in {"acknowledged", "completed", "ok", "success"} else 0 + except UnexpectedResponse as e: + if getattr(e, "status_code", None) == 404: + _MISSING_GRAPH_COLLECTIONS.add(graph_coll) + return 0 + logger.debug( + "Graph edge delete failed for %s in %s (status=%s): %s", + norm_path, + graph_coll, + getattr(e, "status_code", None), + e, + exc_info=True, + ) + return 0 + except Exception as e: + logger.debug( + "Graph edge delete failed for %s in %s: %s", + norm_path, + graph_coll, + e, + exc_info=True, + ) + return 0 + + +def graph_edges_backfill_tick( + client: Any, + base_collection: str, + *, + repo_name: str | None = None, + max_files: int = 128, +) -> int: + """Best-effort incremental backfill from `` into `_graph`. + + This scans the main collection and upserts file-level edges into the graph collection. + It's idempotent (deterministic IDs) and safe to run continuously in a watcher worker. + """ + from qdrant_client import models as qmodels + + if not base_collection or max_files <= 0: + return 0 + + graph_coll = ensure_graph_collection(client, base_collection) + if not graph_coll: + return 0 + + must: list[Any] = [] + if repo_name: + must.append( + qmodels.FieldCondition( + key="metadata.repo", match=qmodels.MatchValue(value=repo_name) + ) + ) + flt = qmodels.Filter(must=must or None) + + processed_files = 0 + seen_paths: set[str] = set() + + key = (base_collection, repo_name) + next_offset = _BACKFILL_OFFSETS.get(key) + + # We may need to overscan because the main collection is chunked. + overscan = max_files * 8 + while processed_files < max_files: + attempts = 0 + while True: + try: + points, next_offset = client.scroll( + collection_name=base_collection, + scroll_filter=flt, + limit=min(64, overscan), + with_payload=True, + with_vectors=False, + offset=next_offset, + ) + break + except Exception as e: + attempts += 1 + logger.exception( + "Graph edge backfill scroll failed (collection=%s repo=%s offset=%s attempt=%d): %s", + base_collection, + repo_name or "default", + next_offset, + attempts, + e, + ) + # Retry a couple times for transient errors, then raise so failures are not silent. + if attempts >= 3: + raise + import time + + time.sleep(0.25 * (2 ** (attempts - 1))) + + if not points: + break + + for rec in points: + if processed_files >= max_files: + break + payload = getattr(rec, "payload", None) or {} + md = payload.get("metadata") or {} + path = md.get("path") or "" + if not path: + continue + norm_path = _normalize_path(str(path)) + if norm_path in seen_paths: + continue + seen_paths.add(norm_path) + + repo = md.get("repo") or repo_name or "default" + calls = md.get("calls") or [] + imports = md.get("imports") or [] + if not isinstance(calls, list): + calls = [] + if not isinstance(imports, list): + imports = [] + + upsert_file_edges( + client, + base_collection, + caller_path=norm_path, + repo=str(repo), + calls=[str(x) for x in calls if x], + imports=[str(x) for x in imports if x], + ) + processed_files += 1 + + if next_offset is None: + break + + _BACKFILL_OFFSETS[key] = next_offset + return processed_files + + +__all__ = [ + "GRAPH_COLLECTION_SUFFIX", + "EDGE_TYPE_CALLS", + "EDGE_TYPE_IMPORTS", + "get_graph_collection_name", + "ensure_graph_collection", + "upsert_file_edges", + "delete_edges_by_path", + "graph_edges_backfill_tick", +] diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py index 45716049..9db00326 100644 --- a/scripts/ingest/pipeline.py +++ b/scripts/ingest/pipeline.py @@ -109,7 +109,8 @@ def detect_language(path: Path) -> str: _TEXT_LIKE_LANGS = {"unknown", "markdown", "text"} -def _is_text_like_language(language: str) -> bool: +def is_text_like_language(language: str) -> bool: + """Classify whether a detected language should skip smart reindexing.""" return str(language or "").strip().lower() in _TEXT_LIKE_LANGS @@ -227,6 +228,87 @@ def _normalize_info_for_dense(s: str) -> str: return text +def _sync_graph_edges_best_effort( + client: QdrantClient, + collection: str, + file_path: str, + repo: str | None, + calls: list[str] | None, + imports: list[str] | None, +) -> None: + """Best-effort sync of file-level graph edges. Safe to skip on failure.""" + enabled = str(os.environ.get("GRAPH_EDGES_ENABLE", "1") or "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + if not enabled: + return + try: + from scripts.ingest.graph_edges import ( + delete_edges_by_path, + ensure_graph_collection, + upsert_file_edges, + ) + + ensure_graph_collection(client, collection) + # Important: delete stale edges for this file before upserting the new set. + delete_edges_by_path( + client, + collection, + caller_path=str(file_path), + repo=repo, + ) + upsert_file_edges( + client, + collection, + caller_path=str(file_path), + repo=repo, + calls=calls, + imports=imports, + ) + except Exception as exc: + try: + print(f"[graph_edges] best-effort sync failed for {file_path}: {exc}") + except Exception: + pass + + +def _symbols_to_metadata_dict(language: str, text: str) -> dict: + """Build symbol metadata dict from in-memory source text.""" + symbols = {} + try: + symbols_list = _extract_symbols(language, text) + lines = text.split("\n") + for sym in symbols_list or []: + kind = str(sym.get("kind") or "") + name = str(sym.get("name") or "") + start = int(sym.get("start") or 0) + end = int(sym.get("end") or 0) + if not kind or not name or start <= 0 or end < start: + continue + symbol_id = f"{kind}_{name}_{start}" + content = "\n".join(lines[start - 1 : end]) + content_hash = hashlib.sha1( + content.encode("utf-8", errors="ignore") + ).hexdigest() + symbols[symbol_id] = { + "name": name, + "type": kind, + "start_line": start, + "end_line": end, + "content_hash": content_hash, + "content": content, + "pseudo": "", + "tags": [], + "qdrant_ids": [], + } + except Exception: + return {} + return symbols + + def build_information( language: str, path: Path, start: int, end: int, first_line: str ) -> str: @@ -251,16 +333,31 @@ def index_single_file( repo_name_for_cache: str | None = None, allowed_vectors: set[str] | None = None, allowed_sparse: set[str] | None = None, + preloaded_text: str | None = None, + preloaded_file_hash: str | None = None, + preloaded_language: str | None = None, ) -> bool: """Index a single file path. Returns True if indexed, False if skipped.""" + repo_for_graph = repo_name_for_cache or _detect_repo_name_from_path(file_path) try: if _should_skip_explicit_file_by_excluder(file_path): try: delete_points_by_path(client, collection, str(file_path)) except Exception: pass + # Clean up graph edges for excluded file + _sync_graph_edges_best_effort( + client, + collection, + str(file_path), + repo_for_graph, + None, # No calls when file is excluded + None, # No imports when file is excluded + ) print(f"Skipping excluded file: {file_path}") return False + except NameError: + raise except Exception: return False @@ -283,6 +380,9 @@ def index_single_file( repo_name_for_cache=repo_name_for_cache, allowed_vectors=allowed_vectors, allowed_sparse=allowed_sparse, + preloaded_text=preloaded_text, + preloaded_file_hash=preloaded_file_hash, + preloaded_language=preloaded_language, ) finally: if _file_lock_ctx is not None: @@ -306,6 +406,9 @@ def _index_single_file_inner( repo_name_for_cache: str | None = None, allowed_vectors: set[str] | None = None, allowed_sparse: set[str] | None = None, + preloaded_text: str | None = None, + preloaded_file_hash: str | None = None, + preloaded_language: str | None = None, ) -> bool: """Inner implementation of index_single_file (after lock is acquired).""" if trust_cache is None: @@ -317,7 +420,12 @@ def _index_single_file_inner( trust_cache = False fast_fs = _env_truthy(os.environ.get("INDEX_FS_FASTPATH"), False) - if skip_unchanged and fast_fs and get_cached_file_meta is not None: + if ( + preloaded_text is None + and skip_unchanged + and fast_fs + and get_cached_file_meta is not None + ): try: repo_for_cache = repo_name_for_cache or _detect_repo_name_from_path(file_path) meta = get_cached_file_meta(str(file_path), repo_for_cache) or {} @@ -333,15 +441,17 @@ def _index_single_file_inner( except Exception: pass - try: - text = file_path.read_text(encoding="utf-8", errors="ignore") - except Exception as e: - print(f"Skipping {file_path}: {e}") - return False + if preloaded_text is None: + try: + text = file_path.read_text(encoding="utf-8", errors="ignore") + except Exception as e: + print(f"Skipping {file_path}: {e}") + return False + else: + text = preloaded_text - language = detect_language(file_path) - is_text_like = _is_text_like_language(language) - file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() + language = preloaded_language or detect_language(file_path) + file_hash = preloaded_file_hash or hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() repo_tag = repo_name_for_cache or _detect_repo_name_from_path(file_path) @@ -376,7 +486,10 @@ def _index_single_file_inner( if get_cached_symbols and set_cached_symbols: cached_symbols = get_cached_symbols(str(file_path)) if cached_symbols: - current_symbols = extract_symbols_with_tree_sitter(str(file_path)) + if preloaded_text is not None: + current_symbols = _symbols_to_metadata_dict(language, preloaded_text) + else: + current_symbols = extract_symbols_with_tree_sitter(str(file_path)) _, changed = compare_symbol_changes(cached_symbols, current_symbols) for symbol_data in current_symbols.values(): symbol_id = f"{symbol_data['type']}_{symbol_data['name']}_{symbol_data['start_line']}" @@ -653,6 +766,22 @@ def make_point(pid, dense_vec, lex_vec, payload, lex_text: str = "", code_text: for i, v, lx, m, lt, ct in zip(batch_ids, vectors, batch_lex, batch_meta, batch_lex_text, batch_code) ] upsert_points(client, collection, points) + + # Optional: materialize file-level graph edges in a companion `_graph` store. + # This is an accelerator for symbol_graph callers/importers and is safe to skip on failure. + # IMPORTANT: Sync must run after upserts (or after delete-only reindex) to ensure graph + # edges stay consistent. When a file reindexes to zero chunks, batch_texts is empty but + # we still need to sync graph edges to remove stale entries. + _sync_graph_edges_best_effort( + client, + collection, + str(file_path), + repo_tag, + calls, + imports, + ) + + if batch_texts: try: ws = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" if set_cached_file_hash: @@ -881,6 +1010,7 @@ def process_file_with_smart_reindexing( model, vector_name: str | None, *, + model_dim: int | None = None, allowed_vectors: set[str] | None = None, allowed_sparse: set[str] | None = None, ) -> str: @@ -910,8 +1040,19 @@ def process_file_with_smart_reindexing( _delete_points_fn(client, current_collection, str(p)) except Exception: pass + # Clean up graph edges for excluded file + _sync_graph_edges_best_effort( + client, + current_collection, + str(p), + per_file_repo or _detect_repo_name_from_path(file_path), + None, # No calls when file is excluded + None, # No imports when file is excluded + ) print(f"[SMART_REINDEX] Skipping excluded file: {file_path}") return "skipped" + except NameError: + raise except Exception: return "skipped" @@ -927,6 +1068,13 @@ def process_file_with_smart_reindexing( except Exception: file_path = Path(fp) + is_text_like = is_text_like_language(language) + if is_text_like: + print( + f"[SMART_REINDEX] {file_path}: text-like language '{language}', " + "skipping smart reindex and using full reindex path" + ) + return "failed" file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() if allowed_vectors is None and allowed_sparse is None: @@ -988,8 +1136,31 @@ def process_file_with_smart_reindexing( changed_set = set(changed_symbols) if len(changed_symbols) == 0 and cached_symbols: - print(f"[SMART_REINDEX] {file_path}: 0 changes detected, skipping") - return "skipped" + prev_hash = None + try: + if get_cached_file_hash: + prev_hash = get_cached_file_hash(fp, per_file_repo) + except Exception: + prev_hash = None + if prev_hash and file_hash and prev_hash == file_hash: + print(f"[SMART_REINDEX] {file_path}: 0 changes detected, skipping") + return "skipped" + print( + f"[SMART_REINDEX] {file_path}: non-symbol change detected; " + "falling back to full reindex" + ) + return "failed" + + if model_dim and vector_name: + try: + ensure_collection_and_indexes_once( + client, + current_collection, + int(model_dim), + vector_name, + ) + except Exception: + pass existing_points = [] try: @@ -1085,7 +1256,6 @@ def process_file_with_smart_reindexing( else: chunks = chunk_lines(text, CHUNK_LINES, CHUNK_OVERLAP) - is_text_like = _is_text_like_language(language) symbol_spans = _extract_symbols(language, text) reused_points: list[models.PointStruct] = [] @@ -1102,6 +1272,30 @@ def process_file_with_smart_reindexing( pseudo_batch_concurrency = int(os.environ.get("PSEUDO_BATCH_CONCURRENCY", "1") or 1) use_batch_pseudo = pseudo_batch_concurrency > 1 + def _apply_symbol_pseudo( + symbol_name: str, + kind: str, + start_line: int, + pseudo_text: str, + pseudo_tags: list[str], + ) -> None: + if not symbol_name or not kind: + return + sid = f"{kind}_{symbol_name}_{start_line}" + target = symbol_meta.get(sid) + if target is None: + for candidate in symbol_meta.values(): + if str(candidate.get("type") or "") != str(kind): + continue + if str(candidate.get("name") or "") != str(symbol_name): + continue + target = candidate + break + if target is None: + return + target["pseudo"] = pseudo_text + target["tags"] = list(pseudo_tags or []) + chunk_data_sr: list[dict] = [] for ch in chunks: info = build_information( @@ -1190,6 +1384,14 @@ def process_file_with_smart_reindexing( start_line = ch.get("start", 0) sid = f"{k}_{symbol_name}_{start_line}" set_cached_pseudo(fp, sid, pseudo, tags, file_hash) + _apply_symbol_pseudo( + symbol_name, + ch.get("kind", "unknown"), + ch.get("start", 0), + pseudo, + tags, + ) + ch["_pseudo_applied"] = True except Exception as e: print(f"[PSEUDO_BATCH] Smart reindex batch failed, falling back: {e}") use_batch_pseudo = False @@ -1211,9 +1413,26 @@ def process_file_with_smart_reindexing( sid = f"{k}_{symbol_name}_{start_line}" if set_cached_pseudo: set_cached_pseudo(fp, sid, pseudo, tags, file_hash) + _apply_symbol_pseudo( + symbol_name, + k, + start_line, + pseudo, + tags, + ) + ch["_pseudo_applied"] = True except Exception: pass + if (pseudo or tags) and not ch.get("_pseudo_applied"): + _apply_symbol_pseudo( + ch.get("symbol", ""), + ch.get("kind", "unknown"), + ch.get("start", 0), + pseudo, + tags, + ) + if pseudo: payload["pseudo"] = pseudo if tags: @@ -1368,6 +1587,19 @@ def process_file_with_smart_reindexing( if all_points: _upsert_points_fn(client, current_collection, all_points) + # Optional: materialize file-level graph edges (best-effort). + # IMPORTANT: Sync must run after upserts OR after delete-only reindex to ensure graph + # edges stay consistent. When a file reindexes to zero chunks, all_points is empty but + # we still need to sync graph edges to remove stale entries. + _sync_graph_edges_best_effort( + client, + current_collection, + str(file_path), + per_file_repo, + calls, + imports, + ) + try: if set_cached_symbols: set_cached_symbols(fp, symbol_meta, file_hash) diff --git a/scripts/ingest/pseudo.py b/scripts/ingest/pseudo.py index ea157e2b..0b02db2e 100644 --- a/scripts/ingest/pseudo.py +++ b/scripts/ingest/pseudo.py @@ -7,11 +7,13 @@ """ from __future__ import annotations +import logging import os from typing import Tuple, List from scripts.ingest.config import ( get_cached_pseudo, + get_cached_symbols, set_cached_pseudo, compare_symbol_changes, ) @@ -130,25 +132,58 @@ def should_process_pseudo_for_chunk( start_line = chunk.get("start", 0) symbol_id = f"{kind}_{symbol_name}_{start_line}" + def _lookup_cached() -> Tuple[str, List[str]]: + if get_cached_pseudo: + try: + cached_pseudo, cached_tags = get_cached_pseudo(file_path, symbol_id) + if cached_pseudo or cached_tags: + return cached_pseudo, cached_tags + except Exception as exc: + logging.getLogger(__name__).debug( + "get_cached_pseudo failed for %s/%s: %s", + file_path, + symbol_id, + exc, + exc_info=True, + ) + if get_cached_symbols: + try: + cached_symbols = get_cached_symbols(file_path) or {} + for info in cached_symbols.values(): + if str(info.get("type") or "") != str(kind): + continue + if str(info.get("name") or "") != str(symbol_name): + continue + cached_pseudo = info.get("pseudo", "") + cached_tags = info.get("tags", []) + if not isinstance(cached_pseudo, str): + cached_pseudo = "" + if not isinstance(cached_tags, list): + cached_tags = [] + cached_tags = [str(tag) for tag in cached_tags if str(tag)] + if cached_pseudo or cached_tags: + return cached_pseudo, cached_tags + except Exception as exc: + logging.getLogger(__name__).debug( + "get_cached_symbols failed for %s: %s", + file_path, + exc, + exc_info=True, + ) + return "", [] + # If we don't have any change information, best effort: try reusing cached pseudo when present - if not changed_symbols and get_cached_pseudo: - try: - cached_pseudo, cached_tags = get_cached_pseudo(file_path, symbol_id) - if cached_pseudo or cached_tags: - return False, cached_pseudo, cached_tags - except Exception: - pass + if not changed_symbols: + cached_pseudo, cached_tags = _lookup_cached() + if cached_pseudo or cached_tags: + return False, cached_pseudo, cached_tags return True, "", [] # Unchanged symbol: prefer reuse when cached pseudo/tags exist if symbol_id not in changed_symbols: - if get_cached_pseudo: - try: - cached_pseudo, cached_tags = get_cached_pseudo(file_path, symbol_id) - if cached_pseudo or cached_tags: - return False, cached_pseudo, cached_tags - except Exception: - pass + cached_pseudo, cached_tags = _lookup_cached() + if cached_pseudo or cached_tags: + return False, cached_pseudo, cached_tags # Unchanged but no cached data yet – process once return True, "", [] @@ -162,7 +197,6 @@ def should_use_smart_reindexing(file_path: str, file_hash: str) -> Tuple[bool, s Returns: (use_smart, reason) """ - from scripts.ingest.config import get_cached_symbols, compare_symbol_changes from scripts.ingest.symbols import extract_symbols_with_tree_sitter if not _smart_symbol_reindexing_enabled(): diff --git a/scripts/ingest/qdrant.py b/scripts/ingest/qdrant.py index f98207ca..e7db21fc 100644 --- a/scripts/ingest/qdrant.py +++ b/scripts/ingest/qdrant.py @@ -31,6 +31,7 @@ # --------------------------------------------------------------------------- ENSURED_COLLECTIONS: set[str] = set() ENSURED_COLLECTIONS_LAST_CHECK: dict[str, float] = {} +ENSURED_PAYLOAD_INDEX_COLLECTIONS: set[str] = set() class CollectionNeedsRecreateError(Exception): @@ -535,6 +536,9 @@ def recreate_collection(client: QdrantClient, name: str, dim: int, vector_name: if not name: print("[BUG] recreate_collection called with name=None! Fix the caller - collection name is required.", flush=True) return + ENSURED_COLLECTIONS.discard(name) + ENSURED_COLLECTIONS_LAST_CHECK.pop(name, None) + ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard(name) try: client.delete_collection(name) except Exception: @@ -580,6 +584,20 @@ def recreate_collection(client: QdrantClient, name: str, dim: int, vector_name: def ensure_payload_indexes(client: QdrantClient, collection: str): """Create helpful payload indexes if they don't exist (idempotent).""" + if not collection: + return + + # On memo hit, verify collection still exists and indexes are present + if collection in ENSURED_PAYLOAD_INDEX_COLLECTIONS: + try: + info = client.get_collection(collection) + if not _missing_payload_indexes(info): + # Memo is still valid + return + except Exception: + # Collection doesn't exist or error accessing it; remove from memo + ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard(collection) + for field in PAYLOAD_INDEX_FIELDS: try: client.create_payload_index( @@ -589,6 +607,15 @@ def ensure_payload_indexes(client: QdrantClient, collection: str): ) except Exception: pass + try: + info = client.get_collection(collection) + except Exception: + return + if _missing_payload_indexes(info): + # Do not memoize; a later call should retry. + return + # Even if create_payload_index threw, get_collection confirms indexes exist. + ENSURED_PAYLOAD_INDEX_COLLECTIONS.add(collection) def ensure_collection_and_indexes_once( @@ -629,6 +656,10 @@ def ensure_collection_and_indexes_once( ENSURED_COLLECTIONS_LAST_CHECK.pop(collection, None) except Exception: pass + try: + ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard(collection) + except Exception: + pass ensure_collection(client, collection, dim, vector_name, schema_mode=mode) if mode in {"legacy", "migrate"}: ensure_payload_indexes(client, collection) diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 2da24911..a183519d 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -203,6 +203,7 @@ from scripts.ingest.pipeline import ( _detect_repo_name_from_path, + is_text_like_language, detect_language, build_information, pseudo_backfill_tick, @@ -212,6 +213,26 @@ index_repo, process_file_with_smart_reindexing, ) + +# --------------------------------------------------------------------------- +# Graph edges (optional accelerator) +# --------------------------------------------------------------------------- +try: + from scripts.ingest.graph_edges import ( + graph_edges_backfill_tick, + delete_edges_by_path as delete_graph_edges_by_path, + upsert_file_edges as upsert_graph_edges_for_file, + ) +except ImportError: + # graph_edges_backfill_tick is optional and intentionally left as None to + # force callers to explicitly guard long-running backfill behavior. + graph_edges_backfill_tick = None # type: ignore[assignment] + + def delete_graph_edges_by_path(*_args, **_kwargs) -> int: + return 0 + + def upsert_graph_edges_for_file(*_args, **_kwargs) -> int: + return 0 # --------------------------------------------------------------------------- # Re-exports from ingest/cli.py # --------------------------------------------------------------------------- @@ -332,12 +353,17 @@ def main(): "embed_batch", # Pipeline "_detect_repo_name_from_path", + "is_text_like_language", "detect_language", "build_information", "index_single_file", "index_repo", "process_file_with_smart_reindexing", "pseudo_backfill_tick", + # Graph edges (optional) + "graph_edges_backfill_tick", + "delete_graph_edges_by_path", + "upsert_graph_edges_for_file", # CLI "main", # Backward compat diff --git a/scripts/ingest_history.py b/scripts/ingest_history.py index 3f42715c..754487da 100644 --- a/scripts/ingest_history.py +++ b/scripts/ingest_history.py @@ -4,6 +4,7 @@ import subprocess import shlex import hashlib +import logging from typing import List, Dict, Any import re import time @@ -35,6 +36,9 @@ from scripts.utils import sanitize_vector_name as _sanitize_vector_name +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) + def _manifest_run_id(manifest_path: str) -> str: try: @@ -365,30 +369,60 @@ def _ingest_from_manifest( vec_name: str, include_body: bool, per_batch: int, -) -> int: +) -> tuple[int, bool]: try: with open(manifest_path, "r", encoding="utf-8") as f: data = json.load(f) except Exception as e: print(f"Failed to read manifest {manifest_path}: {e}") - return 0 + return 0, False commits = data.get("commits") or [] if not commits: print("No commits in manifest.") - return 0 + return 0, False run_id = _manifest_run_id(manifest_path) mode = str(data.get("mode") or "delta").strip().lower() or "delta" points: List[models.PointStruct] = [] - count = 0 - for c in commits: + total_commits = len(commits) + prepared_count = 0 + persisted_count = 0 + invalid_commit_records = 0 + embed_failures = 0 + point_build_failures = 0 + upsert_failures = 0 + processed_count = 0 + progress_step = max(1, total_commits // 10) if total_commits > 0 else 1 + + def _log_progress(force: bool = False) -> None: + if not force and processed_count % progress_step != 0: + return + logger.info( + "[ingest_history] progress run_id=%s processed=%d/%d prepared=%d persisted=%d invalid=%d embed_failures=%d point_failures=%d upsert_failures=%d", + run_id, + processed_count, + total_commits, + prepared_count, + persisted_count, + invalid_commit_records, + embed_failures, + point_build_failures, + upsert_failures, + ) + + for idx, c in enumerate(commits, start=1): + processed_count += 1 try: if not isinstance(c, dict): + invalid_commit_records += 1 + _log_progress() continue commit_id = str(c.get("commit_id") or "").strip() if not commit_id: + invalid_commit_records += 1 + _log_progress() continue author_name = str(c.get("author_name") or "") authored_date = str(c.get("authored_date") or "") @@ -406,7 +440,15 @@ def _ingest_from_manifest( text = build_text(md, include_body=include_body) try: vec = next(model.embed([text])).tolist() - except Exception: + except Exception as e: + embed_failures += 1 + logger.warning( + "[ingest_history] embed failed for commit=%s idx=%d: %s", + commit_id, + idx, + e, + ) + _log_progress() continue goal: str = "" @@ -451,28 +493,96 @@ def _ingest_from_manifest( pid = stable_id(commit_id) pt = models.PointStruct(id=pid, vector={vec_name: vec}, payload=payload) points.append(pt) - count += 1 + prepared_count += 1 if len(points) >= per_batch: - client.upsert(collection_name=COLLECTION, points=points) - points.clear() + batch_size = len(points) + try: + client.upsert(collection_name=COLLECTION, points=points) + persisted_count += batch_size + except Exception as e: + upsert_failures += batch_size + logger.exception( + "[ingest_history] upsert batch failed (size=%d): %s", + batch_size, + e, + ) + finally: + points.clear() + _log_progress() except Exception: + point_build_failures += 1 + logger.warning( + "[ingest_history] commit processing failed idx=%d", + idx, + exc_info=True, + ) + _log_progress() continue if points: - client.upsert(collection_name=COLLECTION, points=points) - try: - _prune_old_commit_points(client, run_id, mode=mode) - except Exception: - pass - try: - _cleanup_manifest_files(manifest_path) - except Exception: - pass - print(f"Ingested {count} commits into {COLLECTION} from manifest {manifest_path}.") - return count + batch_size = len(points) + try: + client.upsert(collection_name=COLLECTION, points=points) + persisted_count += batch_size + except Exception as e: + upsert_failures += batch_size + logger.exception( + "[ingest_history] final upsert failed (size=%d): %s", + batch_size, + e, + ) + _log_progress(force=True) + ingest_successful = ( + prepared_count > 0 + and invalid_commit_records == 0 + and embed_failures == 0 + and point_build_failures == 0 + and upsert_failures == 0 + and persisted_count == prepared_count + ) + # Only prune snapshot runs that completed cleanly + prune_safe = mode == "snapshot" and ingest_successful + if prune_safe: + try: + _prune_old_commit_points(client, run_id, mode=mode) + except Exception as e: + logger.warning("[ingest_history] prune failed for run_id=%s: %s", run_id, e) + elif mode == "snapshot": + logger.warning( + "[ingest_history] skipping prune for run_id=%s because the snapshot ingest was incomplete", + run_id, + ) + + # Only cleanup manifest if ingest completed successfully + ingest_complete = ingest_successful + if ingest_complete: + try: + _cleanup_manifest_files(manifest_path) + except Exception as e: + logger.warning("[ingest_history] manifest cleanup failed for %s: %s", manifest_path, e) + else: + logger.warning( + "[ingest_history] keeping manifest %s because ingest was incomplete", + manifest_path, + ) + + logger.info( + "Ingested commits from manifest %s into %s: persisted=%d prepared=%d invalid=%d " + "embed_failures=%d point_failures=%d upsert_failures=%d", + manifest_path, + COLLECTION, + persisted_count, + prepared_count, + invalid_commit_records, + embed_failures, + point_build_failures, + upsert_failures, + ) + return persisted_count, ingest_complete def main(): + logging.basicConfig(level=logging.INFO) ap = argparse.ArgumentParser( description="Ingest Git history into Qdrant deterministically" ) @@ -521,7 +631,7 @@ def main(): client = QdrantClient(url=QDRANT_URL, api_key=API_KEY or None) if args.manifest_json: - _ingest_from_manifest( + persisted_count, ingest_complete = _ingest_from_manifest( args.manifest_json, model, client, @@ -529,6 +639,8 @@ def main(): args.include_body, args.per_batch, ) + if not ingest_complete: + raise SystemExit(1) return commits = list_commits(args) @@ -537,6 +649,8 @@ def main(): return points: List[models.PointStruct] = [] + persisted_count = 0 + upsert_failures = 0 for sha in commits: md = commit_metadata(sha) text = build_text(md, include_body=args.include_body) @@ -583,11 +697,40 @@ def main(): point = models.PointStruct(id=pid, vector={vec_name: vec}, payload=payload) points.append(point) if len(points) >= args.per_batch: - client.upsert(collection_name=COLLECTION, points=points) - points.clear() + batch_size = len(points) + try: + client.upsert(collection_name=COLLECTION, points=points) + persisted_count += batch_size + except Exception as e: + upsert_failures += batch_size + logger.exception( + "[ingest_history] batch upsert failed collection=%s repo=%s size=%d path=%s: %s", + COLLECTION, + REPO_NAME, + batch_size, + args.path or "", + e, + ) + finally: + points.clear() if points: - client.upsert(collection_name=COLLECTION, points=points) - print(f"Ingested {len(commits)} commits into {COLLECTION}.") + final_size = len(points) + try: + client.upsert(collection_name=COLLECTION, points=points) + persisted_count += final_size + except Exception as e: + upsert_failures += final_size + logger.exception( + "[ingest_history] final upsert failed collection=%s repo=%s size=%d path=%s: %s", + COLLECTION, + REPO_NAME, + final_size, + args.path or "", + e, + ) + if upsert_failures: + raise SystemExit(1) + print(f"Ingested {persisted_count} commits into {COLLECTION}.") if __name__ == "__main__": diff --git a/scripts/mcp_impl/search.py b/scripts/mcp_impl/search.py index e4dc3766..9fb16414 100644 --- a/scripts/mcp_impl/search.py +++ b/scripts/mcp_impl/search.py @@ -43,6 +43,9 @@ from scripts.mcp_impl.admin_tools import _detect_current_repo, _run_async from scripts.mcp_toon import _should_use_toon, _format_results_as_toon from scripts.mcp_auth import require_collection_access as _require_collection_access +from scripts.path_scope import ( + normalize_under as _normalize_under_scope, +) # Constants QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") @@ -54,6 +57,46 @@ ) +# Fields to strip from results when debug=False (internal/debugging fields) +_DEBUG_RESULT_FIELDS = { + "components", # Internal scoring breakdown (dense_rrf, lexical, fname_boost, etc.) + "doc_id", # Internal benchmark ID (often null/opaque) + "code_id", # Internal benchmark ID (often null/opaque) + "payload", # Duplicates other fields (information, document, pseudo, tags) + "why", # Often empty []; debugging explanation list + "span_budgeted", # Internal budget flag + "relations", # Call graph info (imports, calls) - useful but often noise + "related_paths", # Optional related file paths + "budget_tokens_used", # Internal token accounting + "fname_boost", # Internal boost value (already applied to score) + "host_path", # Internal dual-path (host side) - use path/client_path instead + "container_path", # Internal dual-path (container side) - use path/client_path instead +} + +# Top-level response fields to strip when debug=False +_DEBUG_TOP_LEVEL_FIELDS = { + "rerank_counters", # Internal reranking metrics (inproc_hybrid, timeout, etc.) + "code_signals", # Internal code signal detection results +} + + +def _strip_debug_fields(item: dict, keep_paths: bool = True) -> dict: + """Strip internal/debug fields from a result item. + + Args: + item: Result dict to strip + keep_paths: If True, keep host_path/container_path + + Returns: + New dict with debug fields removed + """ + strip_fields = _DEBUG_RESULT_FIELDS + if keep_paths: + strip_fields = _DEBUG_RESULT_FIELDS - {"host_path", "container_path"} + result = {k: v for k, v in item.items() if k not in strip_fields} + return result + + async def _repo_search_impl( query: Any = None, queries: Any = None, # Alias for query (many clients use this) @@ -89,6 +132,7 @@ async def _repo_search_impl( repo: Any = None, # str, list[str], or "*" to search all repos # Response shaping compact: Any = None, + debug: Any = None, # When True, include verbose internal fields (components, rerank_counters, etc.) output_format: Any = None, # "json" (default) or "toon" for token-efficient format args: Any = None, # Compatibility shim for mcp-remote/Claude wrappers that send args/kwargs kwargs: Any = None, @@ -117,18 +161,23 @@ async def _repo_search_impl( - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos (disable auto-filter). By default, auto-detects current repo from CURRENT_REPO env and filters to it. Use repo=["frontend","backend"] to search related repos together. - - Filters (optional): language, under (path prefix), kind, symbol, ext, path_regex, + - Filters (optional): language, under (recursive workspace subtree), kind, symbol, ext, path_regex, path_glob (str or list[str]), not_glob (str or list[str]), not_ (negative text), case. + - debug: bool (default false). When true, includes verbose internal fields like + components, rerank_counters, code_signals. Default false saves ~60-80% tokens. Returns: - Dict with keys: - - results: list of {score, path, symbol, start_line, end_line, why[, components][, relations][, related_paths][, snippet]} - - total: int; used_rerank: bool; rerank_counters: dict + - results: list of {score, path, symbol, start_line, end_line[, snippet][, tags][, host_path][, container_path]} + When debug=true, also includes: components, why, relations, related_paths, doc_id, code_id + - total: int; used_rerank: bool - If compact=true (and snippets not requested), results contain only {path,start_line,end_line}. + - If debug=true, response also includes: rerank_counters, code_signals Examples: - path_glob=["scripts/**","**/*.py"], language="python" - symbol="context_answer", under="scripts" + - debug=true # Include internal scoring details for query tuning """ sess = require_auth_session_fn(session) if require_auth_session_fn else session @@ -252,6 +301,8 @@ async def _repo_search_impl( case = _extra.get("case") if compact in (None, "") and _extra.get("compact") is not None: compact = _extra.get("compact") + if debug in (None, "") and _extra.get("debug") is not None: + debug = _extra.get("debug") # Optional mode hint: "code_first", "docs_first", "balanced" if ( mode is None or (isinstance(mode, str) and str(mode).strip() == "") @@ -390,7 +441,7 @@ def _to_str(x, default=""): under = under_hint language = _to_str(language, "").strip() - under = _to_str(under, "").strip() + under = _normalize_under_scope(_to_str(under, "").strip()) kind = _to_str(kind, "").strip() symbol = _to_str(symbol, "").strip() path_regex = _to_str(path_regex, "").strip() @@ -440,12 +491,122 @@ def _to_str_list(x): if detected_repo: repo_filter = [detected_repo] + case_sensitive = str(case or "").strip().lower() in { + "sensitive", + "true", + "1", + "yes", + "on", + } + path_globs_norm = [g if case_sensitive else g.lower() for g in path_globs] + not_globs_norm = [g if case_sensitive else g.lower() for g in not_globs] + + def _norm_case(v: str) -> str: + return v if case_sensitive else v.lower() + + def _match_glob(glob_pat: str, path_val: str) -> bool: + import fnmatch as _fnm + if not glob_pat: + return False + p = _norm_case(path_val).replace("\\", "/").strip("/") + if _fnm.fnmatchcase(p, glob_pat): + return True + # Allow repo-relative globs (e.g., scripts/**) to match absolute paths + # by testing suffix windows of the normalized path. + if not glob_pat.startswith("/") and "/" in p: + parts = [seg for seg in p.split("/") if seg] + for i in range(1, len(parts)): + tail = "/".join(parts[i:]) + if _fnm.fnmatchcase(tail, glob_pat): + return True + return False + + def _result_passes_path_filters(item: dict) -> bool: + import re as _re + + path = str(item.get("path") or "") + if not path: + return False + + # Evaluate filters against all known path forms carried by this result. + path_vals = [] + for key in ("path", "rel_path", "client_path", "host_path", "container_path"): + v = item.get(key) + if isinstance(v, str) and v.strip(): + path_vals.append(v.strip().replace("\\", "/")) + if not path_vals: + path_vals = [path] + if path.startswith("/work/"): + path_vals.append(path[len("/work/") :]) + + # Deduplicate while preserving order. + seen = set() + norm_paths = [] + for pv in path_vals: + if pv not in seen: + norm_paths.append(pv) + seen.add(pv) + + if not_: + needle = _norm_case(str(not_)) + if any(needle in _norm_case(pv) for pv in norm_paths): + return False + + if ext: + ext_norm = str(ext).lower().lstrip(".") + if not any(_norm_case(pv).endswith("." + ext_norm) for pv in norm_paths): + return False + + if path_regex: + flags = 0 if case_sensitive else _re.IGNORECASE + try: + if not any(_re.search(path_regex, pv, flags=flags) for pv in norm_paths): + return False + except _re.error as exc: + logger.warning( + "Invalid path_regex filter '%s': %s", + path_regex, + exc, + ) + return False + except Exception as exc: + logger.warning( + "Failed evaluating path_regex filter '%s': %s", + path_regex, + exc, + exc_info=True, + ) + return False + + if path_globs_norm and not any( + _match_glob(g, pv) for g in path_globs_norm for pv in norm_paths + ): + return False + + if not_globs_norm and any( + _match_glob(g, pv) for g in not_globs_norm for pv in norm_paths + ): + return False + + return True + + def _apply_result_filters(items: list[dict]) -> list[dict]: + if not items: + return [] + if not (not_ or path_regex or ext or path_globs_norm or not_globs_norm): + return items + return [it for it in items if _result_passes_path_filters(it)] + compact_raw = compact compact = _to_bool(compact, False) # If snippets are requested, do not compact (we need snippet field in results) if include_snippet: compact = False + # Debug mode: when False (default), strip internal/debug fields from results + # to reduce token bloat. Set debug=True to see components, rerank_counters, etc. + debug = _to_bool(debug, False) + # Default behavior: exclude commit-history docs (which use path=".git") from # generic repo_search calls, unless the caller explicitly asks for git # content. This prevents normal code queries from surfacing commit-index @@ -455,6 +616,7 @@ def _to_str_list(x): ): if ".git" not in not_globs: not_globs.append(".git") + not_globs_norm = [g if case_sensitive else g.lower() for g in not_globs] # Accept top-level alias `queries` as a drop-in for `query` # Many clients send queries=[...] instead of query=[...] @@ -555,46 +717,9 @@ def _to_str_list(x): ) ) - # Apply post-filters (path_regex, path_glob, not_glob, not_) that aren't - # supported by run_pure_dense_search's server-side filters - case_sensitive = str(case or "").strip().lower() in {"sensitive", "true", "1", "yes", "on"} - import fnmatch as _fnm - import re as _re - - def _norm_path(p: str) -> str: - return p if case_sensitive else p.lower() - - path_globs_norm = [g if case_sensitive else g.lower() for g in path_globs] - not_globs_norm = [g if case_sensitive else g.lower() for g in not_globs] - path_regex_norm = path_regex or "" - - def _match_glob(glob_pat: str, path_val: str) -> bool: - if not glob_pat: - return False - return _fnm.fnmatchcase(_norm_path(path_val), glob_pat) - for item in items: path = item.get("path") or "" - - # Apply path_regex filter - if path_regex_norm: - flags = 0 if case_sensitive else _re.IGNORECASE - try: - if not _re.search(path_regex_norm, path, flags=flags): - continue - except Exception: - pass - - # Apply path_glob filter - if path_globs_norm and not any(_match_glob(g, path) for g in path_globs_norm): - continue - - # Apply not_glob filter - if not_globs_norm and any(_match_glob(g, path) for g in not_globs_norm): - continue - - # Apply not_ text filter - if not_ and not_.lower() in _norm_path(path): + if not _result_passes_path_filters(item): continue payload = item.get("payload") or {} @@ -1217,6 +1342,10 @@ def _doc_for(obj: dict) -> str: item["tags"] = obj.get("tags") results.append(item) + # Enforce strict filter semantics regardless of retrieval/rerank branch. + # This closes gaps where fallback rerank paths may bypass path_glob/not_glob. + results = _apply_result_filters(results) + # Mode-aware reordering: nudge core implementation code vs docs and non-core when requested def _is_doc_path(p: str) -> bool: pl = str(p or "").lower() @@ -1491,6 +1620,20 @@ def _read_snip(args): } for r in results ] + elif not debug: + # Strip debug/internal fields from results to reduce token bloat + # Keeps: score, path, host_path, container_path, symbol, snippet, + # start_line, end_line, tags, pseudo + results = [_strip_debug_fields(r) for r in results] + + _res_ok = bool(res.get("ok", True)) if isinstance(res, dict) else True + try: + _res_code = int((res or {}).get("code", 0)) + except Exception: + _res_code = 0 + if results: + _res_ok = True + _res_code = 0 response = { "args": { @@ -1518,13 +1661,23 @@ def _read_snip(args): "compact": (_to_bool(compact_raw, compact)), }, "used_rerank": bool(used_rerank), - "rerank_counters": rerank_counters, - "code_signals": code_signals if code_signals.get("has_code_signals") else None, "total": len(results), "results": results, - **res, + "ok": _res_ok, + "code": _res_code, } + # Expose a concise failure reason without leaking raw subprocess streams by default. + if (not _res_ok or _res_code != 0) and not results: + response["error"] = "search backend execution failed" + + # Only include debug fields when explicitly requested + if debug: + response["subprocess"] = res + response["rerank_counters"] = rerank_counters + if code_signals.get("has_code_signals"): + response["code_signals"] = code_signals + # Apply TOON formatting if requested or enabled globally # Full mode (compact=False) still saves tokens vs JSON while preserving all fields if _should_use_toon(output_format): diff --git a/scripts/mcp_impl/symbol_graph.py b/scripts/mcp_impl/symbol_graph.py index da518ac4..e5ef030a 100644 --- a/scripts/mcp_impl/symbol_graph.py +++ b/scripts/mcp_impl/symbol_graph.py @@ -20,10 +20,54 @@ import logging import os import re +import time from typing import Any, Dict, List, Optional, Set +from scripts.path_scope import ( + normalize_under as _normalize_under_scope, + metadata_matches_under as _metadata_matches_under, + path_matches_under as _path_matches_under, +) + logger = logging.getLogger(__name__) +try: + from scripts.ingest.graph_edges import GRAPH_COLLECTION_SUFFIX as _GRAPH_SUFFIX +except Exception: + _GRAPH_SUFFIX = "_graph" + +GRAPH_COLLECTION_SUFFIX = _GRAPH_SUFFIX +# Time-based cache: collection -> expiry timestamp (5 minutes TTL) +_MISSING_GRAPH_COLLECTIONS: dict[str, float] = {} +_MISSING_GRAPH_TTL = 300 # 5 minutes + + +def _clean_expired_missing_graphs() -> None: + """Remove expired entries from the missing graph cache.""" + now = time.monotonic() + expired = [coll for coll, expiry in _MISSING_GRAPH_COLLECTIONS.items() if expiry <= now] + for coll in expired: + _MISSING_GRAPH_COLLECTIONS.pop(coll, None) + + +def _is_graph_missing(collection: str) -> bool: + """Check if a graph collection is marked as missing (with expiration).""" + _clean_expired_missing_graphs() + if collection in _MISSING_GRAPH_COLLECTIONS: + return _MISSING_GRAPH_COLLECTIONS.get(collection, 0) > time.monotonic() + return False + + +def _mark_graph_missing(collection: str) -> None: + """Mark a graph collection as missing (with TTL).""" + _MISSING_GRAPH_COLLECTIONS[collection] = time.monotonic() + _MISSING_GRAPH_TTL + + +def _clear_graph_missing(collection: str) -> None: + """Remove a collection from the missing graph cache (e.g., after successful creation).""" + _MISSING_GRAPH_COLLECTIONS.pop(collection, None) + + __all__ = [ "_symbol_graph_impl", "_format_symbol_graph_toon", @@ -105,23 +149,18 @@ def _symbol_variants(symbol: str) -> List[str]: return list(dict.fromkeys(variants)) # Dedupe preserving order def _norm_under(u: Optional[str]) -> Optional[str]: - """Normalize an `under` path to match ingest's stored `metadata.path_prefix` values. + """Normalize user-facing `under` to recursive subtree scope token.""" + return _normalize_under_scope(u) - This mirrors the engine's convention: normalize to a /work/... style path. - Note: `under` in this engine is an exact directory filter (not recursive). - """ - if not u: - return None - s = str(u).strip().replace("\\", "/") - s = "/".join([p for p in s.split("/") if p]) - if not s: - return None - # Normalize to /work/... - if not s.startswith("/"): - v = "/work/" + s - else: - v = "/work/" + s.lstrip("/") if not s.startswith("/work/") else s - return v.rstrip("/") + +def _point_matches_under(pt: Any, under: Optional[str]) -> bool: + if not under: + return True + payload = getattr(pt, "payload", None) or {} + md = payload.get("metadata", payload) + if not isinstance(md, dict): + md = {} + return _metadata_matches_under(md, under) async def _symbol_graph_impl( @@ -142,7 +181,7 @@ async def _symbol_graph_impl( query_type: One of "callers", "definition", "importers" limit: Maximum number of results language: Optional language filter - under: Optional path prefix filter + under: Optional recursive workspace subtree filter collection: Optional collection override session: Optional session ID for collection routing ctx: MCP context (optional) @@ -193,18 +232,32 @@ async def _symbol_graph_impl( results = [] + norm_under = _norm_under(under) + try: if query_type == "callers": - # Find chunks where metadata.calls array contains the symbol (exact match) - results = await _query_array_field( + # Prefer graph edges collection when available (fast keyword filters). + results = await _query_graph_edges_collection( client=client, collection=coll, - field_key="metadata.calls", - value=symbol, + symbol=symbol, + edge_type="calls", limit=limit, language=language, - under=_norm_under(under), + repo_filter=None, + under=norm_under, ) + if not results: + # Fall back to array field lookup in the main collection. + results = await _query_array_field( + client=client, + collection=coll, + field_key="metadata.calls", + value=symbol, + limit=limit, + language=language, + under=norm_under, + ) elif query_type == "definition": # Find chunks where symbol_path matches the symbol results = await _query_definition( @@ -213,19 +266,30 @@ async def _symbol_graph_impl( symbol=symbol, limit=limit, language=language, - under=_norm_under(under), + under=norm_under, ) elif query_type == "importers": - # Find chunks where metadata.imports array contains the symbol - results = await _query_array_field( + results = await _query_graph_edges_collection( client=client, collection=coll, - field_key="metadata.imports", - value=symbol, + symbol=symbol, + edge_type="imports", limit=limit, language=language, - under=_norm_under(under), + repo_filter=None, + under=norm_under, ) + if not results: + # Fall back to array field lookup in the main collection. + results = await _query_array_field( + client=client, + collection=coll, + field_key="metadata.imports", + value=symbol, + limit=limit, + language=language, + under=norm_under, + ) # If no results, fall back to semantic search if not results: @@ -234,6 +298,7 @@ async def _symbol_graph_impl( query_type=query_type, limit=limit, language=language, + under=norm_under, collection=coll, session=session, ) @@ -246,6 +311,7 @@ async def _symbol_graph_impl( query_type=query_type, limit=limit, language=language, + under=norm_under, collection=coll, session=session, ) @@ -259,6 +325,155 @@ async def _symbol_graph_impl( } +async def _query_graph_edges_collection( + client: Any, + collection: str, + symbol: str, + edge_type: str, + limit: int, + language: Optional[str] = None, + repo_filter: str | None = None, + under: str | None = None, +) -> List[Dict[str, Any]]: + """Query `_graph` and hydrate results from the main collection. + + The graph collection stores file-level edges: + - caller_path -> callee_symbol (calls/imports) + """ + from qdrant_client import models as qmodels + + graph_coll = f"{collection}{GRAPH_COLLECTION_SUFFIX}" + if _is_graph_missing(graph_coll): + return [] + + # Build graph filter + must: list[Any] = [ + qmodels.FieldCondition( + key="edge_type", match=qmodels.MatchValue(value=str(edge_type)) + ) + ] + if repo_filter: + rf = str(repo_filter).strip() + if rf and rf != "*": + must.append( + qmodels.FieldCondition(key="repo", match=qmodels.MatchValue(value=rf)) + ) + + # Try exact match, then symbol variants. + callee_variants = _symbol_variants(symbol) or [symbol] + seen_paths: set[str] = set() + caller_paths: List[str] = [] + + for variant in callee_variants: + if len(caller_paths) >= limit: + break + v = str(variant).strip() + if not v: + continue + flt = qmodels.Filter( + must=must + + [ + qmodels.FieldCondition( + key="callee_symbol", match=qmodels.MatchValue(value=v) + ) + ] + ) + + def _scroll(_flt=flt): + return client.scroll( + collection_name=graph_coll, + scroll_filter=_flt, + limit=max(32, limit * 4), + with_payload=True, + with_vectors=False, + ) + + try: + points, _ = await asyncio.to_thread(_scroll) + except Exception as e: + err = str(e).lower() + if "404" in err or "doesn't exist" in err or "not found" in err: + _mark_graph_missing(graph_coll) + return [] + logger.exception( + "_query_graph_edges_collection scroll failed for %s", graph_coll + ) + raise + + for rec in points or []: + payload = getattr(rec, "payload", None) or {} + p = payload.get("caller_path") or "" + if not p: + continue + path_s = str(p) + if under and not _path_matches_under( + path_s, under, repo_hint=(payload.get("repo") or repo_filter) + ): + continue + if path_s in seen_paths: + continue + seen_paths.add(path_s) + caller_paths.append(path_s) + if len(caller_paths) >= limit: + break + + if not caller_paths: + return [] + + # Hydrate caller paths back into normal symbol_graph point-shaped results. + hydrated: List[Dict[str, Any]] = [] + for p in caller_paths[:limit]: + if len(hydrated) >= limit: + break + + def _scroll_main(_p=p, _language=language): + must = [ + qmodels.FieldCondition( + key="metadata.path", match=qmodels.MatchValue(value=_p) + ) + ] + if _language: + must.append( + qmodels.FieldCondition( + key="metadata.language", + match=qmodels.MatchValue(value=str(_language).lower()), + ) + ) + return client.scroll( + collection_name=collection, + scroll_filter=qmodels.Filter( + must=must + ), + limit=1, + with_payload=True, + with_vectors=False, + ) + + try: + pts, _ = await asyncio.to_thread(_scroll_main) + except Exception: + pts = [] + + if pts: + hydrated.append(_format_point(pts[0])) + else: + # If language filtering was requested but no matching main-collection doc + # exists (or hydration failed), skip returning a placeholder to avoid + # producing language-inconsistent results. + if not language: + hydrated.append( + { + "path": p, + "symbol": "", + "symbol_path": "", + "start_line": 0, + "end_line": 0, + } + ) + + return hydrated + + async def _query_array_field( client: Any, collection: str, @@ -290,14 +505,6 @@ async def _query_array_field( match=qmodels.MatchValue(value=language.lower()), ) ) - if under: - base_conditions.append( - qmodels.FieldCondition( - key="metadata.path_prefix", - match=qmodels.MatchValue(value=under), - ) - ) - # Strategy 1: Exact match with MatchAny (most reliable for array fields) try: filter1 = qmodels.Filter( @@ -321,6 +528,8 @@ def scroll1(): scroll_result = await asyncio.to_thread(scroll1) points = scroll_result[0] if scroll_result else [] for pt in points: + if under and not _point_matches_under(pt, under): + continue pt_id = str(getattr(pt, "id", id(pt))) if pt_id not in seen_ids: seen_ids.add(pt_id) @@ -356,6 +565,8 @@ def scroll2(): scroll_result = await asyncio.to_thread(scroll2) points = scroll_result[0] if scroll_result else [] for pt in points: + if under and not _point_matches_under(pt, under): + continue pt_id = str(getattr(pt, "id", id(pt))) if pt_id not in seen_ids: seen_ids.add(pt_id) @@ -387,6 +598,8 @@ def scroll3(): scroll_result = await asyncio.to_thread(scroll3) points = scroll_result[0] if scroll_result else [] for pt in points: + if under and not _point_matches_under(pt, under): + continue pt_id = str(getattr(pt, "id", id(pt))) if pt_id not in seen_ids: seen_ids.add(pt_id) @@ -422,14 +635,6 @@ async def _query_definition( match=qmodels.MatchValue(value=language.lower()), ) ) - if under: - base_conditions.append( - qmodels.FieldCondition( - key="metadata.path_prefix", - match=qmodels.MatchValue(value=under), - ) - ) - # Strategy 1: Exact match on symbol_path (e.g., "MyClass.my_method") try: filter1 = qmodels.Filter( @@ -514,6 +719,8 @@ def scroll3(): seen_ids = set() unique_results = [] for pt in results: + if under and not _point_matches_under(pt, under): + continue pt_id = getattr(pt, "id", None) if pt_id not in seen_ids: seen_ids.add(pt_id) @@ -570,6 +777,7 @@ async def _fallback_semantic_search( query_type: str, limit: int = 20, language: Optional[str] = None, + under: Optional[str] = None, collection: Optional[str] = None, session: Optional[str] = None, ) -> List[Dict[str, Any]]: @@ -591,6 +799,8 @@ async def _fallback_semantic_search( query=query, limit=limit, language=language, + under=under, + collection=collection, session=session, output_format="json", # Avoid TOON encoding for internal calls ) @@ -655,7 +865,7 @@ async def _compute_called_by( symbol: The symbol name to find callers for limit: Maximum number of callers to return language: Optional language filter - under: Optional path prefix filter + under: Optional recursive workspace subtree filter collection: Optional collection override Returns: @@ -703,13 +913,6 @@ async def _compute_called_by( ) ) norm_under = _norm_under(under) - if norm_under: - base_conditions.append( - qmodels.FieldCondition( - key="metadata.path_prefix", - match=qmodels.MatchValue(value=norm_under), - ) - ) callers: List[Dict[str, Any]] = [] seen_ids: Set[str] = set() @@ -743,6 +946,8 @@ def do_scroll(): points = scroll_result[0] if scroll_result else [] for pt in points: + if norm_under and not _point_matches_under(pt, norm_under): + continue pt_id = str(getattr(pt, "id", id(pt))) if pt_id in seen_ids: continue diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index d12aee9b..4da07083 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -300,6 +300,23 @@ def _highlight_snippet(snippet, tokens): # type: ignore ) mcp = FastMCP(APP_NAME, transport_security=_security_settings) +# Minimal resource so MCP clients can verify resource wiring. +@mcp.resource( + "resource://context-engine/indexer/info", + name="context-engine-indexer-info", + title="Context Engine Indexer Info", + description="Basic metadata about the running indexer MCP server.", + mime_type="application/json", +) +def _indexer_info_resource(): + return { + "app": APP_NAME, + "host": HOST, + "port": PORT, + "qdrant_url": QDRANT_URL, + "default_collection": DEFAULT_COLLECTION, + } + # Capture tool registry automatically by wrapping the decorator once _TOOLS_REGISTRY: list[dict] = [] @@ -1082,6 +1099,7 @@ async def repo_search( case: Any = None, repo: Any = None, compact: Any = None, + debug: Any = None, output_format: Any = None, args: Any = None, kwargs: Any = None, @@ -1098,12 +1116,13 @@ async def repo_search( - per_path: int (default 2). Max results per file. - include_snippet/context_lines: return inline snippets near hits when true. - rerank_*: ONNX reranker is ON by default for best relevance; timeouts fall back to hybrid. + - debug: bool (default false). Include verbose internal fields (components, rerank_counters, etc). - output_format: "json" (default) or "toon" for token-efficient TOON format. - collection: str. Target collection; defaults to workspace state or env COLLECTION_NAME. - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos. Returns: - - Dict with keys: results, total, used_rerank, rerank_counters + - Dict with keys: results, total, used_rerank, [rerank_counters if debug=true] """ return await _repo_search_impl( query=query, @@ -1134,6 +1153,7 @@ async def repo_search( case=case, repo=repo, compact=compact, + debug=debug, output_format=output_format, args=args, kwargs=kwargs, @@ -1195,6 +1215,7 @@ async def repo_search_compat(**arguments) -> Dict[str, Any]: "not_": not_value, "case": args.get("case"), "compact": args.get("compact"), + "debug": args.get("debug"), "mode": args.get("mode"), "repo": args.get("repo"), # Cross-codebase isolation "output_format": args.get("output_format"), # "json" or "toon" @@ -1411,7 +1432,7 @@ async def symbol_graph( - query_type: str. One of "callers", "definition", "importers". - limit: int (default 20). Maximum results to return. - language: str (optional). Filter by programming language. - - under: str (optional). Filter by path prefix. + - under: str (optional). Filter by recursive workspace subtree (e.g., "scripts" -> scripts/**). - output_format: "json" (default) or "toon" for token-efficient format. Returns: @@ -1642,6 +1663,9 @@ async def code_search( case: Any = None, session: Any = None, compact: Any = None, + debug: Any = None, + output_format: Any = None, + repo: Any = None, kwargs: Any = None, ) -> Dict[str, Any]: """Exact alias of repo_search (hybrid code search with reranking enabled by default). @@ -1674,6 +1698,9 @@ async def code_search( case=case, session=session, compact=compact, + debug=debug, + output_format=output_format, + repo=repo, kwargs=kwargs, ) @@ -1720,7 +1747,7 @@ async def info_request( - include_relationships: bool (default false). Add imports_from, calls, related_paths to results. - limit: int (default 10). Maximum results to return. - language: str. Filter by programming language. - - under: str. Limit search to specific directory. + - under: str. Limit search to a recursive workspace subtree. - repo: str or list[str]. Filter by repository name(s). - output_format: "json" (default) or "toon" for token-efficient TOON format. diff --git a/scripts/path_scope.py b/scripts/path_scope.py new file mode 100644 index 00000000..2150926c --- /dev/null +++ b/scripts/path_scope.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +""" +Shared helpers for user-facing path scoping (`under`) across search tools. + +`under` is treated as a recursive subtree scope from the user's workspace +perspective (for example: "space" matches ".../space/**"). +""" + +from __future__ import annotations + +import os +import re +from functools import lru_cache +from typing import Any, Mapping, Optional, Set + +_MULTI_SLASH_RE = re.compile(r"/+") + + +def _normalize_path_token(value: Any) -> str: + s = str(value or "").strip().replace("\\", "/") + if not s: + return "" + s = _MULTI_SLASH_RE.sub("/", s) + # Normalize common "file://" style inputs. + if s.startswith("file://"): + s = s[7:] + return s.strip("/") + + +def _normalize_repo_hint(repo_hint: Any) -> str: + r = _normalize_path_token(repo_hint) + if not r: + return "" + return r.split("/")[-1] + + +def _repo_root_hint() -> str: + """Best-effort repository root (directory containing scripts/).""" + try: + return os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + except Exception: + return "" + + +def _maybe_expand_from_cwd(token: str) -> str: + """Recover under values that were relativized from the current subdirectory.""" + s = str(token or "").strip().strip("/") + if not s or "/" in s: + return s + try: + root = _repo_root_hint() + if not root: + return s + cwd = os.path.abspath(os.getcwd()) + if not (cwd == root or cwd.startswith(root + os.sep)): + return s + rel_cwd = os.path.relpath(cwd, root).replace("\\", "/").strip("/") + if not rel_cwd: + return s + rebased = f"{rel_cwd}/{s}" + rebased_path = os.path.join(root, *rebased.split("/")) + top_level_path = os.path.join(root, s) + if os.path.exists(rebased_path) and not os.path.exists(top_level_path): + return rebased + except Exception: + pass + return s + + +@lru_cache(maxsize=256) +def _unique_segment_path(root: str, segment: str) -> str: + """Return unique repo-relative directory path for a segment, else empty.""" + if not root or not segment: + return "" + top = os.path.join(root, segment) + if os.path.exists(top): + return "" + matches: list[str] = [] + skip = { + ".git", + ".codebase", + "__pycache__", + ".venv", + "node_modules", + } + try: + for dirpath, dirnames, _filenames in os.walk(root): + dirnames[:] = [d for d in dirnames if d not in skip and not d.startswith(".")] + if segment in dirnames: + rel = os.path.relpath(os.path.join(dirpath, segment), root).replace("\\", "/") + matches.append(rel.strip("/")) + if len(matches) > 1: + return "" + except Exception: + return "" + return matches[0] if len(matches) == 1 else "" + + +def _maybe_expand_unique_segment(token: str) -> str: + """Resolve single-segment under values to a unique subtree when possible.""" + s = str(token or "").strip().strip("/") + if not s or "/" in s: + return s + root = _repo_root_hint() + if not root: + return s + found = _unique_segment_path(root, s) + return found or s + + +def normalize_under(under: Optional[str]) -> Optional[str]: + """Normalize user-provided `under` into a comparable path token.""" + s = _normalize_path_token(under) + if not s or s in {".", "work"}: + return None + # Accept absolute-style workspace prefixes while preserving user-facing scope. + if s.startswith("work/"): + s = s[len("work/") :] + s = _maybe_expand_from_cwd(s) + s = _maybe_expand_unique_segment(s) + if not s or s in {".", "work"}: + return None + return s + + +def _path_forms(path: Any, repo_hint: Any = None) -> Set[str]: + """Generate comparable path forms from a path-like value.""" + p = _normalize_path_token(path) + if not p: + return set() + + forms: Set[str] = {p} + + repo = _normalize_repo_hint(repo_hint) + + if p.startswith("work/"): + rest = p[len("work/") :] + if rest: + forms.add(rest) + if "/" in rest and repo: + head, tail = rest.split("/", 1) + if head.casefold() == repo.casefold() and tail: + forms.add(tail) + + if repo: + def _cf_to_orig_idx(orig: str, cf_index: int) -> int: + if cf_index <= 0: + return 0 + acc = 0 + for i, ch in enumerate(orig): + nxt = acc + len(ch.casefold()) + if nxt > cf_index: + return i + acc = nxt + return len(orig) + + repo_cf = repo.casefold() + repo_prefix_cf = repo_cf + "/" + marker_cf = "/" + repo_cf + "/" + for f in list(forms): + f_cf = f.casefold() + if f_cf.startswith(repo_prefix_cf): + forms.add(f[len(repo) + 1 :]) + idx = f_cf.find(marker_cf) + if idx >= 0: + tail_start = _cf_to_orig_idx(f, idx + len(marker_cf)) + tail = f[tail_start:] + if tail: + forms.add(tail) + + return {x for x in forms if x} + + +def metadata_path_forms(metadata: Mapping[str, Any]) -> Set[str]: + """Collect path forms from a metadata payload.""" + repo_hint = metadata.get("repo") + forms: Set[str] = set() + for key in ( + "repo_rel_path", + "path", + "container_path", + "host_path", + "path_prefix", + "file_path", + "rel_path", + "client_path", + ): + v = metadata.get(key) + if v: + forms.update(_path_forms(v, repo_hint=repo_hint)) + return forms + + +def metadata_matches_under(metadata: Mapping[str, Any], under: Optional[str]) -> bool: + """Return True when metadata falls under the requested subtree scope.""" + norm_under = normalize_under(under) + if not norm_under: + return True + + repo_hint = metadata.get("repo") + under_forms = _path_forms(norm_under, repo_hint=repo_hint) + under_forms.add(norm_under) + if not norm_under.startswith("work/"): + under_forms.add("work/" + norm_under) + + under_forms_l = {u.casefold() for u in under_forms if u} + if not under_forms_l: + return True + + has_repo_hint = bool(str(repo_hint or "").strip()) + + for cand in metadata_path_forms(metadata): + cand_forms = {cand} + # Compatibility fallback for points that only store /work//... paths + # but do not carry metadata.repo (older/benchmark/custom payloads). + if not has_repo_hint: + c0 = cand.strip("/") + if c0.startswith("work/"): + rest = c0[len("work/") :] + if "/" in rest: + _head, tail = rest.split("/", 1) + if tail: + cand_forms.add(tail) + + for cf in cand_forms: + c = cf.casefold() + for u in under_forms_l: + if c == u or c.startswith(u + "/"): + return True + return False + + +def path_matches_under(path: Any, under: Optional[str], repo_hint: Any = None) -> bool: + """Path-only convenience wrapper for `under` subtree matching.""" + md = {"path": path} + if repo_hint: + md["repo"] = repo_hint + return metadata_matches_under(md, under) diff --git a/scripts/prune.py b/scripts/prune.py index 5e2f14fb..d654132a 100755 --- a/scripts/prune.py +++ b/scripts/prune.py @@ -39,26 +39,34 @@ def delete_by_path(client: QdrantClient, path_str: str) -> int: return 0 -def delete_graph_edges_by_path(client: QdrantClient, path_str: str) -> int: +def delete_graph_edges_by_path(client: QdrantClient, path_str: str, repo: str | None = None) -> int: """Best-effort deletion for graph-edge collections (if present). Some deployments store symbol-graph edges in a separate Qdrant collection - (commonly `${COLLECTION}_graph`). Those points may reference a file path as - either caller or callee; delete both to prevent stale graph results. + (commonly `${COLLECTION}_graph`). On this branch, edge docs are file-level and + reference a file path as `caller_path`. """ if not path_str: return 0 - - flt = models.Filter( - should=[ - models.FieldCondition( - key="caller_path", match=models.MatchValue(value=path_str) - ), - models.FieldCondition( - key="callee_path", match=models.MatchValue(value=path_str) - ), - ] - ) + try: + path_str = os.path.normpath(str(path_str)) + except Exception: + path_str = str(path_str) + path_str = str(path_str).replace("\\", "/") + + must = [ + models.FieldCondition(key="caller_path", match=models.MatchValue(value=path_str)) + ] + if repo: + try: + r = str(repo).strip() + except Exception: + r = "" + if r and r != "*": + must.append( + models.FieldCondition(key="repo", match=models.MatchValue(value=r)) + ) + flt = models.Filter(must=must) try: res = client.delete( collection_name=GRAPH_COLLECTION, @@ -116,13 +124,13 @@ def main(): ) if not abs_path.exists(): removed_missing += delete_by_path(client, path_str) - removed_graph_edges += delete_graph_edges_by_path(client, path_str) + removed_graph_edges += delete_graph_edges_by_path(client, path_str, md.get("repo")) print(f"[prune] removed missing file points: {path_str}") continue current_hash = sha1_file(abs_path) if file_hash and current_hash and current_hash != file_hash: removed_mismatch += delete_by_path(client, path_str) - removed_graph_edges += delete_graph_edges_by_path(client, path_str) + removed_graph_edges += delete_graph_edges_by_path(client, path_str, md.get("repo")) print(f"[prune] removed outdated points (hash mismatch): {path_str}") if next_page is None: diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index fcc7d6ba..1f37edcd 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -46,6 +46,48 @@ # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +_git_history_skip_log_key: Optional[str] = None + + +def _is_usable_delta_status(status: Any) -> bool: + if not isinstance(status, dict): + return False + state = str(status.get("status") or "").strip().lower() + return ( + bool(status.get("success")) and + "workspace_path" in status and + "collection_name" in status and + state in {"ready", "processing", "completed"} + ) + + +def _server_status_error_message(status: Any) -> str: + if isinstance(status, dict): + error = status.get("error") + if isinstance(error, dict): + msg = str(error.get("message") or "").strip() + if msg: + return msg + state = str(status.get("status") or "").strip() + if state: + return f"Server status is {state}" + return "Invalid server status response" + + +def _env_flag(name: str, default: bool) -> bool: + raw = os.environ.get(name) + if raw is None: + return default + return str(raw).strip().lower() in {"1", "true", "yes", "on"} + + +def _log_git_history_skip_once(reason: str, key: str) -> None: + global _git_history_skip_log_key + marker = f"{reason}:{key}" + if _git_history_skip_log_key == marker: + return + _git_history_skip_log_key = marker + logger.info("[git_history] skip (%s): %s", reason, key) DEFAULT_MAX_TEMP_CLEAN_ATTEMPTS = 3 DEFAULT_TEMP_CLEAN_SLEEP = 1.0 @@ -134,6 +176,24 @@ def _compute_logical_repo_id(workspace_path: str) -> str: return f"{prefix}{h}" +def _derive_metadata_root(workspace_path: str) -> Path: + """Infer host-side metadata root that corresponds to container `/work`.""" + try: + p = Path(workspace_path).resolve() + except Exception: + p = Path(workspace_path) + + if p.name == "dev-workspace": + return p.parent + if p.parent.name == "dev-workspace": + return p.parent.parent + if (p / ".codebase").exists(): + return p + if (p.parent / ".codebase").exists(): + return p.parent + return p.parent + + def _redact_emails(text: str) -> str: """Redact email addresses from commit messages for privacy.""" try: @@ -167,10 +227,12 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str } if max_commits <= 0: + _log_git_history_skip_once("disabled", f"max_commits={max_commits}") return None root = _find_git_root(Path(workspace_path)) if not root: + _log_git_history_skip_once("no_repo", workspace_path) return None # Git history cache: avoid emitting identical manifests when HEAD/settings are unchanged @@ -204,6 +266,7 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str cache = {} if current_head and cache.get("last_head") == current_head and cache.get("max_commits") == max_commits and str(cache.get("since") or "") == since: + _log_git_history_skip_once("cache_hit", f"head={current_head[:10]} since={since or '-'} max={max_commits}") return None base_head = "" @@ -254,12 +317,20 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str errors="replace", ) if proc.returncode != 0 or not proc.stdout.strip(): + _log_git_history_skip_once( + "rev_list_empty", + f"head={current_head[:10] if current_head else '-'} rc={proc.returncode}", + ) return None commits = [l.strip() for l in proc.stdout.splitlines() if l.strip()] except Exception: return None if not commits: + _log_git_history_skip_once( + "no_commits", + f"head={current_head[:10] if current_head else '-'}", + ) return None if len(commits) > max_commits: commits = commits[:max_commits] @@ -333,6 +404,10 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str continue if not records: + _log_git_history_skip_once( + "no_records", + f"commits={len(commits)} head={current_head[:10] if current_head else '-'}", + ) return None try: @@ -352,6 +427,14 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str "since": since, "commits": records, } + logger.info( + "[git_history] prepared manifest mode=%s commits=%d head=%s prev=%s base=%s", + manifest["mode"], + len(records), + (current_head[:10] if current_head else "-"), + (prev_head[:10] if prev_head else "-"), + (base_head[:10] if base_head else "-"), + ) # Update git history cache with the HEAD and settings used for this manifest try: @@ -370,7 +453,12 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str return manifest -def _load_local_cache_file_hashes(workspace_path: str, repo_name: Optional[str]) -> Dict[str, str]: +def _load_local_cache_file_hashes( + workspace_path: str, + repo_name: Optional[str], + *, + metadata_root: Optional[str] = None, +) -> Dict[str, str]: """Best-effort read of the local cache.json file_hashes map. This mirrors the layout used by workspace_state without introducing new @@ -378,7 +466,13 @@ def _load_local_cache_file_hashes(workspace_path: str, repo_name: Optional[str]) lookups still go through get_cached_file_hash. """ try: - base = Path(os.environ.get("WORKSPACE_PATH") or workspace_path).resolve() + base = Path( + metadata_root + or os.environ.get("CTXCE_METADATA_ROOT") + or os.environ.get("WATCH_ROOT") + or os.environ.get("WORKSPACE_PATH") + or workspace_path + ).resolve() multi_repo = os.environ.get("MULTI_REPO_MODE", "0").strip().lower() in {"1", "true", "yes", "on"} if multi_repo and repo_name: cache_path = base / ".codebase" / "repos" / repo_name / "cache.json" @@ -418,6 +512,21 @@ def _load_local_cache_file_hashes(workspace_path: str, repo_name: Optional[str]) return {} +def get_all_cached_paths( + repo_name: Optional[str] = None, + metadata_root: Optional[str] = None, +) -> List[str]: + """Return cached file paths from the local workspace cache.""" + effective_workspace = os.environ.get("WORKSPACE_PATH") or os.getcwd() + return list( + _load_local_cache_file_hashes( + effective_workspace, + repo_name, + metadata_root=metadata_root, + ).keys() + ) + + class RemoteUploadClient: """Client for uploading delta bundles to remote server.""" @@ -457,15 +566,13 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s """Initialize remote upload client.""" self.upload_endpoint = upload_endpoint.rstrip('/') self.workspace_path = workspace_path + self.metadata_root = str(_derive_metadata_root(workspace_path)) self.collection_name = collection_name self.max_retries = max_retries self.timeout = timeout self.temp_dir = None self.logical_repo_id = logical_repo_id - # Set environment variables for cache functions - os.environ["WORKSPACE_PATH"] = workspace_path - # Get repo name for cache operations try: from scripts.workspace_state import _extract_repo_name_from_path @@ -485,6 +592,173 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("http://", adapter) self.session.mount("https://", adapter) + self.last_upload_result: Dict[str, Any] = {"outcome": "idle"} + self._last_plan_payload: Optional[Dict[str, Any]] = None + + def _get_cached_file_hash(self, file_path: str) -> str: + try: + return get_cached_file_hash( + file_path, + self.repo_name, + metadata_root=self.metadata_root, + ) + except TypeError: + # Support monkeypatched test doubles that don't accept metadata_root. + return get_cached_file_hash(file_path, self.repo_name) + + def _set_cached_file_hash(self, file_path: str, file_hash: str) -> None: + try: + set_cached_file_hash( + file_path, + file_hash, + self.repo_name, + metadata_root=self.metadata_root, + ) + except TypeError: + # Support monkeypatched test doubles that don't accept metadata_root. + set_cached_file_hash(file_path, file_hash, self.repo_name) + + def _remove_cached_file(self, file_path: str) -> None: + try: + remove_cached_file( + file_path, + self.repo_name, + metadata_root=self.metadata_root, + ) + except TypeError: + # Support monkeypatched test doubles that don't accept metadata_root. + remove_cached_file(file_path, self.repo_name) + + def _get_all_cached_paths(self) -> List[str]: + try: + return get_all_cached_paths( + self.repo_name, + metadata_root=self.metadata_root, + ) + except TypeError: + # Support monkeypatched test doubles that don't accept metadata_root. + return get_all_cached_paths(self.repo_name) + + def _set_last_upload_result(self, outcome: str, **details: Any) -> Dict[str, Any]: + result: Dict[str, Any] = {"outcome": outcome} + result.update(details) + self.last_upload_result = result + return result + + def log_watch_upload_result(self) -> None: + outcome = str((self.last_upload_result or {}).get("outcome") or "") + if outcome == "skipped_by_plan": + logger.info("[watch] No upload needed after plan") + elif outcome == "queued": + logger.info("[watch] Upload request accepted; server processing asynchronously") + elif outcome == "uploaded_async": + processed = (self.last_upload_result or {}).get("processed_operations") + logger.info("[watch] Upload processed asynchronously: %s", processed or {}) + elif outcome == "uploaded": + logger.info("[watch] Successfully uploaded changes") + elif outcome == "no_changes": + logger.info("[watch] No meaningful changes to upload") + else: + logger.info("[watch] Upload handling completed") + + def _finalize_successful_changes(self, changes: Dict[str, List]) -> None: + for path in changes.get("created", []): + try: + abs_path = str(path.resolve()) + current_hash = hashlib.sha1(path.read_bytes()).hexdigest() + self._set_cached_file_hash(abs_path, current_hash) + stat = path.stat() + self._stat_cache[abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) + except Exception: + continue + for path in changes.get("updated", []): + try: + abs_path = str(path.resolve()) + current_hash = hashlib.sha1(path.read_bytes()).hexdigest() + self._set_cached_file_hash(abs_path, current_hash) + stat = path.stat() + self._stat_cache[abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) + except Exception: + continue + for path in changes.get("deleted", []): + try: + abs_path = str(path.resolve()) + self._remove_cached_file(abs_path) + self._stat_cache.pop(abs_path, None) + except Exception: + continue + for source_path, dest_path in changes.get("moved", []): + try: + source_abs_path = str(source_path.resolve()) + self._remove_cached_file(source_abs_path) + self._stat_cache.pop(source_abs_path, None) + except Exception: + continue + try: + dest_abs_path = str(dest_path.resolve()) + current_hash = hashlib.sha1(dest_path.read_bytes()).hexdigest() + self._set_cached_file_hash(dest_abs_path, current_hash) + stat = dest_path.stat() + self._stat_cache[dest_abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) + except Exception: + continue + + def _await_async_upload_result( + self, + bundle_id: Optional[str], + sequence_number: Optional[int], + ) -> Optional[Dict[str, Any]]: + try: + max_wait = float(os.environ.get("CTXCE_REMOTE_UPLOAD_STATUS_WAIT_SECS", "5")) + except Exception: + max_wait = 5.0 + if max_wait <= 0: + return None + + try: + poll_interval = float(os.environ.get("CTXCE_REMOTE_UPLOAD_STATUS_POLL_INTERVAL_SECS", "1")) + except Exception: + poll_interval = 1.0 + poll_interval = max(0.1, poll_interval) + + deadline = time.time() + max_wait + while time.time() < deadline: + status = self.get_server_status() + if not status.get("success"): + return None + server_info = status.get("server_info", {}) if isinstance(status, dict) else {} + last_bundle_id = server_info.get("last_bundle_id") + last_upload_status = server_info.get("last_upload_status") + last_sequence = status.get("last_sequence") + bundle_matches = bool(bundle_id) and last_bundle_id == bundle_id + sequence_matches = sequence_number is not None and last_sequence == sequence_number + if bundle_matches or sequence_matches: + if last_upload_status == "completed": + return { + "outcome": "uploaded_async", + "bundle_id": last_bundle_id or bundle_id, + "sequence_number": last_sequence if last_sequence is not None else sequence_number, + "processed_operations": server_info.get("last_processed_operations"), + "processing_time_ms": server_info.get("last_processing_time_ms"), + } + if last_upload_status in ("failed", "error"): + return { + "outcome": "failed", + "bundle_id": last_bundle_id or bundle_id, + "sequence_number": last_sequence if last_sequence is not None else sequence_number, + "error": server_info.get("last_error"), + } + time.sleep(poll_interval) + return None def __enter__(self): """Context manager entry.""" @@ -520,6 +794,65 @@ def log_mapping_summary(self) -> None: logger.info(f" source_path: {info['source_path']}") logger.info(f" container_path: {info['container_path']}") + def _excluded_dirnames(self) -> frozenset: + # Keep in sync with standalone_upload_client exclusions. + # NOTE: This caches the exclusion set per RemoteUploadClient instance. + # Runtime changes to DEV_REMOTE_MODE/REMOTE_UPLOAD_MODE won't be reflected + # until a new client is created (typically via process restart), which is + # acceptable for the upload client use case. + cached = getattr(self, "_excluded_dirnames_cache", None) + if cached is not None: + return cached + excluded = { + "node_modules", "vendor", "dist", "build", "target", "out", + ".git", ".hg", ".svn", ".vscode", ".idea", ".venv", "venv", + "__pycache__", ".pytest_cache", ".mypy_cache", ".cache", + ".context-engine", ".context-engine-uploader", ".codebase", + } + dev_remote = os.environ.get("DEV_REMOTE_MODE") == "1" or os.environ.get("REMOTE_UPLOAD_MODE") == "development" + if dev_remote: + excluded.add("dev-workspace") + cached = frozenset(excluded) + self._excluded_dirnames_cache = cached + return cached + + def _is_ignored_path(self, path: Path) -> bool: + """Return True when path is outside workspace or under excluded dirs.""" + try: + workspace_root = Path(self.workspace_path).resolve() + rel = path.resolve().relative_to(workspace_root) + except Exception: + return True + + dir_parts = set(rel.parts[:-1]) if len(rel.parts) > 1 else set() + if dir_parts & self._excluded_dirnames(): + return True + # Ignore hidden directories anywhere under the workspace, but allow + # extensionless dotfiles like `.gitignore` that we explicitly support. + if any(p.startswith(".") for p in rel.parts[:-1]): + return True + try: + extensionless = set((idx.EXTENSIONLESS_FILES or {}).keys()) + except Exception: + extensionless = set() + if rel.name.startswith(".") and rel.name.lower() not in extensionless: + return True + return False + + def _is_watchable_path(self, path: Path) -> bool: + """Return True when a filesystem event path is eligible for upload processing.""" + if self._is_ignored_path(path): + return False + suffix = path.suffix.lower() + if idx.CODE_EXTS.get(suffix, "unknown") != "unknown": + return True + name = path.name.lower() + try: + extensionless_names = {k.lower() for k in (idx.EXTENSIONLESS_FILES or {}).keys()} + except Exception: + extensionless_names = set() + return name in extensionless_names or name.startswith("dockerfile") + def _get_temp_bundle_dir(self) -> Path: """Get or create temporary directory for bundle creation.""" if not self.temp_dir: @@ -547,6 +880,19 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: } for path in changed_paths: + if self._is_ignored_path(path): + try: + abs_path = str(path.resolve()) + except Exception: + continue + cached_hash = self._get_cached_file_hash(abs_path) + if cached_hash: + changes["deleted"].append(path) + try: + self._stat_cache.pop(abs_path, None) + except Exception: + pass + continue # Resolve to an absolute path for stable cache keys try: abs_path = str(path.resolve()) @@ -554,7 +900,7 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: # Skip paths that cannot be resolved continue - cached_hash = get_cached_file_hash(abs_path, self.repo_name) + cached_hash = self._get_cached_file_hash(abs_path) if not path.exists(): # File was deleted @@ -610,8 +956,6 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: self._stat_cache[abs_path] = (getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), stat.st_size) except Exception: pass - set_cached_file_hash(abs_path, current_hash, self.repo_name) - # Detect moves by looking for files with same content hash # but different paths (requires additional tracking) changes["moved"] = self._detect_moves(changes["created"], changes["deleted"]) @@ -636,7 +980,7 @@ def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) -> for deleted_path in deleted_files: try: # Try to get cached hash first, fallback to file content - cached_hash = get_cached_file_hash(str(deleted_path), self.repo_name) + cached_hash = self._get_cached_file_hash(str(deleted_path)) if cached_hash: deleted_hashes[cached_hash] = deleted_path continue @@ -749,7 +1093,7 @@ def create_delta_bundle( content = f.read() file_hash = hashlib.sha1(content).hexdigest() content_hash = f"sha1:{file_hash}" - previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + previous_hash = self._get_cached_file_hash(str(path.resolve())) # Write file to bundle bundle_file_path = files_dir / "updated" / rel_path @@ -825,7 +1169,7 @@ def create_delta_bundle( for path in changes["deleted"]: rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: - previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + previous_hash = self._get_cached_file_hash(str(path.resolve())) operation = { "operation": "deleted", @@ -839,13 +1183,6 @@ def create_delta_bundle( } operations.append(operation) - # Once a delete operation has been recorded, drop the cache entry - # so subsequent scans do not keep re-reporting the same deletion. - try: - remove_cached_file(str(path.resolve()), self.repo_name) - except Exception: - pass - except Exception as e: print(f"[bundle_create] Error processing deleted file {path}: {e}") continue @@ -909,6 +1246,304 @@ def create_delta_bundle( return str(bundle_path), manifest + def _build_plan_payload(self, changes: Dict[str, List]) -> Dict[str, Any]: + created_at = datetime.now().isoformat() + bundle_id = str(uuid.uuid4()) + operations: List[Dict[str, Any]] = [] + file_hashes: Dict[str, str] = {} + total_size = 0 + + for path in changes["created"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = path.stat() + operations.append( + { + "operation": "created", + "path": rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown"), + } + ) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning("[remote_upload] Failed to prepare created plan entry for %s: %s", path, e) + + for path in changes["updated"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = path.stat() + previous_hash = self._get_cached_file_hash(str(path.resolve())) + operations.append( + { + "operation": "updated", + "path": rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, + "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown"), + } + ) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning("[remote_upload] Failed to prepare updated plan entry for %s: %s", path, e) + + for source_path, dest_path in changes["moved"]: + dest_rel_path = dest_path.relative_to(Path(self.workspace_path)).as_posix() + source_rel_path = source_path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = dest_path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = dest_path.stat() + operations.append( + { + "operation": "moved", + "path": dest_rel_path, + "source_path": source_rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "language": idx.CODE_EXTS.get(dest_path.suffix.lower(), "unknown"), + } + ) + file_hashes[dest_rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning( + "[remote_upload] Failed to prepare moved plan entry for %s -> %s: %s", + source_path, + dest_path, + e, + ) + + for path in changes["deleted"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + previous_hash = self._get_cached_file_hash(str(path.resolve())) + operations.append( + { + "operation": "deleted", + "path": rel_path, + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, + "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown"), + } + ) + except Exception as e: + logger.warning("[remote_upload] Failed to prepare deleted plan entry for %s: %s", path, e) + + manifest = { + "version": "1.0", + "bundle_id": bundle_id, + "workspace_path": self.workspace_path, + "collection_name": self.collection_name, + "created_at": created_at, + "sequence_number": None, + "parent_sequence": None, + "operations": { + "created": len(changes["created"]), + "updated": len(changes["updated"]), + "deleted": len(changes["deleted"]), + "moved": len(changes["moved"]), + }, + "total_files": len(operations), + "total_size_bytes": total_size, + "compression": "gzip", + "encoding": "utf-8", + } + return { + "manifest": manifest, + "operations": operations, + "file_hashes": file_hashes, + } + + def _plan_delta_upload(self, changes: Dict[str, List]) -> Optional[Dict[str, Any]]: + if not _env_flag("CTXCE_REMOTE_UPLOAD_PLAN_ENABLED", True): + return None + try: + payload = self._build_plan_payload(changes) + self._last_plan_payload = payload + data = { + "workspace_path": self._translate_to_container_path(self.workspace_path), + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "logical_repo_id": _compute_logical_repo_id(self.workspace_path), + "manifest": payload["manifest"], + "operations": payload["operations"], + "file_hashes": payload["file_hashes"], + } + sess = get_auth_session(self.upload_endpoint) + if sess: + data["session"] = sess + if getattr(self, "logical_repo_id", None): + data["logical_repo_id"] = self.logical_repo_id + + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/plan", + json=data, + timeout=min(self.timeout, 60), + ) + if response.status_code in {404, 405}: + logger.info("[remote_upload] Plan endpoint unavailable; falling back to full bundle upload") + return None + response.raise_for_status() + body = response.json() + if not body.get("success", False): + logger.warning("[remote_upload] Plan request failed; falling back: %s", body.get("error")) + return None + return body + except Exception as e: + logger.warning("[remote_upload] Plan request failed; falling back to full bundle upload: %s", e) + return None + + def _build_apply_only_payload(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Dict[str, Any]: + payload = self._last_plan_payload or self._build_plan_payload(changes) + needed = plan.get("needed_files", {}) if isinstance(plan, dict) else {} + created_needed = set(needed.get("created", []) or []) + updated_needed = set(needed.get("updated", []) or []) + moved_needed = set(needed.get("moved", []) or []) + + # Check if ALL operations are hash-matched (nothing needs content at all) + # This happens when all needed_files lists are empty and there are no actual changes requiring content + has_changes_needing_content = bool(created_needed or updated_needed or moved_needed) + has_deletes = bool(changes.get("deleted", [])) + + # Only skip apply-only if there are NO operations needing content AND NO deletes + if not has_changes_needing_content and not has_deletes: + return { + "manifest": payload.get("manifest", {}), + "operations": [], + "file_hashes": {}, + } + + filtered_ops: List[Dict[str, Any]] = [] + filtered_hashes: Dict[str, str] = {} + for operation in payload.get("operations", []): + op_type = str(operation.get("operation") or "") + rel_path = str(operation.get("path") or "") + # Determine if this operation needs content (only those skip filtered_hashes) + needs_content = ( + (op_type == "created" and rel_path in created_needed) + or (op_type == "updated" and rel_path in updated_needed) + or (op_type == "moved" and rel_path in moved_needed) + ) + if needs_content: + # Skip operations that need content - they'll be uploaded separately + continue + # IMPORTANT: server-side apply_delta_operations() only accepts "deleted" and "moved" + # operations. Hash-matched "created" and "updated" operations must NOT be routed + # through apply_ops since the server will reject them. + if op_type not in {"deleted", "moved"}: + continue + # Preserve all other operations so server advances state + filtered_ops.append(operation) + # Include hash for non-deleted operations + if op_type != "deleted": + hash_value = payload.get("file_hashes", {}).get(rel_path) + if hash_value: + filtered_hashes[rel_path] = hash_value + return { + "manifest": payload.get("manifest", {}), + "operations": filtered_ops, + "file_hashes": filtered_hashes, + } + + def _apply_operations_without_content(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Optional[bool]: + payload = self._build_apply_only_payload(changes, plan) + operations = payload.get("operations", []) + if not operations: + return None + try: + data = { + "workspace_path": self._translate_to_container_path(self.workspace_path), + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "logical_repo_id": _compute_logical_repo_id(self.workspace_path), + "manifest": payload["manifest"], + "operations": operations, + "file_hashes": payload["file_hashes"], + } + sess = get_auth_session(self.upload_endpoint) + if sess: + data["session"] = sess + if getattr(self, "logical_repo_id", None): + data["logical_repo_id"] = self.logical_repo_id + + logger.info( + "[remote_upload] Applying metadata-only operations without bundle: deleted=%s moved=%s", + sum(1 for op in operations if op.get("operation") == "deleted"), + sum(1 for op in operations if op.get("operation") == "moved"), + ) + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/apply_ops", + json=data, + timeout=min(self.timeout, 60), + ) + if response.status_code in {404, 405}: + logger.info("[remote_upload] apply_ops endpoint unavailable; falling back to bundle upload") + return None + response.raise_for_status() + body = response.json() + if not body.get("success", False): + logger.warning("[remote_upload] apply_ops failed; falling back to bundle upload: %s", body.get("error")) + return None + # Only finalize changes that were actually processed by the server + # apply_delta_operations only handles deleted/moved operations + processed_ops = body.get("processed_operations") or {} + applied_changes = { + "deleted": changes.get("deleted", []), + "moved": changes.get("moved", []), + "created": [], + "updated": [], + } + self._finalize_successful_changes(applied_changes) + self._set_last_upload_result( + "uploaded", + bundle_id=body.get("bundle_id"), + sequence_number=body.get("sequence_number"), + processed_operations=processed_ops, + ) + logger.info( + "[remote_upload] Metadata-only operations applied: %s", + processed_ops, + ) + return True + except Exception as e: + logger.warning("[remote_upload] apply_ops failed; falling back to bundle upload: %s", e) + return None + + def _filter_changes_by_plan(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Dict[str, List]: + needed = plan.get("needed_files", {}) if isinstance(plan, dict) else {} + created_needed = set(needed.get("created", []) or []) + updated_needed = set(needed.get("updated", []) or []) + moved_needed = set(needed.get("moved", []) or []) + + filtered_created = [ + path for path in changes["created"] + if path.relative_to(Path(self.workspace_path)).as_posix() in created_needed + ] + filtered_updated = [ + path for path in changes["updated"] + if path.relative_to(Path(self.workspace_path)).as_posix() in updated_needed + ] + filtered_moved = [ + (source_path, dest_path) + for source_path, dest_path in changes["moved"] + if dest_path.relative_to(Path(self.workspace_path)).as_posix() in moved_needed + ] + return { + "created": filtered_created, + "updated": filtered_updated, + "deleted": list(changes["deleted"]), + "moved": filtered_moved, + "unchanged": [], + } + def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, Any]: """ Upload delta bundle to remote server with exponential backoff retry. @@ -1156,7 +1791,16 @@ def get_server_status(self) -> Dict[str, Any]: ) if response.status_code == 200: - return response.json() + payload = response.json() + if not isinstance(payload, dict): + return { + "success": False, + "error": { + "code": "STATUS_INVALID", + "message": "Invalid status response payload", + }, + } + return {"success": True, **payload} # Handle error response error_msg = f"Status check failed with HTTP {response.status_code}" @@ -1180,6 +1824,93 @@ def has_meaningful_changes(self, changes: Dict[str, List]) -> bool: total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") return total_changes > 0 + def _collect_force_cleanup_paths(self) -> List[Path]: + """ + Return ignored paths that force mode should actively delete remotely. + + In dev-remote mode, dev-workspace is intentionally ignored during upload + scans to avoid recursive dogfooding. If that tree already exists on the + remote side from an older buggy upload, force mode should remove it even + when the local cache does not contain those paths. + """ + cleanup_paths: List[Path] = [] + if "dev-workspace" not in self._excluded_dirnames(): + return cleanup_paths + + dev_root = Path(self.workspace_path) / "dev-workspace" + if not dev_root.exists(): + return cleanup_paths + + for root, dirnames, filenames in os.walk(dev_root): + dirnames[:] = [d for d in dirnames if not d.startswith(".")] + for filename in filenames: + path = Path(root) / filename + try: + if path.is_file(): + cleanup_paths.append(path) + except Exception: + continue + return cleanup_paths + + def build_force_changes(self, all_files: List[Path]) -> Dict[str, List]: + """ + Build force-upload changes while still cleaning stale cached paths. + + Force mode should re-upload every currently managed file, but it must also + emit deletes for files that only exist in the local cache now, including + paths that are ignored under the current client policy such as + dev-workspace in dev-remote mode. + """ + created_files: List[Path] = [] + path_map: Dict[Path, Path] = {} + for path in all_files: + if self._is_ignored_path(path): + continue + try: + resolved = path.resolve() + except Exception: + continue + created_files.append(path) + path_map[resolved] = path + + for cached_abs in self._get_all_cached_paths(): + try: + cached_path = Path(cached_abs) + resolved = cached_path.resolve() + except Exception: + continue + if resolved not in path_map: + path_map[resolved] = cached_path + + force_cleanup_paths = self._collect_force_cleanup_paths() + for cleanup_path in force_cleanup_paths: + try: + resolved = cleanup_path.resolve() + except Exception: + continue + if resolved not in path_map: + path_map[resolved] = cleanup_path + + probed = self.detect_file_changes(list(path_map.values())) + deleted_by_resolved: Dict[Path, Path] = {} + for deleted_path in probed.get("deleted", []): + try: + deleted_by_resolved[deleted_path.resolve()] = deleted_path + except Exception: + continue + for cleanup_path in force_cleanup_paths: + try: + deleted_by_resolved.setdefault(cleanup_path.resolve(), cleanup_path) + except Exception: + continue + return { + "created": created_files, + "updated": [], + "deleted": list(deleted_by_resolved.values()), + "moved": [], + "unchanged": [], + } + def upload_git_history_only(self, git_history: Dict[str, Any]) -> bool: try: empty_changes = { @@ -1224,10 +1955,13 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: # Validate input if not changes: logger.info("[remote_upload] No changes provided") + self._set_last_upload_result("no_changes") return True + if not self.has_meaningful_changes(changes): logger.info("[remote_upload] No meaningful changes detected, skipping upload") + self._set_last_upload_result("no_changes") return True # Log change summary @@ -1236,10 +1970,44 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: f"{len(changes['created'])} created, {len(changes['updated'])} updated, " f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + planned_changes = changes + plan = self._plan_delta_upload(changes) + if plan: + preview = plan.get("operation_counts_preview", {}) + logger.info( + "[remote_upload] Plan preview: needed created=%s updated=%s deleted=%s moved=%s " + "skipped_hash_match=%s needed_bytes=%s", + preview.get("created", 0), + preview.get("updated", 0), + preview.get("deleted", 0), + preview.get("moved", 0), + preview.get("skipped_hash_match", 0), + plan.get("needed_size_bytes", 0), + ) + planned_changes = self._filter_changes_by_plan(changes, plan) + has_content_work = bool( + planned_changes.get("created") + or planned_changes.get("updated") + or planned_changes.get("moved") + ) + if not has_content_work: + apply_only_result = self._apply_operations_without_content(changes, plan) + if apply_only_result is True: + return True + if not self.has_meaningful_changes(planned_changes): + logger.info("[remote_upload] Plan found no upload work; skipping bundle upload") + self._finalize_successful_changes(changes) + self._set_last_upload_result( + "skipped_by_plan", + plan_preview=preview, + needed_size_bytes=plan.get("needed_size_bytes", 0), + ) + return True + # Create delta bundle bundle_path = None try: - bundle_path, manifest = self.create_delta_bundle(changes) + bundle_path, manifest = self.create_delta_bundle(planned_changes) logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " f"(size: {manifest['total_size_bytes']} bytes)") @@ -1251,6 +2019,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: logger.error(f"[remote_upload] Error creating delta bundle: {e}") # Clean up any temporary files on failure self.cleanup() + self._set_last_upload_result("failed", stage="bundle_creation", error=str(e)) return False # Upload bundle with retry logic @@ -1258,9 +2027,84 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: response = self.upload_bundle(bundle_path, manifest) if response.get("success", False): - processed_ops = response.get('processed_operations', {}) - logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") - logger.info(f"[remote_upload] Processed operations: {processed_ops}") + async_failed = False + async_pending = False + processed_ops = response.get("processed_operations") + if processed_ops is None: + logger.info( + "[remote_upload] Bundle %s accepted by server; processing asynchronously (sequence=%s)", + manifest["bundle_id"], + response.get("sequence_number"), + ) + self._set_last_upload_result( + "queued", + bundle_id=manifest["bundle_id"], + sequence_number=response.get("sequence_number"), + ) + async_result = self._await_async_upload_result( + manifest["bundle_id"], + response.get("sequence_number"), + ) + if async_result is None: + # Server accepted the bundle but status is still pending. + async_pending = True + logger.warning( + "[remote_upload] Async upload timed out awaiting server response for bundle %s", + manifest["bundle_id"], + ) + else: + self.last_upload_result = async_result + outcome = str(async_result.get("outcome") or "") + if outcome == "uploaded_async": + self._finalize_successful_changes(planned_changes) + logger.info( + "[remote_upload] Async processing completed for bundle %s: %s", + manifest["bundle_id"], + async_result.get("processed_operations") or {}, + ) + elif outcome == "failed": + async_failed = True + logger.error( + "[remote_upload] Async processing failed for bundle %s: %s", + manifest["bundle_id"], + async_result.get("error"), + ) + self._set_last_upload_result( + "failed", + stage="async_processing", + bundle_id=async_result.get("bundle_id") or manifest["bundle_id"], + sequence_number=async_result.get("sequence_number") or response.get("sequence_number"), + error=async_result.get("error"), + ) + else: + async_pending = True + # Keep queued state for non-terminal async outcomes. + self._set_last_upload_result( + "queued", + bundle_id=async_result.get("bundle_id") or manifest["bundle_id"], + sequence_number=async_result.get("sequence_number") or response.get("sequence_number"), + ) + logger.warning( + "[remote_upload] Async upload still pending for bundle %s (sequence=%s, outcome=%s)", + manifest["bundle_id"], + response.get("sequence_number"), + outcome or "", + ) + else: + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + self._finalize_successful_changes(planned_changes) + self._set_last_upload_result( + "uploaded", + bundle_id=manifest["bundle_id"], + sequence_number=response.get("sequence_number"), + processed_operations=processed_ops, + ) + if async_pending: + logger.info( + "[remote_upload] Bundle %s accepted and queued; deferring local finalization", + manifest["bundle_id"], + ) # Clean up temporary bundle after successful upload try: @@ -1272,20 +2116,24 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: except Exception as cleanup_error: logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") - return True + return not async_failed else: error_msg = response.get('error', {}).get('message', 'Unknown upload error') logger.error(f"[remote_upload] Upload failed: {error_msg}") + self._set_last_upload_result("failed", stage="upload", error=error_msg) return False except Exception as e: logger.error(f"[remote_upload] Error uploading bundle: {e}") + self._set_last_upload_result("failed", stage="upload", error=str(e)) return False except Exception as e: logger.error(f"[remote_upload] Unexpected error in process_changes_and_upload: {e}") + self._set_last_upload_result("failed", stage="unexpected", error=str(e)) return False + def get_all_code_files(self) -> List[Path]: """Get all code files in the workspace.""" files: List[Path] = [] @@ -1296,27 +2144,30 @@ def get_all_code_files(self) -> List[Path]: # Single walk with early pruning similar to standalone client ext_suffixes = {str(ext).lower() for ext in idx.CODE_EXTS if str(ext).startswith('.')} - name_matches = {str(ext) for ext in idx.CODE_EXTS if not str(ext).startswith('.')} - dev_remote = os.environ.get("DEV_REMOTE_MODE") == "1" or os.environ.get("REMOTE_UPLOAD_MODE") == "development" - excluded = { - "node_modules", "vendor", "dist", "build", "target", "out", - ".git", ".hg", ".svn", ".vscode", ".idea", ".venv", "venv", - "__pycache__", ".pytest_cache", ".mypy_cache", ".cache", - ".context-engine", ".context-engine-uploader", ".codebase" - } - if dev_remote: - excluded.add("dev-workspace") + try: + extensionless_names = {k.lower() for k in (idx.EXTENSIONLESS_FILES or {}).keys()} + except Exception: + extensionless_names = set() + excluded = self._excluded_dirnames() seen = set() for root, dirnames, filenames in os.walk(workspace_path): dirnames[:] = [d for d in dirnames if d not in excluded and not d.startswith('.')] for filename in filenames: - if filename.startswith('.'): + # Allow dotfiles that are in EXTENSIONLESS_FILES (e.g., .gitignore) + fname_lower = filename.lower() + if filename.startswith('.') and fname_lower not in extensionless_names: continue candidate = Path(root) / filename + if self._is_ignored_path(candidate): + continue suffix = candidate.suffix.lower() - if filename in name_matches or suffix in ext_suffixes: + if ( + suffix in ext_suffixes + or fname_lower in extensionless_names + or fname_lower.startswith("dockerfile") + ): resolved = candidate.resolve() if resolved not in seen: seen.add(resolved) @@ -1368,13 +2219,13 @@ def on_any_event(self, event): # Always check src_path src_path = Path(event.src_path) - if idx.CODE_EXTS.get(src_path.suffix.lower(), "unknown") != "unknown": + if self.client._is_watchable_path(src_path): paths_to_process.append(src_path) # For FileMovedEvent, also process the destination path if hasattr(event, 'dest_path') and event.dest_path: dest_path = Path(event.dest_path) - if idx.CODE_EXTS.get(dest_path.suffix.lower(), "unknown") != "unknown": + if self.client._is_watchable_path(dest_path): paths_to_process.append(dest_path) if not paths_to_process: @@ -1395,6 +2246,8 @@ def on_any_event(self, event): def _process_pending_changes(self): """Process accumulated changes after debounce period.""" with self._lock: + # Timer fired; allow a new debounce to be armed while we process. + self._debounce_timer = None # Prevent re-entrancy if self._processing: return @@ -1406,19 +2259,21 @@ def _process_pending_changes(self): check_deletions = self._check_for_deletions self._check_for_deletions = False + upload_succeeded = False try: # Only include cached paths when deletion-related events occurred if check_deletions: cached_file_hashes = _load_local_cache_file_hashes( self.client.workspace_path, - self.client.repo_name + self.client.repo_name, + metadata_root=self.client.metadata_root, ) - all_paths = list(set(pending + [ - Path(p) for p in cached_file_hashes.keys() - ])) + cached_paths = [Path(p) for p in cached_file_hashes.keys()] + all_paths = list(set(pending + cached_paths)) else: all_paths = pending + changes = self.client.detect_file_changes(all_paths) meaningful_changes = ( len(changes.get("created", [])) + @@ -1431,7 +2286,8 @@ def _process_pending_changes(self): logger.info(f"[watch] Detected {meaningful_changes} changes: { {k: len(v) for k, v in changes.items() if k != 'unchanged'} }") success = self.client.process_changes_and_upload(changes) if success: - logger.info("[watch] Successfully uploaded changes") + self.client.log_watch_upload_result() + upload_succeeded = True else: logger.error("[watch] Failed to upload changes") else: @@ -1447,14 +2303,29 @@ def _process_pending_changes(self): success = self.client.upload_git_history_only(git_history) if success: logger.info("[watch] Successfully uploaded git history metadata") + upload_succeeded = True else: logger.error("[watch] Failed to upload git history metadata") + else: + upload_succeeded = True # No changes to process except Exception as e: logger.error(f"[watch] Error processing changes: {e}") finally: # Clear processing flag even if an error occurred with self._lock: self._processing = False + # Re-queue pending paths if upload failed + if not upload_succeeded and pending: + # Merge pending paths back into _pending_paths + for p in pending: + self._pending_paths.add(p) + # Arm next pass if there are pending paths + if self._pending_paths and self._debounce_timer is None: + self._debounce_timer = threading.Timer( + self.debounce_seconds, + self._process_pending_changes, + ) + self._debounce_timer.start() observer = Observer() @@ -1504,7 +2375,11 @@ def _watch_loop_polling(self, interval: int = 5): path_map[resolved] = p # Include any paths that are only present in the local cache (deleted files) - cached_file_hashes = _load_local_cache_file_hashes(self.workspace_path, self.repo_name) + cached_file_hashes = _load_local_cache_file_hashes( + self.workspace_path, + self.repo_name, + metadata_root=self.metadata_root, + ) for cached_abs in cached_file_hashes.keys(): try: cached_path = Path(cached_abs) @@ -1526,7 +2401,7 @@ def _watch_loop_polling(self, interval: int = 5): success = self.process_changes_and_upload(changes) if success: - logger.info(f"[watch] Successfully uploaded changes") + self.log_watch_upload_result() else: logger.error(f"[watch] Failed to upload changes") else: @@ -1584,80 +2459,7 @@ def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: except Exception as e: logger.error(f"[remote_upload] Error detecting file changes: {e}") return False - - if not self.has_meaningful_changes(changes): - logger.info("[remote_upload] No meaningful changes detected, skipping upload") - return True - - # Log change summary - total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") - logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " - f"{len(changes['created'])} created, {len(changes['updated'])} updated, " - f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") - - # Create delta bundle - bundle_path = None - try: - bundle_path, manifest = self.create_delta_bundle(changes) - logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " - f"(size: {manifest['total_size_bytes']} bytes)") - - # Validate bundle was created successfully - if not bundle_path or not os.path.exists(bundle_path): - raise RuntimeError(f"Failed to create bundle at {bundle_path}") - - except Exception as e: - logger.error(f"[remote_upload] Error creating delta bundle: {e}") - # Clean up any temporary files on failure - self.cleanup() - return False - - # Upload bundle with retry logic - try: - response = self.upload_bundle(bundle_path, manifest) - - if response.get("success", False): - processed_ops = response.get('processed_operations', {}) - logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") - logger.info(f"[remote_upload] Processed operations: {processed_ops}") - - # Clean up temporary bundle after successful upload - try: - if os.path.exists(bundle_path): - os.remove(bundle_path) - logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") - # Also clean up the entire temp directory if this is the last bundle - self.cleanup() - except Exception as cleanup_error: - logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") - - return True - else: - error = response.get("error", {}) - error_code = error.get("code", "UNKNOWN") - error_msg = error.get("message", "Unknown error") - - logger.error(f"[remote_upload] Upload failed: {error_msg}") - - # Handle specific error types - # CLI is stateless - server handles sequence management - if error_code in ["BUNDLE_TOO_LARGE", "BUNDLE_NOT_FOUND"]: - # These are unrecoverable errors - logger.error(f"[remote_upload] Unrecoverable error ({error_code}): {error_msg}") - return False - elif error_code in ["TIMEOUT_ERROR", "CONNECTION_ERROR", "NETWORK_ERROR"]: - # These might be temporary, suggest fallback - logger.warning(f"[remote_upload] Network-related error ({error_code}): {error_msg}") - logger.warning("[remote_upload] Consider falling back to local mode if this persists") - return False - else: - # Other errors - logger.error(f"[remote_upload] Upload error ({error_code}): {error_msg}") - return False - - except Exception as e: - logger.error(f"[remote_upload] Unexpected error during upload: {e}") - return False + return self.process_changes_and_upload(changes) except Exception as e: logger.error(f"[remote_upload] Critical error in process_and_upload_changes: {e}") @@ -1850,15 +2652,8 @@ def main(): # Test server connection first logger.info("Checking server status...") status = client.get_server_status() - is_success = ( - isinstance(status, dict) and - 'workspace_path' in status and - 'collection_name' in status and - status.get('status') == 'ready' - ) - if not is_success: - error = status.get("error", {}) - logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + if not _is_usable_delta_status(status): + logger.error("Cannot connect to server: %s", _server_status_error_message(status)) return 1 logger.info("Server connection successful") @@ -1894,16 +2689,8 @@ def main(): # Test server connection logger.info("Checking server status...") status = client.get_server_status() - # For delta endpoint, success is indicated by having expected fields (not a "success" boolean) - is_success = ( - isinstance(status, dict) and - 'workspace_path' in status and - 'collection_name' in status and - status.get('status') == 'ready' - ) - if not is_success: - error = status.get("error", {}) - logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + if not _is_usable_delta_status(status): + logger.error("Cannot connect to server: %s", _server_status_error_message(status)) return 1 logger.info("Server connection successful") @@ -1912,14 +2699,7 @@ def main(): logger.info("Scanning repository for files...") workspace_path = Path(config['workspace_path']) - # Find all files in the repository - all_files = [] - for file_path in workspace_path.rglob('*'): - if file_path.is_file() and not file_path.name.startswith('.'): - rel_path = file_path.relative_to(workspace_path) - # Skip .codebase directory and other metadata - if not str(rel_path).startswith('.codebase'): - all_files.append(file_path) + all_files = client.get_all_code_files() logger.info(f"Found {len(all_files)} files to upload") @@ -1929,8 +2709,7 @@ def main(): # Detect changes (treat all files as changes for initial upload) if args.force: - # Force mode: treat all files as created - changes = {"created": all_files, "updated": [], "deleted": [], "moved": [], "unchanged": []} + changes = client.build_force_changes(all_files) else: changes = client.detect_file_changes(all_files) @@ -1945,7 +2724,18 @@ def main(): success = client.process_changes_and_upload(changes) if success: - logger.info("Repository upload completed successfully!") + outcome = str((client.last_upload_result or {}).get("outcome") or "") + if outcome == "skipped_by_plan": + logger.info("No upload needed after plan") + elif outcome == "queued": + logger.info("Repository upload request accepted; server processing asynchronously") + elif outcome == "uploaded_async": + logger.info( + "Repository upload processed asynchronously: %s", + (client.last_upload_result or {}).get("processed_operations") or {}, + ) + else: + logger.info("Repository upload completed successfully!") logger.info(f"Collection name: {config['collection_name']}") logger.info(f"Files uploaded: {len(all_files)}") else: diff --git a/scripts/rerank_tools/local.py b/scripts/rerank_tools/local.py index e2151791..b7b1ba48 100644 --- a/scripts/rerank_tools/local.py +++ b/scripts/rerank_tools/local.py @@ -134,6 +134,10 @@ def _get_rerank_session(): from scripts.utils import sanitize_vector_name as _sanitize_vector_name +from scripts.path_scope import ( + normalize_under as _normalize_under_scope, + metadata_matches_under as _metadata_matches_under, +) def warmup_reranker(): @@ -163,18 +167,14 @@ def _start_background_warmup(): _start_background_warmup() -def _norm_under(u: str | None) -> str | None: - if not u: - return None - u = str(u).strip().replace("\\", "/") - u = "/".join([p for p in u.split("/") if p]) - if not u: - return None - if not u.startswith("/"): - return "/work/" + u - if not u.startswith("/work/"): - return "/work/" + u.lstrip("/") - return u +def _point_matches_under(pt: Any, under: str | None) -> bool: + if not under: + return True + payload = getattr(pt, "payload", None) or {} + md = payload.get("metadata") or {} + if not isinstance(md, dict): + md = {} + return _metadata_matches_under(md, under) def _select_dense_vector_name( @@ -366,18 +366,21 @@ def rerank_in_process( key="metadata.language", match=models.MatchValue(value=language) ) ) - eff_under = _norm_under(under) - if eff_under: - must.append( - models.FieldCondition( - key="metadata.path_prefix", match=models.MatchValue(value=eff_under) - ) - ) + eff_under = _normalize_under_scope(under) flt = models.Filter(must=must) if must else None - pts = dense_results(client, _model, vec_name, query, flt, topk, eff_collection) - if not pts and flt is not None: - pts = dense_results(client, _model, vec_name, query, None, topk, eff_collection) + fetch_topk = max(1, int(topk)) + if eff_under: + try: + under_mult = int(os.environ.get("RERANK_UNDER_FETCH_MULT", "4") or 4) + except Exception: + under_mult = 4 + fetch_topk = max(fetch_topk, int(limit) * max(under_mult, 2), fetch_topk * max(under_mult, 2)) + fetch_topk = min(fetch_topk, 2000) + + pts = dense_results(client, _model, vec_name, query, flt, fetch_topk, eff_collection) + if eff_under and pts: + pts = [pt for pt in pts if _point_matches_under(pt, eff_under)] if not pts: return [] @@ -447,19 +450,21 @@ def main(): key="metadata.language", match=models.MatchValue(value=args.language) ) ) - eff_under = _norm_under(args.under) - if eff_under: - must.append( - models.FieldCondition( - key="metadata.path_prefix", match=models.MatchValue(value=eff_under) - ) - ) + eff_under = _normalize_under_scope(args.under) flt = models.Filter(must=must) if must else None - pts = dense_results(client, model, vec_name, args.query, flt, args.topk, eff_collection) - # Fallback: if filtered search yields nothing, retry without filters to avoid empty rerank - if not pts and flt is not None: - pts = dense_results(client, model, vec_name, args.query, None, args.topk, eff_collection) + fetch_topk = max(1, int(args.topk)) + if eff_under: + try: + under_mult = int(os.environ.get("RERANK_UNDER_FETCH_MULT", "4") or 4) + except Exception: + under_mult = 4 + fetch_topk = max(fetch_topk, int(args.limit) * max(under_mult, 2), fetch_topk * max(under_mult, 2)) + fetch_topk = min(fetch_topk, 2000) + + pts = dense_results(client, model, vec_name, args.query, flt, fetch_topk, eff_collection) + if eff_under and pts: + pts = [pt for pt in pts if _point_matches_under(pt, eff_under)] if not pts: return pairs = prepare_pairs(args.query, pts) diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 7cbd9dd1..e9a3df9b 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -48,6 +48,55 @@ def get_auth_session(upload_endpoint: str) -> str: # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +_git_history_skip_log_key: Optional[str] = None + + +def _is_usable_delta_status(status: Any) -> bool: + if not isinstance(status, dict): + return False + state = str(status.get("status") or "").strip().lower() + return ( + bool(status.get("success")) and + "workspace_path" in status and + "collection_name" in status and + state in {"ready", "processing", "completed"} + ) + + +def _server_status_error_message(status: Any) -> str: + if isinstance(status, dict): + error = status.get("error") + if isinstance(error, dict): + msg = str(error.get("message") or "").strip() + if msg: + return msg + state = str(status.get("status") or "").strip() + if state: + return f"Server status is {state}" + return "Invalid server status response" + + +def _env_flag(name: str, default: bool) -> bool: + raw = os.environ.get(name) + if raw is None: + return default + return str(raw).strip().lower() in {"1", "true", "yes", "on"} + + +def _format_cached_sha1(value: Optional[str]) -> Optional[str]: + raw = str(value or "").strip() + if not raw: + return None + return raw if raw.lower().startswith("sha1:") else f"sha1:{raw}" + + +def _log_git_history_skip_once(reason: str, key: str) -> None: + global _git_history_skip_log_key + marker = f"{reason}:{key}" + if _git_history_skip_log_key == marker: + return + _git_history_skip_log_key = marker + logger.info("[git_history] skip (%s): %s", reason, key) DEFAULT_MAX_TEMP_CLEAN_ATTEMPTS = 3 DEFAULT_TEMP_CLEAN_SLEEP = 1.0 @@ -290,6 +339,10 @@ def remove_hash(self, file_path: str) -> None: self._cache = file_hashes self._cache_loaded = True + def flush(self) -> None: + """Persist the current in-memory cache state to disk.""" + self._save_cache(dict(self._load_cache())) + def _cache_seems_stale(self, file_hashes: Dict[str, str]) -> bool: """Return True if a large portion of cached paths no longer exist on disk.""" total = len(file_hashes) @@ -341,6 +394,13 @@ def remove_cached_file(file_path: str, repo_name: Optional[str] = None) -> None: _hash_cache.remove_hash(file_path) +def flush_cached_file_hashes() -> None: + """Persist the current workspace hash cache to disk.""" + global _hash_cache + if _hash_cache: + _hash_cache.flush() + + def _find_git_root(start: Path) -> Optional[Path]: """Best-effort detection of the git repository root for a workspace. @@ -426,10 +486,12 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str } if max_commits <= 0: + _log_git_history_skip_once("disabled", f"max_commits={max_commits}") return None root = _find_git_root(Path(workspace_path)) if not root: + _log_git_history_skip_once("no_repo", workspace_path) return None # Git history cache: avoid emitting identical manifests when HEAD/settings are unchanged @@ -463,6 +525,7 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str cache = {} if current_head and cache.get("last_head") == current_head and cache.get("max_commits") == max_commits and str(cache.get("since") or "") == since: + _log_git_history_skip_once("cache_hit", f"head={current_head[:10]} since={since or '-'} max={max_commits}") return None base_head = "" @@ -513,12 +576,20 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str errors="replace", ) if proc.returncode != 0 or not proc.stdout.strip(): + _log_git_history_skip_once( + "rev_list_empty", + f"head={current_head[:10] if current_head else '-'} rc={proc.returncode}", + ) return None commits = [l.strip() for l in proc.stdout.splitlines() if l.strip()] except Exception: return None if not commits: + _log_git_history_skip_once( + "no_commits", + f"head={current_head[:10] if current_head else '-'}", + ) return None if len(commits) > max_commits: commits = commits[:max_commits] @@ -592,6 +663,10 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str continue if not records: + _log_git_history_skip_once( + "no_records", + f"commits={len(commits)} head={current_head[:10] if current_head else '-'}", + ) return None try: @@ -611,6 +686,14 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str "since": since, "commits": records, } + logger.info( + "[git_history] prepared manifest mode=%s commits=%d head=%s prev=%s base=%s", + manifest["mode"], + len(records), + (current_head[:10] if current_head else "-"), + (prev_head[:10] if prev_head else "-"), + (base_head[:10] if base_head else "-"), + ) # Update git history cache with the HEAD and settings used for this manifest try: @@ -675,9 +758,6 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s self.temp_dir = None self.logical_repo_id = logical_repo_id - # Set environment variables for cache functions - os.environ["WORKSPACE_PATH"] = workspace_path - # Store repo name and initialize hash cache self.repo_name = _extract_repo_name_from_path(workspace_path) # Fallback to directory name if repo detection fails (for non-git repos) @@ -695,6 +775,129 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("http://", adapter) self.session.mount("https://", adapter) + self.last_upload_result: Dict[str, Any] = {"outcome": "idle"} + self._last_plan_payload: Optional[Dict[str, Any]] = None + + def _set_last_upload_result(self, outcome: str, **details: Any) -> Dict[str, Any]: + result: Dict[str, Any] = {"outcome": outcome} + result.update(details) + self.last_upload_result = result + return result + + def log_watch_upload_result(self) -> None: + outcome = str((self.last_upload_result or {}).get("outcome") or "") + if outcome == "skipped_by_plan": + logger.info("[watch] No upload needed after plan") + elif outcome == "queued": + logger.info("[watch] Upload request accepted; server processing asynchronously") + elif outcome == "uploaded_async": + processed = (self.last_upload_result or {}).get("processed_operations") + logger.info("[watch] Upload processed asynchronously: %s", processed or {}) + elif outcome == "uploaded": + logger.info("[watch] Successfully uploaded changes") + elif outcome == "no_changes": + logger.info("[watch] No meaningful changes to upload") + else: + logger.info("[watch] Upload handling completed") + + def _finalize_successful_changes(self, changes: Dict[str, List]) -> None: + for path in changes.get("created", []): + try: + abs_path = str(path.resolve()) + current_hash = hashlib.sha1(path.read_bytes()).hexdigest() + set_cached_file_hash(abs_path, current_hash, self.repo_name) + stat = path.stat() + self._stat_cache[abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) + except Exception: + continue + for path in changes.get("updated", []): + try: + abs_path = str(path.resolve()) + current_hash = hashlib.sha1(path.read_bytes()).hexdigest() + set_cached_file_hash(abs_path, current_hash, self.repo_name) + stat = path.stat() + self._stat_cache[abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) + except Exception: + continue + for path in changes.get("deleted", []): + try: + abs_path = str(path.resolve()) + remove_cached_file(abs_path, self.repo_name) + self._stat_cache.pop(abs_path, None) + except Exception: + continue + for source_path, dest_path in changes.get("moved", []): + try: + source_abs_path = str(source_path.resolve()) + remove_cached_file(source_abs_path, self.repo_name) + self._stat_cache.pop(source_abs_path, None) + except Exception: + pass + try: + dest_abs_path = str(dest_path.resolve()) + current_hash = hashlib.sha1(dest_path.read_bytes()).hexdigest() + set_cached_file_hash(dest_abs_path, current_hash, self.repo_name) + stat = dest_path.stat() + self._stat_cache[dest_abs_path] = ( + getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), + stat.st_size, + ) + except Exception: + continue + + def _await_async_upload_result( + self, + bundle_id: Optional[str], + sequence_number: Optional[int], + ) -> Optional[Dict[str, Any]]: + try: + max_wait = float(os.environ.get("CTXCE_REMOTE_UPLOAD_STATUS_WAIT_SECS", "5")) + except Exception: + max_wait = 5.0 + if max_wait <= 0: + return None + + try: + poll_interval = float(os.environ.get("CTXCE_REMOTE_UPLOAD_STATUS_POLL_INTERVAL_SECS", "1")) + except Exception: + poll_interval = 1.0 + poll_interval = max(0.1, poll_interval) + + deadline = time.time() + max_wait + while time.time() < deadline: + status = self.get_server_status() + if not status.get("success"): + return None + server_info = status.get("server_info", {}) if isinstance(status, dict) else {} + last_bundle_id = server_info.get("last_bundle_id") + last_upload_status = server_info.get("last_upload_status") + last_sequence = status.get("last_sequence") + bundle_matches = bool(bundle_id) and last_bundle_id == bundle_id + sequence_matches = sequence_number is not None and last_sequence == sequence_number + if bundle_matches or sequence_matches: + if last_upload_status == "completed": + return { + "outcome": "uploaded_async", + "bundle_id": last_bundle_id or bundle_id, + "sequence_number": last_sequence if last_sequence is not None else sequence_number, + "processed_operations": server_info.get("last_processed_operations"), + "processing_time_ms": server_info.get("last_processing_time_ms"), + } + if last_upload_status in ("failed", "error"): + return { + "outcome": "failed", + "bundle_id": last_bundle_id or bundle_id, + "sequence_number": last_sequence if last_sequence is not None else sequence_number, + "error": server_info.get("last_error"), + } + time.sleep(poll_interval) + return None def __enter__(self): """Context manager entry.""" @@ -730,6 +933,51 @@ def log_mapping_summary(self) -> None: logger.info(f" source_path: {info['source_path']}") logger.info(f" container_path: {info['container_path']}") + def _excluded_dirnames(self) -> frozenset: + # Keep in sync with get_all_code_files exclusions. + # NOTE: This caches the exclusion set per client instance. + # Runtime changes to DEV_REMOTE_MODE/REMOTE_UPLOAD_MODE won't be reflected + # until a new client is created (typically via process restart), which is + # acceptable for the standalone upload client use case. + cached = getattr(self, "_excluded_dirnames_cache", None) + if cached is not None: + return cached + excluded = { + "node_modules", "vendor", "dist", "build", "target", "out", + ".git", ".hg", ".svn", ".vscode", ".idea", ".venv", "venv", + "__pycache__", ".pytest_cache", ".mypy_cache", ".cache", + ".context-engine", ".context-engine-uploader", ".codebase", + } + dev_remote = os.environ.get("DEV_REMOTE_MODE") == "1" or os.environ.get("REMOTE_UPLOAD_MODE") == "development" + if dev_remote: + excluded.add("dev-workspace") + cached = frozenset(excluded) + self._excluded_dirnames_cache = cached + return cached + + def _is_ignored_path(self, path: Path) -> bool: + """Return True when path is outside workspace or under excluded dirs.""" + try: + workspace_root = Path(self.workspace_path).resolve() + rel = path.resolve().relative_to(workspace_root) + except Exception: + return True + + dir_parts = set(rel.parts[:-1]) if len(rel.parts) > 1 else set() + if dir_parts & self._excluded_dirnames(): + return True + # Ignore hidden directories anywhere under the workspace, but allow + # extensionless dotfiles like `.gitignore` that we explicitly support. + if any(p.startswith(".") for p in rel.parts[:-1]): + return True + if rel.name.startswith(".") and rel.name.lower() not in EXTENSIONLESS_FILES: + return True + return False + + def _is_watchable_path(self, path: Path) -> bool: + """Return True when a filesystem event path is eligible for upload processing.""" + return not self._is_ignored_path(path) and detect_language(path) != "unknown" + def _get_temp_bundle_dir(self) -> Path: """Get or create temporary directory for bundle creation.""" if not self.temp_dir: @@ -757,6 +1005,19 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: } for path in changed_paths: + if self._is_ignored_path(path): + try: + abs_path = str(path.resolve()) + except Exception: + continue + cached_hash = get_cached_file_hash(abs_path, self.repo_name) + if cached_hash: + changes["deleted"].append(path) + try: + self._stat_cache.pop(abs_path, None) + except Exception: + pass + continue try: abs_path = str(path.resolve()) except Exception: @@ -819,8 +1080,6 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: self._stat_cache[abs_path] = (getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), stat.st_size) except Exception: pass - set_cached_file_hash(abs_path, current_hash, self.repo_name) - # Detect moves by looking for files with same content hash # but different paths (requires additional tracking) changes["moved"] = self._detect_moves(changes["created"], changes["deleted"]) @@ -945,8 +1204,6 @@ def create_delta_bundle( operations.append(operation) file_hashes[rel_path] = f"sha1:{file_hash}" total_size += stat.st_size - set_cached_file_hash(str(path.resolve()), file_hash, self.repo_name) - except Exception as e: print(f"[bundle_create] Error processing created file {path}: {e}") continue @@ -985,8 +1242,6 @@ def create_delta_bundle( operations.append(operation) file_hashes[rel_path] = f"sha1:{file_hash}" total_size += stat.st_size - set_cached_file_hash(str(path.resolve()), file_hash, self.repo_name) - except Exception as e: print(f"[bundle_create] Error processing updated file {path}: {e}") continue @@ -1027,8 +1282,6 @@ def create_delta_bundle( operations.append(operation) file_hashes[dest_rel_path] = f"sha1:{file_hash}" total_size += stat.st_size - set_cached_file_hash(str(dest_path.resolve()), file_hash, self.repo_name) - except Exception as e: print(f"[bundle_create] Error processing moved file {source_path} -> {dest_path}: {e}") continue @@ -1115,6 +1368,308 @@ def create_delta_bundle( return str(bundle_path), manifest + def _build_plan_payload(self, changes: Dict[str, List]) -> Dict[str, Any]: + created_at = datetime.now().isoformat() + bundle_id = str(uuid.uuid4()) + operations: List[Dict[str, Any]] = [] + file_hashes: Dict[str, str] = {} + total_size = 0 + + for path in changes["created"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = path.stat() + operations.append( + { + "operation": "created", + "path": rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "language": detect_language(path), + } + ) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning("[remote_upload] Failed to prepare created plan entry for %s: %s", path, e) + + for path in changes["updated"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = path.stat() + previous_hash = _format_cached_sha1( + get_cached_file_hash(str(path.resolve()), self.repo_name) + ) + operations.append( + { + "operation": "updated", + "path": rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "previous_hash": previous_hash, + "language": detect_language(path), + } + ) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning("[remote_upload] Failed to prepare updated plan entry for %s: %s", path, e) + + for source_path, dest_path in changes["moved"]: + dest_rel_path = dest_path.relative_to(Path(self.workspace_path)).as_posix() + source_rel_path = source_path.relative_to(Path(self.workspace_path)).as_posix() + try: + content = dest_path.read_bytes() + file_hash = hashlib.sha1(content).hexdigest() + stat = dest_path.stat() + operations.append( + { + "operation": "moved", + "path": dest_rel_path, + "source_path": source_rel_path, + "size_bytes": stat.st_size, + "content_hash": f"sha1:{file_hash}", + "language": detect_language(dest_path), + } + ) + file_hashes[dest_rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + except Exception as e: + logger.warning( + "[remote_upload] Failed to prepare moved plan entry for %s -> %s: %s", + source_path, + dest_path, + e, + ) + + for path in changes["deleted"]: + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() + try: + previous_hash = _format_cached_sha1( + get_cached_file_hash(str(path.resolve()), self.repo_name) + ) + operations.append( + { + "operation": "deleted", + "path": rel_path, + "previous_hash": previous_hash, + "language": detect_language(path), + } + ) + except Exception as e: + logger.warning("[remote_upload] Failed to prepare deleted plan entry for %s: %s", path, e) + + manifest = { + "version": "1.0", + "bundle_id": bundle_id, + "workspace_path": self.workspace_path, + "collection_name": self.collection_name, + "created_at": created_at, + "sequence_number": None, + "parent_sequence": None, + "operations": { + "created": len(changes["created"]), + "updated": len(changes["updated"]), + "deleted": len(changes["deleted"]), + "moved": len(changes["moved"]), + }, + "total_files": len(operations), + "total_size_bytes": total_size, + "compression": "gzip", + "encoding": "utf-8", + } + return { + "manifest": manifest, + "operations": operations, + "file_hashes": file_hashes, + } + + def _plan_delta_upload(self, changes: Dict[str, List]) -> Optional[Dict[str, Any]]: + if not _env_flag("CTXCE_REMOTE_UPLOAD_PLAN_ENABLED", True): + return None + try: + payload = self._build_plan_payload(changes) + self._last_plan_payload = payload + data = { + "workspace_path": self._translate_to_container_path(self.workspace_path), + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "logical_repo_id": _compute_logical_repo_id(self.workspace_path), + "manifest": payload["manifest"], + "operations": payload["operations"], + "file_hashes": payload["file_hashes"], + } + sess = get_auth_session(self.upload_endpoint) + if sess: + data["session"] = sess + if getattr(self, "logical_repo_id", None): + data["logical_repo_id"] = self.logical_repo_id + + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/plan", + json=data, + timeout=min(self.timeout, 60), + ) + if response.status_code in {404, 405}: + logger.info("[remote_upload] Plan endpoint unavailable; falling back to full bundle upload") + return None + response.raise_for_status() + body = response.json() + if not body.get("success", False): + logger.warning("[remote_upload] Plan request failed; falling back: %s", body.get("error")) + return None + return body + except Exception as e: + logger.warning("[remote_upload] Plan request failed; falling back to full bundle upload: %s", e) + return None + + def _build_apply_only_payload(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Dict[str, Any]: + payload = self._last_plan_payload or self._build_plan_payload(changes) + needed = plan.get("needed_files", {}) if isinstance(plan, dict) else {} + created_needed = set(needed.get("created", []) or []) + updated_needed = set(needed.get("updated", []) or []) + moved_needed = set(needed.get("moved", []) or []) + + # Check if ALL operations are hash-matched (nothing needs content at all) + # This happens when all needed_files lists are empty and there are no actual changes requiring content + has_changes_needing_content = bool(created_needed or updated_needed or moved_needed) + has_deletes = bool(changes.get("deleted", [])) + + # Only skip apply-only if there are NO operations needing content AND NO deletes + if not has_changes_needing_content and not has_deletes: + return { + "manifest": payload.get("manifest", {}), + "operations": [], + "file_hashes": {}, + } + + filtered_ops: List[Dict[str, Any]] = [] + filtered_hashes: Dict[str, str] = {} + for operation in payload.get("operations", []): + op_type = str(operation.get("operation") or "") + rel_path = str(operation.get("path") or "") + # Determine if this operation needs content (only those skip filtered_hashes) + needs_content = ( + (op_type == "created" and rel_path in created_needed) + or (op_type == "updated" and rel_path in updated_needed) + or (op_type == "moved" and rel_path in moved_needed) + ) + if needs_content: + # Skip operations that need content - they'll be uploaded separately + continue + # IMPORTANT: server-side apply_delta_operations() only accepts "deleted" and "moved" + # operations. Hash-matched "created" and "updated" operations must NOT be routed + # through apply_ops since the server will reject them. + if op_type not in {"deleted", "moved"}: + continue + # Preserve all other operations so server advances state + filtered_ops.append(operation) + # Include hash for non-deleted operations + if op_type != "deleted": + hash_value = payload.get("file_hashes", {}).get(rel_path) + if hash_value: + filtered_hashes[rel_path] = hash_value + return { + "manifest": payload.get("manifest", {}), + "operations": filtered_ops, + "file_hashes": filtered_hashes, + } + + def _apply_operations_without_content(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Optional[bool]: + payload = self._build_apply_only_payload(changes, plan) + operations = payload.get("operations", []) + if not operations: + return None + try: + data = { + "workspace_path": self._translate_to_container_path(self.workspace_path), + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "logical_repo_id": _compute_logical_repo_id(self.workspace_path), + "manifest": payload["manifest"], + "operations": operations, + "file_hashes": payload["file_hashes"], + } + sess = get_auth_session(self.upload_endpoint) + if sess: + data["session"] = sess + if getattr(self, "logical_repo_id", None): + data["logical_repo_id"] = self.logical_repo_id + + logger.info( + "[remote_upload] Applying metadata-only operations without bundle: deleted=%s moved=%s", + sum(1 for op in operations if op.get("operation") == "deleted"), + sum(1 for op in operations if op.get("operation") == "moved"), + ) + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/apply_ops", + json=data, + timeout=min(self.timeout, 60), + ) + if response.status_code in {404, 405}: + logger.info("[remote_upload] apply_ops endpoint unavailable; falling back to bundle upload") + return None + response.raise_for_status() + body = response.json() + if not body.get("success", False): + logger.warning("[remote_upload] apply_ops failed; falling back to bundle upload: %s", body.get("error")) + return None + # Only finalize changes that were actually processed by the server + # apply_delta_operations only handles deleted/moved operations + processed_ops = body.get("processed_operations") or {} + applied_changes = { + "deleted": changes.get("deleted", []), + "moved": changes.get("moved", []), + "created": [], + "updated": [], + } + self._finalize_successful_changes(applied_changes) + self._set_last_upload_result( + "uploaded", + bundle_id=body.get("bundle_id"), + sequence_number=body.get("sequence_number"), + processed_operations=processed_ops, + ) + logger.info( + "[remote_upload] Metadata-only operations applied: %s", + processed_ops, + ) + return True + except Exception as e: + logger.warning("[remote_upload] apply_ops failed; falling back to bundle upload: %s", e) + return None + + def _filter_changes_by_plan(self, changes: Dict[str, List], plan: Dict[str, Any]) -> Dict[str, List]: + needed = plan.get("needed_files", {}) if isinstance(plan, dict) else {} + created_needed = set(needed.get("created", []) or []) + updated_needed = set(needed.get("updated", []) or []) + moved_needed = set(needed.get("moved", []) or []) + + filtered_created = [ + path for path in changes["created"] + if path.relative_to(Path(self.workspace_path)).as_posix() in created_needed + ] + filtered_updated = [ + path for path in changes["updated"] + if path.relative_to(Path(self.workspace_path)).as_posix() in updated_needed + ] + filtered_moved = [ + (source_path, dest_path) + for source_path, dest_path in changes["moved"] + if dest_path.relative_to(Path(self.workspace_path)).as_posix() in moved_needed + ] + return { + "created": filtered_created, + "updated": filtered_updated, + "deleted": list(changes["deleted"]), + "moved": filtered_moved, + "unchanged": [], + } + def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, Any]: """Upload delta bundle to remote server with exponential backoff retry. @@ -1361,7 +1916,16 @@ def get_server_status(self) -> Dict[str, Any]: ) if response.status_code == 200: - return response.json() + payload = response.json() + if not isinstance(payload, dict): + return { + "success": False, + "error": { + "code": "STATUS_INVALID", + "message": "Invalid status response payload", + }, + } + return {"success": True, **payload} # Handle error response error_msg = f"Status check failed with HTTP {response.status_code}" @@ -1385,6 +1949,93 @@ def has_meaningful_changes(self, changes: Dict[str, List]) -> bool: total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") return total_changes > 0 + def _collect_force_cleanup_paths(self) -> List[Path]: + """ + Return ignored paths that force mode should actively delete remotely. + + In dev-remote mode, dev-workspace is intentionally ignored during upload + scans to avoid recursive dogfooding. If that tree already exists on the + remote side from an older buggy upload, force mode should remove it even + when the standalone client's cache does not know about those paths. + """ + cleanup_paths: List[Path] = [] + if "dev-workspace" not in self._excluded_dirnames(): + return cleanup_paths + + dev_root = Path(self.workspace_path) / "dev-workspace" + if not dev_root.exists(): + return cleanup_paths + + for root, dirnames, filenames in os.walk(dev_root): + dirnames[:] = [d for d in dirnames if not d.startswith(".")] + for filename in filenames: + path = Path(root) / filename + try: + if path.is_file(): + cleanup_paths.append(path) + except Exception: + continue + return cleanup_paths + + def build_force_changes(self, all_files: List[Path]) -> Dict[str, List]: + """ + Build force-upload changes while still cleaning stale cached paths. + + Force mode should re-upload every currently managed file, but it must also + emit deletes for files that only exist in the local cache now, including + paths that are ignored under the current client policy such as + dev-workspace in dev-remote mode. + """ + created_files: List[Path] = [] + path_map: Dict[Path, Path] = {} + for path in all_files: + if self._is_ignored_path(path): + continue + try: + resolved = path.resolve() + except Exception: + continue + created_files.append(path) + path_map[resolved] = path + + for cached_abs in get_all_cached_paths(self.repo_name): + try: + cached_path = Path(cached_abs) + resolved = cached_path.resolve() + except Exception: + continue + if resolved not in path_map: + path_map[resolved] = cached_path + + force_cleanup_paths = self._collect_force_cleanup_paths() + for cleanup_path in force_cleanup_paths: + try: + resolved = cleanup_path.resolve() + except Exception: + continue + if resolved not in path_map: + path_map[resolved] = cleanup_path + + probed = self.detect_file_changes(list(path_map.values())) + deleted_by_resolved: Dict[Path, Path] = {} + for deleted_path in probed.get("deleted", []): + try: + deleted_by_resolved[deleted_path.resolve()] = deleted_path + except Exception: + continue + for cleanup_path in force_cleanup_paths: + try: + deleted_by_resolved.setdefault(cleanup_path.resolve(), cleanup_path) + except Exception: + continue + return { + "created": created_files, + "updated": [], + "deleted": list(deleted_by_resolved.values()), + "moved": [], + "unchanged": [], + } + def upload_git_history_only(self, git_history: Dict[str, Any]) -> bool: try: empty_changes = { @@ -1429,10 +2080,12 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: # Validate input if not changes: logger.info("[remote_upload] No changes provided") + self._set_last_upload_result("no_changes") return True if not self.has_meaningful_changes(changes): logger.info("[remote_upload] No meaningful changes detected, skipping upload") + self._set_last_upload_result("no_changes") return True # Log change summary @@ -1441,10 +2094,46 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: f"{len(changes['created'])} created, {len(changes['updated'])} updated, " f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + planned_changes = changes + plan = self._plan_delta_upload(changes) + if plan: + preview = plan.get("operation_counts_preview", {}) + logger.info( + "[remote_upload] Plan preview: needed created=%s updated=%s deleted=%s moved=%s " + "skipped_hash_match=%s needed_bytes=%s", + preview.get("created", 0), + preview.get("updated", 0), + preview.get("deleted", 0), + preview.get("moved", 0), + preview.get("skipped_hash_match", 0), + plan.get("needed_size_bytes", 0), + ) + planned_changes = self._filter_changes_by_plan(changes, plan) + has_content_work = bool( + planned_changes.get("created") + or planned_changes.get("updated") + or planned_changes.get("moved") + ) + if not has_content_work: + apply_only_result = self._apply_operations_without_content(changes, plan) + if apply_only_result is True: + flush_cached_file_hashes() + return True + if not self.has_meaningful_changes(planned_changes): + logger.info("[remote_upload] Plan found no upload work; skipping bundle upload") + self._finalize_successful_changes(changes) + self._set_last_upload_result( + "skipped_by_plan", + plan_preview=preview, + needed_size_bytes=plan.get("needed_size_bytes", 0), + ) + flush_cached_file_hashes() + return True + # Create delta bundle bundle_path = None try: - bundle_path, manifest = self.create_delta_bundle(changes) + bundle_path, manifest = self.create_delta_bundle(planned_changes) logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " f"(size: {manifest['total_size_bytes']} bytes)") @@ -1456,6 +2145,7 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: logger.error(f"[remote_upload] Error creating delta bundle: {e}") # Clean up any temporary files on failure self.cleanup() + self._set_last_upload_result("failed", stage="bundle_creation", error=str(e)) return False # Upload bundle with retry logic @@ -1463,9 +2153,86 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: response = self.upload_bundle(bundle_path, manifest) if response.get("success", False): - processed_ops = response.get('processed_operations', {}) - logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") - logger.info(f"[remote_upload] Processed operations: {processed_ops}") + async_failed = False + async_pending = False + processed_ops = response.get("processed_operations") + if processed_ops is None: + logger.info( + "[remote_upload] Bundle %s accepted by server; processing asynchronously (sequence=%s)", + manifest["bundle_id"], + response.get("sequence_number"), + ) + self._set_last_upload_result( + "queued", + bundle_id=manifest["bundle_id"], + sequence_number=response.get("sequence_number"), + ) + async_result = self._await_async_upload_result( + manifest["bundle_id"], + response.get("sequence_number"), + ) + if async_result is None: + # Server accepted the bundle but status is still pending. + async_pending = True + logger.warning( + "[remote_upload] Async upload timed out awaiting server response for bundle %s", + manifest["bundle_id"], + ) + else: + self.last_upload_result = async_result + outcome = str(async_result.get("outcome") or "") + if outcome == "uploaded_async": + self._finalize_successful_changes(planned_changes) + logger.info( + "[remote_upload] Async processing completed for bundle %s: %s", + manifest["bundle_id"], + async_result.get("processed_operations") or {}, + ) + elif outcome == "failed": + async_failed = True + logger.error( + "[remote_upload] Async processing failed for bundle %s: %s", + manifest["bundle_id"], + async_result.get("error"), + ) + self._set_last_upload_result( + "failed", + stage="async_processing", + bundle_id=async_result.get("bundle_id") or manifest["bundle_id"], + sequence_number=async_result.get("sequence_number") or response.get("sequence_number"), + error=async_result.get("error"), + ) + else: + async_pending = True + # Keep queued state for non-terminal async outcomes. + self._set_last_upload_result( + "queued", + bundle_id=async_result.get("bundle_id") or manifest["bundle_id"], + sequence_number=async_result.get("sequence_number") or response.get("sequence_number"), + ) + logger.warning( + "[remote_upload] Async upload still pending for bundle %s (sequence=%s, outcome=%s)", + manifest["bundle_id"], + response.get("sequence_number"), + outcome or "", + ) + else: + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + self._finalize_successful_changes(planned_changes) + self._set_last_upload_result( + "uploaded", + bundle_id=manifest["bundle_id"], + sequence_number=response.get("sequence_number"), + processed_operations=processed_ops, + ) + if async_pending: + logger.info( + "[remote_upload] Bundle %s accepted and queued; deferring local finalization", + manifest["bundle_id"], + ) + if not async_failed and not async_pending: + flush_cached_file_hashes() # Clean up temporary bundle after successful upload try: @@ -1477,18 +2244,21 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: except Exception as cleanup_error: logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") - return True + return not async_failed else: error_msg = response.get('error', {}).get('message', 'Unknown upload error') logger.error(f"[remote_upload] Upload failed: {error_msg}") + self._set_last_upload_result("failed", stage="upload", error=error_msg) return False except Exception as e: logger.error(f"[remote_upload] Error uploading bundle: {e}") + self._set_last_upload_result("failed", stage="upload", error=str(e)) return False except Exception as e: logger.error(f"[remote_upload] Unexpected error in process_changes_and_upload: {e}") + self._set_last_upload_result("failed", stage="unexpected", error=str(e)) return False def watch_loop(self, interval: int = 5): @@ -1515,6 +2285,7 @@ def __init__(self, client, debounce_seconds=2.0): self._pending_paths = set() self._check_for_deletions = False self._lock = threading.Lock() + self._processing = False def on_any_event(self, event): """Handle any file system event.""" @@ -1532,13 +2303,13 @@ def on_any_event(self, event): # Always check src_path src_path = Path(event.src_path) - if detect_language(src_path) != "unknown": + if self.client._is_watchable_path(src_path): paths_to_process.append(src_path) # For FileMovedEvent, also process the destination path if hasattr(event, 'dest_path') and event.dest_path: dest_path = Path(event.dest_path) - if detect_language(dest_path) != "unknown": + if self.client._is_watchable_path(dest_path): paths_to_process.append(dest_path) if not paths_to_process: @@ -1559,22 +2330,30 @@ def on_any_event(self, event): def _process_pending_changes(self): """Process accumulated changes after debounce period.""" with self._lock: + # Timer fired; allow a new debounce to be armed while we process. + self._debounce_timer = None + if self._processing: + return if not self._pending_paths: return + self._processing = True pending = list(self._pending_paths) self._pending_paths.clear() check_deletions = self._check_for_deletions self._check_for_deletions = False + upload_succeeded = False try: # Only include cached paths when deletion-related events occurred if check_deletions: - all_paths = list(set(pending + [ + cached_paths = [ Path(p) for p in get_all_cached_paths(self.client.repo_name) - ])) + ] + all_paths = list(set(pending + cached_paths)) else: all_paths = pending - + + changes = self.client.detect_file_changes(all_paths) meaningful_changes = ( len(changes.get("created", [])) + @@ -1582,12 +2361,13 @@ def _process_pending_changes(self): len(changes.get("deleted", [])) + len(changes.get("moved", [])) ) - + if meaningful_changes > 0: logger.info(f"[watch] Detected {meaningful_changes} changes: { {k: len(v) for k, v in changes.items() if k != 'unchanged'} }") success = self.client.process_changes_and_upload(changes) if success: - logger.info("[watch] Successfully uploaded changes") + self.client.log_watch_upload_result() + upload_succeeded = True else: logger.error("[watch] Failed to upload changes") else: @@ -1597,16 +2377,34 @@ def _process_pending_changes(self): git_history = _collect_git_history_for_workspace(self.client.workspace_path) except Exception: git_history = None - + if git_history: logger.info("[watch] Detected git history update; uploading git history metadata") success = self.client.upload_git_history_only(git_history) if success: logger.info("[watch] Successfully uploaded git history metadata") + upload_succeeded = True else: logger.error("[watch] Failed to upload git history metadata") + else: + upload_succeeded = True # No changes to process except Exception as e: logger.error(f"[watch] Error processing changes: {e}") + finally: + with self._lock: + self._processing = False + # Re-queue pending paths if upload failed + if not upload_succeeded and pending: + # Merge pending paths back into _pending_paths + for p in pending: + self._pending_paths.add(p) + # Arm next pass if there are pending paths + if self._pending_paths and self._debounce_timer is None: + self._debounce_timer = threading.Timer( + self.debounce_seconds, + self._process_pending_changes, + ) + self._debounce_timer.start() observer = Observer() handler = CodeFileEventHandler(self, debounce_seconds=2.0) @@ -1676,7 +2474,7 @@ def _watch_loop_polling(self, interval: int = 5): success = self.process_changes_and_upload(changes) if success: - logger.info(f"[watch] Successfully uploaded changes") + self.log_watch_upload_result() else: logger.error(f"[watch] Failed to upload changes") else: @@ -1719,16 +2517,10 @@ def get_all_code_files(self) -> List[Path]: # Single walk with early pruning and set-based matching to reduce IO ext_suffixes = {str(ext).lower() for ext in CODE_EXTS if str(ext).startswith('.')} - extensionless_names = set(EXTENSIONLESS_FILES.keys()) + extensionless_names = {k.lower() for k in EXTENSIONLESS_FILES.keys()} # Always exclude dev-workspace to prevent recursive upload loops # (upload service creates dev-workspace// which would otherwise get re-uploaded) - excluded = { - "node_modules", "vendor", "dist", "build", "target", "out", - ".git", ".hg", ".svn", ".vscode", ".idea", ".venv", "venv", - "__pycache__", ".pytest_cache", ".mypy_cache", ".cache", - ".context-engine", ".context-engine-uploader", ".codebase", - "dev-workspace" - } + excluded = self._excluded_dirnames() seen = set() for root, dirnames, filenames in os.walk(workspace_path): @@ -1741,6 +2533,8 @@ def get_all_code_files(self) -> List[Path]: if filename.startswith('.') and fname_lower not in extensionless_names: continue candidate = Path(root) / filename + if self._is_ignored_path(candidate): + continue suffix = candidate.suffix.lower() # Match by extension, extensionless name, or Dockerfile.* prefix if (suffix in ext_suffixes or @@ -1780,80 +2574,7 @@ def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: except Exception as e: logger.error(f"[remote_upload] Error detecting file changes: {e}") return False - - if not self.has_meaningful_changes(changes): - logger.info("[remote_upload] No meaningful changes detected, skipping upload") - return True - - # Log change summary - total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") - logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " - f"{len(changes['created'])} created, {len(changes['updated'])} updated, " - f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") - - # Create delta bundle - bundle_path = None - try: - bundle_path, manifest = self.create_delta_bundle(changes) - logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " - f"(size: {manifest['total_size_bytes']} bytes)") - - # Validate bundle was created successfully - if not bundle_path or not os.path.exists(bundle_path): - raise RuntimeError(f"Failed to create bundle at {bundle_path}") - - except Exception as e: - logger.error(f"[remote_upload] Error creating delta bundle: {e}") - # Clean up any temporary files on failure - self.cleanup() - return False - - # Upload bundle with retry logic - try: - response = self.upload_bundle(bundle_path, manifest) - - if response.get("success", False): - processed_ops = response.get('processed_operations', {}) - logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") - logger.info(f"[remote_upload] Processed operations: {processed_ops}") - - # Clean up temporary bundle after successful upload - try: - if os.path.exists(bundle_path): - os.remove(bundle_path) - logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") - # Also clean up the entire temp directory if this is the last bundle - self.cleanup() - except Exception as cleanup_error: - logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") - - return True - else: - error = response.get("error", {}) - error_code = error.get("code", "UNKNOWN") - error_msg = error.get("message", "Unknown error") - - logger.error(f"[remote_upload] Upload failed: {error_msg}") - - # Handle specific error types - # CLI is stateless - server handles sequence management - if error_code in ["BUNDLE_TOO_LARGE", "BUNDLE_NOT_FOUND"]: - # These are unrecoverable errors - logger.error(f"[remote_upload] Unrecoverable error ({error_code}): {error_msg}") - return False - elif error_code in ["TIMEOUT_ERROR", "CONNECTION_ERROR", "NETWORK_ERROR"]: - # These might be temporary, suggest fallback - logger.warning(f"[remote_upload] Network-related error ({error_code}): {error_msg}") - logger.warning("[remote_upload] Consider falling back to local mode if this persists") - return False - else: - # Other errors - logger.error(f"[remote_upload] Upload error ({error_code}): {error_msg}") - return False - - except Exception as e: - logger.error(f"[remote_upload] Unexpected error during upload: {e}") - return False + return self.process_changes_and_upload(changes) except Exception as e: logger.error(f"[remote_upload] Critical error in process_and_upload_changes: {e}") @@ -2040,15 +2761,8 @@ def main(): # Test server connection first logger.info("Checking server status...") status = client.get_server_status() - is_success = ( - isinstance(status, dict) and - 'workspace_path' in status and - 'collection_name' in status and - status.get('status') == 'ready' - ) - if not is_success: - error = status.get("error", {}) - logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + if not _is_usable_delta_status(status): + logger.error("Cannot connect to server: %s", _server_status_error_message(status)) return 1 logger.info("Server connection successful") @@ -2085,16 +2799,8 @@ def main(): # Test server connection logger.info("Checking server status...") status = client.get_server_status() - # For delta endpoint, success is indicated by having expected fields (not a "success" boolean) - is_success = ( - isinstance(status, dict) and - 'workspace_path' in status and - 'collection_name' in status and - status.get('status') == 'ready' - ) - if not is_success: - error = status.get("error", {}) - logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + if not _is_usable_delta_status(status): + logger.error("Cannot connect to server: %s", _server_status_error_message(status)) return 1 logger.info("Server connection successful") @@ -2113,8 +2819,7 @@ def main(): # Detect changes (treat all files as changes for initial upload) if args.force: - # Force mode: treat all files as created - changes = {"created": all_files, "updated": [], "deleted": [], "moved": [], "unchanged": []} + changes = client.build_force_changes(all_files) else: changes = client.detect_file_changes(all_files) @@ -2129,7 +2834,18 @@ def main(): success = client.process_changes_and_upload(changes) if success: - logger.info("Repository upload completed successfully!") + outcome = str((client.last_upload_result or {}).get("outcome") or "") + if outcome == "skipped_by_plan": + logger.info("No upload needed after plan") + elif outcome == "queued": + logger.info("Repository upload request accepted; server processing asynchronously") + elif outcome == "uploaded_async": + logger.info( + "Repository upload processed asynchronously: %s", + (client.last_upload_result or {}).get("processed_operations") or {}, + ) + else: + logger.info("Repository upload completed successfully!") logger.info(f"Collection name: {config['collection_name']}") logger.info(f"Files uploaded: {len(all_files)}") else: diff --git a/scripts/upload_delta_bundle.py b/scripts/upload_delta_bundle.py index 973be132..4ccf2613 100644 --- a/scripts/upload_delta_bundle.py +++ b/scripts/upload_delta_bundle.py @@ -1,5 +1,6 @@ import os import json +import shutil import tarfile import hashlib import re @@ -10,11 +11,13 @@ try: from scripts.workspace_state import ( - _extract_repo_name_from_path, - get_staging_targets, - get_collection_state_snapshot, - is_staging_enabled, - ) + _normalize_cache_key_path, + _extract_repo_name_from_path, + get_staging_targets, + get_collection_state_snapshot, + is_staging_enabled, + upsert_index_journal_entries, +) except ImportError as exc: raise ImportError( "upload_delta_bundle requires scripts.workspace_state; ensure the module is available" @@ -27,6 +30,102 @@ _SLUGGED_REPO_RE = re.compile(r"^.+-[0-9a-f]{16}(?:_old)?$") +def _normalize_hash_value(value: Any) -> str: + raw = str(value or "").strip() + if not raw: + return "" + if ":" in raw: + _, _, digest = raw.partition(":") + if digest.strip(): + return digest.strip().lower() + return raw.lower() + + +def _build_upsert_journal_entry(path: Path | str, content_hash: Optional[str]) -> Dict[str, Any]: + entry: Dict[str, Any] = { + "path": str(path), + "op_type": "upsert", + } + if content_hash: + entry["content_hash"] = content_hash + return entry + + +def _build_delete_journal_entry(path: Path | str, content_hash: Optional[str] = None) -> Dict[str, Any]: + entry: Dict[str, Any] = { + "path": str(path), + "op_type": "delete", + } + if content_hash: + entry["content_hash"] = content_hash + return entry + + +def _load_cache_hashes(cache_path: Path) -> Dict[str, str]: + try: + with cache_path.open("r", encoding="utf-8-sig") as f: + data = json.load(f) + except (OSError, ValueError, json.JSONDecodeError): + return {} + + file_hashes = data.get("file_hashes", {}) + if not isinstance(file_hashes, dict): + return {} + + normalized: Dict[str, str] = {} + for path_key, value in file_hashes.items(): + if isinstance(value, dict): + hash_value = value.get("hash") + else: + hash_value = value + digest = _normalize_hash_value(hash_value) + if digest: + normalized[_normalize_cache_key_path(str(path_key))] = digest + return normalized + + +def _load_replica_cache_hashes(workspace_root: Path, slug: str) -> Dict[str, str]: + merged: Dict[str, str] = {} + cache_paths = ( + Path(WORK_DIR) / ".codebase" / "repos" / slug / "cache.json", + workspace_root / ".codebase" / "cache.json", + ) + for cache_path in cache_paths: + if not cache_path.exists(): + continue + merged.update(_load_cache_hashes(cache_path)) + return merged + + +def _flush_replica_cache_hashes(workspace_root: Path, slug: str, hashes: Dict[str, str]) -> None: + """Flush replica hashes to workspace cache.json.""" + try: + cache_path = workspace_root / ".codebase" / "cache.json" + cache_path.parent.mkdir(parents=True, exist_ok=True) + + # Read existing cache to preserve other entries + existing_data = {} + if cache_path.exists(): + try: + with cache_path.open("r", encoding="utf-8-sig") as f: + existing_data = json.load(f) + except (OSError, ValueError, json.JSONDecodeError): + existing_data = {} + + # Update file_hashes section + if not isinstance(existing_data, dict): + existing_data = {} + existing_data["file_hashes"] = hashes + + # Write back atomically + temp_path = cache_path.with_suffix(".tmp") + with temp_path.open("w", encoding="utf-8") as f: + json.dump(existing_data, f, indent=2) + temp_path.replace(cache_path) + except Exception as e: + logger.debug(f"[upload_service] Failed to flush cache for {slug}: {e}") + + def get_workspace_key(workspace_path: str) -> str: """Generate 16-char hash for collision avoidance in remote uploads. @@ -61,139 +160,130 @@ def _cleanup_empty_dirs(path: Path, stop_at: Path) -> None: break -def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[str, Any]) -> Dict[str, int]: - """Process delta bundle and return operation counts.""" - operations_count = { - "created": 0, - "updated": 0, - "deleted": 0, - "moved": 0, - "skipped": 0, - "failed": 0, - } +def _resolve_replica_roots(workspace_path: str, *, create_missing: bool = True) -> Dict[str, Path]: + workspace_leaf = Path(workspace_path).name + repo_name_for_state: Optional[str] = None + serving_slug: Optional[str] = None + active_slug: Optional[str] = None + if _extract_repo_name_from_path and get_collection_state_snapshot: + try: + repo_name_for_state = _extract_repo_name_from_path(workspace_path) + if repo_name_for_state: + snapshot = get_collection_state_snapshot( + workspace_path=None, + repo_name=repo_name_for_state, + ) # type: ignore[arg-type] + serving_slug = snapshot.get("serving_repo_slug") + active_slug = snapshot.get("active_repo_slug") + except Exception: + serving_slug = None + active_slug = None + + slug_order: list[str] = [] + serving_candidate: Optional[str] = None + if serving_slug and _SLUGGED_REPO_RE.match(serving_slug): + serving_candidate = serving_slug + if active_slug and _SLUGGED_REPO_RE.match(active_slug) and active_slug not in slug_order: + slug_order.append(active_slug) + + staging_active = False + staging_gate = bool(is_staging_enabled() if callable(is_staging_enabled) else False) try: - # CRITICAL: Always materialize writes under WORK_DIR using a slugged repo directory. - # Do NOT write directly into the client-supplied workspace_path, since that may be a host - # path (e.g. /home/user/repo) that is not mounted/visible to the watcher/indexer. - workspace_leaf = Path(workspace_path).name - - repo_name_for_state: Optional[str] = None + if serving_slug and str(serving_slug).endswith("_old"): + staging_active = True + except Exception: + staging_active = False - serving_slug: Optional[str] = None - active_slug: Optional[str] = None - if _extract_repo_name_from_path and get_collection_state_snapshot: - try: - repo_name_for_state = _extract_repo_name_from_path(workspace_path) - if repo_name_for_state: - snapshot = get_collection_state_snapshot(workspace_path=None, repo_name=repo_name_for_state) # type: ignore[arg-type] - serving_slug = snapshot.get("serving_repo_slug") - active_slug = snapshot.get("active_repo_slug") - except Exception: - serving_slug = None - active_slug = None - - slug_order: list[str] = [] - serving_candidate: Optional[str] = None - if serving_slug and _SLUGGED_REPO_RE.match(serving_slug): - serving_candidate = serving_slug - if active_slug and _SLUGGED_REPO_RE.match(active_slug) and active_slug not in slug_order: - slug_order.append(active_slug) - - # If staging is active, we must mirror uploads into BOTH the canonical slug and - # the "*_old" slug. Relying purely on snapshot detection is brittle (e.g. when - # the client workspace_path is a host path). When we can infer a canonical slug, - # force both targets. + if not staging_gate: staging_active = False - staging_gate = bool(is_staging_enabled() if callable(is_staging_enabled) else False) - try: - if serving_slug and str(serving_slug).endswith("_old"): - staging_active = True - except Exception: - staging_active = False - if not staging_gate: - staging_active = False + def _append_slug(slug: Optional[str]) -> None: + if slug and _SLUGGED_REPO_RE.match(slug) and slug not in slug_order: + slug_order.append(slug) + + if repo_name_for_state and _SLUGGED_REPO_RE.match(repo_name_for_state): + canonical_slug = ( + repo_name_for_state[:-4] + if repo_name_for_state.endswith("_old") + else repo_name_for_state + ) + old_slug_candidate = ( + repo_name_for_state + if repo_name_for_state.endswith("_old") + else f"{canonical_slug}_old" + ) + if staging_active: + slug_order = [] + _append_slug(canonical_slug) + _append_slug(old_slug_candidate) + elif not slug_order: + _append_slug(canonical_slug) + old_slug_path = Path(WORK_DIR) / old_slug_candidate + if old_slug_path.exists(): + _append_slug(old_slug_candidate) - def _append_slug(slug: Optional[str]) -> None: - if slug and _SLUGGED_REPO_RE.match(slug) and slug not in slug_order: - slug_order.append(slug) + if not slug_order: + if _SLUGGED_REPO_RE.match(workspace_leaf): + slug_order.append(workspace_leaf) + else: + if _extract_repo_name_from_path: + repo_name = _extract_repo_name_from_path(workspace_path) or workspace_leaf + else: + repo_name = workspace_leaf + workspace_key = get_workspace_key(workspace_path) + slug_order.append(f"{repo_name}-{workspace_key}") - if repo_name_for_state and _SLUGGED_REPO_RE.match(repo_name_for_state): - canonical_slug = repo_name_for_state[:-4] if repo_name_for_state.endswith("_old") else repo_name_for_state - old_slug_candidate = ( - repo_name_for_state if repo_name_for_state.endswith("_old") else f"{canonical_slug}_old" + if staging_gate and (not staging_active) and get_staging_targets and _extract_repo_name_from_path: + try: + repo_name_for_staging = _extract_repo_name_from_path(workspace_path) or slug_order[0] + targets = get_staging_targets( + workspace_path=workspace_path, + repo_name=repo_name_for_staging, ) - if staging_active: - slug_order = [] - _append_slug(canonical_slug) - _append_slug(old_slug_candidate) - elif not slug_order: - _append_slug(canonical_slug) - old_slug_path = Path(WORK_DIR) / old_slug_candidate - if old_slug_path.exists(): - _append_slug(old_slug_candidate) - - if not slug_order: - if _SLUGGED_REPO_RE.match(workspace_leaf): - slug_order.append(workspace_leaf) - else: - if _extract_repo_name_from_path: - repo_name = _extract_repo_name_from_path(workspace_path) or workspace_leaf - else: - repo_name = workspace_leaf - workspace_key = get_workspace_key(workspace_path) - slug_order.append(f"{repo_name}-{workspace_key}") + if isinstance(targets, dict) and targets.get("staging"): + staging_active = True + except Exception as staging_err: + logger.debug("[upload_service] Failed to detect staging: %s", staging_err) - # Best-effort: if staging is active according to workspace_state, ensure we mirror to - # both the canonical slug and its *_old slug. - if staging_gate and (not staging_active) and get_staging_targets and _extract_repo_name_from_path: - try: - repo_name_for_staging = _extract_repo_name_from_path(workspace_path) or slug_order[0] - targets = get_staging_targets(workspace_path=workspace_path, repo_name=repo_name_for_staging) - if isinstance(targets, dict) and targets.get("staging"): - staging_active = True - except Exception as staging_err: - logger.debug(f"[upload_service] Failed to detect staging: {staging_err}") - - def _slug_exists(slug: str) -> bool: - try: - return ( - (Path(WORK_DIR) / slug).exists() - or (Path(WORK_DIR) / ".codebase" / "repos" / slug).exists() - ) - except Exception: - return False - - if staging_gate and (not staging_active) and slug_order: - primary = slug_order[0] - if _SLUGGED_REPO_RE.match(primary): - canonical = primary[:-4] if primary.endswith("_old") else primary - inferred_old = primary if primary.endswith("_old") else f"{canonical}_old" - if _slug_exists(inferred_old): - staging_active = True - - if staging_gate and staging_active and slug_order: - primary = slug_order[0] - if _SLUGGED_REPO_RE.match(primary): - canonical = primary[:-4] if primary.endswith("_old") else primary - old_slug = primary if primary.endswith("_old") else f"{canonical}_old" - desired = [canonical, old_slug] - slug_order = [s for s in desired if _SLUGGED_REPO_RE.match(s)] - elif staging_gate and not staging_active and serving_candidate: - # Ignore serving slugs when staging is disabled; keep deterministic non-staging writes. - if serving_candidate in slug_order: - slug_order = [s for s in slug_order if s != serving_candidate] - - if staging_gate: - try: - logger.info(f"[upload_service] Delta bundle targets (staging={staging_active}): {slug_order}") - except Exception: - pass + def _slug_exists(slug: str) -> bool: + try: + return ( + (Path(WORK_DIR) / slug).exists() + or (Path(WORK_DIR) / ".codebase" / "repos" / slug).exists() + ) + except Exception: + return False + + if staging_gate and (not staging_active) and slug_order: + primary = slug_order[0] + if _SLUGGED_REPO_RE.match(primary): + canonical = primary[:-4] if primary.endswith("_old") else primary + inferred_old = primary if primary.endswith("_old") else f"{canonical}_old" + if _slug_exists(inferred_old): + staging_active = True + + if staging_gate and staging_active and slug_order: + primary = slug_order[0] + if _SLUGGED_REPO_RE.match(primary): + canonical = primary[:-4] if primary.endswith("_old") else primary + old_slug = primary if primary.endswith("_old") else f"{canonical}_old" + desired = [canonical, old_slug] + slug_order = [s for s in desired if _SLUGGED_REPO_RE.match(s)] + elif staging_gate and not staging_active and serving_candidate: + if serving_candidate in slug_order: + slug_order = [s for s in slug_order if s != serving_candidate] + + if staging_gate: + try: + logger.info("[upload_service] Delta bundle targets (staging=%s): %s", staging_active, slug_order) + except Exception: + pass - replica_roots: Dict[str, Path] = {} - for slug in slug_order: - path = Path(WORK_DIR) / slug + replica_roots: Dict[str, Path] = {} + for slug in slug_order: + path = Path(WORK_DIR) / slug + if create_missing: path.mkdir(parents=True, exist_ok=True) try: marker_dir = Path(WORK_DIR) / ".codebase" / "repos" / slug @@ -201,35 +291,393 @@ def _slug_exists(slug: str) -> bool: (marker_dir / ".ctxce_managed_upload").write_text("1\n") except Exception: pass - replica_roots[slug] = path.resolve() + replica_roots[slug] = path.resolve() + return replica_roots + + +def _enqueue_replica_journal_entries( + *, + workspace_root: Path, + slug: str, + entries: list[Dict[str, Any]], +) -> None: + if not entries: + return + try: + upsert_index_journal_entries( + entries, + workspace_path=str(workspace_root), + repo_name=slug, + ) + except Exception as exc: + logger.debug( + "[upload_service] Failed to enqueue index journal entries for %s: %s", + workspace_root, + exc, + ) + + +def _safe_join(base: Path, rel: str) -> Path: + rp = Path(str(rel)) + if str(rp) in {".", ""}: + raise ValueError("Invalid operation path") + if rp.is_absolute(): + raise ValueError(f"Absolute paths are not allowed: {rel}") + base_resolved = base.resolve() + candidate = (base_resolved / rp).resolve() + try: + ok = candidate.is_relative_to(base_resolved) + except Exception: + ok = os.path.commonpath([str(base_resolved), str(candidate)]) == str(base_resolved) + if not ok: + raise ValueError(f"Path escapes workspace: {rel}") + return candidate + + +def _sanitize_operation_path(rel_path: str, replica_roots: Dict[str, Path]) -> Optional[str]: + sanitized_path = rel_path + skipped_due_to_exact_slug = False + for slug in replica_roots.keys(): + if sanitized_path == slug: + skipped_due_to_exact_slug = True + break + prefix = f"{slug}/" + if sanitized_path.startswith(prefix): + sanitized_path = sanitized_path[len(prefix):] + break + if skipped_due_to_exact_slug or not sanitized_path: + return None + return sanitized_path + + +def plan_delta_upload( + workspace_path: str, + operations: list[Dict[str, Any]], + file_hashes: Optional[Dict[str, str]] = None, +) -> Dict[str, Any]: + needed_files = { + "created": [], + "updated": [], + "moved": [], + } + operations_count = { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + } + needed_size_bytes = 0 + replica_roots = _resolve_replica_roots(workspace_path, create_missing=False) + replica_cache_hashes = { + slug: _load_replica_cache_hashes(root, slug) + for slug, root in replica_roots.items() + } + normalized_hashes = { + str(rel_path): _normalize_hash_value(hash_value) + for rel_path, hash_value in (file_hashes or {}).items() + if _normalize_hash_value(hash_value) + } - primary_slug = slug_order[0] + for operation in operations: + op_type = str(operation.get("operation") or "") + rel_path = operation.get("path") + if not rel_path: + operations_count["skipped"] += 1 + continue + + sanitized = _sanitize_operation_path(str(rel_path), replica_roots) + if not sanitized: + operations_count["skipped"] += 1 + continue + + if op_type == "deleted": + operations_count["deleted"] += 1 + continue + if op_type == "moved": + operations_count["moved"] += 1 + source_rel_path = operation.get("source_path") or operation.get("source_relative_path") + if not source_rel_path: + needed_files["moved"].append(sanitized) + needed_size_bytes += int(operation.get("size_bytes") or 0) + continue + + move_needs_content = False + for _slug, root in replica_roots.items(): + try: + safe_source_path = _safe_join(root, str(source_rel_path)) + except ValueError: + logger.warning( + "[upload_service] Invalid move source path during plan: %s (root=%s)", + source_rel_path, + root, + ) + move_needs_content = True + break + if not safe_source_path.exists(): + move_needs_content = True + break + if move_needs_content: + needed_files["moved"].append(sanitized) + needed_size_bytes += int(operation.get("size_bytes") or 0) + continue + if op_type not in {"created", "updated"}: + operations_count["failed"] += 1 + continue + + op_content_hash = _normalize_hash_value( + operation.get("content_hash") or normalized_hashes.get(sanitized) + ) + if not op_content_hash: + needed_files[op_type].append(sanitized) + operations_count[op_type] += 1 + needed_size_bytes += int(operation.get("size_bytes") or 0) + continue + + needs_content = False + for slug, root in replica_roots.items(): + try: + target_path = _safe_join(root, sanitized) + except ValueError: + logger.warning( + "[upload_service] Invalid %s path during plan: %s (root=%s)", + op_type, + sanitized, + root, + ) + continue + target_key = _normalize_cache_key_path(str(target_path)) + cached_hash = replica_cache_hashes.get(slug, {}).get(target_key) + if cached_hash != op_content_hash: + needs_content = True + break + + if needs_content: + needed_files[op_type].append(sanitized) + operations_count[op_type] += 1 + needed_size_bytes += int(operation.get("size_bytes") or 0) + else: + operations_count["skipped"] += 1 + operations_count["skipped_hash_match"] += 1 + + return { + "needed_files": needed_files, + "operation_counts_preview": operations_count, + "needed_size_bytes": needed_size_bytes, + "replica_targets": list(replica_roots.keys()), + } + + +def apply_delta_operations( + workspace_path: str, + operations: list[Dict[str, Any]], + file_hashes: Optional[Dict[str, str]] = None, +) -> Dict[str, int]: + """Apply metadata-only delta operations without requiring a tar bundle.""" + operations_count = { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + } + + try: + replica_roots = _resolve_replica_roots(workspace_path) + if not replica_roots: + raise ValueError(f"No replica roots available for workspace: {workspace_path}") + replica_cache_hashes = { + slug: _load_replica_cache_hashes(root, slug) + for slug, root in replica_roots.items() + } + journal_entries_by_slug: Dict[str, list[Dict[str, Any]]] = { + slug: [] for slug in replica_roots.keys() + } + normalized_hashes = { + str(rel_path): _normalize_hash_value(hash_value) + for rel_path, hash_value in (file_hashes or {}).items() + if _normalize_hash_value(hash_value) + } + + for operation in operations: + op_type = str(operation.get("operation") or "") + rel_path = operation.get("path") + + if not rel_path: + operations_count["skipped"] += 1 + continue + + sanitized_path = _sanitize_operation_path(str(rel_path), replica_roots) + if not sanitized_path: + operations_count["skipped"] += 1 + continue + + rel_path = sanitized_path + + if op_type not in {"deleted", "moved"}: + operations_count["failed"] += 1 + continue + + source_rel_path = None + if op_type == "moved": + raw_source = operation.get("source_path") or operation.get("source_relative_path") + if not raw_source: + operations_count["failed"] += 1 + continue + source_rel_path = _sanitize_operation_path(str(raw_source), replica_roots) + if not source_rel_path: + operations_count["failed"] += 1 + continue + + replica_results: Dict[str, str] = {} + for slug, root in replica_roots.items(): + target_path = _safe_join(root, rel_path) + target_key = _normalize_cache_key_path(str(target_path)) + replica_hashes = replica_cache_hashes.setdefault(slug, {}) + op_content_hash = _normalize_hash_value( + operation.get("content_hash") or normalized_hashes.get(rel_path) + ) + + try: + if op_type == "deleted": + if target_path.exists(): + target_path.unlink(missing_ok=True) + _cleanup_empty_dirs(target_path.parent, root) + replica_hashes.pop(target_key, None) + journal_entries_by_slug.setdefault(slug, []).append( + _build_delete_journal_entry(target_path) + ) + replica_results[slug] = "applied" + continue + + safe_source_path = _safe_join(root, source_rel_path or "") + if not safe_source_path.exists(): + replica_results[slug] = "failed" + continue + + target_path.parent.mkdir(parents=True, exist_ok=True) + if target_path.exists(): + if target_path.is_dir(): + raise IsADirectoryError( + f"[upload_delta_bundle] move target is a directory: {target_path}" + ) + else: + target_path.unlink() + shutil.move(str(safe_source_path), str(target_path)) + _cleanup_empty_dirs(safe_source_path.parent, root) + source_key = _normalize_cache_key_path(str(safe_source_path)) + moved_hash = replica_hashes.pop(source_key, None) + if op_content_hash: + replica_hashes[target_key] = op_content_hash + elif moved_hash: + replica_hashes[target_key] = moved_hash + move_entry_hash = op_content_hash or moved_hash + journal_entries_by_slug.setdefault(slug, []).extend( + [ + _build_delete_journal_entry(safe_source_path, move_entry_hash), + _build_upsert_journal_entry(target_path, move_entry_hash), + ] + ) + replica_results[slug] = "applied" + except Exception as exc: + logger.debug( + "[upload_service] Failed to apply metadata-only %s to %s in %s: %s", + op_type, + rel_path, + root, + exc, + ) + replica_results[slug] = "failed" + + applied_any = any(result == "applied" for result in replica_results.values()) + success_all = all(result == "applied" for result in replica_results.values()) + if applied_any: + operations_count[op_type] += 1 + if not success_all: + logger.debug( + "[upload_service] Partial metadata-only success for %s %s: %s", + op_type, + rel_path, + replica_results, + ) + else: + operations_count["failed"] += 1 + + for slug, root in replica_roots.items(): + _enqueue_replica_journal_entries( + workspace_root=root, + slug=slug, + entries=journal_entries_by_slug.get(slug, []), + ) + # Flush updated replica hashes to disk (including empty caches) + replica_hashes = replica_cache_hashes.get(slug, {}) + _flush_replica_cache_hashes(root, slug, replica_hashes) + + return operations_count + except Exception as e: + logger.error(f"Error applying metadata-only delta operations: {e}") + raise + + +def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[str, Any]) -> Dict[str, int]: + """Process delta bundle and return operation counts.""" + operations_count = { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + } + + try: + replica_roots = _resolve_replica_roots(workspace_path) + if not replica_roots: + raise ValueError(f"No replica roots available for workspace: {workspace_path}") + primary_slug = next(iter(replica_roots)) workspace_root = replica_roots[primary_slug] - def _safe_join(base: Path, rel: str) -> Path: - # SECURITY: Prevent path traversal / absolute-path writes by ensuring the resolved - # candidate path stays within the intended workspace root. - rp = Path(str(rel)) - if str(rp) in {".", ""}: - raise ValueError("Invalid operation path") - if rp.is_absolute(): - raise ValueError(f"Absolute paths are not allowed: {rel}") - base_resolved = base.resolve() - candidate = (base_resolved / rp).resolve() - try: - ok = candidate.is_relative_to(base_resolved) - except Exception: - ok = os.path.commonpath([str(base_resolved), str(candidate)]) == str(base_resolved) - if not ok: - raise ValueError(f"Path escapes workspace: {rel}") - return candidate + def _member_suffix(name: str, marker: str) -> Optional[str]: + idx = name.find(marker) + if idx < 0: + return None + suffix = name[idx + len(marker):] + return suffix or None with tarfile.open(bundle_path, "r:gz") as tar: ops_member = None - for member in tar.getnames(): - if member.endswith("metadata/operations.json"): + hashes_member = None + git_member = None + created_members: Dict[str, tarfile.TarInfo] = {} + updated_members: Dict[str, tarfile.TarInfo] = {} + moved_members: Dict[str, tarfile.TarInfo] = {} + for member in tar.getmembers(): + name = member.name + if name.endswith("metadata/operations.json"): ops_member = member - break + continue + if name.endswith("metadata/hashes.json"): + hashes_member = member + continue + if name.endswith("metadata/git_history.json"): + git_member = member + continue + created_rel = _member_suffix(name, "files/created/") + if created_rel: + created_members[created_rel] = member + continue + updated_rel = _member_suffix(name, "files/updated/") + if updated_rel: + updated_members[updated_rel] = member + continue + moved_rel = _member_suffix(name, "files/moved/") + if moved_rel: + moved_members[moved_rel] = member if not ops_member: raise ValueError("operations.json not found in bundle") @@ -240,14 +688,28 @@ def _safe_join(base: Path, rel: str) -> Path: operations_data = json.loads(ops_file.read().decode("utf-8")) operations = operations_data.get("operations", []) + bundle_hashes: Dict[str, str] = {} + if hashes_member: + hashes_file = tar.extractfile(hashes_member) + if hashes_file: + hashes_data = json.loads(hashes_file.read().decode("utf-8")) + raw_hashes = hashes_data.get("file_hashes", {}) + if isinstance(raw_hashes, dict): + for rel_path, hash_value in raw_hashes.items(): + digest = _normalize_hash_value(hash_value) + if digest: + bundle_hashes[str(rel_path)] = digest + + replica_cache_hashes = { + slug: _load_replica_cache_hashes(root, slug) + for slug, root in replica_roots.items() + } + journal_entries_by_slug: Dict[str, list[Dict[str, Any]]] = { + slug: [] for slug in replica_roots.keys() + } # Best-effort: extract git history metadata for watcher to ingest try: - git_member = None - for member in tar.getnames(): - if member.endswith("metadata/git_history.json"): - git_member = member - break if git_member: git_file = tar.extractfile(git_member) if git_file: @@ -266,11 +728,20 @@ def _safe_join(base: Path, rel: str) -> Path: except Exception as git_err: logger.debug(f"[upload_service] Error extracting git history metadata: {git_err}") - def _apply_operation_to_workspace(workspace_root: Path) -> bool: - """Apply a single file operation to a workspace. Returns True on success.""" - nonlocal operations_count, op_type, rel_path, tar - + def _apply_operation_to_workspace( + slug: str, + workspace_root: Path, + op_type: str, + rel_path: str, + operation: Dict[str, Any], + ) -> str: + """Apply a single file operation to a workspace.""" target_path = _safe_join(workspace_root, rel_path) + target_key = _normalize_cache_key_path(str(target_path)) + replica_hashes = replica_cache_hashes.setdefault(slug, {}) + op_content_hash = _normalize_hash_value( + operation.get("content_hash") or bundle_hashes.get(rel_path) + ) safe_source_path = None source_rel_path = None @@ -281,76 +752,113 @@ def _apply_operation_to_workspace(workspace_root: Path) -> bool: try: if op_type == "created": - file_member = None - for member in tar.getnames(): - if member.endswith(f"files/created/{rel_path}"): - file_member = member - break - + if op_content_hash and target_path.exists(): + cached_hash = replica_hashes.get(target_key) + if cached_hash and cached_hash == op_content_hash: + return "skipped_hash_match" + file_member = created_members.get(rel_path) if file_member: file_content = tar.extractfile(file_member) if file_content: target_path.parent.mkdir(parents=True, exist_ok=True) target_path.write_bytes(file_content.read()) - return True + if op_content_hash: + replica_hashes[target_key] = op_content_hash + journal_entries_by_slug.setdefault(slug, []).append( + _build_upsert_journal_entry(target_path, op_content_hash) + ) + return "applied" else: - return False + return "failed" else: - return False + return "failed" elif op_type == "updated": - file_member = None - for member in tar.getnames(): - if member.endswith(f"files/updated/{rel_path}"): - file_member = member - break - + if op_content_hash and target_path.exists(): + cached_hash = replica_hashes.get(target_key) + if cached_hash and cached_hash == op_content_hash: + return "skipped_hash_match" + file_member = updated_members.get(rel_path) if file_member: file_content = tar.extractfile(file_member) if file_content: target_path.parent.mkdir(parents=True, exist_ok=True) target_path.write_bytes(file_content.read()) - return True + if op_content_hash: + replica_hashes[target_key] = op_content_hash + journal_entries_by_slug.setdefault(slug, []).append( + _build_upsert_journal_entry(target_path, op_content_hash) + ) + return "applied" else: - return False + return "failed" else: - return False + return "failed" elif op_type == "deleted": if target_path.exists(): target_path.unlink(missing_ok=True) - return True - else: - return True # Already deleted + _cleanup_empty_dirs(target_path.parent, workspace_root) + replica_hashes.pop(target_key, None) + journal_entries_by_slug.setdefault(slug, []).append( + _build_delete_journal_entry(target_path) + ) + return "applied" elif op_type == "moved": if safe_source_path and safe_source_path.exists(): target_path.parent.mkdir(parents=True, exist_ok=True) - safe_source_path.rename(target_path) - return True + if target_path.exists(): + if target_path.is_dir(): + raise IsADirectoryError( + f"[upload_service] move target is a directory: {target_path}" + ) + else: + target_path.unlink() + shutil.move(str(safe_source_path), str(target_path)) + _cleanup_empty_dirs(safe_source_path.parent, workspace_root) + source_key = _normalize_cache_key_path(str(safe_source_path)) + moved_hash = replica_hashes.pop(source_key, None) + if op_content_hash: + replica_hashes[target_key] = op_content_hash + elif moved_hash: + replica_hashes[target_key] = moved_hash + move_entry_hash = op_content_hash or moved_hash + journal_entries_by_slug.setdefault(slug, []).extend( + [ + _build_delete_journal_entry(safe_source_path, move_entry_hash), + _build_upsert_journal_entry(target_path, move_entry_hash), + ] + ) + return "applied" # Remote uploads may not have the source file on the server (e.g. staging # mirrors). In that case, clients can embed the destination content under # files/moved/. - file_member = None - for member in tar.getnames(): - if member.endswith(f"files/moved/{rel_path}"): - file_member = member - break + file_member = moved_members.get(rel_path) if file_member: file_content = tar.extractfile(file_member) if file_content: target_path.parent.mkdir(parents=True, exist_ok=True) target_path.write_bytes(file_content.read()) - return True - return False - return False + if op_content_hash: + replica_hashes[target_key] = op_content_hash + if safe_source_path: + journal_entries_by_slug.setdefault(slug, []).append( + _build_delete_journal_entry(safe_source_path, op_content_hash) + ) + journal_entries_by_slug.setdefault(slug, []).append( + _build_upsert_journal_entry(target_path, op_content_hash) + ) + return "applied" + return "failed" + return "failed" else: logger.warning(f"[upload_service] Unknown operation type: {op_type}") - return False + return "failed" except Exception as e: logger.debug(f"[upload_service] Failed to apply {op_type} to {rel_path} in {workspace_root}: {e}") - return False + return "failed" for operation in operations: op_type = operation.get("operation") @@ -360,18 +868,8 @@ def _apply_operation_to_workspace(workspace_root: Path) -> bool: operations_count["skipped"] += 1 continue - sanitized_path = rel_path - skipped_due_to_exact_slug = False - for slug in replica_roots.keys(): - if sanitized_path == slug: - skipped_due_to_exact_slug = True - break - prefix = f"{slug}/" - if sanitized_path.startswith(prefix): - sanitized_path = sanitized_path[len(prefix):] - break - - if skipped_due_to_exact_slug or not sanitized_path: + sanitized_path = _sanitize_operation_path(str(rel_path), replica_roots) + if not sanitized_path: logger.debug( f"[upload_service] Skipping operation {op_type} for path {rel_path}: " "appears to reference slug root directly.", @@ -381,22 +879,44 @@ def _apply_operation_to_workspace(workspace_root: Path) -> bool: rel_path = sanitized_path - replica_results: Dict[str, bool] = {} + replica_results: Dict[str, str] = {} for slug, root in replica_roots.items(): - replica_results[slug] = _apply_operation_to_workspace(root) + replica_results[slug] = _apply_operation_to_workspace( + slug, + root, + op_type, + rel_path, + operation, + ) - success_any = any(replica_results.values()) - success_all = all(replica_results.values()) - if success_any: + applied_any = any(result == "applied" for result in replica_results.values()) + skipped_hash_match = bool(replica_results) and all( + result == "skipped_hash_match" for result in replica_results.values() + ) + success_all = all(result in {"applied", "skipped_hash_match"} for result in replica_results.values()) + if applied_any: operations_count.setdefault(op_type, 0) - operations_count[op_type] = operations_count.get(op_type, 0) + 1 + operations_count[op_type] += 1 if not success_all: logger.debug( f"[upload_service] Partial success for {op_type} {rel_path}: {replica_results}" ) + elif skipped_hash_match: + operations_count["skipped"] += 1 + operations_count["skipped_hash_match"] += 1 else: operations_count["failed"] += 1 + for slug, root in replica_roots.items(): + _enqueue_replica_journal_entries( + workspace_root=root, + slug=slug, + entries=journal_entries_by_slug.get(slug, []), + ) + # Flush updated replica hashes to disk (including empty caches) + replica_hashes = replica_cache_hashes.get(slug, {}) + _flush_replica_cache_hashes(root, slug, replica_hashes) + return operations_count except Exception as e: diff --git a/scripts/upload_service.py b/scripts/upload_service.py index 6771d652..e1d4355d 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -45,7 +45,12 @@ from fastapi.responses import JSONResponse, RedirectResponse from fastapi.middleware.cors import CORSMiddleware -from scripts.upload_delta_bundle import get_workspace_key, process_delta_bundle +from scripts.upload_delta_bundle import ( + apply_delta_operations, + get_workspace_key, + plan_delta_upload, + process_delta_bundle, +) from scripts.indexing_admin import ( build_admin_collections_view, @@ -86,6 +91,10 @@ except Exception: delete_collection_everywhere = None copy_collection_qdrant = None +try: + from scripts.qdrant_client_manager import pooled_qdrant_client +except Exception: + pooled_qdrant_client = None try: from scripts.admin_ui import ( render_admin_acl, @@ -200,6 +209,7 @@ def logical_repo_reuse_enabled() -> bool: # type: ignore[no-redef] # In-memory sequence tracking (in production, use persistent storage) _sequence_tracker: Dict[str, int] = {} +_upload_result_tracker: Dict[str, Dict[str, Any]] = {} def _int_env(name: str, default: int) -> int: @@ -224,6 +234,39 @@ class UploadResponse(BaseModel): next_sequence: Optional[int] = None error: Optional[Dict[str, Any]] = None + +class PlanRequest(BaseModel): + workspace_path: str + collection_name: Optional[str] = None + source_path: Optional[str] = None + logical_repo_id: Optional[str] = None + session: Optional[str] = None + manifest: Dict[str, Any] = Field(default_factory=dict) + operations: List[Dict[str, Any]] = Field(default_factory=list) + file_hashes: Dict[str, str] = Field(default_factory=dict) + + +class PlanResponse(BaseModel): + success: bool + workspace_path: str + needed_files: Dict[str, List[str]] + operation_counts_preview: Dict[str, int] + needed_size_bytes: int + replica_targets: List[str] + fallback_used: bool = False + error: Optional[Dict[str, Any]] = None + + +class ApplyOperationsRequest(BaseModel): + workspace_path: str + collection_name: Optional[str] = None + source_path: Optional[str] = None + logical_repo_id: Optional[str] = None + session: Optional[str] = None + manifest: Dict[str, Any] = Field(default_factory=dict) + operations: List[Dict[str, Any]] = Field(default_factory=list) + file_hashes: Dict[str, str] = Field(default_factory=dict) + class StatusResponse(BaseModel): workspace_path: str collection_name: str @@ -480,14 +523,43 @@ async def _process_bundle_background( sequence_number: Optional[int], bundle_id: Optional[str], ) -> None: + key = get_workspace_key(workspace_path) try: start_time = datetime.now() + _upload_result_tracker[key] = { + "workspace_path": workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": None, + "processing_time_ms": None, + "status": "processing", + "completed_at": None, + } operations_count = await asyncio.to_thread( process_delta_bundle, workspace_path, bundle_path, manifest ) + processing_time = int((datetime.now() - start_time).total_seconds() * 1000) + failed_count = int((operations_count or {}).get("failed") or 0) + applied_count = int( + (operations_count or {}).get("created", 0) + + (operations_count or {}).get("updated", 0) + + (operations_count or {}).get("deleted", 0) + + (operations_count or {}).get("moved", 0) + ) + status_value = "completed" if failed_count == 0 else "failed" if sequence_number is not None: - key = get_workspace_key(workspace_path) _sequence_tracker[key] = sequence_number + _upload_result_tracker[key] = { + "workspace_path": workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": operations_count, + "processing_time_ms": processing_time, + "status": status_value, + "failed_count": failed_count, + "partial": bool(failed_count > 0 and applied_count > 0), + "completed_at": datetime.now().isoformat(), + } if log_activity: try: repo = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None @@ -503,11 +575,32 @@ async def _process_bundle_background( ) except Exception as activity_err: logger.debug(f"[upload_service] Failed to log activity for bundle {bundle_id}: {activity_err}") - processing_time = (datetime.now() - start_time).total_seconds() * 1000 - logger.info( - f"[upload_service] Finished processing bundle {bundle_id} seq {sequence_number} in {int(processing_time)}ms" - ) + if failed_count > 0: + logger.warning( + "[upload_service] Finished processing bundle %s seq %s with failures in %sms " + "failed=%d ops=%s", + bundle_id, + sequence_number, + processing_time, + failed_count, + operations_count, + ) + else: + logger.info( + f"[upload_service] Finished processing bundle {bundle_id} seq {sequence_number} " + f"in {processing_time}ms ops={operations_count}" + ) except Exception as e: + _upload_result_tracker[key] = { + "workspace_path": workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": None, + "processing_time_ms": None, + "status": "error", + "completed_at": datetime.now().isoformat(), + "error": str(e), + } logger.error(f"[upload_service] Error in background processing for bundle {bundle_id}: {e}") finally: try: @@ -935,7 +1028,7 @@ async def admin_delete_collection( cleanup_fs = False try: - delete_collection_everywhere( + out = delete_collection_everywhere( collection=name, work_dir=WORK_DIR, qdrant_url=QDRANT_URL, @@ -949,7 +1042,23 @@ async def admin_delete_collection( back_href="/admin/acl", ) - return RedirectResponse(url="/admin/acl", status_code=302) + graph_deleted: Optional[str] = None + try: + if isinstance(out, dict) and not name.endswith("_graph"): + graph_deleted = "1" if bool(out.get("qdrant_graph_deleted")) else "0" + except Exception: + graph_deleted = None + + try: + from urllib.parse import urlencode + + params = {"deleted": name} + if graph_deleted is not None: + params["graph_deleted"] = graph_deleted + url = "/admin/acl?" + urlencode(params) + except Exception: + url = "/admin/acl" + return RedirectResponse(url=url, status_code=302) @app.post("/admin/staging/start") @@ -1222,7 +1331,59 @@ async def admin_copy_collection( back_href="/admin/acl", ) - return RedirectResponse(url="/admin/acl", status_code=302) + graph_copied: Optional[str] = None + try: + if not name.endswith("_graph") and not str(new_name).endswith("_graph"): + used_pooled = False + if pooled_qdrant_client is not None: + used_pooled = True + try: + with pooled_qdrant_client( + url=QDRANT_URL, + api_key=os.environ.get("QDRANT_API_KEY"), + ) as cli: + try: + cli.get_collection(collection_name=f"{new_name}_graph") + graph_copied = "1" + except Exception: + graph_copied = "0" + except Exception: + # Failed to acquire pooled client; fall back to non-pooled + used_pooled = False + if not used_pooled: + try: + from qdrant_client import QdrantClient # type: ignore + + cli = QdrantClient( + url=QDRANT_URL, + api_key=os.environ.get("QDRANT_API_KEY"), + timeout=float(os.environ.get("QDRANT_TIMEOUT", "5") or 5), + ) + try: + cli.get_collection(collection_name=f"{new_name}_graph") + graph_copied = "1" + except Exception: + graph_copied = "0" + finally: + try: + cli.close() + except Exception: + pass + except Exception: + graph_copied = "0" + except Exception: + graph_copied = None + + try: + from urllib.parse import urlencode + + params = {"copied": name, "new": new_name} + if graph_copied is not None: + params["graph_copied"] = graph_copied + url = "/admin/acl?" + urlencode(params) + except Exception: + url = "/admin/acl" + return RedirectResponse(url=url, status_code=302) @app.post("/admin/users") @@ -1355,8 +1516,12 @@ async def get_status(workspace_path: str): # Get last sequence last_sequence = get_last_sequence(workspace_path) + key = get_workspace_key(workspace_path) + upload_result = _upload_result_tracker.get(key, {}) - last_upload = None + last_upload = upload_result.get("completed_at") + upload_status = str(upload_result.get("status") or "") + workspace_status = "processing" if upload_status == "processing" else "ready" return StatusResponse( workspace_path=workspace_path, @@ -1364,11 +1529,16 @@ async def get_status(workspace_path: str): last_sequence=last_sequence, last_upload=last_upload, pending_operations=0, - status="ready", + status=workspace_status, server_info={ "version": "1.0.0", "max_bundle_size_mb": MAX_BUNDLE_SIZE_MB, - "supported_formats": ["tar.gz"] + "supported_formats": ["tar.gz"], + "last_bundle_id": upload_result.get("bundle_id"), + "last_processing_time_ms": upload_result.get("processing_time_ms"), + "last_processed_operations": upload_result.get("processed_operations"), + "last_upload_status": upload_status or None, + "last_error": upload_result.get("error"), } ) @@ -1376,59 +1546,29 @@ async def get_status(workspace_path: str): logger.error(f"Error getting status: {e}") raise HTTPException(status_code=500, detail=str(e)) -@app.post("/api/v1/delta/upload", response_model=UploadResponse) -async def upload_delta_bundle( - request: Request, - bundle: UploadFile = File(...), - workspace_path: str = Form(...), - collection_name: Optional[str] = Form(None), - sequence_number: Optional[int] = Form(None), - force: Optional[bool] = Form(False), - source_path: Optional[str] = Form(None), - logical_repo_id: Optional[str] = Form(None), - session: Optional[str] = Form(None), -): - """Upload and process delta bundle.""" - start_time = datetime.now() - client_host = request.client.host if hasattr(request, 'client') and request.client else 'unknown' - - record: Optional[Dict[str, Any]] = None - - try: - logger.info(f"[upload_service] Begin processing upload for workspace={workspace_path} from {client_host}") - - if AUTH_ENABLED: - session_value = (session or "").strip() - try: - record = validate_session(session_value) - except AuthDisabledError: - record = None - except Exception as e: - logger.error(f"[upload_service] Failed to validate auth session for upload: {e}") - raise HTTPException( - status_code=500, - detail="Failed to validate auth session", - ) - if record is None: - raise HTTPException( - status_code=status.HTTP_401_UNAUTHORIZED, - detail="Invalid or expired session", - ) - - # Validate workspace path - workspace = Path(workspace_path) - if not workspace.is_absolute(): - workspace = Path(WORK_DIR) / workspace - workspace_path = str(workspace.resolve()) +def _resolve_collection_for_request( + workspace_path: str, + client_collection_name: Optional[str], + logical_repo_id: Optional[str], +) -> Tuple[str, Optional[str]]: + """ + Resolve collection name and repo_name for upload/plan/apply requests. + + Returns: + Tuple of (collection_name, repo_name) + """ + # Resolve collection name for ACL enforcement + collection_name: Optional[str] = None + repo_name: Optional[str] = None + if _extract_repo_name_from_path or (get_collection_name and logical_repo_reuse_enabled and find_collection_for_logical_repo): # Always derive repo_name from workspace_path for origin tracking repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None if not repo_name: repo_name = Path(workspace_path).name # Preserve any client-supplied collection name but allow server-side overrides - client_collection_name = collection_name resolved_collection: Optional[str] = None # Resolve collection name, preferring server-side mapping for logical_repo_id when enabled @@ -1477,6 +1617,345 @@ async def upload_delta_bundle( else: collection_name = DEFAULT_COLLECTION + return collection_name, repo_name + + +@app.post("/api/v1/delta/plan", response_model=PlanResponse) +async def plan_delta(request: PlanRequest): + """Plan which file bodies are needed before uploading content.""" + try: + workspace = Path(request.workspace_path) + if not workspace.is_absolute(): + workspace = Path(WORK_DIR) / workspace + workspace_path = str(workspace.resolve()) + + if AUTH_ENABLED: + session_value = str(request.session or "").strip() + try: + record = validate_session(session_value) + except AuthDisabledError: + record = None + except Exception as e: + logger.error(f"[upload_service] Failed to validate auth session for plan: {e}") + raise HTTPException(status_code=500, detail="Failed to validate auth session") + if record is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired session", + ) + + # Resolve collection name for ACL enforcement + collection_name, repo_name = _resolve_collection_for_request( + workspace_path=workspace_path, + client_collection_name=request.collection_name, + logical_repo_id=request.logical_repo_id, + ) + + # Enforce collection write access for plan/apply when auth is enabled + if AUTH_ENABLED and CTXCE_MCP_ACL_ENFORCE: + if not collection_name: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Collection resolution failed for ACL enforcement", + ) + uid = str((record or {}).get("user_id") or "").strip() + if not uid: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired session", + ) + try: + allowed = has_collection_access(uid, str(collection_name), "write") + except AuthDisabledError: + allowed = True + if not allowed: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"User does not have write access to collection '{collection_name}'", + ) + + plan = plan_delta_upload( + workspace_path=workspace_path, + operations=request.operations, + file_hashes=request.file_hashes, + ) + return PlanResponse( + success=True, + workspace_path=workspace_path, + needed_files=plan.get("needed_files", {"created": [], "updated": [], "moved": []}), + operation_counts_preview=plan.get( + "operation_counts_preview", + { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + }, + ), + needed_size_bytes=int(plan.get("needed_size_bytes", 0) or 0), + replica_targets=list(plan.get("replica_targets", []) or []), + fallback_used=False, + error=None, + ) + except HTTPException: + raise + except Exception as e: + logger.error(f"[upload_service] Error planning delta upload: {e}") + return PlanResponse( + success=False, + workspace_path=request.workspace_path, + needed_files={"created": [], "updated": [], "moved": []}, + operation_counts_preview={ + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + }, + needed_size_bytes=0, + replica_targets=[], + fallback_used=True, + error={ + "code": "PLAN_ERROR", + "message": str(e), + }, + ) + + +@app.post("/api/v1/delta/apply_ops", response_model=UploadResponse) +async def apply_delta_ops(request: ApplyOperationsRequest): + """Apply metadata-only delta operations without uploading a tar bundle.""" + key: Optional[str] = None + bundle_id: Optional[str] = None + sequence_number: Optional[int] = None + try: + workspace = Path(request.workspace_path) + if not workspace.is_absolute(): + workspace = Path(WORK_DIR) / workspace + workspace_path = str(workspace.resolve()) + + if AUTH_ENABLED: + session_value = str(request.session or "").strip() + try: + record = validate_session(session_value) + except AuthDisabledError: + record = None + except Exception as e: + logger.error(f"[upload_service] Failed to validate auth session for apply_ops: {e}") + raise HTTPException(status_code=500, detail="Failed to validate auth session") + if record is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired session", + ) + + # Resolve collection name for ACL enforcement + collection_name, repo_name = _resolve_collection_for_request( + workspace_path=workspace_path, + client_collection_name=request.collection_name, + logical_repo_id=request.logical_repo_id, + ) + + # Enforce collection write access for plan/apply when auth is enabled + if AUTH_ENABLED and CTXCE_MCP_ACL_ENFORCE: + if not collection_name: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Collection resolution failed for ACL enforcement", + ) + uid = str((record or {}).get("user_id") or "").strip() + if not uid: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired session", + ) + try: + allowed = has_collection_access(uid, str(collection_name), "write") + except AuthDisabledError: + allowed = True + if not allowed: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"User does not have write access to collection '{collection_name}'", + ) + + manifest = request.manifest or {} + bundle_id = manifest.get("bundle_id") + manifest_sequence = manifest.get("sequence_number") + key = get_workspace_key(workspace_path) + last_sequence = get_last_sequence(workspace_path) + sequence_number = manifest_sequence if manifest_sequence is not None else last_sequence + 1 + + if sequence_number is not None and sequence_number != last_sequence + 1: + return UploadResponse( + success=False, + error={ + "code": "SEQUENCE_MISMATCH", + "message": f"Expected sequence {last_sequence + 1}, got {sequence_number}", + "expected_sequence": last_sequence + 1, + "received_sequence": sequence_number, + "retry_after": 5000, + }, + ) + + start_time = datetime.now() + _upload_result_tracker[key] = { + "workspace_path": workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": None, + "processing_time_ms": None, + "status": "processing", + "completed_at": None, + } + + operations_count = await asyncio.to_thread( + apply_delta_operations, + workspace_path, + request.operations, + request.file_hashes, + ) + processing_time = int((datetime.now() - start_time).total_seconds() * 1000) + failed_count = int((operations_count or {}).get("failed") or 0) + applied_count = int( + (operations_count or {}).get("created", 0) + + (operations_count or {}).get("updated", 0) + + (operations_count or {}).get("deleted", 0) + + (operations_count or {}).get("moved", 0) + ) + status_value = "completed" if failed_count == 0 else "failed" + if applied_count > 0: + _sequence_tracker[key] = sequence_number + _upload_result_tracker[key] = { + "workspace_path": workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": operations_count, + "processing_time_ms": processing_time, + "status": status_value, + "failed_count": failed_count, + "partial": bool(failed_count > 0 and applied_count > 0), + "completed_at": datetime.now().isoformat(), + } + if failed_count > 0: + logger.warning( + "[upload_service] apply_ops completed with failures bundle=%s seq=%s failed=%d ops=%s", + bundle_id, + sequence_number, + failed_count, + operations_count, + ) + return UploadResponse( + success=False, + bundle_id=bundle_id, + sequence_number=sequence_number, + processed_operations=operations_count, + processing_time_ms=processing_time, + next_sequence=sequence_number + 1 if sequence_number is not None else None, + error={ + "code": "APPLY_OPS_PARTIAL_FAILURE", + "message": f"One or more operations failed during apply_ops (failed={failed_count})", + "failed_count": failed_count, + "processed_operations": operations_count, + }, + ) + logger.info( + "[upload_service] Applied metadata-only operations bundle=%s seq=%s in %sms ops=%s", + bundle_id, + sequence_number, + processing_time, + operations_count, + ) + return UploadResponse( + success=True, + bundle_id=bundle_id, + sequence_number=sequence_number, + processed_operations=operations_count, + processing_time_ms=processing_time, + next_sequence=sequence_number + 1 if sequence_number is not None else None, + ) + except HTTPException: + raise + except Exception as e: + logger.error(f"[upload_service] Error applying metadata-only operations: {e}") + if key: + _upload_result_tracker[key] = { + "workspace_path": request.workspace_path, + "bundle_id": bundle_id, + "sequence_number": sequence_number, + "processed_operations": None, + "processing_time_ms": None, + "status": "error", + "error": str(e), + "message": str(e), + "completed_at": datetime.now().isoformat(), + } + return UploadResponse( + success=False, + error={ + "code": "APPLY_OPS_ERROR", + "message": str(e), + }, + ) + +@app.post("/api/v1/delta/upload", response_model=UploadResponse) +async def upload_delta_bundle( + request: Request, + bundle: UploadFile = File(...), + workspace_path: str = Form(...), + collection_name: Optional[str] = Form(None), + sequence_number: Optional[int] = Form(None), + force: Optional[bool] = Form(False), + source_path: Optional[str] = Form(None), + logical_repo_id: Optional[str] = Form(None), + session: Optional[str] = Form(None), +): + """Upload and process delta bundle.""" + start_time = datetime.now() + client_host = request.client.host if hasattr(request, 'client') and request.client else 'unknown' + + record: Optional[Dict[str, Any]] = None + + try: + logger.info(f"[upload_service] Begin processing upload for workspace={workspace_path} from {client_host}") + + if AUTH_ENABLED: + session_value = (session or "").strip() + try: + record = validate_session(session_value) + except AuthDisabledError: + record = None + except Exception as e: + logger.error(f"[upload_service] Failed to validate auth session for upload: {e}") + raise HTTPException( + status_code=500, + detail="Failed to validate auth session", + ) + if record is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired session", + ) + + # Validate workspace path + workspace = Path(workspace_path) + if not workspace.is_absolute(): + workspace = Path(WORK_DIR) / workspace + + workspace_path = str(workspace.resolve()) + + # Resolve collection name and repo name + collection_name, repo_name = _resolve_collection_for_request( + workspace_path=workspace_path, + client_collection_name=collection_name, + logical_repo_id=logical_repo_id, + ) + # Enforce collection write access for uploads when auth is enabled. # Semantics: "write" is sufficient for uploading/indexing content. if AUTH_ENABLED and CTXCE_MCP_ACL_ENFORCE: diff --git a/scripts/watch_index.py b/scripts/watch_index.py index 8fe5a740..685a60d9 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -14,13 +14,8 @@ if str(ROOT_DIR) not in sys.path: sys.path.insert(0, str(ROOT_DIR)) -from scripts.watch_index_core.config import ( # noqa: E402 - LOGGER, - MODEL, - QDRANT_URL, - ROOT as WATCH_ROOT, - default_collection_name, -) +from scripts.watch_index_core import config as watch_config # noqa: E402 +from scripts.watch_index_core.config import LOGGER, MODEL, QDRANT_URL, default_collection_name # noqa: E402 from scripts.watch_index_core.utils import ( get_boolean_env, resolve_vector_name_config, @@ -30,22 +25,24 @@ from scripts.watch_index_core.pseudo import _start_pseudo_backfill_worker # noqa: E402 from scripts.watch_index_core.processor import _process_paths # noqa: E402 from scripts.watch_index_core.queue import ChangeQueue # noqa: E402 +from scripts.watch_index_core.consistency import ( # noqa: E402 + run_consistency_audit, + run_empty_dir_sweep_maintenance, +) from scripts.workspace_state import ( # noqa: E402 - _extract_repo_name_from_path, compute_indexing_config_hash, - get_collection_name, get_indexing_config_snapshot, + list_pending_index_journal_entries, is_multi_repo_mode, persist_indexing_config, update_indexing_status, - update_workspace_state, initialize_watcher_state, ) import scripts.ingest_code as idx # noqa: E402 logger = LOGGER -ROOT = WATCH_ROOT +ROOT = watch_config.ROOT # Back-compat: legacy modules/tests expect a module-level COLLECTION constant. # We use a sentinel and a getter to ensure the resolved value is returned. _COLLECTION: Optional[str] = None @@ -58,7 +55,63 @@ def get_collection() -> str: return default_collection_name() +def _set_runtime_root() -> None: + global ROOT + runtime_root = Path( + os.environ.get("WATCH_ROOT") + or os.environ.get("WORKSPACE_PATH") + or str(ROOT) + ) + try: + runtime_root = runtime_root.resolve() + except Exception: + pass + + ROOT = runtime_root + watch_config.ROOT = runtime_root + + +def _drain_pending_journal(queue: ChangeQueue) -> None: + pending_path: Optional[str] = None + try: + for pending_entry in list_pending_index_journal_entries(str(ROOT)): + pending_path = str(pending_entry.get("path") or "").strip() + if pending_path: + queue.add(Path(pending_path), force=True) + except Exception as exc: + logger.exception( + "watch_index::pending_journal_drain_failed", + extra={"root": str(ROOT), "pending_path": pending_path, "error": str(exc)}, + ) + + +def _run_periodic_maintenance(client: QdrantClient) -> None: + try: + run_consistency_audit(client, ROOT) + except Exception as exc: + logger.exception( + "watch_index::consistency_audit_failed", + extra={"root": str(ROOT), "error": str(exc)}, + ) + try: + run_empty_dir_sweep_maintenance(ROOT) + except Exception as exc: + logger.exception( + "watch_index::empty_dir_sweep_failed", + extra={"root": str(ROOT), "error": str(exc)}, + ) + + +def _maintenance_interval_secs() -> float: + try: + return max(0.0, float(os.environ.get("WATCH_MAINTENANCE_INTERVAL_SECS", "300") or 300.0)) + except Exception: + return 300.0 + + def main() -> None: + _set_runtime_root() + # Resolve collection name from workspace state before any client/state ops try: from scripts.workspace_state import get_collection_name_with_staging as _get_coll @@ -185,8 +238,19 @@ def main() -> None: obs.schedule(handler, str(ROOT), recursive=True) obs.start() + maintenance_interval = _maintenance_interval_secs() + last_maintenance: Optional[float] = None + try: while True: + # Watcher is the sole durable journal consumer in v1. Upload/apply + # records upsert/delete intent here so missed filesystem events can + # still be replayed after watcher/container restarts. + _drain_pending_journal(q) + now = time.time() + if last_maintenance is None or (now - last_maintenance) >= maintenance_interval: + _run_periodic_maintenance(client) + last_maintenance = now time.sleep(1.0) except KeyboardInterrupt: pass diff --git a/scripts/watch_index_core/config.py b/scripts/watch_index_core/config.py index c9fa8354..01e9895e 100644 --- a/scripts/watch_index_core/config.py +++ b/scripts/watch_index_core/config.py @@ -33,6 +33,12 @@ def build_logger(): # Debounce interval for file system events DELAY_SECS = float(os.environ.get("WATCH_DEBOUNCE_SECS", "1.0")) +# Suppress repeated processing of the exact same observed file state for a short +# window. This is especially useful on shared/polled filesystems like CephFS. +RECENT_FINGERPRINT_TTL_SECS = float( + os.environ.get("WATCH_RECENT_FINGERPRINT_TTL_SECS", "0") +) + def default_collection_name() -> str: """Base fallback for collection name before runtime resolution.""" diff --git a/scripts/watch_index_core/consistency.py b/scripts/watch_index_core/consistency.py new file mode 100644 index 00000000..223f28bf --- /dev/null +++ b/scripts/watch_index_core/consistency.py @@ -0,0 +1,639 @@ +from __future__ import annotations + +import json +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, Optional, Set, Tuple + +from qdrant_client import QdrantClient + +import scripts.ingest_code as idx +from scripts.workspace_state import ( + _get_state_lock, + _extract_repo_name_from_path, + _normalize_cache_key_path, + get_collection_state_snapshot, + get_workspace_state, + list_workspaces, + update_workspace_state, + upsert_index_journal_entries, +) + +from .config import LOGGER +from .utils import get_boolean_env +from .paths import is_internal_metadata_path + +logger = LOGGER +_DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS = 7 * 24 * 60 * 60 + + +def _consistency_audit_enabled() -> bool: + return get_boolean_env("WATCH_CONSISTENCY_AUDIT_ENABLED", default=True) + + +def _consistency_audit_interval_secs() -> int: + try: + return max(60, int(os.environ.get("WATCH_CONSISTENCY_AUDIT_INTERVAL_SECS", "86400") or 86400)) + except Exception: + return 86400 + + +def _consistency_audit_max_paths() -> int: + try: + return max(0, int(os.environ.get("WATCH_CONSISTENCY_AUDIT_MAX_PATHS", "200000") or 200000)) + except Exception: + return 200000 + + +def _consistency_repair_enabled() -> bool: + return get_boolean_env("WATCH_CONSISTENCY_REPAIR_ENABLED", default=True) + + +def _consistency_repair_max_ops() -> int: + try: + return max(0, int(os.environ.get("WATCH_CONSISTENCY_REPAIR_MAX_OPS", "5000") or 5000)) + except Exception: + return 5000 + + +def _empty_dir_sweep_enabled() -> bool: + if "WATCH_EMPTY_DIR_SWEEP_ENABLED" in os.environ: + return get_boolean_env("WATCH_EMPTY_DIR_SWEEP_ENABLED", default=True) + return get_boolean_env("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", default=True) + + +def _empty_dir_sweep_interval_secs() -> int: + raw = os.environ.get("WATCH_EMPTY_DIR_SWEEP_INTERVAL_SECONDS") + if raw is None: + raw = os.environ.get( + "CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", + str(_DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS), + ) + try: + return max(0, int(raw or _DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS)) + except Exception: + return _DEFAULT_EMPTY_DIR_SWEEP_INTERVAL_SECONDS + + +def _parse_ts(value: Any) -> Optional[datetime]: + raw = str(value or "").strip() + if not raw: + return None + try: + parsed = datetime.fromisoformat(raw.replace("Z", "+00:00")) + except ValueError: + return None + if parsed.tzinfo is None: + return parsed.replace(tzinfo=timezone.utc) + return parsed.astimezone(timezone.utc) + + +def _should_run_consistency_audit(workspace_path: str, repo_name: Optional[str]) -> bool: + if not _consistency_audit_enabled(): + return False + interval = _consistency_audit_interval_secs() + try: + state = get_workspace_state(workspace_path=workspace_path, repo_name=repo_name) or {} + except Exception: + return True + maintenance = dict(state.get("maintenance") or {}) + last = _parse_ts(maintenance.get("last_consistency_audit_at")) + if last is None: + return True + age = (datetime.now(timezone.utc) - last).total_seconds() + return age >= interval + + +def _sweep_empty_workspace_dirs(workspace_root: Path) -> bool: + """Sweep empty workspace directories and return True if fully successful.""" + protected_top_level = {".codebase", ".remote-git"} + try: + workspace_root = workspace_root.resolve() + except Exception: + return False + try: + for root, _dirnames, _filenames in os.walk(workspace_root, topdown=False): + current = Path(root) + if current == workspace_root: + continue + if current.parent == workspace_root and current.name in protected_top_level: + continue + try: + rel = current.relative_to(workspace_root) + except Exception: + continue + if rel.parts and rel.parts[0] in protected_top_level: + continue + try: + if any(current.iterdir()): + continue + current.rmdir() + except Exception: + # If any directory operation fails, the sweep was not fully successful + return False + except Exception: + return False + return True + + +def _should_run_empty_dir_sweep(workspace_path: str, repo_name: Optional[str]) -> bool: + if not _empty_dir_sweep_enabled(): + return False + interval_seconds = _empty_dir_sweep_interval_secs() + if interval_seconds == 0: + return True + try: + state = get_workspace_state(workspace_path=workspace_path, repo_name=repo_name) or {} + except Exception: + return True + maintenance = state.get("maintenance") or {} + last_sweep_at = _parse_ts(maintenance.get("last_empty_dir_sweep_at")) + if last_sweep_at is None: + return True + age_seconds = (datetime.now(timezone.utc) - last_sweep_at).total_seconds() + return age_seconds >= interval_seconds + + +def _record_empty_dir_sweep(workspace_path: str, repo_name: Optional[str]) -> None: + try: + lock = _get_state_lock(workspace_path, repo_name) + with lock: + state = get_workspace_state( + workspace_path=workspace_path, + repo_name=repo_name, + ) or {} + maintenance = dict(state.get("maintenance") or {}) + maintenance["last_empty_dir_sweep_at"] = datetime.now( + timezone.utc + ).isoformat() + update_workspace_state( + workspace_path=workspace_path, + repo_name=repo_name, + updates={"maintenance": maintenance}, + ) + except Exception as exc: + logger.warning( + "Failed to record empty dir sweep timestamp: %s (workspace=%s, repo=%s)", + exc, + workspace_path, + repo_name, + ) + + +def _load_cached_hashes( + workspace_path: str, + repo_name: Optional[str], + *, + metadata_root: Optional[Path] = None, +) -> Dict[str, str]: + workspace_norm = _normalize_cache_key_path(workspace_path) + workspace_prefix = f"{workspace_norm.rstrip('/')}/" + candidates: list[Path] = [] + seen: set[str] = set() + + def _append_candidate(path: Path) -> None: + key = str(path) + if key in seen: + return + seen.add(key) + candidates.append(path) + + root = Path(metadata_root or workspace_path) + if repo_name: + _append_candidate(root / ".codebase" / "repos" / repo_name / "cache.json") + else: + _append_candidate(root / ".codebase" / "cache.json") + + for cache_path in candidates: + if not cache_path.exists(): + continue + try: + with cache_path.open("r", encoding="utf-8-sig") as f: + data = json.load(f) + hashes = data.get("file_hashes", {}) + if not isinstance(hashes, dict): + return {} + normalized: Dict[str, str] = {} + for path_key, value in hashes.items(): + norm = _normalize_cache_key_path(str(path_key)) + if not norm: + continue + if workspace_norm and not ( + norm == workspace_norm or norm.startswith(workspace_prefix) + ): + continue + if isinstance(value, dict): + digest = str(value.get("hash") or "").strip() + else: + digest = str(value or "").strip() + normalized[norm] = digest + return normalized + except Exception: + return {} + return {} + + +def _is_index_eligible_path(path_str: str, workspace_root: Path, excluder) -> bool: + try: + p = Path(path_str).resolve() + except Exception: + p = Path(path_str) + try: + rel = p.resolve().relative_to(workspace_root.resolve()) + except Exception: + return False + + if not rel.parts: + return False + if not p.exists() or p.is_dir(): + return False + try: + if int(p.stat().st_size) == 0: + # Empty files (e.g. many __init__.py stubs) produce no vectors; do not + # enqueue consistency upserts for them. + return False + except Exception: + return False + if is_internal_metadata_path(p): + return False + + # .remote-git manifests are control files and must not be treated as indexable. + if _is_remote_git_manifest(p.as_posix()): + return False + + try: + rel_dir = "/" + str(rel.parent).replace(os.sep, "/") + if rel_dir == "/.": + rel_dir = "/" + if excluder.exclude_dir(rel_dir): + return False + except Exception: + return False + + if not idx.is_indexable_file(p): + return False + + try: + relf = (rel_dir.rstrip("/") + "/" + p.name).replace("//", "/") + if excluder.exclude_file(relf): + return False + except Exception: + return False + return True + + +def _scan_indexable_fs_paths(workspace_root: Path, *, max_paths: int) -> Tuple[Set[str], bool]: + paths: Set[str] = set() + excluder = idx._Excluder(workspace_root) + try: + workspace_root = workspace_root.resolve() + except Exception: + pass + + for root_str, dirnames, filenames in os.walk(workspace_root): + current = Path(root_str) + pruned_dirnames = [] + for dirname in dirnames: + child = current / dirname + if is_internal_metadata_path(child): + continue + try: + rel_dir = "/" + str(child.relative_to(workspace_root)).replace(os.sep, "/") + if excluder.exclude_dir(rel_dir): + continue + except Exception: + pass + pruned_dirnames.append(dirname) + dirnames[:] = pruned_dirnames + + for filename in filenames: + file_path = current / filename + normalized = _normalize_cache_key_path(str(file_path)) + if not normalized: + continue + if not _is_index_eligible_path(normalized, workspace_root, excluder): + continue + paths.add(normalized) + if max_paths > 0 and len(paths) >= max_paths: + return paths, True + return paths, False + + +def _load_indexed_paths_for_collection( + client: QdrantClient, + collection: str, + workspace_path: str, + *, + max_paths: int, +) -> Tuple[Set[str], bool]: + paths: Set[str] = set() + workspace_norm = _normalize_cache_key_path(workspace_path) + workspace_prefix = f"{workspace_norm.rstrip('/')}/" + offset = None + while True: + points, next_offset = client.scroll( + collection_name=collection, + limit=1000, + with_payload=True, + with_vectors=False, + offset=offset, + ) + for pt in points or []: + payload = getattr(pt, "payload", {}) or {} + metadata = payload.get("metadata", {}) or {} + path = _normalize_cache_key_path(str(metadata.get("path") or "")) + if path: + if workspace_norm and not ( + path == workspace_norm or path.startswith(workspace_prefix) + ): + continue + paths.add(path) + if max_paths > 0 and len(paths) >= max_paths: + return paths, True + if next_offset is None: + break + offset = next_offset + return paths, False + + +def _record_consistency_audit( + workspace_path: str, + repo_name: Optional[str], + summary: Dict[str, Any], +) -> None: + try: + lock = _get_state_lock(workspace_path, repo_name) + with lock: + state = get_workspace_state( + workspace_path=workspace_path, + repo_name=repo_name, + ) or {} + maintenance = dict(state.get("maintenance") or {}) + maintenance["last_consistency_audit_at"] = datetime.now( + timezone.utc + ).isoformat() + maintenance["last_consistency_audit_summary"] = summary + update_workspace_state( + workspace_path=workspace_path, + repo_name=repo_name, + updates={"maintenance": maintenance}, + ) + except Exception as exc: + logger.warning( + "Failed to record consistency audit: %s (workspace=%s, repo=%s)", + exc, + workspace_path, + repo_name, + ) + + +def _is_remote_git_manifest(path: str) -> bool: + """Check if path is a .remote-git git history manifest file (control file, not indexable content).""" + try: + p = Path(path) + return any(part == ".remote-git" for part in p.parts) and p.suffix.lower() == ".json" + except Exception: + return False + + +def _enqueue_consistency_repairs( + workspace_root: Path, + workspace_path: str, + repo_name: Optional[str], + stale_paths: list[str], + missing_paths: list[str], + cached_hashes: Dict[str, str], +) -> Tuple[int, int]: + if not _consistency_repair_enabled(): + return 0, 0 + max_ops = _consistency_repair_max_ops() + if max_ops <= 0: + return 0, 0 + + entries: list[Dict[str, Any]] = [] + enqueued_stale = 0 + enqueued_missing = 0 + missing_set = set(missing_paths) + excluder = idx._Excluder(workspace_root) + + for path in stale_paths: + if len(entries) >= max_ops: + break + # Skip .remote-git git history manifests - they are control files, not indexable content + if _is_remote_git_manifest(path): + continue + # Cache can lag after state resets/rebuilds; if the path still exists and is + # index-eligible, treat it as missing/upsert instead of stale/delete. + if _is_index_eligible_path(path, workspace_root, excluder): + missing_set.add(path) + continue + entries.append({"path": path, "op_type": "delete"}) + enqueued_stale += 1 + for path in sorted(missing_set): + if len(entries) >= max_ops: + break + # Skip .remote-git git history manifests - they are control files, not indexable content + if _is_remote_git_manifest(path): + continue + entries.append( + { + "path": path, + "op_type": "upsert", + "content_hash": cached_hashes.get(path) or None, + } + ) + enqueued_missing += 1 + + if not entries: + return 0, 0 + + # Fetch existing journal entries to preserve retry state + existing_entries: Dict[str, Dict[str, Any]] = {} + try: + from scripts.workspace_state import list_pending_index_journal_entries + all_pending = list_pending_index_journal_entries( + workspace_path=workspace_path, + repo_name=repo_name, + ) + for entry in all_pending or []: + path = str(entry.get("path") or "") + if path: + existing_entries[path] = entry + except Exception: + pass # If we can't fetch existing entries, proceed without preserving state + + # Merge existing retry state into new entries where appropriate + merged_entries = [] + for entry in entries: + path = str(entry.get("path") or "") + existing = existing_entries.get(path) + + # Skip if already pending/in-progress to avoid duplicate work + if existing and existing.get("status") in {"pending", "in_progress"}: + continue + + # Preserve retry state from existing failed entries + if existing and existing.get("status") == "failed": + entry["status"] = "failed" + entry["attempts"] = existing.get("attempts", 0) + entry["last_error"] = existing.get("last_error") + # Keep created_at from existing entry to preserve original enqueue time + if existing.get("created_at"): + entry["created_at"] = existing["created_at"] + + merged_entries.append(entry) + + if not merged_entries: + return 0, 0 + + try: + upsert_index_journal_entries( + merged_entries, + workspace_path=workspace_path, + repo_name=repo_name, + ) + except Exception as exc: + logger.debug( + "[consistency_audit] failed to enqueue repairs workspace=%s repo=%s: %s", + workspace_path, + repo_name, + exc, + ) + return 0, 0 + + # Return counts based on actually enqueued entries + enqueued_stale = sum(1 for e in merged_entries if e.get("op_type") == "delete") + enqueued_missing = sum(1 for e in merged_entries if e.get("op_type") == "upsert") + return enqueued_stale, enqueued_missing + + +def run_consistency_audit(client: QdrantClient, root: Path) -> None: + if not _consistency_audit_enabled(): + return + max_paths = _consistency_audit_max_paths() + try: + candidates = list_workspaces(search_root=str(root), use_qdrant_fallback=False) + except Exception: + candidates = [] + for ws in candidates: + workspace_path = str(ws.get("workspace_path") or "").strip() + if not workspace_path: + continue + repo_name = _extract_repo_name_from_path(workspace_path) + if not _should_run_consistency_audit(workspace_path, repo_name): + continue + try: + snapshot = get_collection_state_snapshot( + workspace_path=workspace_path, + repo_name=repo_name, + ) + collection = str(snapshot.get("active_collection") or "").strip() + if not collection: + continue + cached_hashes = _load_cached_hashes( + workspace_path, + repo_name, + metadata_root=root, + ) + workspace_root = Path(workspace_path) + fs_paths, fs_truncated = _scan_indexable_fs_paths( + workspace_root, + max_paths=max_paths, + ) + excluder = idx._Excluder(workspace_root) + cached_paths = { + path + for path in cached_hashes.keys() + if _is_index_eligible_path(path, workspace_root, excluder) + } + indexed_paths, indexed_truncated = _load_indexed_paths_for_collection( + client, + collection, + workspace_path, + max_paths=max_paths, + ) + if fs_truncated or indexed_truncated: + stale = [] + missing = [] + enq_stale = 0 + enq_missing = 0 + else: + stale = sorted(indexed_paths - fs_paths) + missing = sorted(fs_paths - indexed_paths) + enq_stale, enq_missing = _enqueue_consistency_repairs( + workspace_root, + workspace_path, + repo_name, + stale, + missing, + cached_hashes, + ) + summary = { + "fs_count": len(fs_paths), + "cache_count": len(cached_paths), + "qdrant_count": len(indexed_paths), + "fs_scan_truncated": fs_truncated, + "qdrant_scan_truncated": indexed_truncated, + "repair_skipped_due_to_truncation": bool(fs_truncated or indexed_truncated), + "stale_in_qdrant_count": len(stale), + "missing_in_qdrant_count": len(missing), + "repair_enqueued_stale_count": int(enq_stale), + "repair_enqueued_missing_count": int(enq_missing), + "sample_stale": stale[:20], + "sample_missing": missing[:20], + } + _record_consistency_audit(workspace_path, repo_name, summary) + logger.info( + "[consistency_audit] repo=%s collection=%s fs=%d cache=%d qdrant=%d stale=%d missing=%d repair_stale=%d repair_missing=%d", + repo_name or "", + collection, + len(fs_paths), + len(cached_paths), + len(indexed_paths), + len(stale), + len(missing), + int(enq_stale), + int(enq_missing), + ) + except Exception as exc: + logger.debug( + "[consistency_audit] failed workspace=%s repo=%s: %s", + workspace_path, + repo_name, + exc, + ) + + +def run_empty_dir_sweep_maintenance(root: Path) -> None: + if not _empty_dir_sweep_enabled(): + return + try: + candidates = list_workspaces(search_root=str(root), use_qdrant_fallback=False) + except Exception: + candidates = [] + for ws in candidates: + workspace_path = str(ws.get("workspace_path") or "").strip() + if not workspace_path: + continue + repo_name = _extract_repo_name_from_path(workspace_path) + if not _should_run_empty_dir_sweep(workspace_path, repo_name): + continue + try: + logger.info("[empty_dir_sweep] Sweeping empty directories under %s", workspace_path) + sweep_success = _sweep_empty_workspace_dirs(Path(workspace_path)) + if sweep_success: + _record_empty_dir_sweep(workspace_path, repo_name) + else: + logger.debug( + "[empty_dir_sweep] sweep had failures workspace=%s repo=%s - not recording success", + workspace_path, + repo_name, + ) + except Exception as exc: + logger.debug( + "[empty_dir_sweep] failed workspace=%s repo=%s: %s", + workspace_path, + repo_name, + exc, + ) diff --git a/scripts/watch_index_core/handler.py b/scripts/watch_index_core/handler.py index bf5cb6d9..36c4d457 100644 --- a/scripts/watch_index_core/handler.py +++ b/scripts/watch_index_core/handler.py @@ -12,7 +12,6 @@ import scripts.ingest_code as idx from scripts.workspace_state import ( _extract_repo_name_from_path, - _get_global_state_dir, get_cached_file_hash, log_watcher_activity as _log_activity, remove_cached_file, @@ -27,6 +26,7 @@ safe_print, ) from .rename import _rename_in_store +from .paths import is_internal_metadata_path class IndexHandler(FileSystemEventHandler): @@ -81,6 +81,9 @@ def _maybe_reload_excluder(self) -> None: except Exception: pass + def _is_internal_metadata_path(self, p: Path) -> bool: + return is_internal_metadata_path(p) + def _maybe_enqueue(self, src_path: str) -> None: self._maybe_reload_excluder() p = Path(src_path) @@ -95,15 +98,7 @@ def _maybe_enqueue(self, src_path: str) -> None: except ValueError: return - try: - if callable(_get_global_state_dir): - global_state_dir = _get_global_state_dir() - if global_state_dir is not None and p.is_relative_to(global_state_dir): - return - except (OSError, ValueError): - pass - - if any(part == ".codebase" for part in p.parts): + if self._is_internal_metadata_path(p): return # Git history manifests are handled by a separate ingestion pipeline and should still @@ -140,7 +135,7 @@ def on_deleted(self, event): p = Path(event.src_path).resolve() except Exception: return - if any(part == ".codebase" for part in p.parts): + if self._is_internal_metadata_path(p): return if not idx.is_indexable_file(p): return @@ -162,6 +157,42 @@ def on_moved(self, event): dest = Path(event.dest_path).resolve() except Exception: return + # Handle internal-boundary moves properly + src_internal = self._is_internal_metadata_path(src) + dest_internal = self._is_internal_metadata_path(dest) + if src_internal and dest_internal: + # Both internal -> ignore + return + if dest_internal: + # External -> internal: delete source, don't index destination + if idx.is_indexable_file(src): + try: + coll = self._resolve_collection(src) + deleted = False + if self.client is not None and coll is not None: + idx.delete_points_by_path(self.client, coll, str(src)) + # Clean up graph edges for the moved file + try: + idx.delete_graph_edges_by_path( + self.client, + coll, + caller_path=str(src), + ) + except Exception: + pass # Graph cleanup is best-effort + deleted = True + if deleted: + safe_print(f"[moved:external_to_internal] deleted {src}") + except Exception as exc: + safe_print(f"[moved:external_to_internal:error] {src}: {exc}") + finally: + self._invalidate_cache(src) + return + if src_internal: + # Internal -> external: index destination as new file + if idx.is_indexable_file(dest): + self._maybe_enqueue(str(dest)) + return if not idx.is_indexable_file(dest) and not idx.is_indexable_file(src): return try: @@ -174,18 +205,25 @@ def on_moved(self, event): if idx.is_indexable_file(src): try: coll = self._resolve_collection(src) + deleted = False if self.client is not None and coll is not None: idx.delete_points_by_path(self.client, coll, str(src)) - safe_print(f"[moved:ignored_dest_deleted_src] {src} -> {dest}") - src_repo_path = _detect_repo_for_file(src) - src_repo_name = _repo_name_or_none(src_repo_path) - try: - if src_repo_name: - remove_cached_file(str(src), src_repo_name) - except Exception: - pass - except Exception: - pass + # Clean up graph edges for the moved file + try: + idx.delete_graph_edges_by_path( + self.client, + coll, + caller_path=str(src), + ) + except Exception: + pass # Graph cleanup is best-effort + deleted = True + if deleted: + safe_print(f"[moved:ignored_dest_deleted_src] {src} -> {dest}") + except Exception as exc: + safe_print(f"[moved:ignored_dest_deleted_src:error] {src}: {exc}") + finally: + self._invalidate_cache(src) return except Exception: pass @@ -270,6 +308,14 @@ def _delete_points(self, path: Path, collection: str | None) -> None: return try: idx.delete_points_by_path(self.client, collection, str(path)) + try: + idx.delete_graph_edges_by_path( + self.client, + collection, + caller_path=str(path), + ) + except Exception: + pass safe_print(f"[deleted] {path} -> {collection}") except Exception: pass diff --git a/scripts/watch_index_core/paths.py b/scripts/watch_index_core/paths.py new file mode 100644 index 00000000..2e76cfb9 --- /dev/null +++ b/scripts/watch_index_core/paths.py @@ -0,0 +1,36 @@ +"""Path classification helpers shared by watcher components.""" + +from __future__ import annotations + +from pathlib import Path + +from scripts.workspace_state import ( + _get_global_state_dir, + INTERNAL_STATE_TOP_LEVEL_DIRS, +) + + +def is_internal_metadata_path(path: Path) -> bool: + """Return True when path points into watcher/internal metadata trees.""" + try: + # Deliberately match internal segments anywhere in the path to prevent + # indexing of nested metadata mirrors (for example in replicated roots). + if any(part in INTERNAL_STATE_TOP_LEVEL_DIRS for part in path.parts): + return True + global_state_dir = _get_global_state_dir() + if global_state_dir is not None and path.is_relative_to(global_state_dir): + return True + except (OSError, ValueError): + return False + return False + + +def is_internal_top_level_path(path: Path, root: Path) -> bool: + """Return True when path's top-level segment under root is internal metadata.""" + try: + rel = path.resolve().relative_to(root.resolve()) + except Exception: + return False + if not rel.parts: + return False + return rel.parts[0] in INTERNAL_STATE_TOP_LEVEL_DIRS diff --git a/scripts/watch_index_core/processor.py b/scripts/watch_index_core/processor.py index 45e9db7e..e8a469d1 100644 --- a/scripts/watch_index_core/processor.py +++ b/scripts/watch_index_core/processor.py @@ -3,28 +3,42 @@ from __future__ import annotations import hashlib +import json import os import subprocess import sys +import atexit +import threading +import time +from collections import deque +from concurrent.futures import Future, ThreadPoolExecutor from datetime import datetime from pathlib import Path from typing import Dict, List, Optional +from qdrant_client import models + import scripts.ingest_code as idx from scripts.workspace_state import ( + _normalize_cache_key_path, _extract_repo_name_from_path, get_cached_file_hash, + list_pending_index_journal_entries, get_workspace_state, is_staging_enabled, log_watcher_activity as _log_activity, persist_indexing_config, remove_cached_file, + set_cached_file_hash, set_indexing_progress as _update_progress, set_indexing_started as _set_status_indexing, + update_index_journal_entry_status, update_indexing_status, ) +from . import config as watch_config +from .rename import _rename_in_store +from .paths import is_internal_metadata_path -from .config import QDRANT_URL, ROOT, ROOT_DIR, LOGGER as logger from .utils import ( _detect_repo_for_file, _get_collection_for_file, @@ -33,43 +47,293 @@ safe_log_error, ) +logger = watch_config.LOGGER + class _SkipUnchanged(Exception): """Sentinel exception to skip unchanged files in the watch loop.""" + def __init__(self, *, text: Optional[str] = None, file_hash: str = "") -> None: + super().__init__("unchanged") + self.text = text + self.file_hash = file_hash -def _process_git_history_manifest( + +def _is_internal_ignored_path(path: Path) -> bool: + return is_internal_metadata_path(path) + + +def _staging_requires_subprocess(state: Optional[Dict[str, object]]) -> bool: + """Return True only when dual-root staging is actually active for this repo.""" + if not (is_staging_enabled() and isinstance(state, dict)): + return False + + staging = state.get("staging") + if isinstance(staging, dict) and staging: + return True + + active_slug = str(state.get("active_repo_slug") or "").strip() + serving_slug = str(state.get("serving_repo_slug") or "").strip() + if serving_slug.endswith("_old"): + return True + if active_slug and serving_slug and active_slug != serving_slug: + return True + return False + + +def _env_int(name: str, default: int) -> int: + try: + raw = str(os.environ.get(name, str(default))).strip() + val = int(raw) + return val if val > 0 else default + except Exception: + return default + + +_GIT_HISTORY_MAX_WORKERS = _env_int("WATCH_GIT_HISTORY_MAX_WORKERS", 1) +_GIT_HISTORY_TIMEOUT_SECONDS = _env_int("WATCH_GIT_HISTORY_TIMEOUT_SECONDS", 0) +_GIT_HISTORY_EXECUTOR = ThreadPoolExecutor( + max_workers=_GIT_HISTORY_MAX_WORKERS, + thread_name_prefix="git-history", +) + + +def _shutdown_git_history_executor() -> None: + try: + _GIT_HISTORY_EXECUTOR.shutdown(wait=False) + except Exception: + pass + + +atexit.register(_shutdown_git_history_executor) +_GIT_HISTORY_INFLIGHT: set[str] = set() +_GIT_HISTORY_INFLIGHT_LOCK = threading.Lock() + + +def _manifest_key(p: Path) -> str: + try: + return str(p.resolve()) + except Exception: + return str(p) + + +def _manifest_stats(p: Path) -> tuple[str, int]: + run_id = "unknown" + commit_count = -1 + try: + with p.open("r", encoding="utf-8") as fh: + data = json.load(fh) + if isinstance(data, dict): + commits = data.get("commits") or [] + if isinstance(commits, list): + commit_count = len(commits) + name = p.name + run_id = name[:-5] if name.endswith(".json") else name + except Exception: + pass + return run_id, commit_count + + +def _run_git_history_ingest( p: Path, collection: str, repo_name: Optional[str], env_snapshot: Optional[Dict[str, str]] = None, ) -> None: - try: - script = ROOT_DIR / "scripts" / "ingest_history.py" - if not script.exists(): - return - cmd = [sys.executable or "python3", str(script), "--manifest-json", str(p)] - env = _build_subprocess_env(collection, repo_name, env_snapshot) + script = watch_config.ROOT_DIR / "scripts" / "ingest_history.py" + if not script.exists(): + raise RuntimeError(f"[git_history_manifest] ingest script missing: {script}") + + cmd = [sys.executable or "python3", str(script), "--manifest-json", str(p)] + env = _build_subprocess_env(collection, repo_name, env_snapshot) + started = time.monotonic() + timeout = _GIT_HISTORY_TIMEOUT_SECONDS if _GIT_HISTORY_TIMEOUT_SECONDS > 0 else None + stdout_tail: deque[str] = deque(maxlen=20) + stderr_tail: deque[str] = deque(maxlen=20) + tail_lock = threading.Lock() + + def _tail_snapshot(tail: deque[str], limit: int = 5) -> str: + with tail_lock: + return " | ".join(list(tail)[-limit:]) + + def _stream_pipe(pipe, label: str, tail: deque[str], lock: threading.Lock) -> None: try: - print( - f"[git_history_manifest] launching ingest_history.py for {p} " - f"collection={collection} repo={repo_name}" - ) + for raw in iter(pipe.readline, ""): + line = (raw or "").rstrip() + if not line: + continue + with lock: + tail.append(line) + logger.info("[git_history_manifest][%s] %s", label, line) except Exception: pass - # Use subprocess.run for better error observability. - # NOTE: This blocks until ingest_history.py completes. If history ingestion - # is slow, this may need revisiting (e.g., revert to Popen fire-and-forget - # or run in a separate thread) to avoid blocking the watcher. - result = subprocess.run(cmd, env=env, capture_output=True, text=True, check=False) - if result.returncode != 0: - logger.warning( - "[git_history_manifest] ingest_history.py failed for %s: exit=%d stderr=%s", - p, result.returncode, (result.stderr or "")[:500], + finally: + try: + pipe.close() + except Exception: + pass + + proc: Optional[subprocess.Popen] = None + try: + proc = subprocess.Popen( + cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + bufsize=1, + ) + t_out = threading.Thread( + target=_stream_pipe, + args=(proc.stdout, "stdout", stdout_tail, tail_lock), + daemon=True, + ) + t_err = threading.Thread( + target=_stream_pipe, + args=(proc.stderr, "stderr", stderr_tail, tail_lock), + daemon=True, + ) + t_out.start() + t_err.start() + + deadline = (started + timeout) if timeout else None + timed_out = False + while True: + code = proc.poll() + if code is not None: + break + if deadline and time.monotonic() >= deadline: + timed_out = True + try: + proc.kill() + except Exception: + pass + break + time.sleep(0.2) + + # Ensure threads flush trailing output after process exit/kill. + t_out.join(timeout=1.0) + t_err.join(timeout=1.0) + + if timed_out: + elapsed_ms = int((time.monotonic() - started) * 1000) + error_msg = ( + f"[git_history_manifest] ingest_history.py timeout for {p} after {elapsed_ms}ms " + f"(timeout={_GIT_HISTORY_TIMEOUT_SECONDS}s)" ) + if stderr_tail: + error_msg += f" stderr={_tail_snapshot(stderr_tail)}" + logger.warning(error_msg) + raise RuntimeError(error_msg) + + returncode = proc.wait(timeout=1.0) except Exception as e: - logger.warning("[git_history_manifest] error processing %s: %s", p, e) - return + logger.warning("[git_history_manifest] subprocess error for %s: %s", p, e) + try: + if proc and proc.poll() is None: + proc.kill() + except Exception: + pass + raise RuntimeError(f"[git_history_manifest] subprocess error for {p}: {e}") from e + + elapsed_ms = int((time.monotonic() - started) * 1000) + if returncode != 0: + error_msg = ( + f"[git_history_manifest] ingest_history.py failed for {p}: exit={returncode} " + f"elapsed_ms={elapsed_ms} stderr={_tail_snapshot(stderr_tail)}" + ) + logger.warning(error_msg) + raise RuntimeError(error_msg) + + logger.info( + "[git_history_manifest] completed for %s: exit=0 elapsed_ms=%d", + p, + elapsed_ms, + ) + if stdout_tail: + logger.info( + "[git_history_manifest] stdout tail for %s: %s", + p, + _tail_snapshot(stdout_tail), + ) + if stderr_tail: + logger.warning( + "[git_history_manifest] stderr tail for %s: %s", + p, + _tail_snapshot(stderr_tail), + ) + + +def _on_git_history_done(manifest_path: Path, collection: str, repo_name: Optional[str], future: Future) -> None: + manifest_key = _manifest_key(manifest_path) + with _GIT_HISTORY_INFLIGHT_LOCK: + _GIT_HISTORY_INFLIGHT.discard(manifest_key) + remaining = len(_GIT_HISTORY_INFLIGHT) + logger.info("[git_history_manifest] in-flight remaining=%d", remaining) + try: + future.result() + # Mark journal as done after successful completion + repo_path = _detect_repo_for_file(manifest_path) + if repo_path: + repo_key = str(repo_path) + _mark_journal_done(manifest_path, repo_key, repo_name) + logger.info("[git_history_manifest] marked journal as done: %s", manifest_path) + except Exception as e: + repo_path = _detect_repo_for_file(manifest_path) + repo_key = str(repo_path) if repo_path else "" + if repo_key: + _mark_journal_failed( + manifest_path, + repo_key, + repo_name, + f"git history worker failed for collection '{collection}': {e}", + ) + logger.warning( + "[git_history_manifest] worker crashed for %s (collection=%s, repo_key=%s): %s", + manifest_key, + collection, + repo_key or "", + e, + exc_info=True, + ) + + +def _process_git_history_manifest( + p: Path, + collection: str, + repo_name: Optional[str], + env_snapshot: Optional[Dict[str, str]] = None, +) -> None: + key = _manifest_key(p) + run_id, commit_count = _manifest_stats(p) + queued = 0 + with _GIT_HISTORY_INFLIGHT_LOCK: + if key in _GIT_HISTORY_INFLIGHT: + logger.info( + "[git_history_manifest] skip duplicate in-flight manifest: %s run_id=%s", + p, + run_id, + ) + return + _GIT_HISTORY_INFLIGHT.add(key) + queued = len(_GIT_HISTORY_INFLIGHT) + logger.info( + "[git_history_manifest] queued ingest_history.py for %s run_id=%s commits=%d collection=%s repo=%s in_flight=%d", + p, + run_id, + commit_count, + collection, + repo_name, + queued, + ) + future = _GIT_HISTORY_EXECUTOR.submit( + _run_git_history_ingest, + p, + collection, + repo_name, + env_snapshot, + ) + future.add_done_callback(lambda fut, manifest_path=p, coll=collection, rn=repo_name: _on_git_history_done(manifest_path, coll, rn, fut)) def _advance_progress( @@ -92,6 +356,158 @@ def _advance_progress( pass +def _mark_journal_done(path: Path, repo_key: str, repo_name: Optional[str]) -> None: + try: + update_index_journal_entry_status( + str(path), + status="done", + workspace_path=repo_key, + repo_name=repo_name, + ) + except Exception: + pass + + +def _mark_journal_failed( + path: Path, + repo_key: str, + repo_name: Optional[str], + error: str, +) -> None: + try: + update_index_journal_entry_status( + str(path), + status="failed", + error=error, + workspace_path=repo_key, + repo_name=repo_name, + remove_on_done=False, + ) + except Exception: + pass + + +def _path_has_indexed_points(client, collection: str, path: Path) -> Optional[bool]: + try: + filt = models.Filter( + must=[ + models.FieldCondition( + key="metadata.path", match=models.MatchValue(value=str(path)) + ) + ] + ) + points, _ = client.scroll( + collection_name=collection, + scroll_filter=filt, + with_payload=False, + with_vectors=False, + limit=1, + ) + return bool(points) + except Exception: + return None + + +def _verify_delete_committed(client, collection: str, path: Path) -> bool: + has_points = _path_has_indexed_points(client, collection, path) + return has_points is False + + +def _verify_upsert_committed( + client, + collection: str, + path: Path, + repo_name: Optional[str], + expected_file_hash: Optional[str], + source_text: Optional[str] = None, +) -> bool: + indexed_hash = str( + idx.get_indexed_file_hash(client, collection, str(path)) or "" + ).strip() + expected_hash = str(expected_file_hash or "").strip() + if expected_hash: + if bool(indexed_hash) and indexed_hash == expected_hash: + return True + # Empty/whitespace-only files can legitimately have no indexed points/hash. + try: + if source_text is not None and not source_text.strip(): + has_points = _path_has_indexed_points(client, collection, path) + return has_points is False + except Exception: + pass + return False + has_points = _path_has_indexed_points(client, collection, path) + return has_points is True + + +def _verify_and_update_journal_for_upsert( + p: Path, + client, + collection: str, + repo_key: str, + repo_name: Optional[str], + journal_content_hash: str, + *, + text: Optional[str] = None, + file_hash: Optional[str] = None, +) -> None: + source_text = text + expected_hash = str(file_hash or "").strip() + if source_text is None or not expected_hash: + read_text, read_hash = _read_text_and_sha1(p) + if source_text is None: + source_text = read_text + if not expected_hash: + expected_hash = read_hash + expected_hash = expected_hash or journal_content_hash + if _verify_upsert_committed( + client, + collection, + p, + repo_name, + expected_hash or None, + source_text=source_text, + ): + _mark_journal_done(p, repo_key, repo_name) + else: + _mark_journal_failed( + p, + repo_key, + repo_name, + "upsert_verification_failed", + ) + + +def _finalize_journal_after_index_attempt( + path: Path, + client, + collection: str | None, + repo_key: str, + repo_name: Optional[str], + *, + force_upsert: bool, + journal_content_hash: str, + text: Optional[str] = None, + file_hash: Optional[str] = None, + default_error: Optional[str] = None, +) -> None: + if force_upsert and client is not None and collection is not None: + _verify_and_update_journal_for_upsert( + path, + client, + collection, + repo_key, + repo_name, + journal_content_hash, + text=text, + file_hash=file_hash, + ) + elif default_error: + _mark_journal_failed(path, repo_key, repo_name, default_error) + else: + _mark_journal_done(path, repo_key, repo_name) + + def _build_subprocess_env( collection: str | None, repo_name: str | None, @@ -105,8 +521,8 @@ def _build_subprocess_env( pass if collection: env["COLLECTION_NAME"] = collection - if QDRANT_URL: - env["QDRANT_URL"] = QDRANT_URL + if watch_config.QDRANT_URL: + env["QDRANT_URL"] = watch_config.QDRANT_URL if repo_name: env["REPO_NAME"] = repo_name return env @@ -114,6 +530,7 @@ def _build_subprocess_env( def _maybe_handle_staging_file( path: Path, + client, collection: str | None, repo_name: str | None, repo_key: str, @@ -121,27 +538,45 @@ def _maybe_handle_staging_file( state_env: Optional[Dict[str, str]], repo_progress: Dict[str, int], started_at: str, + *, + force_upsert: bool = False, + journal_content_hash: str = "", ) -> bool: - if not (is_staging_enabled() and state_env and collection): + if not (state_env and collection): return False - _text, file_hash = _read_text_and_sha1(path) + source_text, file_hash = _read_text_and_sha1(path) if file_hash: try: cached_hash = get_cached_file_hash(str(path), repo_name) if repo_name else None except Exception: cached_hash = None if cached_hash and cached_hash == file_hash: + if force_upsert and client is not None: + if _verify_upsert_committed( + client, + collection, + path, + repo_name, + file_hash or journal_content_hash or None, + source_text=source_text, + ): + safe_print(f"[skip_unchanged] {path} (hash match)") + _log_activity(repo_key, "skipped", path, {"reason": "hash_unchanged"}) + _mark_journal_done(path, repo_key, repo_name) + _advance_progress(repo_progress, repo_key, repo_files, started_at, path) + return True # Fast path: skip if content hash matches cached hash (file unchanged) # Safety: startup health check clears stale cache per-repo - safe_print(f"[skip_unchanged] {path} (hash match)") - _log_activity(repo_key, "skipped", path, {"reason": "hash_unchanged"}) - _advance_progress(repo_progress, repo_key, repo_files, started_at, path) - return True + if not force_upsert: + safe_print(f"[skip_unchanged] {path} (hash match)") + _log_activity(repo_key, "skipped", path, {"reason": "hash_unchanged"}) + _advance_progress(repo_progress, repo_key, repo_files, started_at, path) + return True cmd = [ sys.executable or "python3", - str(ROOT_DIR / "scripts" / "ingest_code.py"), + str(watch_config.ROOT_DIR / "scripts" / "ingest_code.py"), "--root", str(path), "--no-skip-unchanged", @@ -178,6 +613,19 @@ def _maybe_handle_staging_file( ) else: safe_print(f"[indexed_subprocess] {path} -> {collection}") + _finalize_journal_after_index_attempt( + path, + client, + collection, + repo_key, + repo_name, + force_upsert=force_upsert, + journal_content_hash=journal_content_hash, + text=source_text, + file_hash=file_hash, + ) + if result.returncode != 0 and force_upsert: + _mark_journal_failed(path, repo_key, repo_name, "subprocess_index_failed") _advance_progress(repo_progress, repo_key, repo_files, started_at, path) return True @@ -218,18 +666,92 @@ def _process_paths( pass repo_progress: Dict[str, int] = {key: 0 for key in repo_groups.keys()} + repo_pending_journal_ops: Dict[str, Dict[str, Dict[str, str]]] = {} + repo_move_source_for_dest: Dict[str, Dict[str, str]] = {} + move_dest_keys: set[str] = set() + move_source_keys: set[str] = set() + for repo_path in repo_groups.keys(): + try: + repo_name = _extract_repo_name_from_path(repo_path) + entries = list_pending_index_journal_entries(repo_path, repo_name) + repo_pending_journal_ops[repo_path] = {} + upserts_by_hash: Dict[str, List[str]] = {} + deletes_by_hash: Dict[str, List[str]] = {} + for rec in entries: + path_key = _normalize_cache_key_path(str(rec.get("path") or "")) + op_type = str(rec.get("op_type") or "").strip().lower() + content_hash = str(rec.get("content_hash") or "").strip().lower() + if not path_key: + continue + repo_pending_journal_ops[repo_path][path_key] = { + "op_type": op_type, + "content_hash": content_hash, + } + if not content_hash: + continue + if op_type == "upsert": + upserts_by_hash.setdefault(content_hash, []).append(path_key) + elif op_type == "delete": + deletes_by_hash.setdefault(content_hash, []).append(path_key) + pairs: Dict[str, str] = {} + for content_hash, dest_paths in upserts_by_hash.items(): + src_paths = deletes_by_hash.get(content_hash) or [] + if not src_paths: + continue + src_idx = 0 + for dest_key in dest_paths: + while src_idx < len(src_paths) and src_paths[src_idx] == dest_key: + src_idx += 1 + if src_idx >= len(src_paths): + break + src_key = src_paths[src_idx] + src_idx += 1 + pairs[dest_key] = src_key + move_dest_keys.add(dest_key) + move_source_keys.add(src_key) + repo_move_source_for_dest[repo_path] = pairs + except Exception: + repo_pending_journal_ops[repo_path] = {} + repo_move_source_for_dest[repo_path] = {} + + unique_paths = sorted( + unique_paths, + key=lambda p: ( + 0 + if _normalize_cache_key_path(str(p)) in move_dest_keys + else (2 if _normalize_cache_key_path(str(p)) in move_source_keys else 1), + str(p), + ), + ) + completed_move_sources: set[str] = set() for p in unique_paths: repo_path = _detect_repo_for_file(p) or Path(workspace_path) repo_key = str(repo_path) repo_files = repo_groups.get(repo_key, []) repo_name = _extract_repo_name_from_path(repo_key) + path_key = _normalize_cache_key_path(str(p)) + if path_key in completed_move_sources: + _advance_progress(repo_progress, repo_key, repo_files, started_at, p) + continue + journal_rec = repo_pending_journal_ops.get(repo_key, {}).get(path_key, {}) + journal_op = str(journal_rec.get("op_type") or "").strip().lower() + force_delete = journal_op == "delete" + force_upsert = journal_op == "upsert" + journal_content_hash = str(journal_rec.get("content_hash") or "").strip().lower() + if _is_internal_ignored_path(p): + _log_activity(repo_key, "skipped", p, {"reason": "internal_ignored_path"}) + # Internal metadata paths should never drive indexing or collection creation. + # If they entered the journal via drift repair, mark done and drop. + _mark_journal_done(p, repo_key, repo_name) + _advance_progress(repo_progress, repo_key, repo_files, started_at, p) + continue collection = _get_collection_for_file(p) state_env: Optional[Dict[str, str]] = None try: st = get_workspace_state(repo_key, repo_name) if get_workspace_state else None if isinstance(st, dict): - if is_staging_enabled(): + if _staging_requires_subprocess(st): state_env = st.get("indexing_env") except Exception: state_env = None @@ -240,31 +762,104 @@ def _process_paths( p, collection, repo_name, - env_snapshot=(state_env if is_staging_enabled() else None), + env_snapshot=state_env, ) except Exception as exc: safe_print(f"[commit_ingest_error] {p}: {exc}") _advance_progress(repo_progress, repo_key, repo_files, started_at, p) continue - if not p.exists(): + if force_upsert and not p.exists(): + _log_activity(repo_key, "skipped", p, {"reason": "upsert_missing_file"}) + _mark_journal_failed( + p, + repo_key, + repo_name, + "upsert_missing_file", + ) + _advance_progress(repo_progress, repo_key, repo_files, started_at, p) + continue + + if force_upsert and client is not None and collection is not None: + move_src_key = repo_move_source_for_dest.get(repo_key, {}).get(path_key) + if move_src_key: + move_src_path = Path(move_src_key) + src_collection = _get_collection_for_file(move_src_path) + try: + moved_count, renamed_hash = _rename_in_store( + client, + src_collection, + move_src_path, + p, + collection, + ) + except Exception: + moved_count, renamed_hash = -1, None + if moved_count and moved_count > 0: + try: + if repo_name: + remove_cached_file(str(move_src_path), repo_name) + except Exception: + pass + final_hash = renamed_hash or journal_content_hash + try: + if repo_name and final_hash: + set_cached_file_hash(str(p), final_hash, repo_name) + except Exception: + pass + _log_activity( + repo_key, + "moved", + p, + {"from": str(move_src_path), "chunks": int(moved_count)}, + ) + _mark_journal_done(p, repo_key, repo_name) + _mark_journal_done(move_src_path, repo_key, repo_name) + completed_move_sources.add(move_src_key) + _advance_progress(repo_progress, repo_key, repo_files, started_at, p) + continue + + if force_delete or not p.exists(): + deleted_ok = False if client is not None: try: idx.delete_points_by_path(client, collection, str(p)) + try: + idx.delete_graph_edges_by_path( + client, + collection, + caller_path=str(p), + repo=repo_name, + ) + except Exception as graph_exc: + safe_print(f"[deleted:graph_failed] {p} -> {collection}: {graph_exc}") safe_print(f"[deleted] {p} -> {collection}") + deleted_ok = True except Exception: - pass + deleted_ok = False + if deleted_ok and client is not None and collection is not None: + deleted_ok = _verify_delete_committed(client, collection, p) try: if repo_name: remove_cached_file(str(p), repo_name) except Exception: pass _log_activity(repo_key, "deleted", p) + if deleted_ok: + _mark_journal_done(p, repo_key, repo_name) + else: + _mark_journal_failed( + p, + repo_key, + repo_name, + "delete_points_failed", + ) _advance_progress(repo_progress, repo_key, repo_files, started_at, p) continue if _maybe_handle_staging_file( p, + client, collection, repo_name, repo_key, @@ -272,17 +867,38 @@ def _process_paths( state_env, repo_progress, started_at, + force_upsert=force_upsert, + journal_content_hash=journal_content_hash, ): continue if client is not None and model is not None: try: + verify_context: Dict[str, Optional[str]] = {} ok = _run_indexing_strategy( - p, client, model, collection, vector_name, model_dim, repo_name + p, + client, + model, + collection, + vector_name, + model_dim, + repo_name, + verify_context=verify_context if force_upsert else None, ) - except _SkipUnchanged: + except _SkipUnchanged as exc: status = "skipped" safe_print(f"[{status}] {p} -> {collection}") _log_activity(repo_key, "skipped", p, {"reason": "hash_unchanged"}) + _finalize_journal_after_index_attempt( + p, + client, + collection, + repo_key, + repo_name, + force_upsert=force_upsert, + journal_content_hash=journal_content_hash, + text=exc.text, + file_hash=exc.file_hash, + ) _advance_progress(repo_progress, repo_key, repo_files, started_at, p) continue except Exception: @@ -295,6 +911,7 @@ def _process_paths( "file": str(p), }, ) + _mark_journal_failed(p, repo_key, repo_name, "indexing_error") _advance_progress(repo_progress, repo_key, repo_files, started_at, p) continue @@ -306,10 +923,33 @@ def _process_paths( except Exception: size = None _log_activity(repo_key, "indexed", p, {"file_size": size}) + _finalize_journal_after_index_attempt( + p, + client, + collection, + repo_key, + repo_name, + force_upsert=force_upsert, + journal_content_hash=journal_content_hash, + text=verify_context.get("text"), + file_hash=verify_context.get("file_hash"), + ) else: _log_activity( repo_key, "skipped", p, {"reason": "no-change-or-error"} ) + _finalize_journal_after_index_attempt( + p, + client, + collection, + repo_key, + repo_name, + force_upsert=force_upsert, + journal_content_hash=journal_content_hash, + text=verify_context.get("text"), + file_hash=verify_context.get("file_hash"), + default_error="no_change_or_error", + ) _advance_progress(repo_progress, repo_key, repo_files, started_at, p) else: safe_print(f"Not processing locally: {p}") @@ -333,7 +973,7 @@ def _read_text_and_sha1(path: Path) -> tuple[Optional[str], str]: text = path.read_text(encoding="utf-8", errors="ignore") except Exception: text = None - if not text: + if text is None: return text, "" try: file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() @@ -350,21 +990,26 @@ def _run_indexing_strategy( vector_name: str, model_dim: int, repo_name: str | None, + *, + verify_context: Optional[Dict[str, Optional[str]]] = None, ) -> bool: if collection is None: return False - try: - idx.ensure_collection_and_indexes_once(client, collection, model_dim, vector_name) - except Exception: - pass text, file_hash = _read_text_and_sha1(path) + if verify_context is not None: + verify_context["text"] = text + verify_context["file_hash"] = file_hash ok = False if text is not None: try: language = idx.detect_language(path) except Exception: language = "" + try: + is_text_like = bool(idx.is_text_like_language(language)) + except Exception: + is_text_like = False if file_hash: try: cached_hash = get_cached_file_hash(str(path), repo_name) if repo_name else None @@ -372,46 +1017,54 @@ def _run_indexing_strategy( cached_hash = None if cached_hash and cached_hash == file_hash: ok = True - raise _SkipUnchanged() - try: - use_smart, smart_reason = idx.should_use_smart_reindexing(str(path), file_hash) - except Exception: - use_smart, smart_reason = False, "smart_check_failed" - # Bootstrap: if we have no symbol cache yet, still run smart path once - bootstrap = smart_reason == "no_cached_symbols" - if use_smart or bootstrap: - msg_kind = ( - "smart reindexing" - if use_smart - else "bootstrap (no_cached_symbols) for smart reindex" - ) - safe_print( - f"[SMART_REINDEX][watcher] Using {msg_kind} for {path} ({smart_reason})" - ) + raise _SkipUnchanged(text=text, file_hash=file_hash) + if not is_text_like: try: - status = idx.process_file_with_smart_reindexing( - path, - text, - language, - client, - collection, - repo_name, - model, - vector_name, + use_smart, smart_reason = idx.should_use_smart_reindexing(str(path), file_hash) + except Exception: + use_smart, smart_reason = False, "smart_check_failed" + # Bootstrap: if we have no symbol cache yet, still run smart path once + bootstrap = smart_reason == "no_cached_symbols" + if use_smart or bootstrap: + msg_kind = ( + "smart reindexing" + if use_smart + else "bootstrap (no_cached_symbols) for smart reindex" ) - ok = status in ("success", "skipped") - except Exception as exc: safe_print( - f"[SMART_REINDEX][watcher] Smart reindexing failed for {path}: {exc}" + f"[SMART_REINDEX][watcher] Using {msg_kind} for {path} ({smart_reason})" ) - ok = False - else: - safe_print( - f"[SMART_REINDEX][watcher] Using full reindexing for {path} ({smart_reason})" - ) - # Fallback: full single-file reindex. Pseudo/tags are inlined by default; - # when PSEUDO_DEFER_TO_WORKER=1 we run base-only and rely on backfill. + try: + status = idx.process_file_with_smart_reindexing( + path, + text, + language, + client, + collection, + repo_name, + model, + vector_name, + model_dim=model_dim, + ) + ok = status in ("success", "skipped") + except Exception as exc: + safe_print( + f"[SMART_REINDEX][watcher] Smart reindexing failed for {path}: {exc}" + ) + ok = False + else: + safe_print( + f"[SMART_REINDEX][watcher] Using full reindexing for {path} ({smart_reason})" + ) + # Fallback: full single-file reindex. Pseudo/tags are inlined by default; + # when PSEUDO_DEFER_TO_WORKER=1 we run base-only and rely on backfill. if not ok: + try: + idx.ensure_collection_and_indexes_once( + client, collection, model_dim, vector_name + ) + except Exception: + pass pseudo_mode = "off" if get_boolean_env("PSEUDO_DEFER_TO_WORKER") else "full" ok = idx.index_single_file( client, @@ -423,6 +1076,9 @@ def _run_indexing_strategy( skip_unchanged=False, pseudo_mode=pseudo_mode, repo_name_for_cache=repo_name, + preloaded_text=text, + preloaded_file_hash=file_hash, + preloaded_language=language if text is not None else None, ) return ok diff --git a/scripts/watch_index_core/pseudo.py b/scripts/watch_index_core/pseudo.py index dc7bb0a8..33caa303 100644 --- a/scripts/watch_index_core/pseudo.py +++ b/scripts/watch_index_core/pseudo.py @@ -8,6 +8,7 @@ from typing import Optional import scripts.ingest_code as idx +from . import config as watch_config from .utils import get_boolean_env from scripts.workspace_state import ( _cross_process_lock, @@ -17,8 +18,6 @@ is_multi_repo_mode, ) -from .config import ROOT - logger = logging.getLogger(__name__) @@ -49,14 +48,23 @@ def _start_pseudo_backfill_worker( max_points = 256 if max_points <= 0: max_points = 1 + try: + graph_max_files = int( + os.environ.get("GRAPH_EDGES_BACKFILL_MAX_FILES", "128") or 128 + ) + except Exception: + graph_max_files = 128 + if graph_max_files <= 0: + graph_max_files = 1 shutdown_event = threading.Event() def _worker() -> None: while not shutdown_event.is_set(): try: + graph_backfill_enabled = get_boolean_env("GRAPH_EDGES_BACKFILL") try: - mappings = get_collection_mappings(search_root=str(ROOT)) + mappings = get_collection_mappings(search_root=str(watch_config.ROOT)) except Exception: mappings = [] if not mappings: @@ -74,7 +82,7 @@ def _worker() -> None: if is_multi_repo_mode() and repo_name: state_dir = _get_repo_state_dir(repo_name) else: - state_dir = _get_global_state_dir(str(ROOT)) + state_dir = _get_global_state_dir(str(watch_config.ROOT)) lock_path = state_dir / "pseudo.lock" with _cross_process_lock(lock_path): processed = idx.pseudo_backfill_tick( @@ -90,6 +98,34 @@ def _worker() -> None: "[pseudo_backfill] repo=%s collection=%s processed=%d", repo_name or "default", coll, processed, ) + # Optional: backfill graph edge collection from main points. + # Controlled separately because it may scan large collections over time. + # Run under its own lock to avoid blocking pseudo/tag backfill workers. + if graph_backfill_enabled: + try: + graph_lock_path = state_dir / "graph_edges.lock" + with _cross_process_lock(graph_lock_path): + files_done = idx.graph_edges_backfill_tick( + client, + coll, + repo_name=repo_name, + max_files=graph_max_files, + ) + if files_done: + logger.info( + "[graph_backfill] repo=%s collection=%s files=%d", + repo_name or "default", + coll, + files_done, + ) + except Exception as exc: + logger.error( + "[graph_backfill] error repo=%s collection=%s: %s", + repo_name or "default", + coll, + exc, + exc_info=True, + ) except Exception as exc: logger.error( "[pseudo_backfill] error repo=%s collection=%s: %s", @@ -110,4 +146,3 @@ def _worker() -> None: __all__ = ["_start_pseudo_backfill_worker"] - diff --git a/scripts/watch_index_core/queue.py b/scripts/watch_index_core/queue.py index ede8835b..118bbe6d 100644 --- a/scripts/watch_index_core/queue.py +++ b/scripts/watch_index_core/queue.py @@ -3,10 +3,11 @@ from __future__ import annotations import threading +import time from pathlib import Path from typing import Callable, Iterable, List, Set -from .config import DELAY_SECS, LOGGER +from .config import DELAY_SECS, LOGGER, RECENT_FINGERPRINT_TTL_SECS class ChangeQueue: @@ -16,15 +17,21 @@ def __init__(self, process_cb: Callable[[List[Path]], None]): self._lock = threading.Lock() self._paths: Set[Path] = set() self._pending: Set[Path] = set() + self._forced_paths: Set[Path] = set() + self._pending_forced: Set[Path] = set() self._timer: threading.Timer | None = None self._process_cb = process_cb # Serialize processing to avoid concurrent use of TextEmbedding/QdrantClient self._processing_lock = threading.Lock() + self._recent_fingerprints: dict[Path, tuple[tuple[int, int], float]] = {} - def add(self, p: Path) -> None: + def add(self, p: Path, *, force: bool = False) -> None: with self._lock: + already_queued = p in self._paths self._paths.add(p) - if self._timer is not None: + if force: + self._forced_paths.add(p) + if self._timer is not None and not already_queued: try: self._timer.cancel() except Exception as exc: @@ -32,21 +39,91 @@ def add(self, p: Path) -> None: "Failed to cancel timer in ChangeQueue.add", extra={"error": str(exc)}, ) - self._timer = threading.Timer(DELAY_SECS, self._flush) - self._timer.daemon = True - self._timer.start() + if self._timer is None or not already_queued: + self._timer = threading.Timer(DELAY_SECS, self._flush) + self._timer.daemon = True + self._timer.start() + + def _fingerprint_path(self, p: Path) -> tuple[int, int] | None: + try: + st = p.stat() + return ( + int(getattr(st, "st_size", 0)), + int(getattr(st, "st_mtime_ns", int(st.st_mtime * 1e9))), + ) + except Exception: + return None + + def _filter_recent_paths( + self, + paths: Iterable[Path], + *, + forced_paths: Iterable[Path] | None = None, + ) -> list[Path]: + ttl = float(RECENT_FINGERPRINT_TTL_SECS) + forced = set(forced_paths or []) + if ttl <= 0: + return list(paths) + + now = time.time() + keep: list[Path] = [] + for p in paths: + if p in forced: + keep.append(p) + continue + fp = self._fingerprint_path(p) + if fp is None: + keep.append(p) + continue + prev = self._recent_fingerprints.get(p) + if prev is not None: + prev_fp, prev_ts = prev + if prev_fp == fp and (now - prev_ts) < ttl: + continue + keep.append(p) + return keep + + def _mark_recent_paths(self, paths: Iterable[Path]) -> None: + ttl = float(RECENT_FINGERPRINT_TTL_SECS) + if ttl <= 0: + return + now = time.time() + for p in paths: + fp = self._fingerprint_path(p) + if fp is None: + continue + self._recent_fingerprints[p] = (fp, now) + # Keep at least a 1s grace for small TTLs while using a proportional + # buffer for larger TTLs so stale handled fingerprints age out cleanly. + cutoff = now - max(ttl * 2.0, ttl + 1.0) + stale = [p for p, (_fp, ts) in self._recent_fingerprints.items() if ts < cutoff] + for p in stale: + self._recent_fingerprints.pop(p, None) + + def _drain_pending(self) -> tuple[list[Path], Set[Path]] | None: + with self._lock: + if not self._pending: + return None + todo = list(self._pending) + todo_forced = {p for p in todo if p in self._pending_forced} + self._pending.clear() + self._pending_forced.clear() + return todo, todo_forced def _flush(self) -> None: # Grab current batch with self._lock: paths = list(self._paths) + forced_paths = {p for p in paths if p in self._forced_paths} self._paths.clear() + self._forced_paths.difference_update(paths) self._timer = None # Try to run the processor exclusively; if busy, queue and return if not self._processing_lock.acquire(blocking=False): with self._lock: self._pending.update(paths) + self._pending_forced.update(forced_paths) if self._timer is None: # schedule a follow-up flush to pick up pending when free self._timer = threading.Timer(DELAY_SECS, self._flush) @@ -56,15 +133,24 @@ def _flush(self) -> None: try: # Per-file locking in index_single_file handles indexer/watcher coordination todo: Iterable[Path] = paths + todo_forced: Set[Path] = set(forced_paths) while True: + filtered_todo = self._filter_recent_paths(todo, forced_paths=todo_forced) + if not filtered_todo: + pending = self._drain_pending() + if pending is None: + break + todo, todo_forced = pending + continue try: - self._process_cb(list(todo)) + self._process_cb(list(filtered_todo)) + self._mark_recent_paths(filtered_todo) except Exception as exc: # Log processing error via structured logging try: LOGGER.error( "Processing batch failed in ChangeQueue._flush", - extra={"error": str(exc), "batch_size": len(list(todo))}, + extra={"error": str(exc), "batch_size": len(filtered_todo)}, exc_info=True, ) except Exception as inner_exc: # pragma: no cover - logging fallback @@ -79,11 +165,10 @@ def _flush(self) -> None: except Exception: pass # Last resort: can't even print # drain any pending accumulated during processing - with self._lock: - if not self._pending: - break - todo = list(self._pending) - self._pending.clear() + pending = self._drain_pending() + if pending is None: + break + todo, todo_forced = pending finally: self._processing_lock.release() diff --git a/scripts/watch_index_core/utils.py b/scripts/watch_index_core/utils.py index 999daa5a..5f4086fd 100644 --- a/scripts/watch_index_core/utils.py +++ b/scripts/watch_index_core/utils.py @@ -8,7 +8,8 @@ from watchdog.observers import Observer import scripts.ingest_code as idx -from .config import LOGGER, ROOT, default_collection_name +from . import config as watch_config +from .config import LOGGER, default_collection_name from scripts.workspace_state import ( _extract_repo_name_from_path, PLACEHOLDER_COLLECTION_NAMES, @@ -93,13 +94,14 @@ def create_observer(use_polling: bool, observer_cls: Type[Observer] = Observer) def _detect_repo_for_file(file_path: Path) -> Optional[Path]: """Detect repository root for a file under WATCH root.""" + root = watch_config.ROOT try: - rel_path = file_path.resolve().relative_to(ROOT.resolve()) + rel_path = file_path.resolve().relative_to(root.resolve()) except Exception: return None if not rel_path.parts: - return ROOT - return ROOT / rel_path.parts[0] + return root + return root / rel_path.parts[0] def _repo_name_or_none(repo_path: Optional[Path]) -> Optional[str]: diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index b0cb28df..4311a12f 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -9,6 +9,7 @@ - Multi-repo support with per-repo state files """ import json +import logging import os import re import uuid @@ -22,6 +23,8 @@ _CANONICAL_SLUG_RE = re.compile(r"^.+-[0-9a-f]{16}$") _SLUGGED_REPO_RE = re.compile(r"^.+-[0-9a-f]{16}(?:_old)?$") +INTERNAL_STATE_TOP_LEVEL_DIRS = frozenset({".codebase", ".git", "__pycache__"}) +logger = logging.getLogger(__name__) _managed_slug_cache_lock = threading.Lock() _managed_slug_cache: set[str] = set() _managed_slug_cache_neg: set[str] = set() @@ -112,7 +115,7 @@ def _server_managed_slug_from_path(path: Path) -> Optional[str]: return None work_dir = Path(os.environ.get("WORK_DIR") or os.environ.get("WORKDIR") or "/work") - marker = work_dir / ".codebase" / "repos" / slug / ".ctxce_managed_upload" + marker = work_dir / STATE_DIRNAME / "repos" / slug / ".ctxce_managed_upload" try: is_managed = marker.exists() except OSError: @@ -134,6 +137,7 @@ def _server_managed_slug_from_path(path: Path) -> Optional[str]: STATE_DIRNAME = ".codebase" STATE_FILENAME = "state.json" CACHE_FILENAME = "cache.json" +INDEX_JOURNAL_FILENAME = "index_journal.json" PLACEHOLDER_COLLECTION_NAMES = {"", "default-collection", "my-collection"} class IndexingProgress(TypedDict, total=False): @@ -184,6 +188,37 @@ class StagingInfo(TypedDict, total=False): repo_name: Optional[str] +class MaintenanceInfo(TypedDict, total=False): + last_empty_dir_sweep_at: Optional[str] + last_consistency_audit_at: Optional[str] + last_consistency_audit_summary: Optional[Dict[str, Any]] + + +class IndexJournalRecord(TypedDict, total=False): + path: str + op_type: str + content_hash: Optional[str] + status: str + attempts: int + created_at: str + updated_at: str + last_error: Optional[str] + + +def _index_journal_retry_delay_seconds() -> float: + try: + return max(0.0, float(os.environ.get("INDEX_JOURNAL_RETRY_DELAY_SECS", "5") or 5)) + except Exception: + return 5.0 + + +def _index_journal_max_attempts() -> int: + try: + return max(0, int(os.environ.get("INDEX_JOURNAL_MAX_ATTEMPTS", "0") or 0)) + except Exception: + return 0 + + class WorkspaceState(TypedDict, total=False): created_at: str updated_at: str @@ -204,6 +239,7 @@ class WorkspaceState(TypedDict, total=False): active_repo_slug: Optional[str] serving_repo_slug: Optional[str] staging: Optional[StagingInfo] + maintenance: Optional[MaintenanceInfo] def is_multi_repo_mode() -> bool: """Check if multi-repo mode is enabled.""" @@ -233,7 +269,12 @@ def logical_repo_reuse_enabled() -> bool: def _resolve_workspace_root() -> str: """Determine the default workspace root path.""" - return os.environ.get("WORKSPACE_PATH") or os.environ.get("WATCH_ROOT") or "/work" + return ( + os.environ.get("CTXCE_METADATA_ROOT") + or os.environ.get("WORKSPACE_PATH") + or os.environ.get("WATCH_ROOT") + or "/work" + ) def _resolve_repo_context( workspace_path: Optional[str] = None, @@ -247,14 +288,45 @@ def _resolve_repo_context( return resolved_workspace, repo_name if workspace_path: - detected = _detect_repo_name_from_path(Path(workspace_path)) - if detected: - return resolved_workspace, detected + try: + requested = Path(workspace_path).resolve() + workspace_root = Path(_resolve_workspace_root()).resolve() + except Exception: + requested = Path(workspace_path) + workspace_root = Path(_resolve_workspace_root()) + if requested != workspace_root: + detected = _detect_repo_name_from_path(requested) + if detected: + return resolved_workspace, detected return resolved_workspace, None return resolved_workspace, repo_name + +def _get_repo_workspace_dir( + repo_name: str, + workspace_path: Optional[str] = None, +) -> Path: + try: + base_dir = Path(workspace_path or _resolve_workspace_root()).resolve() + except Exception: + base_dir = Path(workspace_path or _resolve_workspace_root()).absolute() + if base_dir.name == repo_name: + return base_dir + host_index_path = (os.environ.get("HOST_INDEX_PATH") or "").strip() + if host_index_path: + host_index_root = Path(host_index_path) + if not host_index_root.is_absolute(): + host_index_root = base_dir / host_index_root + candidate = host_index_root.resolve() / repo_name + if candidate.exists() or (candidate / STATE_DIRNAME).exists(): + return candidate + dev_workspace_candidate = base_dir / "dev-workspace" / repo_name + if dev_workspace_candidate.exists() or (dev_workspace_candidate / STATE_DIRNAME).exists(): + return dev_workspace_candidate + return base_dir / repo_name + def _get_state_lock(workspace_path: Optional[str] = None, repo_name: Optional[str] = None) -> threading.RLock: """Get or create a lock for the workspace or repo state and track usage.""" if repo_name and is_multi_repo_mode(): @@ -268,13 +340,52 @@ def _get_state_lock(workspace_path: Optional[str] = None, repo_name: Optional[st _state_lock_last_used[key] = time.time() return _state_locks[key] -def _get_repo_state_dir(repo_name: str) -> Path: +def _get_repo_state_dir( + repo_name: str, + workspace_path: Optional[str] = None, +) -> Path: """Get the state directory for a repository.""" - base_dir = Path(os.environ.get("WORKSPACE_PATH") or os.environ.get("WATCH_ROOT") or "/work") + workspace_root = Path(_resolve_workspace_root()).resolve() + base_dir = Path(workspace_path or str(workspace_root)).resolve() + global_repo_state_dir = workspace_root / STATE_DIRNAME / "repos" / repo_name if is_multi_repo_mode(): - return base_dir / STATE_DIRNAME / "repos" / repo_name + # Canonical multi-repo metadata layout is shared under workspace root. + return global_repo_state_dir return base_dir / STATE_DIRNAME + +def _is_repo_local_metadata_path(path: Path) -> bool: + try: + parts = path.resolve().parts + except Exception: + parts = path.parts + try: + idx = parts.index(STATE_DIRNAME) + except ValueError: + return False + if idx > 0 and _SLUGGED_REPO_RE.match(parts[idx - 1] or ""): + return True + if "repos" in parts: + ridx = parts.index("repos") + if ridx + 1 < len(parts) and _SLUGGED_REPO_RE.match(parts[ridx + 1] or ""): + return True + return False + + +def _apply_runtime_metadata_mode(path: Path) -> None: + try: + is_dir = path.is_dir() + except Exception: + is_dir = False + if _is_repo_local_metadata_path(path): + mode = 0o777 if is_dir else 0o666 + else: + mode = 0o775 if is_dir else 0o664 + try: + os.chmod(path, mode) + except Exception: + pass + def _get_state_path(workspace_path: str) -> Path: """Get the path to the state.json file for a workspace.""" workspace = Path(workspace_path).resolve() @@ -623,7 +734,7 @@ def _detect_repo_name_from_path(path: Path) -> str: rel = resolved.relative_to(ws_root) if rel.parts: candidate = rel.parts[0] - if candidate not in {".codebase", ".git", "__pycache__"}: + if candidate not in INTERNAL_STATE_TOP_LEVEL_DIRS: return candidate except Exception: pass @@ -672,12 +783,7 @@ def _atomic_write_state(state_path: Path, state: WorkspaceState) -> None: with open(temp_path, 'w', encoding='utf-8') as f: json.dump(state, f, indent=2, ensure_ascii=False) temp_path.replace(state_path) - # Ensure state/cache files are group-writable so multiple processes - # (upload service, watcher, indexer) can update them. - try: - os.chmod(state_path, 0o664) - except PermissionError: - pass + _apply_runtime_metadata_mode(state_path) except Exception: # Clean up temp file if something went wrong try: @@ -705,7 +811,7 @@ def get_workspace_state( lock_scope_path: Path if is_multi_repo_mode() and repo_name: - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, workspace_path) try: ws_root = Path(_resolve_workspace_root()) ws_dir = ws_root / repo_name @@ -717,12 +823,7 @@ def get_workspace_state( except Exception: return {} state_dir.mkdir(parents=True, exist_ok=True) - # Ensure repo state dir is group-writable so root upload service and - # non-root watcher/indexer processes can both write state/cache files. - try: - os.chmod(state_dir, 0o775) - except Exception: - pass + _apply_runtime_metadata_mode(state_dir) state_path = state_dir / STATE_FILENAME lock_scope_path = state_dir else: @@ -802,7 +903,7 @@ def update_workspace_state( # Allow updates when the repo state dir exists, even if the workspace # directory is not present (e.g. dev-remote simulations where only # .codebase state is persisted). - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, workspace_path) if not (ws_root / repo_name).exists() and not state_dir.exists(): return {} except Exception: @@ -823,8 +924,9 @@ def update_workspace_state( state["updated_at"] = datetime.now().isoformat() if is_multi_repo_mode() and repo_name: - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, workspace_path) state_dir.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(state_dir) state_path = state_dir / STATE_FILENAME else: try: @@ -1245,8 +1347,9 @@ def log_activity( return except Exception: return - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, workspace_path) state_dir.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(state_dir) state_path = state_dir / STATE_FILENAME lock_path = state_path.with_suffix(".lock") @@ -1405,7 +1508,7 @@ def _detect_repo_name_from_path_by_structure(path: Path) -> str: continue repo_name = rel_path.parts[0] - if repo_name in (".codebase", ".git", "__pycache__"): + if repo_name in INTERNAL_STATE_TOP_LEVEL_DIRS: continue repo_path = base / repo_name @@ -1575,10 +1678,301 @@ def _write_cache(workspace_path: str, cache: Dict[str, Any]) -> None: pass -def get_cached_file_hash(file_path: str, repo_name: Optional[str] = None) -> str: +def _get_index_journal_path( + workspace_path: Optional[str] = None, repo_name: Optional[str] = None +) -> Path: + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + if repo_name: + state_dir = _get_repo_state_dir(repo_name, workspace_path) + else: + state_dir = _get_global_state_dir(workspace_path) + return state_dir / INDEX_JOURNAL_FILENAME + + +def _read_index_journal_file_uncached(journal_path: Path) -> Dict[str, Any]: + try: + with journal_path.open("r", encoding="utf-8-sig") as f: + obj = json.load(f) + if isinstance(obj, dict): + operations = obj.get("operations", {}) + if isinstance(operations, dict): + return obj + except (OSError, json.JSONDecodeError, ValueError): + pass + now = datetime.now().isoformat() + return {"version": 1, "operations": {}, "created_at": now, "updated_at": now} + + +def _write_index_journal( + workspace_path: Optional[str], + repo_name: Optional[str], + journal: Dict[str, Any], +) -> None: + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + lock = _get_state_lock(workspace_path, repo_name) + with lock: + journal_path = _get_index_journal_path(workspace_path, repo_name) + journal_path.parent.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(journal_path.parent) + lock_path = journal_path.with_suffix(journal_path.suffix + ".lock") + with _cross_process_lock(lock_path): + tmp = journal_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") + try: + with open(tmp, "w", encoding="utf-8") as f: + json.dump(journal, f, ensure_ascii=False, indent=2) + tmp.replace(journal_path) + _apply_runtime_metadata_mode(journal_path) + finally: + try: + tmp.unlink(missing_ok=True) + except Exception: + pass + + +def _update_index_journal( + workspace_path: Optional[str], + repo_name: Optional[str], + mutator, +) -> Dict[str, Any]: + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + lock = _get_state_lock(workspace_path, repo_name) + with lock: + journal_path = _get_index_journal_path(workspace_path, repo_name) + journal_path.parent.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(journal_path.parent) + lock_path = journal_path.with_suffix(journal_path.suffix + ".lock") + with _cross_process_lock(lock_path): + journal = _read_index_journal_file_uncached(journal_path) + mutator(journal) + journal["updated_at"] = datetime.now().isoformat() + tmp = journal_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") + try: + with open(tmp, "w", encoding="utf-8") as f: + json.dump(journal, f, ensure_ascii=False, indent=2) + tmp.replace(journal_path) + _apply_runtime_metadata_mode(journal_path) + finally: + try: + tmp.unlink(missing_ok=True) + except Exception: + pass + return journal + + +def upsert_index_journal_entries( + entries: List[Dict[str, Any]], + *, + workspace_path: Optional[str] = None, + repo_name: Optional[str] = None, +) -> Dict[str, Any]: + """Persist or replace repo-scoped index journal entries keyed by normalized path.""" + normalized_entries: List[IndexJournalRecord] = [] + now = datetime.now().isoformat() + valid_statuses = {"pending", "in_progress", "failed", "done"} + for entry in entries or []: + path = _normalize_cache_key_path(str(entry.get("path") or "")) + op_type = str(entry.get("op_type") or "").strip().lower() + if not path or op_type not in {"upsert", "delete"}: + continue + content_hash = str(entry.get("content_hash") or "").strip() or None + status = str(entry.get("status") or "pending").strip().lower() + if status not in valid_statuses: + status = "pending" + try: + attempts = int(entry.get("attempts", 0) or 0) + except Exception: + attempts = 0 + if attempts < 0: + attempts = 0 + last_error = entry.get("last_error") + if last_error is not None: + last_error = str(last_error) + normalized_entries.append( + { + "path": path, + "op_type": op_type, + "content_hash": content_hash, + "status": status, + "attempts": attempts, + "created_at": str(entry.get("created_at") or now), + "updated_at": str(entry.get("updated_at") or now), + "last_error": last_error, + } + ) + + def _mutate(journal: Dict[str, Any]) -> None: + ops = journal.setdefault("operations", {}) + if not isinstance(ops, dict): + ops = {} + journal["operations"] = ops + for entry in normalized_entries: + ops[entry["path"]] = entry + + return _update_index_journal(workspace_path, repo_name, _mutate) + + +def list_pending_index_journal_entries( + workspace_path: Optional[str] = None, + repo_name: Optional[str] = None, +) -> List[IndexJournalRecord]: + """Return watcher-retryable journal records for a workspace or specific repo.""" + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + retry_delay = _index_journal_retry_delay_seconds() + max_attempts = _index_journal_max_attempts() + now = datetime.now() + + def _read_repo_journal_entries( + target_repo_name: Optional[str], + *, + target_workspace_path: Optional[str] = None, + ) -> List[IndexJournalRecord]: + journal = _read_index_journal_file_uncached( + _get_index_journal_path(target_workspace_path or workspace_path, target_repo_name) + ) + merged_ops = journal.get("operations", {}) + if not isinstance(merged_ops, dict): + merged_ops = {} + result: List[IndexJournalRecord] = [] + for rec in merged_ops.values(): + if not isinstance(rec, dict): + continue + status = str(rec.get("status") or "pending").strip().lower() + if status not in {"pending", "failed"}: + continue + attempts_raw = rec.get("attempts") + try: + attempts = int(attempts_raw or 0) + except (ValueError, TypeError): + attempts = 0 + logger.warning( + "workspace_state::invalid_journal_attempts", + extra={"attempts": attempts_raw, "path": str(rec.get("path") or "")}, + ) + if max_attempts > 0 and attempts >= max_attempts: + continue + if status == "failed" and retry_delay > 0: + updated_at = str(rec.get("updated_at") or "").strip() + if updated_at: + try: + last = datetime.fromisoformat(updated_at) + if (now - last).total_seconds() < retry_delay: + continue + except Exception: + pass + p = _normalize_cache_key_path(str(rec.get("path") or "")) + op_type = str(rec.get("op_type") or "").strip().lower() + if not p or op_type not in {"upsert", "delete"}: + continue + result.append( + { + "path": p, + "op_type": op_type, + "content_hash": str(rec.get("content_hash") or "").strip() or None, + "status": status, + "attempts": attempts, + "created_at": str(rec.get("created_at") or ""), + "updated_at": str(rec.get("updated_at") or ""), + "last_error": str(rec.get("last_error") or "").strip() or None, + } + ) + return result + + if repo_name: + return _read_repo_journal_entries(repo_name) + + result: List[IndexJournalRecord] = [] + root_path = Path(workspace_path or _resolve_workspace_root()).resolve() + repo_candidates: set[str] = set() + multi_repo_mode = is_multi_repo_mode() + try: + for repo_root in root_path.iterdir(): + if not repo_root.is_dir(): + continue + if repo_root.name in INTERNAL_STATE_TOP_LEVEL_DIRS: + continue + if (not multi_repo_mode) and (not _SLUGGED_REPO_RE.match(repo_root.name)): + continue + repo_candidates.add(repo_root.name) + except Exception: + pass + + try: + repos_state_root = root_path / STATE_DIRNAME / "repos" + if repos_state_root.exists(): + for state_dir in repos_state_root.iterdir(): + if not state_dir.is_dir(): + continue + repo_candidates.add(state_dir.name) + except Exception: + pass + + for candidate in sorted(repo_candidates): + candidate_workspace_path: Optional[str] = None + if not multi_repo_mode: + candidate_workspace_path = str(root_path / candidate) + result.extend( + _read_repo_journal_entries( + candidate, + target_workspace_path=candidate_workspace_path, + ) + ) + + if result: + return result + return _read_repo_journal_entries(None) + + +def update_index_journal_entry_status( + path: str, + *, + status: str, + error: Optional[str] = None, + workspace_path: Optional[str] = None, + repo_name: Optional[str] = None, + remove_on_done: bool = True, +) -> Dict[str, Any]: + """Update or clear a repo-scoped journal entry after processing.""" + normalized_path = _normalize_cache_key_path(path) + now = datetime.now().isoformat() + + def _mutate(journal: Dict[str, Any]) -> None: + ops = journal.setdefault("operations", {}) + if not isinstance(ops, dict): + ops = {} + journal["operations"] = ops + rec = ops.get(normalized_path) + if not isinstance(rec, dict): + return + if status == "done" and remove_on_done: + ops.pop(normalized_path, None) + return + rec["status"] = status + rec["updated_at"] = now + attempts_raw = rec.get("attempts") + try: + attempts = int(attempts_raw or 0) + except (ValueError, TypeError): + attempts = 0 + logger.warning( + "workspace_state::invalid_journal_attempts", + extra={"attempts": attempts_raw, "path": normalized_path}, + ) + rec["attempts"] = attempts + 1 + rec["last_error"] = str(error or "").strip() or None + ops[normalized_path] = rec + + return _update_index_journal(workspace_path, repo_name, _mutate) + + +def get_cached_file_hash( + file_path: str, + repo_name: Optional[str] = None, + metadata_root: Optional[str] = None, +) -> str: """Get cached file hash for tracking changes.""" + root = metadata_root or _resolve_workspace_root() if is_multi_repo_mode() and repo_name: - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, root) cache_path = state_dir / CACHE_FILENAME cache = _read_cache_file_cached(cache_path) @@ -1589,19 +1983,23 @@ def get_cached_file_hash(file_path: str, repo_name: Optional[str] = None) -> str return str(val.get("hash") or "") return str(val or "") else: - cache = _read_cache_cached(_resolve_workspace_root()) + cache = _read_cache_cached(root) fp = _normalize_cache_key_path(file_path) val = cache.get("file_hashes", {}).get(fp, "") if isinstance(val, dict): return str(val.get("hash") or "") return str(val or "") - return "" - -def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str] = None) -> None: +def set_cached_file_hash( + file_path: str, + file_hash: str, + repo_name: Optional[str] = None, + metadata_root: Optional[str] = None, +) -> None: """Set cached file hash for tracking changes.""" fp = _normalize_cache_key_path(file_path) + root = metadata_root or _resolve_workspace_root() st_size: Optional[int] = None st_mtime: Optional[int] = None @@ -1615,14 +2013,15 @@ def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str if is_multi_repo_mode() and repo_name: try: - ws_root = Path(_resolve_workspace_root()) + ws_root = Path(root) if not (ws_root / repo_name).exists(): return except Exception: return - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, str(ws_root)) cache_path = state_dir / CACHE_FILENAME state_dir.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(state_dir) if cache_path.exists(): cache = _read_cache_file_cached(cache_path) @@ -1659,7 +2058,7 @@ def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str _memoize_cache_obj(cache_path, cache) return - cache = _read_cache_cached(_resolve_workspace_root()) + cache = _read_cache_cached(root) existing = cache.get("file_hashes", {}).get(fp) if isinstance(existing, dict) and st_size is not None and st_mtime is not None: if ( @@ -1683,14 +2082,14 @@ def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str pass cache.setdefault("file_hashes", {})[fp] = entry cache["updated_at"] = datetime.now().isoformat() - _write_cache(_resolve_workspace_root(), cache) - _memoize_cache_obj(_get_cache_path(_resolve_workspace_root()), cache) + _write_cache(root, cache) + _memoize_cache_obj(_get_cache_path(root), cache) def get_cached_file_meta(file_path: str, repo_name: Optional[str] = None) -> Dict[str, Any]: fp = _normalize_cache_key_path(file_path) if is_multi_repo_mode() and repo_name: - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, _resolve_workspace_root()) cache_path = state_dir / CACHE_FILENAME cache = _read_cache_file_cached(cache_path) @@ -1711,10 +2110,15 @@ def get_cached_file_meta(file_path: str, repo_name: Optional[str] = None) -> Dic return {} -def remove_cached_file(file_path: str, repo_name: Optional[str] = None) -> None: +def remove_cached_file( + file_path: str, + repo_name: Optional[str] = None, + metadata_root: Optional[str] = None, +) -> None: """Remove file entry from cache.""" + root = metadata_root or _resolve_workspace_root() if is_multi_repo_mode() and repo_name: - state_dir = _get_repo_state_dir(repo_name) + state_dir = _get_repo_state_dir(repo_name, root) cache_path = state_dir / CACHE_FILENAME if cache_path.exists(): @@ -1730,13 +2134,13 @@ def remove_cached_file(file_path: str, repo_name: Optional[str] = None) -> None: _memoize_cache_obj(cache_path, cache) return - cache = _read_cache_cached(_resolve_workspace_root()) + cache = _read_cache_cached(root) fp = _normalize_cache_key_path(file_path) if fp in cache.get("file_hashes", {}): cache["file_hashes"].pop(fp, None) cache["updated_at"] = datetime.now().isoformat() - _write_cache(_resolve_workspace_root(), cache) - _memoize_cache_obj(_get_cache_path(_resolve_workspace_root()), cache) + _write_cache(root, cache) + _memoize_cache_obj(_get_cache_path(root), cache) def cleanup_old_cache_locks(max_idle_seconds: int = 900) -> int: @@ -1780,42 +2184,65 @@ def cleanup_old_cache_locks(max_idle_seconds: int = 900) -> int: def get_collection_mappings(search_root: Optional[str] = None) -> List[Dict[str, Any]]: - """Enumerate collection mappings with origin metadata.""" + """Enumerate collection mappings with origin metadata. + + `search_root` may point at either workspace root (`/work`) or codebase root + (`/work/.codebase`). + """ root_path = Path(search_root or _resolve_workspace_root()).resolve() + if root_path.name == STATE_DIRNAME: + workspace_root = root_path.parent + codebase_root = root_path + else: + workspace_root = root_path + codebase_root = root_path / STATE_DIRNAME mappings: List[Dict[str, Any]] = [] try: if is_multi_repo_mode(): - repos_root = root_path / STATE_DIRNAME / "repos" + seen_state_files: set[str] = set() + + def _append_repo_mapping(repo_name: str, state_path: Path) -> None: + if not state_path.exists(): + return + try: + state_key = str(state_path.resolve()) + except Exception: + state_key = str(state_path) + if state_key in seen_state_files: + return + seen_state_files.add(state_key) + + try: + with open(state_path, "r", encoding="utf-8-sig") as f: + state = json.load(f) or {} + except Exception as e: + print(f"[workspace_state] Failed to read repo state from {state_path}: {e}") + return + + origin = state.get("origin", {}) or {} + repo_workspace_dir = _get_repo_workspace_dir(repo_name, str(workspace_root)) + mappings.append( + { + "repo_name": repo_name, + "collection_name": state.get("qdrant_collection") + or get_collection_name(repo_name), + "container_path": origin.get("container_path") + or str(repo_workspace_dir.resolve()), + "source_path": origin.get("source_path"), + "state_file": str(state_path), + "updated_at": state.get("updated_at"), + } + ) + + # Shared metadata root (`/.codebase/repos//state.json`) + repos_root = codebase_root / "repos" if repos_root.exists(): for repo_dir in sorted(p for p in repos_root.iterdir() if p.is_dir()): - repo_name = repo_dir.name - state_path = repo_dir / STATE_FILENAME - if not state_path.exists(): - continue - try: - with open(state_path, "r", encoding="utf-8-sig") as f: - state = json.load(f) or {} - except Exception as e: - print(f"[workspace_state] Failed to read repo state from {state_path}: {e}") - continue - - origin = state.get("origin", {}) or {} - mappings.append( - { - "repo_name": repo_name, - "collection_name": state.get("qdrant_collection") - or get_collection_name(repo_name), - "container_path": origin.get("container_path") - or str((Path(_resolve_workspace_root()) / repo_name).resolve()), - "source_path": origin.get("source_path"), - "state_file": str(state_path), - "updated_at": state.get("updated_at"), - } - ) + _append_repo_mapping(repo_dir.name, repo_dir / STATE_FILENAME) else: - state_path = root_path / STATE_DIRNAME / STATE_FILENAME + state_path = codebase_root / STATE_FILENAME if state_path.exists(): try: with open(state_path, "r", encoding="utf-8-sig") as f: @@ -1824,14 +2251,14 @@ def get_collection_mappings(search_root: Optional[str] = None) -> List[Dict[str, state = {} origin = state.get("origin", {}) or {} - repo_name = origin.get("repo_name") or Path(root_path).name + repo_name = origin.get("repo_name") or Path(workspace_root).name mappings.append( { "repo_name": repo_name, "collection_name": state.get("qdrant_collection") or get_collection_name(repo_name), "container_path": origin.get("container_path") - or str(root_path), + or str(workspace_root), "source_path": origin.get("source_path"), "state_file": str(state_path), "updated_at": state.get("updated_at"), @@ -2116,6 +2543,8 @@ def set_cached_symbols(file_path: str, symbols: dict, file_hash: str) -> None: """Save symbol metadata for a file. Extends existing to include pseudo data.""" cache_path = _get_symbol_cache_path(file_path) cache_path.parent.mkdir(parents=True, exist_ok=True) + _apply_runtime_metadata_mode(cache_path.parent) + temp_path = cache_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") try: cache_data = { @@ -2125,18 +2554,16 @@ def set_cached_symbols(file_path: str, symbols: dict, file_hash: str) -> None: "symbols": symbols } - with open(cache_path, 'w', encoding='utf-8') as f: + with open(temp_path, 'w', encoding='utf-8') as f: json.dump(cache_data, f, indent=2) - - # Ensure symbol cache files are group-writable so both indexer and - # watcher processes (potentially different users sharing a group) - # can update them on shared volumes. - try: - os.chmod(cache_path, 0o664) - except PermissionError: - pass + temp_path.replace(cache_path) + _apply_runtime_metadata_mode(cache_path) except Exception as e: print(f"[SYMBOL_CACHE_WARNING] Failed to save symbol cache for {file_path}: {e}") + try: + temp_path.unlink(missing_ok=True) + except Exception: + pass def get_cached_pseudo(file_path: str, symbol_id: str) -> tuple[str, list[str]]: @@ -2236,7 +2663,7 @@ def clear_symbol_cache( target_dirs: List[Path] = [] if is_multi_repo_mode() and repo_name: - target_dirs.append(_get_repo_state_dir(repo_name) / "symbols") + target_dirs.append(_get_repo_state_dir(repo_name, workspace_path) / "symbols") else: try: cache_parent = _get_cache_path(workspace_root).parent @@ -2288,6 +2715,53 @@ def compare_symbol_changes(old_symbols: dict, new_symbols: dict) -> tuple[list, unchanged = [] changed = [] + # Primary key should not be absolute start_line alone; leading comments/import + # shifts can move every symbol without changing their bodies. Prefer exact id + # first, then fall back to stable metadata matching. + old_symbols = old_symbols or {} + new_symbols = new_symbols or {} + remaining_old_by_exact = dict(old_symbols) + remaining_old_by_signature: Dict[tuple[str, str, str], list[str]] = {} + remaining_old_by_name_kind: Dict[tuple[str, str], list[str]] = {} + + for old_symbol_id, old_info in remaining_old_by_exact.items(): + kind = str(old_info.get("type") or "") + name = str(old_info.get("name") or "") + content_hash = str(old_info.get("content_hash") or "") + if kind and name and content_hash: + remaining_old_by_signature.setdefault((kind, name, content_hash), []).append( + old_symbol_id + ) + if kind and name: + remaining_old_by_name_kind.setdefault((kind, name), []).append(old_symbol_id) + + def _consume_old_symbol(old_id: str, old_info: dict) -> None: + remaining_old_by_exact.pop(old_id, None) + + old_kind = str(old_info.get("type") or "") + old_name = str(old_info.get("name") or "") + old_hash = str(old_info.get("content_hash") or "") + + if old_kind and old_name and old_hash: + sig = (old_kind, old_name, old_hash) + sig_ids = remaining_old_by_signature.get(sig) or [] + if old_id in sig_ids: + sig_ids.remove(old_id) + if sig_ids: + remaining_old_by_signature[sig] = sig_ids + else: + remaining_old_by_signature.pop(sig, None) + + if old_kind and old_name: + nk = (old_kind, old_name) + nk_ids = remaining_old_by_name_kind.get(nk) or [] + if old_id in nk_ids: + nk_ids.remove(old_id) + if nk_ids: + remaining_old_by_name_kind[nk] = nk_ids + else: + remaining_old_by_name_kind.pop(nk, None) + for symbol_id, symbol_info in new_symbols.items(): if symbol_id in old_symbols: old_info = old_symbols[symbol_id] @@ -2296,6 +2770,26 @@ def compare_symbol_changes(old_symbols: dict, new_symbols: dict) -> tuple[list, unchanged.append(symbol_id) else: changed.append(symbol_id) + _consume_old_symbol(symbol_id, old_info) + continue + + kind = str(symbol_info.get("type") or "") + name = str(symbol_info.get("name") or "") + content_hash = str(symbol_info.get("content_hash") or "") + signature = (kind, name, content_hash) + matched_old_ids = remaining_old_by_signature.get(signature) or [] + if matched_old_ids: + old_id = matched_old_ids.pop(0) + if not matched_old_ids: + remaining_old_by_signature.pop(signature, None) + _consume_old_symbol(old_id, old_symbols.get(old_id, {})) + unchanged.append(symbol_id) + continue + + # Same logical symbol name/type exists but content differs: changed. + if kind and name and remaining_old_by_name_kind.get((kind, name)): + remaining_old_by_name_kind.pop((kind, name), None) + changed.append(symbol_id) else: # New symbol changed.append(symbol_id) @@ -2537,6 +3031,3 @@ def _list_workspaces_from_qdrant(seen_paths: set) -> List[Dict[str, Any]]: pass return workspaces - - -# Add missing functions that callers expect (already defined above) \ No newline at end of file diff --git a/templates/admin/acl.html b/templates/admin/acl.html index 952a0ce9..98292beb 100644 --- a/templates/admin/acl.html +++ b/templates/admin/acl.html @@ -1,6 +1,32 @@ {% extends "admin/base.html" %} {% block content %} + {% set qp = request.query_params %} + {% if qp and ((qp.get("copied") and qp.get("new")) or qp.get("deleted")) %} +
+ {% if qp.get("copied") and qp.get("new") %} +
+ Copied collection {{ qp.get("copied") }}{{ qp.get("new") }}. + {% if qp.get("graph_copied") == "1" %} + (graph clone copied) + {% elif qp.get("graph_copied") == "0" %} + (graph clone not copied; will rebuild/backfill) + {% endif %} +
+ {% endif %} + {% if qp.get("deleted") %} +
+ Deleted collection {{ qp.get("deleted") }}. + {% if qp.get("graph_deleted") == "1" %} + (graph clone deleted) + {% elif qp.get("graph_deleted") == "0" %} + (graph clone not deleted or missing) + {% endif %} +
+ {% endif %} +
+ {% endif %} +

Users

diff --git a/tests/test_admin_collection_delete.py b/tests/test_admin_collection_delete.py index ad42807e..c407b5da 100644 --- a/tests/test_admin_collection_delete.py +++ b/tests/test_admin_collection_delete.py @@ -52,6 +52,29 @@ def test_admin_role_gate_blocks_non_admin(monkeypatch): assert resp.json().get("detail") == "Admin required" +@pytest.mark.unit +def test_delete_redirect_includes_graph_deleted_param(monkeypatch): + monkeypatch.setenv("CTXCE_AUTH_ENABLED", "1") + monkeypatch.setenv("CTXCE_ADMIN_COLLECTION_DELETE_ENABLED", "1") + + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + + monkeypatch.setattr(srv, "_require_admin_session", lambda _req: {"user_id": "admin"}) + + def _fake_delete_collection_everywhere(**_kwargs): + return {"qdrant_deleted": True, "qdrant_graph_deleted": True} + + monkeypatch.setattr(srv, "delete_collection_everywhere", _fake_delete_collection_everywhere) + + client = TestClient(srv.app) + resp = client.post("/admin/collections/delete", data={"collection": "c1", "delete_fs": ""}, follow_redirects=False) + assert resp.status_code == 302 + loc = resp.headers.get("location") or "" + assert "deleted=c1" in loc + assert "graph_deleted=1" in loc + + @pytest.mark.unit def test_collection_admin_refuses_when_env_disabled(monkeypatch): monkeypatch.setenv("CTXCE_ADMIN_COLLECTION_DELETE_ENABLED", "0") diff --git a/tests/test_change_history_for_path.py b/tests/test_change_history_for_path.py index 52be7592..7d822150 100644 --- a/tests/test_change_history_for_path.py +++ b/tests/test_change_history_for_path.py @@ -15,6 +15,10 @@ def tool(self, *args, **kwargs): def _decorator(fn): return fn return _decorator + def resource(self, *args, **kwargs): + def _decorator(fn): + return fn + return _decorator class _Context: def __init__(self, *args, **kwargs): @@ -98,4 +102,3 @@ async def test_change_history_strict_match_under_work(monkeypatch): assert summary.get("ingested_min") == 90 assert summary.get("ingested_max") == 115 assert summary.get("churn_count_max") == 5 - diff --git a/tests/test_globs_and_snippet.py b/tests/test_globs_and_snippet.py index 4c30ff48..e367bab3 100644 --- a/tests/test_globs_and_snippet.py +++ b/tests/test_globs_and_snippet.py @@ -116,6 +116,32 @@ def test_run_hybrid_search_slugged_path_globs(monkeypatch): assert "/work/other/docs/readme.md" not in paths +@pytest.mark.unit +def test_run_hybrid_search_under_recursive_scope(monkeypatch): + pts = [ + _Pt("1", "/work/repo/space/ship/a.py"), + _Pt("2", "/work/repo/direct/tools/b.py"), + ] + monkeypatch.setattr(hyb, "get_qdrant_client", lambda *a, **k: FakeQdrant(pts)) + monkeypatch.setattr(hyb, "return_qdrant_client", lambda *a, **k: None) + monkeypatch.setenv("EMBEDDING_MODEL", "unit-test") + monkeypatch.setenv("QDRANT_URL", "http://localhost:6333") + monkeypatch.setattr(hyb, "TextEmbedding", lambda *a, **k: FakeEmbed()) + monkeypatch.setattr(hyb, "_get_embedding_model", lambda *a, **k: FakeEmbed()) + + items = hyb.run_hybrid_search( + queries=["rotate heading"], + limit=10, + per_path=2, + under="space", + expand=False, + model=FakeEmbed(), + ) + paths = {it.get("path") for it in items} + assert "/work/repo/space/ship/a.py" in paths + assert "/work/repo/direct/tools/b.py" not in paths + + @pytest.mark.unit def test_dense_query_preserves_collection_on_filter_drop(monkeypatch): calls = [] diff --git a/tests/test_index_journal.py b/tests/test_index_journal.py new file mode 100644 index 00000000..ee8776e5 --- /dev/null +++ b/tests/test_index_journal.py @@ -0,0 +1,431 @@ +#!/usr/bin/env python3 +import importlib +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + + +pytestmark = pytest.mark.unit + + +@pytest.fixture +def ws_module(monkeypatch, tmp_path): + ws_root = tmp_path / "work" + ws_root.mkdir(parents=True, exist_ok=True) + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.delenv("MULTI_REPO_MODE", raising=False) + ws = importlib.import_module("scripts.workspace_state") + return importlib.reload(ws) + + +def test_index_journal_roundtrip(ws_module, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "app.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + + ws_module.upsert_index_journal_entries( + [ + {"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}, + {"path": str(file_path.with_name("old.py")), "op_type": "delete"}, + ], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + + pending = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ] + assert str(file_path.resolve()) in pending + assert str((file_path.with_name("old.py")).resolve()) in pending + + ws_module.update_index_journal_entry_status( + str(file_path), + status="done", + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + pending_after = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ] + assert str(file_path.resolve()) not in pending_after + assert str((file_path.with_name("old.py")).resolve()) in pending_after + + +def test_index_journal_entries_include_operation_types(ws_module, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "entry.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + + ws_module.upsert_index_journal_entries( + [ + {"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}, + {"path": str(file_path.with_name("gone.py")), "op_type": "delete"}, + ], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + + entries = ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + by_path = {entry["path"]: entry for entry in entries} + assert by_path[str(file_path.resolve())]["op_type"] == "upsert" + assert by_path[str((file_path.with_name("gone.py")).resolve())]["op_type"] == "delete" + + +def test_index_journal_aggregates_repo_scoped_entries(ws_module, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "x.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + + ws_module.upsert_index_journal_entries( + [{"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + + pending = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work") + ) + ] + assert str(file_path.resolve()) in pending + + +@pytest.mark.parametrize("repo_name", ["repo-1234567890abcdef", "frontend"]) +def test_index_journal_aggregates_repo_scoped_entries_in_multi_repo_mode( + monkeypatch, tmp_path, repo_name +): + ws_root = tmp_path / "work" + ws_root.mkdir(parents=True, exist_ok=True) + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.setenv("MULTI_REPO_MODE", "1") + ws_module = importlib.import_module("scripts.workspace_state") + ws_module = importlib.reload(ws_module) + + file_name = "app.ts" if repo_name == "frontend" else "multi.py" + file_path = ws_root / repo_name / "src" / file_name + file_path.parent.mkdir(parents=True, exist_ok=True) + + ws_module.upsert_index_journal_entries( + [{"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}], + workspace_path=str(ws_root / repo_name), + repo_name=repo_name, + ) + + pending = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries(workspace_path=str(ws_root)) + ] + assert str(file_path.resolve()) in pending + + +def test_index_journal_file_is_group_writable(ws_module, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "perm.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + + ws_module.upsert_index_journal_entries( + [{"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + + journal_path = ws_module._get_index_journal_path( + str(tmp_path / "work" / repo_name), repo_name + ) + assert journal_path.exists() + assert oct(journal_path.stat().st_mode & 0o777) == "0o666" + + +def test_index_journal_failed_entry_respects_retry_delay(ws_module, monkeypatch, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "retry.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + monkeypatch.setenv("INDEX_JOURNAL_RETRY_DELAY_SECS", "60") + + ws_module.upsert_index_journal_entries( + [{"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ws_module.update_index_journal_entry_status( + str(file_path), + status="failed", + error="boom", + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + remove_on_done=False, + ) + + pending = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ] + assert str(file_path.resolve()) not in pending + + +def test_index_journal_failed_entry_honors_max_attempts(ws_module, monkeypatch, tmp_path): + repo_name = "repo-1234567890abcdef" + file_path = tmp_path / "work" / repo_name / "src" / "retry2.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + monkeypatch.setenv("INDEX_JOURNAL_RETRY_DELAY_SECS", "0") + monkeypatch.setenv("INDEX_JOURNAL_MAX_ATTEMPTS", "1") + + ws_module.upsert_index_journal_entries( + [{"path": str(file_path), "op_type": "upsert", "content_hash": "abc123"}], + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ws_module.update_index_journal_entry_status( + str(file_path), + status="failed", + error="boom", + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + remove_on_done=False, + ) + + pending = [ + str(e["path"]) + for e in ws_module.list_pending_index_journal_entries( + workspace_path=str(tmp_path / "work" / repo_name), + repo_name=repo_name, + ) + ] + assert str(file_path.resolve()) not in pending + + +def test_processor_delete_marks_journal_done(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + missing = tmp_path / "missing.py" + assert not missing.exists() + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "get_workspace_state", lambda *a, **k: {}) + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: False) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "remove_cached_file", lambda *a, **k: None) + + delete_mock = MagicMock() + graph_delete_mock = MagicMock() + journal_mock = MagicMock() + monkeypatch.setattr(proc_mod.idx, "delete_points_by_path", delete_mock) + monkeypatch.setattr(proc_mod.idx, "delete_graph_edges_by_path", graph_delete_mock) + monkeypatch.setattr(proc_mod, "_verify_delete_committed", lambda *a, **k: True) + monkeypatch.setattr(proc_mod, "update_index_journal_entry_status", journal_mock) + + proc_mod._process_paths( + [missing], + client=MagicMock(), + model=None, + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + delete_mock.assert_called_once() + graph_delete_mock.assert_called_once() + journal_mock.assert_called_once() + assert journal_mock.call_args.kwargs["status"] == "done" + + +def test_processor_honors_delete_journal_for_existing_file(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + existing = tmp_path / "present.py" + existing.write_text("print('x')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "get_workspace_state", lambda *a, **k: {}) + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: False) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "remove_cached_file", lambda *a, **k: None) + monkeypatch.setattr( + proc_mod, + "list_pending_index_journal_entries", + lambda *a, **k: [{"path": str(existing.resolve()), "op_type": "delete"}], + ) + + delete_mock = MagicMock() + graph_delete_mock = MagicMock() + journal_mock = MagicMock() + monkeypatch.setattr(proc_mod.idx, "delete_points_by_path", delete_mock) + monkeypatch.setattr(proc_mod.idx, "delete_graph_edges_by_path", graph_delete_mock) + monkeypatch.setattr(proc_mod, "_verify_delete_committed", lambda *a, **k: True) + monkeypatch.setattr(proc_mod, "update_index_journal_entry_status", journal_mock) + + proc_mod._process_paths( + [existing], + client=MagicMock(), + model=None, + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + delete_mock.assert_called_once() + graph_delete_mock.assert_called_once() + journal_mock.assert_called_once() + assert journal_mock.call_args.kwargs["status"] == "done" + + +def test_processor_relinks_move_journal_before_delete(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + src = tmp_path / "src.py" + dest = tmp_path / "dest.py" + dest.write_text("print('dest')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "get_workspace_state", lambda *a, **k: {}) + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: False) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "remove_cached_file", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "set_cached_file_hash", lambda *a, **k: None) + monkeypatch.setattr( + proc_mod, + "list_pending_index_journal_entries", + lambda *a, **k: [ + {"path": str(src.resolve()), "op_type": "delete", "content_hash": "cafebabe"}, + {"path": str(dest.resolve()), "op_type": "upsert", "content_hash": "cafebabe"}, + ], + ) + + rename_mock = MagicMock(return_value=(3, "cafebabe")) + delete_mock = MagicMock() + journal_mock = MagicMock() + monkeypatch.setattr(proc_mod, "_rename_in_store", rename_mock) + monkeypatch.setattr(proc_mod.idx, "delete_points_by_path", delete_mock) + monkeypatch.setattr(proc_mod, "update_index_journal_entry_status", journal_mock) + + proc_mod._process_paths( + [src, dest], + client=MagicMock(), + model=MagicMock(), + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + rename_mock.assert_called_once() + delete_mock.assert_not_called() + done_paths = [call.args[0] for call in journal_mock.call_args_list if call.kwargs.get("status") == "done"] + assert str(dest.resolve()) in done_paths + assert str(src.resolve()) in done_paths + + +def test_processor_skips_internal_git_path_without_collection_resolution(monkeypatch): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + internal = Path("/work/.git/HEAD") + + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "get_workspace_state", lambda *a, **k: {}) + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: False) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + + collection_mock = MagicMock(return_value="should-not-be-used") + journal_mock = MagicMock() + monkeypatch.setattr(proc_mod, "_get_collection_for_file", collection_mock) + monkeypatch.setattr(proc_mod, "update_index_journal_entry_status", journal_mock) + monkeypatch.setattr( + proc_mod, + "list_pending_index_journal_entries", + lambda *a, **k: [{"path": str(internal), "op_type": "delete"}], + ) + + proc_mod._process_paths( + [internal], + client=MagicMock(), + model=None, + vector_name="vec", + model_dim=1, + workspace_path="/work", + ) + + collection_mock.assert_not_called() + journal_mock.assert_called_once() + assert journal_mock.call_args.kwargs["status"] == "done" + + +def test_processor_force_upsert_empty_file_marks_done(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + empty_file = tmp_path / "pkg" / "__init__.py" + empty_file.parent.mkdir(parents=True, exist_ok=True) + empty_file.write_text("", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "get_workspace_state", lambda *a, **k: {}) + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: False) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "remove_cached_file", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_run_indexing_strategy", lambda *a, **k: False) + monkeypatch.setattr(proc_mod, "_path_has_indexed_points", lambda *a, **k: False) + + journal_mock = MagicMock() + monkeypatch.setattr(proc_mod, "update_index_journal_entry_status", journal_mock) + monkeypatch.setattr( + proc_mod, + "list_pending_index_journal_entries", + lambda *a, **k: [ + { + "path": str(empty_file.resolve()), + "op_type": "upsert", + "content_hash": "da39a3ee5e6b4b0d3255bfef95601890afd80709", + } + ], + ) + + proc_mod._process_paths( + [empty_file], + client=MagicMock(), + model=MagicMock(), + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + journal_mock.assert_called_once() + assert journal_mock.call_args.kwargs["status"] == "done" diff --git a/tests/test_ingest_cli.py b/tests/test_ingest_cli.py new file mode 100644 index 00000000..493d1a68 --- /dev/null +++ b/tests/test_ingest_cli.py @@ -0,0 +1,57 @@ +import sys +from pathlib import Path + +import pytest + + +@pytest.mark.unit +def test_cli_force_collection_disables_multi_repo_enumeration(monkeypatch, tmp_path: Path): + from scripts.ingest import cli + + # Create fake repo dirs to prove we are not enumerating them. + (tmp_path / "repo_a").mkdir() + (tmp_path / "repo_b").mkdir() + + calls = [] + + def _fake_index_repo( + root, + qdrant_url, + api_key, + collection, + model_name, + recreate, + dedupe, + skip_unchanged, + pseudo_mode, + schema_mode, + ): + calls.append( + { + "root": Path(root), + "collection": collection, + "recreate": recreate, + "dedupe": dedupe, + "skip_unchanged": skip_unchanged, + } + ) + + monkeypatch.setattr(cli, "index_repo", _fake_index_repo) + monkeypatch.setattr(cli, "is_multi_repo_mode", lambda: True) + monkeypatch.setattr(cli, "get_collection_name", lambda *_: "should-not-use") + + monkeypatch.setenv("MULTI_REPO_MODE", "1") + monkeypatch.setenv("COLLECTION_NAME", "forced-collection") + monkeypatch.setenv("CTXCE_FORCE_COLLECTION_NAME", "1") + + monkeypatch.setattr( + sys, + "argv", + ["ingest_code.py", "--root", str(tmp_path)], + ) + + cli.main() + + assert len(calls) == 1 + assert calls[0]["root"] == tmp_path + assert calls[0]["collection"] == "forced-collection" diff --git a/tests/test_ingest_schema_mode.py b/tests/test_ingest_schema_mode.py index c766089b..461212e3 100644 --- a/tests/test_ingest_schema_mode.py +++ b/tests/test_ingest_schema_mode.py @@ -91,6 +91,7 @@ def test_schema_mode_validate_errors_on_missing_vectors(monkeypatch): def test_schema_mode_migrate_adds_missing_vectors_and_indexes(monkeypatch): monkeypatch.setenv("PATTERN_VECTORS", "1") monkeypatch.setattr(ingq, "LEX_SPARSE_MODE", False) + ingq.ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard("test-collection") existing_vectors = { "code": object(), @@ -122,6 +123,7 @@ def test_schema_mode_migrate_adds_missing_vectors_and_indexes(monkeypatch): def test_schema_mode_create_creates_collection_only(monkeypatch): monkeypatch.setenv("PATTERN_VECTORS", "0") monkeypatch.setattr(ingq, "LEX_SPARSE_MODE", False) + ingq.ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard("test-collection") client = FakeClient(collection_exists=False) @@ -138,3 +140,15 @@ def test_schema_mode_create_creates_collection_only(monkeypatch): assert any( c["field_name"] == "metadata.language" for c in client.payload_index_calls ) + + +def test_ensure_payload_indexes_memoized_per_process(): + client = FakeClient(collection_exists=True) + ingq.ENSURED_PAYLOAD_INDEX_COLLECTIONS.discard("test-collection") + + ingq.ensure_payload_indexes(client, "test-collection") + first_count = len(client.payload_index_calls) + ingq.ensure_payload_indexes(client, "test-collection") + + assert first_count == len(ingq.PAYLOAD_INDEX_FIELDS) + assert len(client.payload_index_calls) == first_count diff --git a/tests/test_integration_qdrant.py b/tests/test_integration_qdrant.py index 65cef7f5..ab5d71d1 100644 --- a/tests/test_integration_qdrant.py +++ b/tests/test_integration_qdrant.py @@ -1,6 +1,7 @@ import os import json import uuid +import asyncio import importlib import pytest @@ -75,7 +76,7 @@ def test_index_and_search_minirepo(tmp_path, monkeypatch, qdrant_container): ) # Search directly via async function - res = srv.asyncio.get_event_loop().run_until_complete( + res = asyncio.run( srv.repo_search( queries=["def f"], limit=5, @@ -127,19 +128,19 @@ def test_filters_language_and_path(tmp_path, monkeypatch, qdrant_container): f_md = str(tmp_path / "pkg" / "b.md") # Filter by language=python should bias toward .py - res1 = srv.asyncio.get_event_loop().run_until_complete( + res1 = asyncio.run( srv.repo_search(queries=["def"], limit=5, language="python", compact=False) ) assert any(f_py in (r.get("path") or "") for r in res1.get("results", [])) # Filter by ext=txt should retrieve text file - res2 = srv.asyncio.get_event_loop().run_until_complete( + res2 = asyncio.run( srv.repo_search(queries=["hello"], limit=5, ext="md", compact=False) ) assert any(f_md in (r.get("path") or "") for r in res2.get("results", [])) # Path glob to only allow pkg/*.py - res3 = srv.asyncio.get_event_loop().run_until_complete( + res3 = asyncio.run( srv.repo_search( queries=["def"], limit=5, diff --git a/tests/test_negative_args.py b/tests/test_negative_args.py index 32dadd52..f5181375 100644 --- a/tests/test_negative_args.py +++ b/tests/test_negative_args.py @@ -1,4 +1,5 @@ import os +import asyncio import pytest import scripts.mcp_indexer_server as srv @@ -14,7 +15,7 @@ def test_repo_search_conflicting_filters_empty_ok(monkeypatch): monkeypatch.setattr(hy, "run_hybrid_search", lambda *a, **k: []) - res = srv.asyncio.get_event_loop().run_until_complete( + res = asyncio.run( srv.repo_search(queries=["foo"], limit=3, ext="cpp", compact=True) ) diff --git a/tests/test_path_scope.py b/tests/test_path_scope.py new file mode 100644 index 00000000..06369860 --- /dev/null +++ b/tests/test_path_scope.py @@ -0,0 +1,61 @@ +import importlib + + +ps = importlib.import_module("scripts.path_scope") + + +def test_normalize_under_strips_work_prefix(): + assert ps.normalize_under("/work/scripts/mcp_impl") == "scripts/mcp_impl" + + +def test_normalize_under_keeps_repo_prefixed_path(): + assert ( + ps.normalize_under("/work/Context-Engine/scripts/mcp_impl") + == "Context-Engine/scripts/mcp_impl" + ) + + +def test_normalize_under_rebases_single_segment_from_cwd(monkeypatch, tmp_path): + repo = tmp_path / "repo" + (repo / "nested" / "scope").mkdir(parents=True) + monkeypatch.setattr(ps, "_repo_root_hint", lambda: str(repo)) + monkeypatch.setattr(ps.os, "getcwd", lambda: str(repo / "nested")) + + assert ps.normalize_under("scope") == "nested/scope" + + +def test_normalize_under_does_not_rebase_when_top_level_exists(monkeypatch, tmp_path): + repo = tmp_path / "repo" + (repo / "nested" / "scope").mkdir(parents=True) + (repo / "scope").mkdir(parents=True) + monkeypatch.setattr(ps, "_repo_root_hint", lambda: str(repo)) + monkeypatch.setattr(ps.os, "getcwd", lambda: str(repo / "nested")) + + assert ps.normalize_under("scope") == "scope" + + +def test_normalize_under_expands_unique_segment(monkeypatch, tmp_path): + repo = tmp_path / "repo" + (repo / "alpha" / "mcp_impl").mkdir(parents=True) + monkeypatch.setattr(ps, "_repo_root_hint", lambda: str(repo)) + monkeypatch.setattr(ps.os, "getcwd", lambda: str(repo)) + ps._unique_segment_path.cache_clear() + + assert ps.normalize_under("mcp_impl") == "alpha/mcp_impl" + + +def test_normalize_under_keeps_ambiguous_segment(monkeypatch, tmp_path): + repo = tmp_path / "repo" + (repo / "alpha" / "dup").mkdir(parents=True) + (repo / "beta" / "dup").mkdir(parents=True) + monkeypatch.setattr(ps, "_repo_root_hint", lambda: str(repo)) + monkeypatch.setattr(ps.os, "getcwd", lambda: str(repo)) + ps._unique_segment_path.cache_clear() + + assert ps.normalize_under("dup") == "dup" + + +def test_metadata_matches_under_without_repo_hint_for_work_repo_paths(): + md = {"path": "/work/repo/space/ship/a.py"} + assert ps.metadata_matches_under(md, "space") + assert not ps.metadata_matches_under(md, "direct") diff --git a/tests/test_rerank_under_scope.py b/tests/test_rerank_under_scope.py new file mode 100644 index 00000000..080da9cd --- /dev/null +++ b/tests/test_rerank_under_scope.py @@ -0,0 +1,66 @@ +import importlib + + +rr = importlib.import_module("scripts.rerank_tools.local") + + +class _Pt: + def __init__(self, pid: str, path: str): + self.id = pid + self.payload = { + "metadata": { + "path": path, + "start_line": 1, + "end_line": 2, + "symbol": "f", + } + } + + +class _FakeModel: + def embed(self, texts): + for _ in texts: + yield [0.01] * 8 + + +def test_rerank_in_process_under_excludes_out_of_scope(monkeypatch): + monkeypatch.setattr(rr, "QdrantClient", lambda *a, **k: object()) + monkeypatch.setattr(rr, "_select_dense_vector_name", lambda *a, **k: "vec") + monkeypatch.setattr( + rr, + "dense_results", + lambda *a, **k: [_Pt("1", "/work/repo/direct/tools/b.py")], + ) + monkeypatch.setattr(rr, "rerank_local", lambda pairs: [0.9] * len(pairs)) + + out = rr.rerank_in_process( + query="rotate heading", + topk=10, + limit=5, + under="space", + model=_FakeModel(), + collection="codebase", + ) + assert out == [] + + +def test_rerank_in_process_under_keeps_in_scope(monkeypatch): + monkeypatch.setattr(rr, "QdrantClient", lambda *a, **k: object()) + monkeypatch.setattr(rr, "_select_dense_vector_name", lambda *a, **k: "vec") + monkeypatch.setattr( + rr, + "dense_results", + lambda *a, **k: [_Pt("1", "/work/repo/space/ship/a.py")], + ) + monkeypatch.setattr(rr, "rerank_local", lambda pairs: [0.9] * len(pairs)) + + out = rr.rerank_in_process( + query="rotate heading", + topk=10, + limit=5, + under="space", + model=_FakeModel(), + collection="codebase", + ) + assert len(out) == 1 + assert out[0]["path"] == "/work/repo/space/ship/a.py" diff --git a/tests/test_reranker_verification.py b/tests/test_reranker_verification.py index e7a05245..b43441eb 100644 --- a/tests/test_reranker_verification.py +++ b/tests/test_reranker_verification.py @@ -16,6 +16,10 @@ def tool(self, *args, **kwargs): def _decorator(fn): return fn return _decorator + def resource(self, *args, **kwargs): + def _decorator(fn): + return fn + return _decorator class _Context: def __init__(self, *args, **kwargs): @@ -102,7 +106,9 @@ def fake_rerank_local(pairs): assert [r["path"] for r in base["results"]] == ["/work/a.py", "/work/b.py"] # With rerank enabled, order should flip to B then A; counters should show inproc_hybrid - rr = await server.repo_search(query="q", limit=2, per_path=2, rerank_enabled=True, compact=True) + rr = await server.repo_search( + query="q", limit=2, per_path=2, rerank_enabled=True, compact=True, debug=True + ) assert rr.get("used_rerank") is True assert rr.get("rerank_counters", {}).get("inproc_hybrid", 0) >= 1 assert [r["path"] for r in rr["results"]] == ["/work/b.py", "/work/a.py"] @@ -147,6 +153,69 @@ def fake_rerank_in_process(**kwargs): assert captured.get("collection") == "other-collection" +@pytest.mark.service +@pytest.mark.anyio +async def test_rerank_inproc_dense_respects_path_filters(monkeypatch): + monkeypatch.setenv("HYBRID_IN_PROCESS", "1") + monkeypatch.setenv("RERANK_IN_PROCESS", "1") + + def fake_run_hybrid_search(**kwargs): + return [] + + monkeypatch.setitem(sys.modules, "scripts.hybrid_search", _make_hybrid_stub(fake_run_hybrid_search)) + monkeypatch.delitem(sys.modules, "scripts.mcp_indexer_server", raising=False) + server = importlib.import_module("scripts.mcp_indexer_server") + monkeypatch.setattr(server, "_get_embedding_model", _fake_embedding_model) + + def fake_rerank_in_process(**kwargs): + return [ + {"score": 0.9, "path": "/work/src/a.py", "symbol": "", "start_line": 1, "end_line": 3}, + {"score": 0.8, "path": "/work/tests/b.py", "symbol": "", "start_line": 5, "end_line": 9}, + { + "score": 0.7, + "path": "/home/coder/project/Context-Engine/scripts/mcp_impl/search.py", + "symbol": "", + "start_line": 10, + "end_line": 20, + }, + ] + + monkeypatch.setattr( + importlib.import_module("scripts.rerank_local"), + "rerank_in_process", + fake_rerank_in_process, + ) + + only_tests = await server.repo_search( + query="q", + limit=10, + rerank_enabled=True, + path_glob=["tests/**"], + compact=True, + ) + assert [r["path"] for r in only_tests["results"]] == ["/work/tests/b.py"] + + no_tests = await server.repo_search( + query="q", + limit=10, + rerank_enabled=True, + not_glob=["**/tests/**"], + compact=True, + ) + assert all("/tests/" not in r["path"] for r in no_tests["results"]) + + host_rel_glob = await server.repo_search( + query="q", + limit=10, + rerank_enabled=True, + path_glob=["scripts/mcp_impl/**"], + compact=True, + ) + assert [r["path"] for r in host_rel_glob["results"]] == [ + "/home/coder/project/Context-Engine/scripts/mcp_impl/search.py" + ] + + @pytest.mark.service @pytest.mark.anyio async def test_rerank_subprocess_timeout_fallback(monkeypatch): @@ -187,9 +256,9 @@ async def fake_run_async(cmd, env=None, timeout=None): rerank_enabled=True, compact=True, collection="test-coll", + debug=True, ) # Fallback should keep original order from hybrid; timeout counter incremented assert rr.get("used_rerank") is False assert rr.get("rerank_counters", {}).get("timeout", 0) >= 1 assert [r["path"] for r in rr["results"]] == ["/work/a.py", "/work/b.py"] - diff --git a/tests/test_server_helpers.py b/tests/test_server_helpers.py index 94212145..84856ee6 100644 --- a/tests/test_server_helpers.py +++ b/tests/test_server_helpers.py @@ -1,4 +1,5 @@ import json +import asyncio import types import importlib @@ -53,7 +54,7 @@ def test_repo_search_arg_normalization(monkeypatch, tmp_path): # Ensure in-process branch stays off monkeypatch.delenv("HYBRID_IN_PROCESS", raising=False) - res = srv.asyncio.get_event_loop().run_until_complete( + res = asyncio.run( _call_repo_search( queries=["FooBar"], limit="12", # str on purpose to test coercion diff --git a/tests/test_service_qdrant_status.py b/tests/test_service_qdrant_status.py index df04254c..38a1ec6b 100644 --- a/tests/test_service_qdrant_status.py +++ b/tests/test_service_qdrant_status.py @@ -1,4 +1,5 @@ import types +import asyncio import importlib import pytest @@ -31,7 +32,7 @@ def test_qdrant_status_mocked(monkeypatch): monkeypatch.setattr(qdrant_client, "QdrantClient", lambda *a, **k: FakeQdrant()) - out = srv.asyncio.get_event_loop().run_until_complete( + out = asyncio.run( srv.qdrant_status(collection="test") ) # qdrant_status returns a summary shape without an 'ok' key diff --git a/tests/test_smart_reindex_vectors.py b/tests/test_smart_reindex_vectors.py index 2e77056e..aca0e819 100644 --- a/tests/test_smart_reindex_vectors.py +++ b/tests/test_smart_reindex_vectors.py @@ -2,6 +2,7 @@ import sys from types import SimpleNamespace from pathlib import Path +from unittest.mock import MagicMock import pytest @@ -119,6 +120,178 @@ def fake_upsert_points(_client, _collection, points): assert out_vec[ingest_code.LEX_VECTOR_NAME] != old_lex +def test_should_process_pseudo_for_chunk_reuses_cache_after_line_shift(monkeypatch): + from scripts.ingest import pseudo as pseudo_mod + + monkeypatch.setattr(pseudo_mod, "get_cached_pseudo", lambda *a, **k: ("", [])) + monkeypatch.setattr( + pseudo_mod, + "get_cached_symbols", + lambda _fp: { + "function_foo_10": { + "name": "foo", + "type": "function", + "pseudo": "cached pseudo", + "tags": ["alpha", "beta"], + } + }, + ) + + needs_processing, pseudo, tags = pseudo_mod.should_process_pseudo_for_chunk( + "x.py", + {"symbol": "foo", "kind": "function", "start": 12}, + changed_symbols=set(), + ) + + assert needs_processing is False + assert pseudo == "cached pseudo" + assert tags == ["alpha", "beta"] + + +def test_smart_reindex_persists_pseudo_on_shifted_symbol_ids(tmp_path, monkeypatch): + monkeypatch.setitem(sys.modules, "fastembed", SimpleNamespace(TextEmbedding=object)) + monkeypatch.setenv("PSEUDO_BATCH_CONCURRENCY", "1") + + from scripts import ingest_code + from scripts.ingest import pipeline as ingest_pipeline + + fp = tmp_path / "x.py" + fp.write_text("def foo():\n return 1\n", encoding="utf-8") + + monkeypatch.setattr( + ingest_pipeline, + "extract_symbols_with_tree_sitter", + lambda _fp: { + "function_foo_12": { + "name": "foo", + "type": "function", + "start_line": 12, + "end_line": 13, + "content_hash": "samehash", + "pseudo": "", + "tags": [], + "qdrant_ids": [], + }, + "function_bar_20": { + "name": "bar", + "type": "function", + "start_line": 20, + "end_line": 21, + "content_hash": "barhash-new", + "pseudo": "", + "tags": [], + "qdrant_ids": [], + }, + }, + ) + monkeypatch.setattr( + ingest_pipeline, + "get_cached_symbols", + lambda _fp: { + "function_foo_10": { + "name": "foo", + "type": "function", + "start_line": 10, + "end_line": 11, + "content_hash": "samehash", + "pseudo": "cached pseudo", + "tags": ["tag1"], + "qdrant_ids": [], + }, + "function_bar_20": { + "name": "bar", + "type": "function", + "start_line": 20, + "end_line": 21, + "content_hash": "barhash-old", + "pseudo": "old bar", + "tags": ["old"], + "qdrant_ids": [], + }, + }, + ) + monkeypatch.setattr( + ingest_pipeline, + "compare_symbol_changes", + lambda *_: (["function_foo_12"], ["function_bar_20"]), + ) + monkeypatch.setattr(ingest_pipeline, "ensure_collection_and_indexes_once", lambda *a, **k: None) + + class FakeClient: + def scroll(self, **kwargs): + return ([], None) + + monkeypatch.setattr(ingest_pipeline, "delete_points_by_path", lambda *a, **k: None) + monkeypatch.setattr(ingest_pipeline, "upsert_points", lambda *a, **k: None) + monkeypatch.setattr( + ingest_pipeline, + "_sync_graph_edges_best_effort", + lambda *a, **k: None, + raising=False, + ) + monkeypatch.setattr(ingest_pipeline, "_get_imports_calls", lambda *a, **k: ([], [])) + monkeypatch.setattr(ingest_pipeline, "_git_metadata", lambda *a, **k: (0, 0, 0)) + monkeypatch.setattr(ingest_pipeline, "_compute_host_and_container_paths", lambda _p: ("", "")) + monkeypatch.setattr(ingest_pipeline, "_lex_hash_vector_text", lambda _t: [0.0] * ingest_code.LEX_VECTOR_DIM) + monkeypatch.setattr(ingest_pipeline, "_select_dense_text", lambda **kwargs: kwargs.get("code_text") or "") + monkeypatch.setattr(ingest_pipeline, "embed_batch", lambda _model, texts: [[0.1, 0.2, 0.3] for _ in texts]) + monkeypatch.setattr(ingest_code, "embed_batch", lambda _model, texts: [[0.1, 0.2, 0.3] for _ in texts]) + monkeypatch.setattr(ingest_pipeline, "generate_pseudo_tags", lambda _t: ("NEW", ["fresh"])) + monkeypatch.setattr( + ingest_pipeline, + "chunk_lines", + lambda text, *_a, **_k: [ + {"start": 12, "end": 13, "text": text, "symbol": "foo", "kind": "function"}, + {"start": 20, "end": 21, "text": text, "symbol": "bar", "kind": "function"}, + ], + ) + monkeypatch.setattr( + ingest_pipeline, + "chunk_semantic", + lambda text, *_a, **_k: [ + {"start": 12, "end": 13, "text": text, "symbol": "foo", "kind": "function"}, + {"start": 20, "end": 21, "text": text, "symbol": "bar", "kind": "function"}, + ], + ) + monkeypatch.setattr( + ingest_pipeline, + "chunk_by_tokens", + lambda text, *_a, **_k: [ + {"start": 12, "end": 13, "text": text, "symbol": "foo", "kind": "function"}, + {"start": 20, "end": 21, "text": text, "symbol": "bar", "kind": "function"}, + ], + ) + monkeypatch.setattr(ingest_pipeline, "_extract_symbols", lambda *_a, **_k: []) + monkeypatch.setattr(ingest_pipeline, "build_information", lambda *a, **k: "info") + monkeypatch.setattr(ingest_pipeline, "hash_id", lambda *a, **k: 1) + monkeypatch.setattr(ingest_pipeline, "generate_pseudo_tags_batch", None, raising=False) + + saved = {} + monkeypatch.setattr(ingest_pipeline, "set_cached_pseudo", lambda *a, **k: None) + monkeypatch.setattr(ingest_pipeline, "set_cached_file_hash", lambda *a, **k: None) + monkeypatch.setattr(ingest_pipeline, "should_process_pseudo_for_chunk", ingest_code.should_process_pseudo_for_chunk) + monkeypatch.setattr(ingest_pipeline, "set_cached_symbols", lambda _fp, symbols, _hash: saved.update(symbols)) + + status = ingest_code.process_file_with_smart_reindexing( + file_path=fp, + text=fp.read_text(encoding="utf-8"), + language="python", + client=FakeClient(), + current_collection="c", + per_file_repo="r", + model=object(), + vector_name="dense", + model_dim=3, + ) + + assert status == "success" + # `foo` is logically reusable across the line shift, but chunk-level pseudo + # generation may still refresh it depending on chunk processing order. + assert saved["function_foo_12"]["pseudo"] in {"cached pseudo", "NEW"} + assert saved["function_bar_20"]["pseudo"] == "NEW" + assert saved["function_foo_12"]["tags"] + + def test_smart_reindex_does_not_reuse_when_info_changes(tmp_path, monkeypatch): """Dense embeddings must not be reused if `information` differs.""" @@ -291,3 +464,42 @@ def fake_upsert_points(_client, _collection, points): assert len(captured["points"]) == 1 out_vec = captured["points"][0].vector assert out_vec == embedded_vec + + +def test_smart_reindex_no_symbol_changes_falls_back_without_hash_cache(tmp_path, monkeypatch): + monkeypatch.setitem(sys.modules, "fastembed", SimpleNamespace(TextEmbedding=object)) + + from scripts.ingest import pipeline as ingest_pipeline + + code = "def hi():\n return 1\n" + fp = tmp_path / "x.py" + fp.write_text(code, encoding="utf-8") + + monkeypatch.setattr( + ingest_pipeline, + "extract_symbols_with_tree_sitter", + lambda _fp: {"function_hi_1": {"name": "hi", "type": "function", "start_line": 1}}, + ) + monkeypatch.setattr( + ingest_pipeline, + "get_cached_symbols", + lambda _fp: {"function_hi_1": {"name": "hi", "type": "function", "start_line": 1}}, + ) + monkeypatch.setattr(ingest_pipeline, "compare_symbol_changes", lambda *_: ([], [])) + monkeypatch.setattr(ingest_pipeline, "get_cached_file_hash", lambda *_: None) + set_cached_file_hash = MagicMock() + monkeypatch.setattr(ingest_pipeline, "set_cached_file_hash", set_cached_file_hash) + + status = ingest_pipeline.process_file_with_smart_reindexing( + file_path=Path(fp), + text=code, + language="python", + client=MagicMock(), + current_collection="c", + per_file_repo="r", + model=object(), + vector_name="dense", + ) + + assert status == "failed" + set_cached_file_hash.assert_not_called() diff --git a/tests/test_staging_lifecycle.py b/tests/test_staging_lifecycle.py index 01734e32..ea7ab528 100644 --- a/tests/test_staging_lifecycle.py +++ b/tests/test_staging_lifecycle.py @@ -542,6 +542,55 @@ def fake_abort(**kwargs): assert calls["abort"] == 1 +def test_admin_copy_endpoint_reports_graph_clone_in_redirect(monkeypatch: pytest.MonkeyPatch): + import sys + import types + from urllib.parse import parse_qs, urlparse + + from scripts import upload_service + + monkeypatch.setattr(upload_service, "AUTH_ENABLED", True) + monkeypatch.setattr(upload_service, "_require_admin_session", lambda request: {"user_id": "admin"}) + monkeypatch.setattr(upload_service, "WORK_DIR", "/fake/work") + monkeypatch.setenv("WORK_DIR", "/fake/work") + monkeypatch.setattr(upload_service, "pooled_qdrant_client", None, raising=False) + + def fake_copy_collection_qdrant(**kwargs): + assert kwargs.get("source") == "src" + assert kwargs.get("target") == "dst" + return "dst" + + monkeypatch.setattr(upload_service, "copy_collection_qdrant", fake_copy_collection_qdrant) + + class _FakeQdrantClient: + def __init__(self, *args, **kwargs): + pass + + def get_collection(self, collection_name: str): + if collection_name == "dst_graph": + return {"name": collection_name} + raise RuntimeError("not found") + + def close(self): + return None + + monkeypatch.setitem(sys.modules, "qdrant_client", types.SimpleNamespace(QdrantClient=_FakeQdrantClient)) + + client = TestClient(upload_service.app) + resp = client.post( + "/admin/staging/copy", + data={"collection": "src", "target": "dst", "overwrite": ""}, + follow_redirects=False, + ) + assert resp.status_code == 302 + loc = resp.headers.get("location") or "" + parsed = urlparse(loc) + qs = parse_qs(parsed.query) + assert qs.get("copied") == ["src"] + assert qs.get("new") == ["dst"] + assert qs.get("graph_copied") == ["1"] + + def test_watcher_collection_resolution_prefers_serving_state_when_staging_enabled(monkeypatch: pytest.MonkeyPatch, tmp_path: Path): from scripts.watch_index_core import utils as watch_utils @@ -650,7 +699,9 @@ class _Proc: env = captured["env"] assert env["BASE_ONLY"] == "system" assert env["COLLECTION_NAME"] == "primary-coll" - assert "CTXCE_FORCE_COLLECTION_NAME" not in env + # Admin-spawned ingests should never enumerate `/work/*` in multi-repo mode; + # force exact collection/root handling even when no explicit overrides are provided. + assert env.get("CTXCE_FORCE_COLLECTION_NAME") == "1" def test_promote_pending_env_without_pending_config(staging_workspace: dict): diff --git a/tests/test_symbol_graph_tool.py b/tests/test_symbol_graph_tool.py index e148fc17..1d3861d0 100644 --- a/tests/test_symbol_graph_tool.py +++ b/tests/test_symbol_graph_tool.py @@ -2,21 +2,43 @@ @pytest.mark.asyncio -async def test_symbol_graph_under_uses_path_prefix_matchvalue(): - # Import internal helper to validate filter construction without needing a real Qdrant instance. - from qdrant_client import models as qmodels +async def test_symbol_graph_under_filters_results_by_recursive_scope(): + # Validate that under applies as recursive subtree filter (user-facing scope). from scripts.mcp_impl import symbol_graph as sg - captured = {} + class _Pt: + def __init__(self, pid, path): + self.id = pid + self.payload = { + "metadata": { + "repo": "repo", + "path": path, + "start_line": 1, + "end_line": 2, + "symbol": "f", + "symbol_path": "f", + "language": "python", + "calls": ["foo"], + } + } class FakeClient: + def __init__(self): + self.scroll_filters = [] + def scroll(self, *, collection_name, scroll_filter, limit, with_payload, with_vectors): - captured["collection_name"] = collection_name - captured["scroll_filter"] = scroll_filter - return ([], None) + self.scroll_filters.append(scroll_filter) + return ( + [ + _Pt("1", "/work/repo/scripts/a.py"), + _Pt("2", "/work/repo/tests/b.py"), + ], + None, + ) - await sg._query_array_field( # type: ignore[attr-defined] - client=FakeClient(), + client = FakeClient() + out = await sg._query_array_field( # type: ignore[attr-defined] + client=client, collection="codebase", field_key="metadata.calls", value="foo", @@ -25,15 +47,29 @@ def scroll(self, *, collection_name, scroll_filter, limit, with_payload, with_ve under=sg._norm_under("scripts"), # type: ignore[attr-defined] ) - flt = captured.get("scroll_filter") - assert isinstance(flt, qmodels.Filter) - must = list(flt.must or []) - keys = [getattr(c, "key", None) for c in must] - assert "metadata.path_prefix" in keys - - # Ensure it's an exact match (MatchValue), not substring (MatchText) - cond = next(c for c in must if getattr(c, "key", None) == "metadata.path_prefix") - assert isinstance(cond.match, qmodels.MatchValue) - assert cond.match.value == "/work/scripts" - + # Validate _query_array_field forwards language/value constraints to scroll_filter. + assert client.scroll_filters, "Expected at least one scroll() call" + first_filter = client.scroll_filters[0] + first_must = list(getattr(first_filter, "must", []) or []) + assert any( + getattr(cond, "key", None) == "metadata.calls" + and getattr(getattr(cond, "match", None), "any", None) == ["foo"] + for cond in first_must + ) + assert any( + getattr(cond, "key", None) == "metadata.language" + and getattr(getattr(cond, "match", None), "value", None) == "python" + for cond in first_must + ) + assert any( + any( + getattr(cond, "key", None) == "metadata.calls" + and getattr(getattr(cond, "match", None), "text", None) == "foo" + for cond in list(getattr(sf, "must", []) or []) + ) + for sf in client.scroll_filters + ), "Expected MatchText fallback filter for metadata.calls" + paths = {r.get("path") for r in out} + assert "/work/repo/scripts/a.py" in paths + assert "/work/repo/tests/b.py" not in paths diff --git a/tests/test_upload_client_ignore_cleanup.py b/tests/test_upload_client_ignore_cleanup.py new file mode 100644 index 00000000..1c01cc67 --- /dev/null +++ b/tests/test_upload_client_ignore_cleanup.py @@ -0,0 +1,585 @@ +import importlib +from pathlib import Path +from unittest.mock import MagicMock + + +def _exercise_ignored_path_cleanup(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + ignored = workspace / "dev-workspace" / "nested.py" + ignored.parent.mkdir(parents=True, exist_ok=True) + ignored.write_text("print('dogfood')\n", encoding="utf-8") + + monkeypatch.setenv("DEV_REMOTE_MODE", "1") + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: "abc123") + monkeypatch.setattr(mod, "set_cached_file_hash", lambda *a, **k: None) + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + changes = client.detect_file_changes([ignored]) + + assert ignored in changes["deleted"] + assert not changes["created"] + assert not changes["updated"] + assert not changes["moved"] + + +def test_remote_upload_client_marks_ignored_cached_paths_deleted(monkeypatch, tmp_path): + _exercise_ignored_path_cleanup("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_marks_ignored_cached_paths_deleted(monkeypatch, tmp_path): + _exercise_ignored_path_cleanup("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def _exercise_force_mode_cleanup(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + stale_ignored = workspace / "dev-workspace" / "nested.py" + stale_ignored.parent.mkdir(parents=True, exist_ok=True) + stale_ignored.write_text("print('stale')\n", encoding="utf-8") + + monkeypatch.setenv("DEV_REMOTE_MODE", "1") + monkeypatch.setattr(mod, "get_all_cached_paths", lambda repo_name=None: [str(stale_ignored)]) + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: "abc123") + monkeypatch.setattr(mod, "set_cached_file_hash", lambda *a, **k: None) + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + changes = client.build_force_changes([current]) + + assert current in changes["created"] + assert stale_ignored in changes["deleted"] + assert not changes["updated"] + assert not changes["moved"] + + +def test_remote_upload_client_force_mode_keeps_creates_and_deletes_ignored_cached_paths(monkeypatch, tmp_path): + _exercise_force_mode_cleanup("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_force_mode_keeps_creates_and_deletes_ignored_cached_paths(monkeypatch, tmp_path): + _exercise_force_mode_cleanup("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def _exercise_force_mode_excludes_ignored_current_files(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + ignored_current = workspace / "dev-workspace" / "ignored.py" + ignored_current.parent.mkdir(parents=True, exist_ok=True) + ignored_current.write_text("print('ignored')\n", encoding="utf-8") + + monkeypatch.setenv("DEV_REMOTE_MODE", "1") + monkeypatch.setattr(mod, "get_all_cached_paths", lambda repo_name=None: []) + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: None) + monkeypatch.setattr(mod, "set_cached_file_hash", lambda *a, **k: None) + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + changes = client.build_force_changes([current, ignored_current]) + + assert current in changes["created"] + assert ignored_current not in changes["created"] + assert ignored_current in changes["deleted"] + assert not changes["updated"] + assert not changes["moved"] + + +def test_remote_upload_client_force_mode_excludes_ignored_current_files(monkeypatch, tmp_path): + _exercise_force_mode_excludes_ignored_current_files( + "scripts.remote_upload_client", + monkeypatch, + tmp_path, + ) + + +def test_standalone_upload_client_force_mode_excludes_ignored_current_files(monkeypatch, tmp_path): + _exercise_force_mode_excludes_ignored_current_files( + "scripts.standalone_upload_client", + monkeypatch, + tmp_path, + ) + + +def _exercise_force_mode_dev_workspace_cleanup_without_cache(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + mirrored = workspace / "dev-workspace" / "nested" / "stale.py" + mirrored.parent.mkdir(parents=True, exist_ok=True) + mirrored.write_text("print('stale')\n", encoding="utf-8") + + monkeypatch.setenv("DEV_REMOTE_MODE", "1") + monkeypatch.setattr(mod, "get_all_cached_paths", lambda repo_name=None: []) + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: None) + monkeypatch.setattr(mod, "set_cached_file_hash", lambda *a, **k: None) + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + changes = client.build_force_changes([current]) + + assert current in changes["created"] + assert mirrored in changes["deleted"] + assert not changes["updated"] + assert not changes["moved"] + + +def test_remote_upload_client_force_mode_deletes_dev_workspace_without_cache(monkeypatch, tmp_path): + _exercise_force_mode_dev_workspace_cleanup_without_cache("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_force_mode_deletes_dev_workspace_without_cache(monkeypatch, tmp_path): + _exercise_force_mode_dev_workspace_cleanup_without_cache("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def _exercise_plan_skip_avoids_bundle_upload(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + monkeypatch.setattr( + client, + "_plan_delta_upload", + lambda changes: { + "needed_files": {"created": [], "updated": [], "moved": []}, + "operation_counts_preview": { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 1, + "skipped_hash_match": 1, + "failed": 0, + }, + "needed_size_bytes": 0, + }, + ) + monkeypatch.setattr(client, "create_delta_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not bundle"))) + monkeypatch.setattr(client, "upload_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not upload"))) + + assert client.process_changes_and_upload( + { + "created": [current], + "updated": [], + "deleted": [], + "moved": [], + "unchanged": [], + } + ) is True + assert client.last_upload_result["outcome"] == "skipped_by_plan" + + +def test_remote_upload_client_plan_skip_avoids_bundle_upload(monkeypatch, tmp_path): + _exercise_plan_skip_avoids_bundle_upload("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_plan_skip_avoids_bundle_upload(monkeypatch, tmp_path): + _exercise_plan_skip_avoids_bundle_upload("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def _exercise_detect_file_changes_does_not_persist_hash(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + set_hash = MagicMock() + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: "oldhash") + monkeypatch.setattr(mod, "set_cached_file_hash", set_hash) + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + changes = client.detect_file_changes([current]) + + assert current in changes["updated"] + set_hash.assert_not_called() + + +def test_remote_upload_client_detect_file_changes_does_not_persist_hash(monkeypatch, tmp_path): + _exercise_detect_file_changes_does_not_persist_hash( + "scripts.remote_upload_client", monkeypatch, tmp_path + ) + + +def test_standalone_upload_client_detect_file_changes_does_not_persist_hash(monkeypatch, tmp_path): + _exercise_detect_file_changes_does_not_persist_hash( + "scripts.standalone_upload_client", monkeypatch, tmp_path + ) + + +def _exercise_plan_skip_finalizes_hash(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + set_hash = MagicMock() + monkeypatch.setattr(mod, "set_cached_file_hash", set_hash) + monkeypatch.setattr( + client, + "_plan_delta_upload", + lambda changes: { + "needed_files": {"created": [], "updated": [], "moved": []}, + "operation_counts_preview": { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 1, + "skipped_hash_match": 1, + "failed": 0, + }, + "needed_size_bytes": 0, + }, + ) + monkeypatch.setattr(client, "create_delta_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not bundle"))) + monkeypatch.setattr(client, "upload_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not upload"))) + + assert client.process_changes_and_upload( + { + "created": [], + "updated": [current], + "deleted": [], + "moved": [], + "unchanged": [], + } + ) is True + assert client.last_upload_result["outcome"] == "skipped_by_plan" + set_hash.assert_called_once() + + +def test_remote_upload_client_plan_skip_finalizes_hash(monkeypatch, tmp_path): + _exercise_plan_skip_finalizes_hash( + "scripts.remote_upload_client", monkeypatch, tmp_path + ) + + +def test_standalone_upload_client_plan_skip_finalizes_hash(monkeypatch, tmp_path): + _exercise_plan_skip_finalizes_hash( + "scripts.standalone_upload_client", monkeypatch, tmp_path + ) + + +def test_standalone_upload_client_plan_payload_prefixes_previous_hash(monkeypatch, tmp_path): + mod = importlib.import_module("scripts.standalone_upload_client") + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + updated = workspace / "app.py" + updated.write_text("print('updated')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + monkeypatch.setattr(mod, "get_cached_file_hash", lambda path, repo_name=None: "abc123") + + payload = client._build_plan_payload( + { + "created": [], + "updated": [updated], + "deleted": [updated], + "moved": [], + } + ) + + updated_op = next(op for op in payload["operations"] if op["operation"] == "updated") + deleted_op = next(op for op in payload["operations"] if op["operation"] == "deleted") + assert updated_op["previous_hash"] == "sha1:abc123" + assert deleted_op["previous_hash"] == "sha1:abc123" + + +def _exercise_delete_only_plan_uses_apply_ops(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + deleted = workspace / "old.py" + deleted.write_text("print('old')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + removed_paths = [] + + monkeypatch.setattr( + client, + "_plan_delta_upload", + lambda changes: { + "needed_files": {"created": [], "updated": [], "moved": []}, + "operation_counts_preview": { + "created": 0, + "updated": 0, + "deleted": 1, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + }, + "needed_size_bytes": 0, + }, + ) + monkeypatch.setattr( + client, + "_build_plan_payload", + lambda changes: { + "manifest": {"bundle_id": "b1", "sequence_number": None}, + "operations": [{"operation": "deleted", "path": "old.py"}], + "file_hashes": {}, + }, + ) + monkeypatch.setattr(client, "create_delta_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not bundle"))) + monkeypatch.setattr(client, "upload_bundle", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not upload"))) + + class _Resp: + status_code = 200 + + @staticmethod + def raise_for_status(): + return None + + @staticmethod + def json(): + return { + "success": True, + "bundle_id": "b1", + "sequence_number": 3, + "processed_operations": {"deleted": 1, "created": 0, "updated": 0, "moved": 0, "skipped": 0, "skipped_hash_match": 0, "failed": 0}, + } + + monkeypatch.setattr(client.session, "post", lambda *a, **k: _Resp()) + monkeypatch.setattr(mod, "remove_cached_file", lambda path, repo_name=None: removed_paths.append((path, repo_name))) + + assert client.process_changes_and_upload( + { + "created": [], + "updated": [], + "deleted": [deleted], + "moved": [], + "unchanged": [], + } + ) is True + assert client.last_upload_result["outcome"] == "uploaded" + assert client.last_upload_result["processed_operations"]["deleted"] == 1 + assert removed_paths == [(str(deleted.resolve()), client.repo_name)] + + +def test_remote_upload_client_delete_only_plan_uses_apply_ops(monkeypatch, tmp_path): + _exercise_delete_only_plan_uses_apply_ops("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_delete_only_plan_uses_apply_ops(monkeypatch, tmp_path): + _exercise_delete_only_plan_uses_apply_ops("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def _exercise_async_upload_sets_queued_result(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + bundle_path = workspace / "bundle.tar.gz" + bundle_path.write_bytes(b"bundle") + monkeypatch.setattr(client, "_plan_delta_upload", lambda changes: None) + monkeypatch.setattr( + client, + "create_delta_bundle", + lambda changes: (str(bundle_path), {"bundle_id": "bundle-1", "total_size_bytes": 6}), + ) + monkeypatch.setattr( + client, + "upload_bundle", + lambda *a, **k: {"success": True, "sequence_number": 7, "processed_operations": None}, + ) + monkeypatch.setattr(mod, "flush_cached_file_hashes", lambda: None, raising=False) + + assert client.process_changes_and_upload( + { + "created": [current], + "updated": [], + "deleted": [], + "moved": [], + "unchanged": [], + } + ) is True + assert client.last_upload_result["outcome"] == "queued" + assert client.last_upload_result["sequence_number"] == 7 + + +def _exercise_async_upload_promotes_completed_result(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + current = workspace / "app.py" + current.write_text("print('current')\n", encoding="utf-8") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + bundle_path = workspace / "bundle.tar.gz" + bundle_path.write_bytes(b"bundle") + monkeypatch.setattr(client, "_plan_delta_upload", lambda changes: None) + monkeypatch.setattr( + client, + "create_delta_bundle", + lambda changes: (str(bundle_path), {"bundle_id": "bundle-1", "total_size_bytes": 6}), + ) + monkeypatch.setattr( + client, + "upload_bundle", + lambda *a, **k: {"success": True, "sequence_number": 7, "processed_operations": None}, + ) + monkeypatch.setattr( + client, + "get_server_status", + lambda: { + "success": True, + "last_sequence": 7, + "server_info": { + "last_bundle_id": "bundle-1", + "last_upload_status": "completed", + "last_processed_operations": {"updated": 1, "failed": 0}, + "last_processing_time_ms": 12, + }, + }, + ) + monkeypatch.setattr(mod, "flush_cached_file_hashes", lambda: None, raising=False) + + assert client.process_changes_and_upload( + { + "created": [current], + "updated": [], + "deleted": [], + "moved": [], + "unchanged": [], + } + ) is True + assert client.last_upload_result["outcome"] == "uploaded_async" + assert client.last_upload_result["processed_operations"] == {"updated": 1, "failed": 0} + + +def test_remote_upload_client_async_upload_sets_queued_result(monkeypatch, tmp_path): + _exercise_async_upload_sets_queued_result("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_async_upload_sets_queued_result(monkeypatch, tmp_path): + _exercise_async_upload_sets_queued_result("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def test_remote_upload_client_async_upload_promotes_completed_result(monkeypatch, tmp_path): + _exercise_async_upload_promotes_completed_result("scripts.remote_upload_client", monkeypatch, tmp_path) + + +def test_standalone_upload_client_async_upload_promotes_completed_result(monkeypatch, tmp_path): + _exercise_async_upload_promotes_completed_result("scripts.standalone_upload_client", monkeypatch, tmp_path) + + +def _exercise_watchable_path_excludes_ignored_updates(mod_name: str, monkeypatch, tmp_path: Path) -> None: + mod = importlib.import_module(mod_name) + + workspace = tmp_path / "repo" + workspace.mkdir(parents=True, exist_ok=True) + source = workspace / "src" / "tracked.py" + source.parent.mkdir(parents=True, exist_ok=True) + source.write_text("print('tracked')\n", encoding="utf-8") + + mirrored = workspace / "dev-workspace" / "nested" / "ignored.py" + mirrored.parent.mkdir(parents=True, exist_ok=True) + mirrored.write_text("print('ignored')\n", encoding="utf-8") + + monkeypatch.setenv("DEV_REMOTE_MODE", "1") + + client = mod.RemoteUploadClient( + upload_endpoint="http://localhost:8004", + workspace_path=str(workspace), + collection_name="test-coll", + ) + + assert client._is_watchable_path(source) is True + assert client._is_watchable_path(mirrored) is False + + +def test_remote_upload_client_watchable_path_excludes_ignored_updates(monkeypatch, tmp_path): + _exercise_watchable_path_excludes_ignored_updates( + "scripts.remote_upload_client", + monkeypatch, + tmp_path, + ) + + +def test_standalone_upload_client_watchable_path_excludes_ignored_updates(monkeypatch, tmp_path): + _exercise_watchable_path_excludes_ignored_updates( + "scripts.standalone_upload_client", + monkeypatch, + tmp_path, + ) diff --git a/tests/test_upload_service_path_traversal.py b/tests/test_upload_service_path_traversal.py index 0d01478f..796de7c1 100644 --- a/tests/test_upload_service_path_traversal.py +++ b/tests/test_upload_service_path_traversal.py @@ -1,5 +1,6 @@ import io import json +import os import tarfile from pathlib import Path @@ -52,6 +53,59 @@ def _write_bundle_with_created_file(tmp_path: Path, rel_path: str, content: byte return bundle_path +def _write_bundle_with_hash_metadata( + tmp_path: Path, + *, + operations: list[dict], + file_hashes: dict[str, str] | None = None, + created_files: dict[str, bytes] | None = None, + updated_files: dict[str, bytes] | None = None, +) -> Path: + bundle_path = tmp_path / "bundle-hashes.tar.gz" + payload = json.dumps({"operations": operations}).encode("utf-8") + hashes_payload = json.dumps({"file_hashes": file_hashes or {}}).encode("utf-8") + + with tarfile.open(bundle_path, "w:gz") as tar: + info = tarfile.TarInfo(name="metadata/operations.json") + info.size = len(payload) + tar.addfile(info, io.BytesIO(payload)) + + hashes_info = tarfile.TarInfo(name="metadata/hashes.json") + hashes_info.size = len(hashes_payload) + tar.addfile(hashes_info, io.BytesIO(hashes_payload)) + + for rel_path, content in (created_files or {}).items(): + file_info = tarfile.TarInfo(name=f"files/created/{rel_path}") + file_info.size = len(content) + tar.addfile(file_info, io.BytesIO(content)) + + for rel_path, content in (updated_files or {}).items(): + file_info = tarfile.TarInfo(name=f"files/updated/{rel_path}") + file_info.size = len(content) + tar.addfile(file_info, io.BytesIO(content)) + + return bundle_path + + +def _write_repo_cache(work_dir: Path, slug: str, rel_path: str, file_hash: str) -> None: + target = (work_dir / slug / rel_path).resolve() + cache_path = work_dir / ".codebase" / "repos" / slug / "cache.json" + cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_path.write_text( + json.dumps( + { + "file_hashes": { + str(target): { + "hash": file_hash, + } + } + }, + indent=2, + ), + encoding="utf-8", + ) + + def test_process_delta_bundle_rejects_traversal_created(tmp_path, monkeypatch): import scripts.upload_delta_bundle as us @@ -197,3 +251,484 @@ def test_process_delta_bundle_rejects_traversal_moved_source(tmp_path, monkeypat bundle_path=bundle, manifest={"bundle_id": "b1"}, ) + + +def test_process_delta_bundle_skips_created_write_when_server_hash_matches(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "src/file.txt" + content = b"same-content" + file_hash = "sha1:efb5d7d4d38013264f2c00fceeb401f8c8d77d9f" + + target = work_dir / slug / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(content) + os.utime(target, ns=(1_000_000_000, 1_000_000_000)) + before_mtime_ns = target.stat().st_mtime_ns + _write_repo_cache(work_dir, slug, rel_path, file_hash) + + bundle = _write_bundle_with_hash_metadata( + tmp_path, + operations=[ + { + "operation": "created", + "path": rel_path, + "content_hash": file_hash, + } + ], + file_hashes={rel_path: file_hash}, + created_files={rel_path: content}, + ) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-skip-created"}, + ) + + assert counts.get("created") == 0 + assert counts.get("skipped") == 1 + assert counts.get("skipped_hash_match") == 1 + assert target.read_bytes() == content + assert target.stat().st_mtime_ns == before_mtime_ns + + +def test_process_delta_bundle_uses_hashes_metadata_for_updated_skip(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "src/keep.txt" + content = b"existing-content" + file_hash = "sha1:2910e29d6f6d3d2f01f8cc52ec386a4936ca9d2f" + + target = work_dir / slug / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(content) + os.utime(target, ns=(2_000_000_000, 2_000_000_000)) + before_mtime_ns = target.stat().st_mtime_ns + _write_repo_cache(work_dir, slug, rel_path, file_hash) + + bundle = _write_bundle_with_hash_metadata( + tmp_path, + operations=[ + { + "operation": "updated", + "path": rel_path, + } + ], + file_hashes={rel_path: file_hash}, + updated_files={rel_path: content}, + ) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-skip-updated"}, + ) + + assert counts.get("updated") == 0 + assert counts.get("skipped") == 1 + assert counts.get("skipped_hash_match") == 1 + assert target.read_bytes() == content + assert target.stat().st_mtime_ns == before_mtime_ns + + +def test_normalize_hash_value_strips_algorithm_prefixes(): + import scripts.upload_delta_bundle as us + + assert us._normalize_hash_value("sha1:ABCDEF") == "abcdef" + assert us._normalize_hash_value("md5:ABCDEF") == "abcdef" + assert us._normalize_hash_value("sha256:ABCDEF") == "abcdef" + assert us._normalize_hash_value("ABCDEF") == "abcdef" + + +def test_process_delta_bundle_uses_first_marker_match_for_created_members(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "nested/files/created/path.txt" + content = b"marker-safe" + bundle = _write_bundle_with_created_file(tmp_path, rel_path, content) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-created-marker"}, + ) + + assert counts.get("created") == 1 + assert (work_dir / slug / rel_path).read_bytes() == content + + +def test_process_delta_bundle_deleted_prunes_empty_parent_dirs(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "dev-workspace/nested/stale.py" + target = work_dir / slug / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text("stale\n", encoding="utf-8") + + bundle = _write_bundle( + tmp_path, + [{"operation": "deleted", "path": rel_path}], + ) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-delete-prune"}, + ) + + assert counts.get("deleted") == 1 + assert not target.exists() + assert not (work_dir / slug / "dev-workspace" / "nested").exists() + assert not (work_dir / slug / "dev-workspace").exists() + assert (work_dir / slug).exists() + + +def test_process_delta_bundle_moved_prunes_empty_source_parent_dirs(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + src = work_dir / slug / "dev-workspace" / "nested" / "from.py" + dest_rel_path = "dest/to.py" + src.parent.mkdir(parents=True, exist_ok=True) + src.write_text("payload\n", encoding="utf-8") + + bundle = _write_bundle( + tmp_path, + [{"operation": "moved", "path": dest_rel_path, "source_path": "dev-workspace/nested/from.py"}], + ) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-move-prune"}, + ) + + assert counts.get("moved") == 1 + assert not src.exists() + assert (work_dir / slug / dest_rel_path).read_text(encoding="utf-8") == "payload\n" + assert not (work_dir / slug / "dev-workspace" / "nested").exists() + assert not (work_dir / slug / "dev-workspace").exists() + assert (work_dir / slug).exists() + + +def test_process_delta_bundle_does_not_sweep_stranded_empty_dirs_without_file_ops(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + slug = "repo-0123456789abcdef" + stranded = work_dir / slug / "dev-workspace" / "nested" / "empty" + stranded.mkdir(parents=True, exist_ok=True) + + bundle = _write_bundle(tmp_path, []) + + counts = us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-sweep-empty"}, + ) + + assert counts == { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + } + assert stranded.exists() + assert (work_dir / slug / "dev-workspace").exists() + assert (work_dir / slug).exists() + + +def test_process_delta_bundle_skips_broad_empty_dir_sweep_when_disabled(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", "0") + + slug = "repo-0123456789abcdef" + stranded = work_dir / slug / "dev-workspace" / "nested" / "empty" + stranded.mkdir(parents=True, exist_ok=True) + + bundle = _write_bundle(tmp_path, []) + + us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-sweep-disabled"}, + ) + + assert stranded.exists() + + +def test_process_delta_bundle_skips_broad_empty_dir_sweep_when_recent(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + slug = "repo-0123456789abcdef" + stranded = work_dir / slug / "dev-workspace" / "nested" / "empty" + stranded.mkdir(parents=True, exist_ok=True) + + bundle = _write_bundle(tmp_path, []) + + us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-sweep-recent"}, + ) + + assert stranded.exists() + + +def test_process_delta_bundle_preserves_protected_top_level_dirs_when_empty(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", "1") + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", "0") + + slug = "repo-0123456789abcdef" + protected = work_dir / slug / ".remote-git" + protected.mkdir(parents=True, exist_ok=True) + + bundle = _write_bundle(tmp_path, []) + + us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-protected-empty"}, + ) + + assert protected.exists() + + +def test_process_delta_bundle_preserves_nested_dirs_under_protected_top_level(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP", "1") + monkeypatch.setenv("CTXCE_UPLOAD_EMPTY_DIR_SWEEP_INTERVAL_SECONDS", "0") + + slug = "repo-0123456789abcdef" + protected_nested = work_dir / slug / ".codebase" / "repos" / "empty" + protected_nested.mkdir(parents=True, exist_ok=True) + + bundle = _write_bundle(tmp_path, []) + + us.process_delta_bundle( + workspace_path=f"/work/{slug}", + bundle_path=bundle, + manifest={"bundle_id": "b-protected-nested-empty"}, + ) + + assert protected_nested.exists() + + +def test_plan_delta_upload_skips_matching_created_files(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "src/file.txt" + content = b"same-content" + file_hash = "sha1:efb5d7d4d38013264f2c00fceeb401f8c8d77d9f" + + target = work_dir / slug / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(content) + _write_repo_cache(work_dir, slug, rel_path, file_hash) + + plan = us.plan_delta_upload( + workspace_path=f"/work/{slug}", + operations=[ + { + "operation": "created", + "path": rel_path, + "content_hash": file_hash, + "size_bytes": len(content), + } + ], + file_hashes={rel_path: file_hash}, + ) + + assert plan["needed_files"]["created"] == [] + assert plan["operation_counts_preview"]["skipped_hash_match"] == 1 + assert plan["needed_size_bytes"] == 0 + + +def test_plan_delta_upload_marks_updated_file_needed_when_hash_missing(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + rel_path = "src/keep.txt" + file_hash = "sha1:2910e29d6f6d3d2f01f8cc52ec386a4936ca9d2f" + + plan = us.plan_delta_upload( + workspace_path=f"/work/{slug}", + operations=[ + { + "operation": "updated", + "path": rel_path, + "content_hash": file_hash, + "size_bytes": 17, + } + ], + file_hashes={rel_path: file_hash}, + ) + + assert plan["needed_files"]["updated"] == [rel_path] + assert plan["operation_counts_preview"]["updated"] == 1 + assert plan["needed_size_bytes"] == 17 + + +def test_plan_delta_upload_skips_move_content_when_source_exists_on_server(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + source_rel = "src/old.py" + dest_rel = "src/new.py" + source = work_dir / slug / source_rel + source.parent.mkdir(parents=True, exist_ok=True) + source.write_text("print('move')\n", encoding="utf-8") + + plan = us.plan_delta_upload( + workspace_path=f"/work/{slug}", + operations=[ + { + "operation": "moved", + "path": dest_rel, + "source_path": source_rel, + "content_hash": "sha1:abc123", + "size_bytes": 12, + } + ], + file_hashes={dest_rel: "sha1:abc123"}, + ) + + assert plan["needed_files"]["moved"] == [] + assert plan["operation_counts_preview"]["moved"] == 1 + assert plan["needed_size_bytes"] == 0 + + +def test_plan_delta_upload_marks_move_needed_when_source_path_is_invalid(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + dest_rel = "src/new.py" + + plan = us.plan_delta_upload( + workspace_path=f"/work/{slug}", + operations=[ + { + "operation": "moved", + "path": dest_rel, + "source_path": "../escape.py", + "content_hash": "sha1:abc123", + "size_bytes": 12, + } + ], + file_hashes={dest_rel: "sha1:abc123"}, + ) + + assert plan["needed_files"]["moved"] == [dest_rel] + assert plan["operation_counts_preview"]["moved"] == 1 + assert plan["needed_size_bytes"] == 12 + + +def test_apply_delta_operations_moves_file_without_bundle(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + + slug = "repo-0123456789abcdef" + source_rel = "src/old.py" + dest_rel = "src/new.py" + source = work_dir / slug / source_rel + source.parent.mkdir(parents=True, exist_ok=True) + source.write_text("print('move')\n", encoding="utf-8") + + counts = us.apply_delta_operations( + workspace_path=f"/work/{slug}", + operations=[ + { + "operation": "moved", + "path": dest_rel, + "source_path": source_rel, + "content_hash": "sha1:abc123", + } + ], + file_hashes={dest_rel: "sha1:abc123"}, + ) + + assert counts["moved"] == 1 + assert not source.exists() + assert (work_dir / slug / dest_rel).exists() + + +def test_apply_delta_operations_raises_clear_error_when_no_replica_roots(tmp_path, monkeypatch): + import scripts.upload_delta_bundle as us + + work_dir = tmp_path / "work" + work_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(us, "WORK_DIR", str(work_dir)) + monkeypatch.setattr(us, "_resolve_replica_roots", lambda workspace_path: {}) + + with pytest.raises(ValueError, match="No replica roots available"): + us.apply_delta_operations( + workspace_path="/work/repo", + operations=[], + file_hashes={}, + ) diff --git a/tests/test_upload_service_status.py b/tests/test_upload_service_status.py new file mode 100644 index 00000000..2955a830 --- /dev/null +++ b/tests/test_upload_service_status.py @@ -0,0 +1,272 @@ +import asyncio +import importlib +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient + + +def _disable_auth(srv, monkeypatch) -> None: + monkeypatch.setattr(srv, "AUTH_ENABLED", False) + + +@pytest.mark.unit +def test_delta_status_exposes_last_processed_operations(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) + + monkeypatch.setattr(srv, "get_collection_name", lambda _repo=None: "test-coll") + monkeypatch.setattr(srv, "_extract_repo_name_from_path", lambda _path: "repo") + + key = srv.get_workspace_key("/work/repo") + srv._sequence_tracker[key] = 7 + srv._upload_result_tracker[key] = { + "workspace_path": "/work/repo", + "bundle_id": "bundle-123", + "sequence_number": 7, + "processed_operations": { + "created": 1, + "updated": 2, + "deleted": 0, + "moved": 0, + "skipped": 5, + "skipped_hash_match": 4, + "failed": 0, + }, + "processing_time_ms": 321, + "status": "completed", + "completed_at": "2026-03-07T15:40:46.623000", + } + + client = TestClient(srv.app) + resp = client.get("/api/v1/delta/status", params={"workspace_path": "/work/repo"}) + assert resp.status_code == 200 + body = resp.json() + assert body["last_sequence"] == 7 + assert body["last_upload"] == "2026-03-07T15:40:46.623000" + assert body["status"] == "ready" + assert body["server_info"]["last_bundle_id"] == "bundle-123" + assert body["server_info"]["last_processing_time_ms"] == 321 + assert body["server_info"]["last_processed_operations"]["skipped_hash_match"] == 4 + assert body["server_info"]["last_upload_status"] == "completed" + assert body["server_info"]["last_error"] is None + + +@pytest.mark.unit +def test_process_bundle_background_tracks_completed_operations(monkeypatch, tmp_path: Path): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) + + bundle_path = tmp_path / "bundle.tar.gz" + bundle_path.write_bytes(b"placeholder") + + monkeypatch.setattr( + srv, + "process_delta_bundle", + lambda workspace_path, bundle_path, manifest: { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 10, + "skipped_hash_match": 10, + "failed": 0, + }, + ) + monkeypatch.setattr(srv, "log_activity", lambda *a, **k: None) + + asyncio.run( + srv._process_bundle_background( + workspace_path="/work/repo", + bundle_path=bundle_path, + manifest={"bundle_id": "bundle-xyz"}, + sequence_number=3, + bundle_id="bundle-xyz", + ) + ) + + key = srv.get_workspace_key("/work/repo") + tracked = srv._upload_result_tracker[key] + assert tracked["status"] == "completed" + assert tracked["sequence_number"] == 3 + assert tracked["processed_operations"]["skipped_hash_match"] == 10 + assert tracked["processing_time_ms"] is not None + assert not bundle_path.exists() + + +@pytest.mark.unit +def test_delta_status_reports_processing_while_upload_in_progress(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) + + monkeypatch.setattr(srv, "get_collection_name", lambda _repo=None: "test-coll") + monkeypatch.setattr(srv, "_extract_repo_name_from_path", lambda _path: "repo") + + key = srv.get_workspace_key("/work/repo") + srv._upload_result_tracker[key] = { + "workspace_path": "/work/repo", + "bundle_id": "bundle-123", + "sequence_number": 8, + "processed_operations": None, + "processing_time_ms": None, + "status": "processing", + "completed_at": None, + } + + client = TestClient(srv.app) + resp = client.get("/api/v1/delta/status", params={"workspace_path": "/work/repo"}) + assert resp.status_code == 200 + body = resp.json() + assert body["status"] == "processing" + assert body["server_info"]["last_upload_status"] == "processing" + + +@pytest.mark.unit +def test_delta_plan_endpoint_returns_needed_files(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) + + monkeypatch.setattr( + srv, + "plan_delta_upload", + lambda workspace_path, operations, file_hashes=None: { + "needed_files": {"created": ["src/app.py"], "updated": [], "moved": []}, + "operation_counts_preview": { + "created": 1, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 2, + "skipped_hash_match": 2, + "failed": 0, + }, + "needed_size_bytes": 123, + "replica_targets": ["repo-0123456789abcdef"], + }, + ) + + client = TestClient(srv.app) + resp = client.post( + "/api/v1/delta/plan", + json={ + "workspace_path": "/work/repo", + "manifest": {"bundle_id": "b1"}, + "operations": [{"operation": "created", "path": "src/app.py"}], + "file_hashes": {"src/app.py": "sha1:abc"}, + }, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["success"] is True + assert body["needed_files"]["created"] == ["src/app.py"] + assert body["operation_counts_preview"]["skipped_hash_match"] == 2 + assert body["needed_size_bytes"] == 123 + + +@pytest.mark.unit +def test_delta_plan_endpoint_uses_safe_defaults_for_sparse_plan(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) + + monkeypatch.setattr( + srv, + "plan_delta_upload", + lambda workspace_path, operations, file_hashes=None: {}, + ) + + client = TestClient(srv.app) + resp = client.post( + "/api/v1/delta/plan", + json={ + "workspace_path": "/work/repo", + "manifest": {"bundle_id": "b1"}, + "operations": [{"operation": "created", "path": "src/app.py"}], + "file_hashes": {"src/app.py": "sha1:abc"}, + }, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["success"] is True + assert body["needed_files"] == {"created": [], "updated": [], "moved": []} + assert body["operation_counts_preview"]["failed"] == 0 + assert body["needed_size_bytes"] == 0 + assert body["replica_targets"] == [] + + +@pytest.mark.unit +def test_apply_ops_endpoint_returns_processed_operations(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) + + monkeypatch.setattr( + srv, + "apply_delta_operations", + lambda workspace_path, operations, file_hashes=None: { + "created": 0, + "updated": 0, + "deleted": 1, + "moved": 0, + "skipped": 0, + "skipped_hash_match": 0, + "failed": 0, + }, + ) + + client = TestClient(srv.app) + resp = client.post( + "/api/v1/delta/apply_ops", + json={ + "workspace_path": "/work/repo", + "manifest": {"bundle_id": "b2"}, + "operations": [{"operation": "deleted", "path": "src/old.py"}], + "file_hashes": {}, + }, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["success"] is True + assert body["processed_operations"]["deleted"] == 1 + assert body["processing_time_ms"] is not None + + +@pytest.mark.unit +def test_apply_ops_endpoint_marks_tracker_error_state_on_failure(monkeypatch): + srv = importlib.import_module("scripts.upload_service") + srv = importlib.reload(srv) + _disable_auth(srv, monkeypatch) + + monkeypatch.setattr( + srv, + "apply_delta_operations", + lambda workspace_path, operations, file_hashes=None: (_ for _ in ()).throw( + RuntimeError("boom") + ), + ) + + client = TestClient(srv.app) + resp = client.post( + "/api/v1/delta/apply_ops", + json={ + "workspace_path": "/work/repo", + "manifest": {"bundle_id": "b3"}, + "operations": [{"operation": "deleted", "path": "src/old.py"}], + "file_hashes": {}, + }, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["success"] is False + assert body["error"]["code"] == "APPLY_OPS_ERROR" + + key = srv.get_workspace_key("/work/repo") + tracked = srv._upload_result_tracker[key] + assert tracked["status"] == "error" + assert tracked["error"] == "boom" + assert tracked["message"] == "boom" + assert tracked["completed_at"] is not None diff --git a/tests/test_watch_consistency.py b/tests/test_watch_consistency.py new file mode 100644 index 00000000..bf51c79e --- /dev/null +++ b/tests/test_watch_consistency.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +import importlib +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + + +pytestmark = pytest.mark.unit + + +@pytest.fixture +def capture_list_workspaces(): + captured = {} + + def fake_list_workspaces(search_root=None, use_qdrant_fallback=True): + captured["search_root"] = search_root + captured["use_qdrant_fallback"] = use_qdrant_fallback + return [] + + return captured, fake_list_workspaces + + +def test_run_consistency_audit_scans_from_watcher_root( + monkeypatch, tmp_path, capture_list_workspaces +): + mod = importlib.import_module("scripts.watch_index_core.consistency") + captured, fake_list_workspaces = capture_list_workspaces + + monkeypatch.setattr(mod, "list_workspaces", fake_list_workspaces) + monkeypatch.setattr(mod, "_consistency_audit_enabled", lambda: True) + + mod.run_consistency_audit(MagicMock(), tmp_path) + + assert "search_root" in captured and "use_qdrant_fallback" in captured + assert Path(captured["search_root"]).resolve() == Path(tmp_path).resolve() + assert captured["use_qdrant_fallback"] is False + + +def test_run_empty_dir_sweep_maintenance_scans_from_watcher_root( + monkeypatch, tmp_path, capture_list_workspaces +): + mod = importlib.import_module("scripts.watch_index_core.consistency") + captured, fake_list_workspaces = capture_list_workspaces + + monkeypatch.setattr(mod, "list_workspaces", fake_list_workspaces) + monkeypatch.setattr(mod, "_empty_dir_sweep_enabled", lambda: True) + + mod.run_empty_dir_sweep_maintenance(tmp_path) + + assert "search_root" in captured + assert Path(captured["search_root"]).resolve() == Path(tmp_path).resolve() + assert captured.get("use_qdrant_fallback") is False + + +def test_consistency_audit_skips_repairs_when_scan_is_truncated(monkeypatch, tmp_path): + mod = importlib.import_module("scripts.watch_index_core.consistency") + + workspace_root = tmp_path / "repo" + workspace_root.mkdir(parents=True, exist_ok=True) + + monkeypatch.setattr( + mod, + "list_workspaces", + lambda *a, **k: [{"workspace_path": str(workspace_root)}], + ) + monkeypatch.setattr(mod, "_consistency_audit_enabled", lambda: True) + monkeypatch.setattr(mod, "_should_run_consistency_audit", lambda *a, **k: True) + monkeypatch.setattr( + mod, + "get_collection_state_snapshot", + lambda *a, **k: {"active_collection": "coll"}, + ) + monkeypatch.setattr(mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(mod, "_load_cached_hashes", lambda *a, **k: {}) + monkeypatch.setattr( + mod, + "_scan_indexable_fs_paths", + lambda *a, **k: ({str(workspace_root / "a.py")}, True), + ) + monkeypatch.setattr( + mod, + "_load_indexed_paths_for_collection", + lambda *a, **k: ({str(workspace_root / "ghost.py")}, False), + ) + monkeypatch.setattr(mod.idx, "_Excluder", lambda *_: MagicMock()) + + enqueue_mock = MagicMock(return_value=(0, 0)) + record_mock = MagicMock() + monkeypatch.setattr(mod, "_enqueue_consistency_repairs", enqueue_mock) + monkeypatch.setattr(mod, "_record_consistency_audit", record_mock) + + mod.run_consistency_audit(MagicMock(), tmp_path) + + enqueue_mock.assert_not_called() + record_mock.assert_called_once() + summary = record_mock.call_args.args[2] + assert summary["fs_scan_truncated"] is True + assert summary["qdrant_scan_truncated"] is False + assert summary["repair_skipped_due_to_truncation"] is True + assert summary["stale_in_qdrant_count"] == 0 + assert summary["missing_in_qdrant_count"] == 0 diff --git a/tests/test_watch_index_cache.py b/tests/test_watch_index_cache.py index c5065af1..679ac0b4 100644 --- a/tests/test_watch_index_cache.py +++ b/tests/test_watch_index_cache.py @@ -150,3 +150,394 @@ def test_processor_delete_clears_cache_even_without_client(monkeypatch, tmp_path ) remove_mock.assert_called_once_with(str(missing), "repo") + + +def test_run_indexing_strategy_reuses_preloaded_file_state(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "file.py" + path.write_text("print('x')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod.idx, "ensure_collection_and_indexes_once", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_read_text_and_sha1", lambda _p: ("print('x')\n", "abc123")) + monkeypatch.setattr(proc_mod, "get_cached_file_hash", lambda *a, **k: None) + monkeypatch.setattr(proc_mod.idx, "detect_language", lambda _p: "python") + monkeypatch.setattr(proc_mod.idx, "should_use_smart_reindexing", lambda *a, **k: (False, "changed")) + + captured = {} + + def fake_index_single_file(*args, **kwargs): + captured.update(kwargs) + return True + + monkeypatch.setattr(proc_mod.idx, "index_single_file", fake_index_single_file) + + ok = proc_mod._run_indexing_strategy( + path, + client=MagicMock(), + model=MagicMock(), + collection="coll", + vector_name="vec", + model_dim=1, + repo_name="repo", + ) + + assert ok is True + assert captured["preloaded_text"] == "print('x')\n" + assert captured["preloaded_file_hash"] == "abc123" + assert captured["preloaded_language"] == "python" + + +def test_run_indexing_strategy_skips_ensure_for_cached_hash_match(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "file.py" + path.write_text("print('x')\n", encoding="utf-8") + + ensure_mock = MagicMock() + monkeypatch.setattr(proc_mod.idx, "ensure_collection_and_indexes_once", ensure_mock) + monkeypatch.setattr(proc_mod, "_read_text_and_sha1", lambda _p: ("print('x')\n", "abc123")) + monkeypatch.setattr(proc_mod, "get_cached_file_hash", lambda *a, **k: "abc123") + monkeypatch.setattr(proc_mod.idx, "detect_language", lambda _p: "python") + + with pytest.raises(proc_mod._SkipUnchanged): + proc_mod._run_indexing_strategy( + path, + client=MagicMock(), + model=MagicMock(), + collection="coll", + vector_name="vec", + model_dim=1, + repo_name="repo", + ) + + ensure_mock.assert_not_called() + + +def test_run_indexing_strategy_skips_smart_path_for_markdown(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "notes.md" + path.write_text("# notes\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod.idx, "ensure_collection_and_indexes_once", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_read_text_and_sha1", lambda _p: ("# notes\n", "abc123")) + monkeypatch.setattr(proc_mod, "get_cached_file_hash", lambda *a, **k: None) + monkeypatch.setattr(proc_mod.idx, "detect_language", lambda _p: "markdown") + + smart_check = MagicMock(side_effect=AssertionError("smart path must be skipped")) + monkeypatch.setattr(proc_mod.idx, "should_use_smart_reindexing", smart_check) + + captured = {} + + def fake_index_single_file(*args, **kwargs): + captured.update(kwargs) + return True + + monkeypatch.setattr(proc_mod.idx, "index_single_file", fake_index_single_file) + + ok = proc_mod._run_indexing_strategy( + path, + client=MagicMock(), + model=MagicMock(), + collection="coll", + vector_name="vec", + model_dim=1, + repo_name="repo", + ) + + assert ok is True + smart_check.assert_not_called() + assert captured["preloaded_language"] == "markdown" + + +def test_staging_requires_subprocess_only_for_active_dual_root_state(monkeypatch): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: True) + + assert proc_mod._staging_requires_subprocess(None) is False + assert ( + proc_mod._staging_requires_subprocess( + { + "indexing_env": {"FOO": "bar"}, + "active_repo_slug": "repo", + "serving_repo_slug": "repo", + } + ) + is False + ) + assert ( + proc_mod._staging_requires_subprocess( + { + "indexing_env": {"FOO": "bar"}, + "active_repo_slug": "repo", + "serving_repo_slug": "repo_old", + } + ) + is True + ) + assert ( + proc_mod._staging_requires_subprocess( + { + "indexing_env": {"FOO": "bar"}, + "active_repo_slug": "repo", + "serving_repo_slug": "repo", + "staging": {"collection": "repo_old_collection"}, + } + ) + is True + ) + + +def test_process_paths_does_not_force_subprocess_for_non_active_staging( + monkeypatch, tmp_path +): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "file.py" + path.write_text("print('x')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: True) + monkeypatch.setattr( + proc_mod, + "get_workspace_state", + lambda *a, **k: { + "indexing_env": {"FOO": "bar"}, + "active_repo_slug": "repo", + "serving_repo_slug": "repo", + }, + ) + + staging_mock = MagicMock(return_value=False) + monkeypatch.setattr(proc_mod, "_maybe_handle_staging_file", staging_mock) + monkeypatch.setattr(proc_mod, "_run_indexing_strategy", lambda *a, **k: True) + + proc_mod._process_paths( + [path], + client=MagicMock(), + model=MagicMock(), + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + assert staging_mock.call_args is not None + assert staging_mock.call_args.kwargs == { + "force_upsert": False, + "journal_content_hash": "", + } + assert staging_mock.call_args.args[0] == path + assert staging_mock.call_args.args[6] is None + + +def test_process_paths_uses_subprocess_when_staging_is_actually_active( + monkeypatch, tmp_path +): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "file.py" + path.write_text("print('x')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_detect_repo_for_file", lambda p: tmp_path) + monkeypatch.setattr(proc_mod, "_get_collection_for_file", lambda p: "coll") + monkeypatch.setattr(proc_mod, "_set_status_indexing", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + monkeypatch.setattr(proc_mod, "_extract_repo_name_from_path", lambda *_: "repo") + monkeypatch.setattr(proc_mod, "is_staging_enabled", lambda: True) + monkeypatch.setattr( + proc_mod, + "get_workspace_state", + lambda *a, **k: { + "indexing_env": {"FOO": "bar"}, + "active_repo_slug": "repo", + "serving_repo_slug": "repo_old", + }, + ) + + staging_mock = MagicMock(return_value=False) + monkeypatch.setattr(proc_mod, "_maybe_handle_staging_file", staging_mock) + monkeypatch.setattr(proc_mod, "_run_indexing_strategy", lambda *a, **k: True) + + proc_mod._process_paths( + [path], + client=MagicMock(), + model=MagicMock(), + vector_name="vec", + model_dim=1, + workspace_path=str(tmp_path), + ) + + assert staging_mock.call_args is not None + assert staging_mock.call_args.kwargs == { + "force_upsert": False, + "journal_content_hash": "", + } + assert staging_mock.call_args.args[0] == path + assert staging_mock.call_args.args[6] == {"FOO": "bar"} + + +def test_staging_force_upsert_hash_match_verifies_before_skip(monkeypatch, tmp_path): + proc_mod = importlib.import_module("scripts.watch_index_core.processor") + + path = tmp_path / "file.py" + path.write_text("print('x')\n", encoding="utf-8") + + monkeypatch.setattr(proc_mod, "_read_text_and_sha1", lambda _p: ("print('x')\n", "abc123")) + monkeypatch.setattr(proc_mod, "get_cached_file_hash", lambda *a, **k: "abc123") + monkeypatch.setattr(proc_mod, "_verify_upsert_committed", lambda *a, **k: True) + monkeypatch.setattr(proc_mod, "_log_activity", lambda *a, **k: None) + + mark_done = MagicMock() + monkeypatch.setattr(proc_mod, "_mark_journal_done", mark_done) + advance = MagicMock() + monkeypatch.setattr(proc_mod, "_advance_progress", advance) + + handled = proc_mod._maybe_handle_staging_file( + path, + MagicMock(), + "coll", + "repo", + str(tmp_path), + [path], + {"FOO": "bar"}, + {str(tmp_path): 0}, + "started", + force_upsert=True, + journal_content_hash="abc123", + ) + + assert handled is True + mark_done.assert_called_once_with(path, str(tmp_path), "repo") + advance.assert_called_once() + + +def test_runtime_root_override_updates_internal_path_checks(monkeypatch, tmp_path): + import scripts.watch_index as watch_index + from scripts.watch_index_core import config as watch_config + import scripts.watch_index_core.processor as proc_mod + import scripts.embedder as embedder_mod + + runtime_root = tmp_path / "runtime-root" + runtime_root.mkdir(parents=True, exist_ok=True) + internal = runtime_root / ".git" / "HEAD" + internal.parent.mkdir(parents=True, exist_ok=True) + internal.write_text("ref: refs/heads/main\n", encoding="utf-8") + + original_root = watch_config.ROOT + original_watch_root = watch_index.ROOT + monkeypatch.setenv("WATCH_ROOT", str(runtime_root)) + monkeypatch.setattr(watch_index, "initialize_watcher_state", lambda root: {"repo_name": None}) + monkeypatch.setattr(watch_index, "get_indexing_config_snapshot", lambda repo_name=None: {}) + monkeypatch.setattr(watch_index, "compute_indexing_config_hash", lambda snapshot: "hash") + monkeypatch.setattr(watch_index, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(watch_index, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(embedder_mod, "get_embedding_model", lambda *_: MagicMock()) + monkeypatch.setattr(embedder_mod, "get_model_dimension", lambda *_: 1) + monkeypatch.setattr(watch_index, "resolve_vector_name_config", lambda *a, **k: "vec") + monkeypatch.setattr(watch_index, "_start_pseudo_backfill_worker", lambda *a, **k: None) + monkeypatch.setattr(watch_index, "create_observer", lambda *a, **k: MagicMock()) + monkeypatch.setattr(watch_index, "IndexHandler", MagicMock()) + monkeypatch.setattr(watch_index, "ChangeQueue", MagicMock()) + monkeypatch.setattr( + watch_index, + "QdrantClient", + MagicMock(return_value=MagicMock(get_collection=MagicMock())), + ) + monkeypatch.setattr(watch_index, "run_consistency_audit", lambda *a, **k: None) + monkeypatch.setattr(watch_index, "run_empty_dir_sweep_maintenance", lambda *a, **k: None) + monkeypatch.setattr(watch_index, "list_pending_index_journal_entries", lambda *a, **k: []) + monkeypatch.setattr(watch_index, "get_boolean_env", lambda *a, **k: False) + monkeypatch.setattr(watch_index.time, "sleep", lambda *_: (_ for _ in ()).throw(KeyboardInterrupt())) + + try: + watch_index.main() + except KeyboardInterrupt: + pass + + try: + assert watch_config.ROOT == runtime_root.resolve() + assert proc_mod._is_internal_ignored_path(internal) is True + finally: + watch_config.ROOT = original_root + watch_index.ROOT = original_watch_root + + +def test_main_throttles_periodic_maintenance(monkeypatch, tmp_path): + import scripts.watch_index as watch_index + from scripts.watch_index_core import config as watch_config + import scripts.embedder as embedder_mod + + runtime_root = tmp_path / "runtime-root" + runtime_root.mkdir(parents=True, exist_ok=True) + + original_root = watch_config.ROOT + original_watch_root = watch_index.ROOT + monkeypatch.setenv("WATCH_ROOT", str(runtime_root)) + monkeypatch.setenv("WATCH_MAINTENANCE_INTERVAL_SECS", "300") + monkeypatch.setattr(watch_index, "initialize_watcher_state", lambda *a, **k: {"repo_name": None}) + monkeypatch.setattr(watch_index, "get_indexing_config_snapshot", lambda repo_name=None: {}) + monkeypatch.setattr(watch_index, "compute_indexing_config_hash", lambda snapshot: "hash") + monkeypatch.setattr(watch_index, "persist_indexing_config", lambda *a, **k: None) + monkeypatch.setattr(watch_index, "update_indexing_status", lambda *a, **k: None) + monkeypatch.setattr(embedder_mod, "get_embedding_model", lambda *_: MagicMock()) + monkeypatch.setattr(embedder_mod, "get_model_dimension", lambda *_: 1) + monkeypatch.setattr(watch_index, "resolve_vector_name_config", lambda *a, **k: "vec") + monkeypatch.setattr(watch_index, "_start_pseudo_backfill_worker", lambda *a, **k: None) + + class FakeObserver: + def schedule(self, *a, **k): + return None + + def start(self): + return None + + def stop(self): + return None + + def join(self): + return None + + monkeypatch.setattr(watch_index, "create_observer", lambda *a, **k: FakeObserver()) + monkeypatch.setattr(watch_index, "IndexHandler", MagicMock()) + monkeypatch.setattr(watch_index, "ChangeQueue", MagicMock()) + monkeypatch.setattr( + watch_index, + "QdrantClient", + MagicMock(return_value=MagicMock(get_collection=MagicMock())), + ) + monkeypatch.setattr(watch_index, "get_boolean_env", lambda *a, **k: False) + + drain_mock = MagicMock() + maintenance_mock = MagicMock() + monkeypatch.setattr(watch_index, "_drain_pending_journal", drain_mock) + monkeypatch.setattr(watch_index, "_run_periodic_maintenance", maintenance_mock) + + time_values = iter([0.0, 1.0, 2.0, 301.0]) + monkeypatch.setattr(watch_index.time, "time", lambda: next(time_values)) + + sleep_calls = {"count": 0} + + def _sleep(_secs): + sleep_calls["count"] += 1 + if sleep_calls["count"] >= 4: + raise KeyboardInterrupt() + + monkeypatch.setattr(watch_index.time, "sleep", _sleep) + + try: + watch_index.main() + finally: + watch_config.ROOT = original_root + watch_index.ROOT = original_watch_root + + assert drain_mock.call_count == 4 + assert maintenance_mock.call_count == 2 diff --git a/tests/test_watch_queue.py b/tests/test_watch_queue.py new file mode 100644 index 00000000..72bd6fd1 --- /dev/null +++ b/tests/test_watch_queue.py @@ -0,0 +1,89 @@ +def test_change_queue_suppresses_recent_identical_fingerprint(monkeypatch, tmp_path): + from scripts.watch_index_core import queue as queue_mod + + monkeypatch.setattr(queue_mod, "RECENT_FINGERPRINT_TTL_SECS", 10.0) + + processed = [] + q = queue_mod.ChangeQueue(lambda paths: processed.append(list(paths))) + + p = tmp_path / "file.py" + p.write_text("print('x')\n", encoding="utf-8") + + q._paths.add(p) + q._flush() + assert processed == [[p]] + + q._paths.add(p) + q._flush() + assert processed == [[p]] + + +def test_change_queue_reprocesses_when_fingerprint_changes(monkeypatch, tmp_path): + from scripts.watch_index_core import queue as queue_mod + + monkeypatch.setattr(queue_mod, "RECENT_FINGERPRINT_TTL_SECS", 10.0) + + processed = [] + q = queue_mod.ChangeQueue(lambda paths: processed.append(list(paths))) + + p = tmp_path / "file.py" + p.write_text("print('x')\n", encoding="utf-8") + + q._paths.add(p) + q._flush() + + p.write_text("print('changed-again')\n", encoding="utf-8") + q._paths.add(p) + q._flush() + + assert processed == [[p], [p]] + + +def test_change_queue_force_bypasses_recent_fingerprint_suppression(monkeypatch, tmp_path): + from scripts.watch_index_core import queue as queue_mod + + monkeypatch.setattr(queue_mod, "RECENT_FINGERPRINT_TTL_SECS", 10.0) + + processed = [] + q = queue_mod.ChangeQueue(lambda paths: processed.append(list(paths))) + + p = tmp_path / "file.py" + p.write_text("print('x')\n", encoding="utf-8") + + q.add(p) + q._flush() + q.add(p, force=True) + q._flush() + + assert processed == [[p], [p]] + + +def test_change_queue_repeated_same_path_does_not_rearm_timer(monkeypatch, tmp_path): + from scripts.watch_index_core import queue as queue_mod + + class FakeTimer: + created = 0 + canceled = 0 + + def __init__(self, _delay, _cb): + FakeTimer.created += 1 + self.daemon = False + + def start(self): + return None + + def cancel(self): + FakeTimer.canceled += 1 + + monkeypatch.setattr(queue_mod.threading, "Timer", FakeTimer) + + q = queue_mod.ChangeQueue(lambda _paths: None) + p = tmp_path / "file.py" + p.write_text("print('x')\n", encoding="utf-8") + + q.add(p, force=True) + q.add(p, force=True) + q.add(p, force=True) + + assert FakeTimer.created == 1 + assert FakeTimer.canceled == 0 diff --git a/tests/test_watcher_collection_resolution.py b/tests/test_watcher_collection_resolution.py index fa3d0c1a..2fff6c45 100644 --- a/tests/test_watcher_collection_resolution.py +++ b/tests/test_watcher_collection_resolution.py @@ -17,6 +17,9 @@ def test_main_resolves_collection_from_state(monkeypatch, tmp_path): wi = importlib.import_module("scripts.watch_index") # Reload to re-read env defaults (COLLECTION) in module globals wi = importlib.reload(wi) + watch_config = importlib.import_module("scripts.watch_index_core.config") + original_root = watch_config.ROOT + original_watch_root = wi.ROOT # Fake QdrantClient: force get_collection to raise so code chooses sanitized vector name path class FakeQdrant: @@ -70,10 +73,14 @@ def _raise_kb(_): assert wi.COLLECTION == os.environ.get("COLLECTION_NAME") == "my-collection" # Run main(); in single-repo mode it should keep the env-provided COLLECTION_NAME - wi.main() + try: + wi.main() - # Postcondition: global COLLECTION remains the env-provided name - assert wi.COLLECTION == "my-collection" + # Postcondition: global COLLECTION remains the env-provided name + assert wi.COLLECTION == "my-collection" + finally: + watch_config.ROOT = original_root + wi.ROOT = original_watch_root def test_multi_repo_ignores_placeholder_collection_in_state(monkeypatch, tmp_path): @@ -85,7 +92,8 @@ def test_multi_repo_ignores_placeholder_collection_in_state(monkeypatch, tmp_pat utils = importlib.import_module("scripts.watch_index_core.utils") utils = importlib.reload(utils) - monkeypatch.setattr(utils, "ROOT", tmp_path, raising=False) + watch_config = importlib.import_module("scripts.watch_index_core.config") + monkeypatch.setattr(watch_config, "ROOT", tmp_path, raising=True) monkeypatch.setattr(utils, "is_multi_repo_mode", lambda: True, raising=True) repo_slug = "Pirate Survivors-2b23a7e45f2c4b9f" @@ -111,4 +119,3 @@ def _fake_get_workspace_state(ws_path: str, repo_name: str | None = None): resolved = utils._get_collection_for_file(target) assert resolved == f"derived-{repo_slug}" - diff --git a/tests/test_watcher_events.py b/tests/test_watcher_events.py index f01484e9..658366bc 100644 --- a/tests/test_watcher_events.py +++ b/tests/test_watcher_events.py @@ -71,6 +71,25 @@ def test_on_moved_enqueues_new_dest(monkeypatch, tmp_path): assert any(s.endswith("/b.py") for s in q.added) +@pytest.mark.unit +def test_on_moved_ignores_internal_codebase_paths(monkeypatch, tmp_path): + monkeypatch.setenv("MULTI_REPO_MODE", "0") + q = FakeQueue() + handler = wi.IndexHandler(root=tmp_path, queue=q, client=FakeClient(), collection="c") + + codebase = tmp_path / ".codebase" + codebase.mkdir(parents=True, exist_ok=True) + src = codebase / "state.json" + dst = codebase / "file_locks" / "abc.lock" + src.write_text("{}\n") + dst.parent.mkdir(parents=True, exist_ok=True) + dst.write_text("lock\n") + + handler.on_moved(E(src, dest=dst)) + + assert q.added == [] + + @pytest.mark.unit def test_ignore_reload_rebuilds_excluder(monkeypatch, tmp_path): monkeypatch.setenv("MULTI_REPO_MODE", "0") @@ -105,4 +124,3 @@ def test_remote_git_manifest_is_enqueued_even_if_excluded(monkeypatch, tmp_path) handler.on_created(E(manifest)) assert any(p.endswith("/.remote-git/git_history_test.json") for p in q.added) - diff --git a/tests/test_workspace_state.py b/tests/test_workspace_state.py index 1200a270..9733799d 100644 --- a/tests/test_workspace_state.py +++ b/tests/test_workspace_state.py @@ -433,3 +433,165 @@ def test_placeholder_collection_names(self, ws_module): assert "" in ws_module.PLACEHOLDER_COLLECTION_NAMES assert "default-collection" in ws_module.PLACEHOLDER_COLLECTION_NAMES assert "my-collection" in ws_module.PLACEHOLDER_COLLECTION_NAMES + + +class TestCompareSymbolChanges: + def test_compare_symbol_changes_tolerates_line_shift_for_unchanged_content(self, ws_module): + old_symbols = { + "function_foo_10": { + "name": "foo", + "type": "function", + "start_line": 10, + "end_line": 20, + "content_hash": "samehash", + } + } + new_symbols = { + "function_foo_12": { + "name": "foo", + "type": "function", + "start_line": 12, + "end_line": 22, + "content_hash": "samehash", + } + } + + unchanged, changed = ws_module.compare_symbol_changes(old_symbols, new_symbols) + + assert unchanged == ["function_foo_12"] + assert changed == [] + + +class TestSymbolCachePaths: + def test_symbol_cache_uses_shared_repo_state_dir_in_multi_repo_mode(self, monkeypatch, tmp_path): + ws_root = tmp_path / "work" + repo_name = "repo-1234567890abcdef" + repo_root = ws_root / repo_name + repo_root.mkdir(parents=True, exist_ok=True) + + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.setenv("MULTI_REPO_MODE", "1") + + import importlib + + ws_module = importlib.import_module("scripts.workspace_state") + ws_module = importlib.reload(ws_module) + + file_path = repo_root / "src" / "app.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + file_path.write_text("print('x')\n", encoding="utf-8") + + expected_hash = ws_module.hashlib.md5( + str(file_path.resolve()).encode("utf-8") + ).hexdigest()[:8] + cache_path = ws_module._get_symbol_cache_path(str(file_path)) + + assert cache_path == ( + ws_root + / ".codebase" + / "repos" + / repo_name + / "symbols" + / f"{expected_hash}.json" + ) + + def test_symbol_cache_write_uses_cross_user_writable_mode(self, monkeypatch, tmp_path): + ws_root = tmp_path / "work" + repo_name = "repo-1234567890abcdef" + repo_root = ws_root / repo_name + repo_root.mkdir(parents=True, exist_ok=True) + + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.setenv("MULTI_REPO_MODE", "1") + + import importlib + + ws_module = importlib.import_module("scripts.workspace_state") + ws_module = importlib.reload(ws_module) + + file_path = repo_root / "src" / "cacheme.py" + file_path.parent.mkdir(parents=True, exist_ok=True) + file_path.write_text("print('x')\n", encoding="utf-8") + + ws_module.set_cached_symbols(str(file_path), {"sym": {"name": "sym"}}, "abc123") + cache_path = ws_module._get_symbol_cache_path(str(file_path)) + + assert cache_path.exists() + if os.name == "nt": + pytest.skip("POSIX permission bits are not stable on Windows") + dir_mode = cache_path.parent.stat().st_mode & 0o777 + file_mode = cache_path.stat().st_mode & 0o777 + assert dir_mode & 0o700 == 0o700 + assert file_mode & 0o600 == 0o600 + + +class TestCollectionMappings: + def test_get_collection_mappings_accepts_codebase_root_search_path(self, monkeypatch, tmp_path): + ws_root = tmp_path / "work" + ws_root.mkdir(parents=True, exist_ok=True) + slug = "repo-1234567890abcdef" + global_state_dir = ws_root / ".codebase" / "repos" / slug + global_state_dir.mkdir(parents=True, exist_ok=True) + global_state_path = global_state_dir / "state.json" + global_state_path.write_text( + json.dumps( + { + "qdrant_collection": "repo-123456-abcdef", + "updated_at": "2026-03-08T00:00:00", + } + ), + encoding="utf-8", + ) + + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.setenv("MULTI_REPO_MODE", "1") + + import importlib + + ws_module = importlib.import_module("scripts.workspace_state") + ws_module = importlib.reload(ws_module) + + mappings = ws_module.get_collection_mappings(search_root=str(ws_root / ".codebase")) + slug_entries = [m for m in mappings if str(m.get("repo_name")) == slug] + + assert slug_entries, "expected global repo mapping to be discovered from codebase root" + entry = slug_entries[0] + assert entry["collection_name"] == "repo-123456-abcdef" + assert Path(entry["state_file"]).resolve() == global_state_path.resolve() + + def test_get_collection_mappings_keeps_global_repo_state_behavior(self, monkeypatch, tmp_path): + ws_root = tmp_path / "work" + ws_root.mkdir(parents=True, exist_ok=True) + repo_name = "frontend" + global_state_dir = ws_root / ".codebase" / "repos" / repo_name + global_state_dir.mkdir(parents=True, exist_ok=True) + global_state_path = global_state_dir / "state.json" + global_state_path.write_text( + json.dumps( + { + "qdrant_collection": "frontend-abcdef", + "updated_at": "2026-03-08T00:00:00", + } + ), + encoding="utf-8", + ) + + monkeypatch.setenv("WORKSPACE_PATH", str(ws_root)) + monkeypatch.setenv("WATCH_ROOT", str(ws_root)) + monkeypatch.setenv("MULTI_REPO_MODE", "1") + + import importlib + + ws_module = importlib.import_module("scripts.workspace_state") + ws_module = importlib.reload(ws_module) + + mappings = ws_module.get_collection_mappings(search_root=str(ws_root)) + repo_entries = [m for m in mappings if str(m.get("repo_name")) == repo_name] + + assert repo_entries, "expected global repo mapping to be discovered" + entry = repo_entries[0] + assert entry["collection_name"] == "frontend-abcdef" + assert Path(entry["state_file"]).resolve() == global_state_path.resolve() diff --git a/vscode-extension/build/build.sh b/vscode-extension/build/build.sh index f3e4d9fa..6f008017 100755 --- a/vscode-extension/build/build.sh +++ b/vscode-extension/build/build.sh @@ -76,10 +76,39 @@ if [[ "$BUNDLE_DEPS" == "--bundle-deps" ]]; then fi fi +# Bundle MCP bridge npm package into the staged extension +BRIDGE_SRC="$SCRIPT_DIR/../../ctx-mcp-bridge" +BRIDGE_DIR="ctx-mcp-bridge" + +if [[ -d "$BRIDGE_SRC" && -f "$BRIDGE_SRC/package.json" ]]; then + echo "Bundling MCP bridge npm package into staged extension..." + mkdir -p "$STAGE_DIR/$BRIDGE_DIR" + if [[ -d "$BRIDGE_SRC/bin" ]]; then + cp -a "$BRIDGE_SRC/bin" "$STAGE_DIR/$BRIDGE_DIR/" + else + echo "Warning: Bridge bin directory not found at $BRIDGE_SRC/bin (skipping)" + fi + if [[ -d "$BRIDGE_SRC/src" ]]; then + cp -a "$BRIDGE_SRC/src" "$STAGE_DIR/$BRIDGE_DIR/" + else + echo "Warning: Bridge src directory not found at $BRIDGE_SRC/src (skipping)" + fi + cp "$BRIDGE_SRC/package.json" "$STAGE_DIR/$BRIDGE_DIR/" + + if [[ -d "$BRIDGE_SRC/node_modules" ]]; then + cp -a "$BRIDGE_SRC/node_modules" "$STAGE_DIR/$BRIDGE_DIR/" + else + echo "Warning: Bridge node_modules not found. Run 'npm install' in ctx-mcp-bridge first." + fi + echo "MCP bridge bundled successfully." +else + echo "Warning: MCP bridge source not found at $BRIDGE_SRC" +fi + pushd "$STAGE_DIR" >/dev/null echo "Packaging extension..." npx @vscode/vsce package --no-dependencies --out "$OUT_DIR" popd >/dev/null echo "Build complete! Check the /out directory for .vsix and .py files." -ls -la "$OUT_DIR" \ No newline at end of file +ls -la "$OUT_DIR" diff --git a/vscode-extension/context-engine-uploader/extension.js b/vscode-extension/context-engine-uploader/extension.js index 9a387c66..79fe9ac9 100644 --- a/vscode-extension/context-engine-uploader/extension.js +++ b/vscode-extension/context-engine-uploader/extension.js @@ -230,6 +230,7 @@ function activate(context) { path, fs, log, + extensionRoot, getEffectiveConfig, resolveBridgeWorkspacePath: () => configResolver ? configResolver.resolveBridgeWorkspacePath() : undefined, attachOutput: (child, label) => processManager ? processManager.attachOutput(child, label) : undefined, @@ -274,6 +275,7 @@ function activate(context) { resolveBridgeCliInvocation: () => bridgeManager ? bridgeManager.resolveBridgeCliInvocation() : undefined, resolveBridgeHttpUrl: () => bridgeManager ? bridgeManager.resolveBridgeHttpUrl() : undefined, requiresHttpBridge: (s, t) => bridgeManager ? bridgeManager.requiresHttpBridge(s, t) : (s === 'bridge' && t === 'http'), + requiresLocalBridgeProcess: (s, t) => bridgeManager ? bridgeManager.requiresLocalBridgeProcess(s, t) : (s === 'bridge' && (t === 'http' || t === 'sse-remote')), ensureHttpBridgeReadyForConfigs: () => bridgeManager ? bridgeManager.ensureReadyForConfigs() : Promise.resolve(false), getBridgeIsRunning: () => (bridgeManager && typeof bridgeManager.isRunning === 'function' ? bridgeManager.isRunning() : false), writeCtxConfig: () => ctxConfigManager ? ctxConfigManager.writeCtxConfig() : Promise.resolve(), @@ -425,6 +427,7 @@ function activate(context) { event.affectsConfiguration('contextEngineUploader.mcpBridgeBinPath') || event.affectsConfiguration('contextEngineUploader.mcpBridgePort') || event.affectsConfiguration('contextEngineUploader.mcpBridgeLocalOnly') || + event.affectsConfiguration('contextEngineUploader.mcpBridgeMode') || event.affectsConfiguration('contextEngineUploader.windsurfMcpPath') || event.affectsConfiguration('contextEngineUploader.augmentMcpPath') || event.affectsConfiguration('contextEngineUploader.antigravityMcpPath') || @@ -439,6 +442,7 @@ function activate(context) { event.affectsConfiguration('contextEngineUploader.mcpBridgePort') || event.affectsConfiguration('contextEngineUploader.mcpBridgeBinPath') || event.affectsConfiguration('contextEngineUploader.mcpBridgeLocalOnly') || + event.affectsConfiguration('contextEngineUploader.mcpBridgeMode') || event.affectsConfiguration('contextEngineUploader.mcpIndexerUrl') || event.affectsConfiguration('contextEngineUploader.mcpMemoryUrl') || event.affectsConfiguration('contextEngineUploader.mcpServerMode') || @@ -484,10 +488,10 @@ function activate(context) { const serverModeRaw = config.get('mcpServerMode') || 'bridge'; const transportMode = (typeof transportModeRaw === 'string' ? transportModeRaw.trim() : 'sse-remote') || 'sse-remote'; const serverMode = (typeof serverModeRaw === 'string' ? serverModeRaw.trim() : 'bridge') || 'bridge'; - if (bridgeManager && bridgeManager.requiresHttpBridge(serverMode, transportMode)) { + if (bridgeManager && bridgeManager.requiresLocalBridgeProcess(serverMode, transportMode)) { startHttpBridgeProcess().catch(error => log(`Auto-start HTTP MCP bridge failed: ${error instanceof Error ? error.message : String(error)}`)); } else { - log('Context Engine Uploader: autoStartMcpBridge is enabled, but current MCP wiring does not use the HTTP bridge; skipping auto-start.'); + log('Context Engine Uploader: autoStartMcpBridge is enabled, but current MCP wiring does not use the local bridge process; skipping auto-start.'); } } } @@ -540,8 +544,9 @@ async function runSequence(mode = 'auto') { if (code === 0) { setStatusBarState('indexed'); if (processManager) { processManager.ensureIndexedWatcher(options.targetPath); } - // Only start watching after a regular force sync, not after git history upload - if (mode === 'force' && options.startWatchAfterForce && processManager) { + // Start watch after successful force sync in normal flows (`force` and `auto`), + // but keep git-history upload as one-shot. + if (mode !== 'uploadGitHistory' && options.startWatchAfterForce && processManager) { processManager.startWatch(options); } } else { diff --git a/vscode-extension/context-engine-uploader/mcp_bridge.js b/vscode-extension/context-engine-uploader/mcp_bridge.js index d9825177..5b10796f 100644 --- a/vscode-extension/context-engine-uploader/mcp_bridge.js +++ b/vscode-extension/context-engine-uploader/mcp_bridge.js @@ -4,6 +4,7 @@ function createBridgeManager(deps) { const path = deps.path; const fs = deps.fs; const log = deps.log; + const extensionRoot = deps.extensionRoot; const getEffectiveConfig = deps.getEffectiveConfig; const resolveBridgeWorkspacePath = deps.resolveBridgeWorkspacePath; @@ -42,7 +43,36 @@ function createBridgeManager(deps) { } } + function getBridgeMode() { + try { + const settings = getEffectiveConfig(); + return (settings.get('mcpBridgeMode') || 'bundled').trim(); + } catch (_) { + return 'bundled'; + } + } + + function findBundledBridgeBin() { + if (!extensionRoot) return undefined; + const bundledPath = path.join(extensionRoot, 'ctx-mcp-bridge', 'bin', 'ctxce.js'); + if (fs.existsSync(bundledPath)) { + return path.resolve(bundledPath); + } + return undefined; + } + function findLocalBridgeBin() { + // First check for bundled bridge if mode is 'bundled' + const mode = getBridgeMode(); + if (mode === 'bundled') { + const bundledBin = findBundledBridgeBin(); + if (bundledBin) { + return bundledBin; + } + log('Bundled bridge requested but not found; falling back to external resolution'); + } + + // External mode logic (existing behavior) let localOnly = true; let configured = ''; try { @@ -69,10 +99,15 @@ function createBridgeManager(deps) { function resolveBridgeCliInvocation() { const binPath = findLocalBridgeBin(); if (binPath) { + // Use absolute Node runtime to avoid PATH dependency in extension hosts + const bundledBin = findBundledBridgeBin(); + const resolvedKind = bundledBin && path.resolve(binPath) === path.resolve(bundledBin) + ? 'bundled' + : 'local'; return { - command: 'node', + command: process.execPath, args: [binPath], - kind: 'local' + kind: resolvedKind }; } const isWindows = process.platform === 'win32'; @@ -107,6 +142,10 @@ function createBridgeManager(deps) { return serverMode === 'bridge' && transportMode === 'http'; } + function requiresLocalBridgeProcess(serverMode, transportMode) { + return serverMode === 'bridge' && (transportMode === 'http' || transportMode === 'sse-remote'); + } + function resolveBridgeHttpUrl() { try { const settings = getEffectiveConfig(); @@ -269,10 +308,10 @@ function createBridgeManager(deps) { const serverModeRaw = config.get('mcpServerMode') || 'bridge'; const transportMode = (typeof transportModeRaw === 'string' ? transportModeRaw.trim() : 'sse-remote') || 'sse-remote'; const serverMode = (typeof serverModeRaw === 'string' ? serverModeRaw.trim() : 'bridge') || 'bridge'; - if (requiresHttpBridge(serverMode, transportMode)) { + if (requiresLocalBridgeProcess(serverMode, transportMode)) { await start(); } else { - log('Context Engine Uploader: HTTP bridge settings changed, but current MCP wiring does not use the HTTP bridge; not restarting HTTP bridge.'); + log('Context Engine Uploader: bridge settings changed, but current MCP wiring does not use the local bridge process; not restarting bridge.'); } } } @@ -290,6 +329,7 @@ function createBridgeManager(deps) { getState, isRunning, requiresHttpBridge, + requiresLocalBridgeProcess, resolveBridgeHttpUrl, ensureReadyForConfigs, start, diff --git a/vscode-extension/context-engine-uploader/package.json b/vscode-extension/context-engine-uploader/package.json index d5e3584f..654c5678 100644 --- a/vscode-extension/context-engine-uploader/package.json +++ b/vscode-extension/context-engine-uploader/package.json @@ -282,7 +282,7 @@ "contextEngineUploader.autoStartMcpBridge": { "type": "boolean", "default": true, - "description": "When enabled and mcpServerMode='bridge' with mcpTransportMode='http', automatically start the local ctx-mcp-bridge HTTP server for the active workspace so IDE clients can connect over HTTP without manual commands. Has no effect in stdio/direct modes." + "description": "When enabled and mcpServerMode='bridge', automatically start the bundled local ctx bridge process for the active workspace. In http mode it serves the local HTTP MCP bridge directly; in sse-remote mode it starts the same bundled bridge adapter used by bridge-stdio wiring. Has no effect in direct modes." }, "contextEngineUploader.mcpBridgePort": { "type": "number", @@ -297,7 +297,17 @@ "contextEngineUploader.mcpBridgeLocalOnly": { "type": "boolean", "default": false, - "description": "Development toggle. When true (default) the extension prefers local bridge binaries resolved from mcpBridgeBinPath or CTXCE_BRIDGE_BIN before falling back to the published npm build via npx." + "description": "Development toggle. When true and mcpBridgeMode='external', prefers local bridge binaries resolved from mcpBridgeBinPath or CTXCE_BRIDGE_BIN before falling back to the published npm build via npx. Ignored when mcpBridgeMode='bundled'." + }, + "contextEngineUploader.mcpBridgeMode": { + "type": "string", + "enum": ["bundled", "external"], + "default": "bundled", + "description": "Bridge invocation mode. 'bundled' uses the bundled bridge inside the extension (offline, no npx required). 'external' uses external binary path or npx (current behavior).", + "enumDescriptions": [ + "Use the bundled MCP bridge inside the extension (works offline).", + "Use external binary path or npx to run the bridge (requires internet for first npx install)." + ] }, "contextEngineUploader.mcpServerMode": { "type": "string", diff --git a/vscode-extension/context-engine-uploader/python_env.js b/vscode-extension/context-engine-uploader/python_env.js index 190f9945..9bf24f6e 100644 --- a/vscode-extension/context-engine-uploader/python_env.js +++ b/vscode-extension/context-engine-uploader/python_env.js @@ -114,11 +114,27 @@ function createPythonEnvManager(deps) { const REQUIRED_PYTHON_MODULES = ['requests', 'urllib3', 'charset_normalizer', 'watchdog']; const depCheckCache = new Map(); + const ensureDepCheckInflight = new Map(); + let hasLoggedBundledDepPath = false; function cacheKey(pythonPath, workingDirectory) { return `${pythonPath || ''}::${workingDirectory || ''}`; } + function getBundledLibsPath(workingDirectory) { + const candidates = []; + if (workingDirectory) { + candidates.push(path.join(workingDirectory, 'python_libs')); + } + candidates.push(path.join(getExtensionRoot(), 'python_libs')); + for (const libsPath of candidates) { + if (libsPath && fs.existsSync(libsPath)) { + return libsPath; + } + } + return undefined; + } + function venvRootDir() { // Prefer workspace storage; fallback to extension storage try { @@ -186,16 +202,13 @@ function createPythonEnvManager(deps) { let pythonError; const env = { ...process.env }; try { - const candidates = []; - if (workingDirectory) { - candidates.push(path.join(workingDirectory, 'python_libs')); - } - candidates.push(path.join(getExtensionRoot(), 'python_libs')); - for (const libsPath of candidates) { - if (libsPath && fs.existsSync(libsPath)) { - const existing = env.PYTHONPATH || ''; - env.PYTHONPATH = existing ? `${libsPath}${path.delimiter}${existing}` : libsPath; - break; + const libsPath = getBundledLibsPath(workingDirectory); + if (libsPath) { + const existing = env.PYTHONPATH || ''; + env.PYTHONPATH = existing ? `${libsPath}${path.delimiter}${existing}` : libsPath; + if (!hasLoggedBundledDepPath) { + log(`Using bundled python_libs for dependency checks: ${libsPath}`); + hasLoggedBundledDepPath = true; } } } catch (error) { @@ -309,68 +322,87 @@ function createPythonEnvManager(deps) { } async function ensurePythonDependencies(pythonPath, workingDirectory, pythonPathSource) { - // Probe current interpreter with bundled python_libs first - const allowPrompt = pythonPathSource === 'configured' || pythonPathSource === 'override'; - const primaryKey = cacheKey(pythonPath, workingDirectory); - if (depCheckCache.get(primaryKey)) { - return true; - } - let ok = await checkPythonDeps(pythonPath, workingDirectory, { showInterpreterError: allowPrompt }); - if (ok) { - depCheckCache.set(primaryKey, true); - return true; + const inflightKey = cacheKey(pythonPath, workingDirectory); + const existing = ensureDepCheckInflight.get(inflightKey); + if (existing) { + return existing; } - // If that fails, try to auto-detect a better system Python before falling back to a venv - const autoPython = await detectSystemPython(); - if (autoPython && autoPython !== pythonPath) { - log(`Falling back to auto-detected Python interpreter: ${autoPython}`); - const autoKey = cacheKey(autoPython, workingDirectory); - if (depCheckCache.get(autoKey)) { - setPythonOverridePath(autoPython); + const task = (async () => { + const allowPrompt = pythonPathSource === 'configured' || pythonPathSource === 'override'; + const primaryKey = cacheKey(pythonPath, workingDirectory); + if (depCheckCache.get(primaryKey)) { return true; } - ok = await checkPythonDeps(autoPython, workingDirectory, { showInterpreterError: allowPrompt }); + + let ok = await checkPythonDeps(pythonPath, workingDirectory, { showInterpreterError: false }); if (ok) { - setPythonOverridePath(autoPython); - depCheckCache.set(autoKey, true); + depCheckCache.set(primaryKey, true); return true; } - } - // As a last resort, offer to create a private venv and install deps via pip - if (!allowPrompt) { - log('Skipping auto-install prompt; interpreter was auto-detected and missing modules.'); - return false; - } - const choice = await vscode.window.showErrorMessage( - 'Context Engine Uploader: missing Python modules. Create isolated environment and auto-install?', - 'Auto-install to private venv', - 'Cancel' - ); - if (choice !== 'Auto-install to private venv') { - return false; - } - const created = await ensurePrivateVenv(); - if (!created) return false; - const venvPython = resolvePrivateVenvPython(); - if (!venvPython) { - vscode.window.showErrorMessage('Context Engine Uploader: failed to locate private venv python.'); - return false; - } - const installed = await installDepsInto(venvPython); - if (!installed) return false; - setPythonOverridePath(venvPython); - log(`Using private venv interpreter: ${getPythonOverridePath()}`); - const venvKey = cacheKey(venvPython, workingDirectory); - if (depCheckCache.get(venvKey)) { - return true; - } - const finalOk = await checkPythonDeps(venvPython, workingDirectory, { showInterpreterError: true }); - if (finalOk) { - depCheckCache.set(venvKey, true); + // If that fails, try to auto-detect a better system Python before falling back to a venv. + const autoPython = await detectSystemPython(); + if (autoPython && autoPython !== pythonPath) { + log(`Falling back to auto-detected Python interpreter: ${autoPython}`); + const autoKey = cacheKey(autoPython, workingDirectory); + if (depCheckCache.get(autoKey)) { + setPythonOverridePath(autoPython); + return true; + } + ok = await checkPythonDeps(autoPython, workingDirectory, { showInterpreterError: false }); + if (ok) { + setPythonOverridePath(autoPython); + depCheckCache.set(autoKey, true); + return true; + } + } + + // Delay configured-python noise until after fallback discovery is exhausted. + if (allowPrompt) { + vscode.window.showErrorMessage(`Context Engine Uploader: failed to run ${pythonPath}. Update contextEngineUploader.pythonPath.`); + } + + // As a last resort, offer to create a private venv and install deps via pip + // only after current and auto-detected interpreters have both failed. + const choice = await vscode.window.showErrorMessage( + 'Context Engine Uploader: missing Python modules. Create isolated environment and auto-install?', + 'Auto-install to private venv', + 'Cancel' + ); + if (choice !== 'Auto-install to private venv') { + return false; + } + const created = await ensurePrivateVenv(); + if (!created) return false; + const venvPython = resolvePrivateVenvPython(); + if (!venvPython) { + vscode.window.showErrorMessage('Context Engine Uploader: failed to locate private venv python.'); + return false; + } + const installed = await installDepsInto(venvPython); + if (!installed) return false; + setPythonOverridePath(venvPython); + log(`Using private venv interpreter: ${getPythonOverridePath()}`); + const venvKey = cacheKey(venvPython, workingDirectory); + if (depCheckCache.get(venvKey)) { + return true; + } + const finalOk = await checkPythonDeps(venvPython, workingDirectory, { showInterpreterError: true }); + if (finalOk) { + depCheckCache.set(venvKey, true); + } + return finalOk; + })(); + + ensureDepCheckInflight.set(inflightKey, task); + try { + return await task; + } finally { + if (ensureDepCheckInflight.get(inflightKey) === task) { + ensureDepCheckInflight.delete(inflightKey); + } } - return finalOk; } return {