From f2f2d45e5a6ebe0abc38235dd461b8f79861c490 Mon Sep 17 00:00:00 2001 From: Victor Sumner Date: Tue, 24 Feb 2026 17:04:22 -0500 Subject: [PATCH 1/8] feat(worker): add durable terminal delivery, checkpoints, and resilience --- docs/content/docs/(configuration)/config.mdx | 2 +- docs/content/docs/(features)/workers.mdx | 24 +- ...0260224000001_worker_delivery_receipts.sql | 28 + src/agent/channel.rs | 989 ++++++++++++++++-- src/api/channels.rs | 70 +- src/api/webchat.rs | 3 + src/conversation/history.rs | 862 ++++++++++++++- src/cron/scheduler.rs | 12 +- src/lib.rs | 38 + src/llm/model.rs | 24 +- src/main.rs | 398 +++++-- src/messaging/discord.rs | 140 ++- src/messaging/slack.rs | 18 +- src/messaging/webchat.rs | 19 +- src/tools.rs | 5 +- src/tools/browser.rs | 45 +- src/tools/cancel.rs | 2 +- src/tools/react.rs | 8 +- src/tools/reply.rs | 49 +- src/tools/send_file.rs | 36 +- src/tools/skip.rs | 8 +- 21 files changed, 2489 insertions(+), 291 deletions(-) create mode 100644 migrations/20260224000001_worker_delivery_receipts.sql diff --git a/docs/content/docs/(configuration)/config.mdx b/docs/content/docs/(configuration)/config.mdx index f098b6d73..21a78bfb9 100644 --- a/docs/content/docs/(configuration)/config.mdx +++ b/docs/content/docs/(configuration)/config.mdx @@ -476,7 +476,7 @@ Thresholds are fractions of `context_window`. | Key | Type | Default | Description | |-----|------|---------|-------------| | `tick_interval_secs` | integer | 30 | How often the cortex checks system state | -| `worker_timeout_secs` | integer | 300 | Worker timeout before cancellation | +| `worker_timeout_secs` | integer | 300 | Inactivity timeout for worker progress events before forced cancellation | | `branch_timeout_secs` | integer | 60 | Branch timeout before cancellation | | `circuit_breaker_threshold` | integer | 3 | Consecutive failures before auto-disable | diff --git a/docs/content/docs/(features)/workers.mdx b/docs/content/docs/(features)/workers.mdx index 3e0a1096a..1e1275f30 100644 --- a/docs/content/docs/(features)/workers.mdx +++ b/docs/content/docs/(features)/workers.mdx @@ -95,7 +95,7 @@ Workers run in segments of 25 turns each. After each segment: - If the agent returned a result: done - If max turns hit: compact if needed, continue with "Continue where you left off" -- If cancelled: state = Failed +- If cancelled: state = Cancelled - If context overflow: force compact, retry This prevents runaway workers and handles long tasks that exceed a single agent loop. @@ -111,10 +111,32 @@ Workers report progress via the `set_status` tool. The status string (max 256 ch The channel LLM sees this and can decide whether to wait, ask for more info, or cancel. +Spacebot also forwards throttled worker checkpoints to the user-facing adapter: + +- Start and completion updates are always surfaced. +- Mid-run checkpoints are deduped and rate-limited (default: at most one every 20s per worker, with urgent states bypassing the limit). +- Adapters that support message editing (for example Discord) update a single progress message in place to avoid channel spam. + ## Concurrency Workers run concurrently. The default limit is `max_concurrent_workers: 5` per channel (configurable per agent). Attempting to spawn beyond the limit returns an error to the LLM so it can wait or cancel an existing worker. +## Timeouts + +Worker runs are bounded by `worker_timeout_secs` (default `300`) as an inactivity timeout. Any worker progress event (status updates, tool activity, permission/question prompts) resets the timer. + +If no progress arrives within the timeout window, Spacebot marks the worker as `timed_out`, records a terminal result, and removes it from active worker state so the channel can continue delegating work. + +## Terminal Delivery Reliability + +Terminal worker notices (`done`, `failed`, `timed_out`, `cancelled`) are queued as durable delivery receipts before they are sent to the messaging adapter. + +- Receipts are retried with bounded backoff on adapter delivery errors. +- Successful delivery marks the receipt as acknowledged. +- On process restart, in-flight (`sending`) receipts are re-queued so completion notices are not silently dropped. +- Old terminal receipts (`acked`, `failed`) are pruned periodically to keep storage bounded. +- `/api/channels/status` includes `worker_delivery_receipts` counts (`pending`, `failed`) per channel for observability. + ## Model Routing Workers default to `anthropic/claude-haiku-4.5-20250514`. Task-type overrides apply — for example, a `coding` task type routes to `anthropic/claude-sonnet-4-20250514`. Fallback chains are supported. All hot-reloadable. diff --git a/migrations/20260224000001_worker_delivery_receipts.sql b/migrations/20260224000001_worker_delivery_receipts.sql new file mode 100644 index 000000000..4c31c75ea --- /dev/null +++ b/migrations/20260224000001_worker_delivery_receipts.sql @@ -0,0 +1,28 @@ +-- Durable delivery receipts for terminal worker notifications. +-- +-- Tracks whether a terminal worker completion notice has been delivered to the +-- user-facing channel, with bounded retry metadata for transient adapter +-- failures. + +CREATE TABLE IF NOT EXISTS worker_delivery_receipts ( + id TEXT PRIMARY KEY, + worker_id TEXT NOT NULL, + channel_id TEXT NOT NULL, + kind TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + terminal_state TEXT NOT NULL, + payload_text TEXT NOT NULL, + attempt_count INTEGER NOT NULL DEFAULT 0, + last_error TEXT, + next_attempt_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + acked_at TIMESTAMP, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + UNIQUE(worker_id, kind) +); + +CREATE INDEX idx_worker_delivery_receipts_due + ON worker_delivery_receipts(status, next_attempt_at); + +CREATE INDEX idx_worker_delivery_receipts_channel + ON worker_delivery_receipts(channel_id, created_at); diff --git a/src/agent/channel.rs b/src/agent/channel.rs index c30430585..43c55b217 100644 --- a/src/agent/channel.rs +++ b/src/agent/channel.rs @@ -10,8 +10,8 @@ use crate::error::{AgentError, Result}; use crate::hooks::SpacebotHook; use crate::llm::SpacebotModel; use crate::{ - AgentDeps, BranchId, ChannelId, InboundMessage, OutboundResponse, ProcessEvent, ProcessId, - ProcessType, WorkerId, + AgentDeps, BranchId, ChannelId, InboundMessage, OutboundEnvelope, OutboundResponse, + ProcessEvent, ProcessId, ProcessType, WorkerId, }; use rig::agent::AgentBuilder; use rig::completion::{CompletionModel, Prompt}; @@ -33,6 +33,21 @@ const RETRIGGER_DEBOUNCE_MS: u64 = 500; /// infinite retrigger cascades where each retrigger spawns more work. const MAX_RETRIGGERS_PER_TURN: usize = 3; +/// Minimum interval between user-facing worker checkpoint updates. +/// This keeps progress useful without flooding channel messages. +const WORKER_CHECKPOINT_MIN_INTERVAL_SECS: u64 = 20; + +/// Maximum length for user-facing checkpoint text. +const WORKER_CHECKPOINT_MAX_CHARS: usize = 220; +const WORKER_RECEIPT_DISPATCH_INTERVAL_SECS: u64 = 5; +const WORKER_RECEIPT_DISPATCH_BATCH_SIZE: i64 = 8; + +#[derive(Debug, Clone)] +struct WorkerCheckpointState { + last_status: String, + last_sent_at: tokio::time::Instant, +} + /// Shared state that channel tools need to act on the channel. /// /// Wrapped in Arc and passed to tools (branch, spawn_worker, route, cancel) @@ -62,7 +77,11 @@ pub struct ChannelState { impl ChannelState { /// Cancel a running worker by aborting its tokio task and cleaning up state. /// Returns an error message if the worker is not found. - pub async fn cancel_worker(&self, worker_id: WorkerId) -> std::result::Result<(), String> { + pub async fn cancel_worker( + &self, + worker_id: WorkerId, + reason: Option<&str>, + ) -> std::result::Result<(), String> { let handle = self.worker_handles.write().await.remove(&worker_id); let removed = self .active_workers @@ -74,13 +93,49 @@ impl ChannelState { if let Some(handle) = handle { handle.abort(); - // Mark the DB row as cancelled since the abort prevents WorkerComplete from firing - self.process_run_logger - .log_worker_completed(worker_id, "Worker cancelled", false); + let reason = reason + .map(str::trim) + .filter(|value| !value.is_empty()) + .unwrap_or("cancelled by request"); + let _ = self.deps.event_tx.send(crate::ProcessEvent::WorkerStatus { + agent_id: self.deps.agent_id.clone(), + worker_id, + channel_id: Some(self.channel_id.clone()), + status: "cancelled".to_string(), + }); + let _ = self + .deps + .event_tx + .send(crate::ProcessEvent::WorkerComplete { + agent_id: self.deps.agent_id.clone(), + worker_id, + channel_id: Some(self.channel_id.clone()), + result: format!("Worker cancelled: {reason}."), + notify: true, + }); Ok(()) } else if removed { - self.process_run_logger - .log_worker_completed(worker_id, "Worker cancelled", false); + // Worker was in active_workers but had no handle (shouldn't happen, but handle gracefully) + let reason = reason + .map(str::trim) + .filter(|value| !value.is_empty()) + .unwrap_or("cancelled by request"); + let _ = self.deps.event_tx.send(crate::ProcessEvent::WorkerStatus { + agent_id: self.deps.agent_id.clone(), + worker_id, + channel_id: Some(self.channel_id.clone()), + status: "cancelled".to_string(), + }); + let _ = self + .deps + .event_tx + .send(crate::ProcessEvent::WorkerComplete { + agent_id: self.deps.agent_id.clone(), + worker_id, + channel_id: Some(self.channel_id.clone()), + result: format!("Worker cancelled: {reason}."), + notify: true, + }); Ok(()) } else { Err(format!("Worker {worker_id} not found")) @@ -122,7 +177,7 @@ pub struct Channel { /// Event receiver for process events. pub event_rx: broadcast::Receiver, /// Outbound response sender for the messaging layer. - pub response_tx: mpsc::Sender, + pub response_tx: mpsc::Sender, /// Self-sender for re-triggering the channel after background process completion. pub self_tx: mpsc::Sender, /// Conversation ID from the first message (for synthetic re-trigger messages). @@ -149,19 +204,10 @@ pub struct Channel { pending_retrigger_metadata: HashMap, /// Deadline for firing the pending retrigger (debounce timer). retrigger_deadline: Option, - /// Optional send_agent_message tool (only when agent has active links). - send_agent_message_tool: Option, - /// Turn counter for link channels (used for safety cap). - link_turn_count: u32, - /// Originating channel that triggered this link conversation (for routing conclusions back). - originating_channel: Option, - /// Messaging adapter name from the originating channel (e.g. "webchat", "discord"). - /// Used by `route_link_conclusion` to set the correct `source` on injected messages. - originating_source: Option, - /// Set after `conclude_link` fires. Prevents the channel from processing - /// further messages, stopping the ping-pong that happens when both sides - /// keep responding to each other after the task is done. - link_concluded: bool, + /// Per-worker checkpoint state used for status dedupe/throttling. + worker_checkpoints: HashMap, + /// Periodic deadline for checking due worker terminal delivery receipts. + worker_receipt_dispatch_deadline: tokio::time::Instant, } impl Channel { @@ -173,7 +219,7 @@ impl Channel { pub fn new( id: ChannelId, deps: AgentDeps, - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, event_rx: broadcast::Receiver, screenshot_dir: std::path::PathBuf, logs_dir: std::path::PathBuf, @@ -265,11 +311,9 @@ impl Channel { pending_retrigger: false, pending_retrigger_metadata: HashMap::new(), retrigger_deadline: None, - send_agent_message_tool, - link_turn_count: 0, - originating_channel: None, - originating_source: None, - link_concluded: false, + worker_checkpoints: HashMap::new(), + worker_receipt_dispatch_deadline: tokio::time::Instant::now() + + std::time::Duration::from_secs(WORKER_RECEIPT_DISPATCH_INTERVAL_SECS), }; (channel, message_tx) @@ -289,13 +333,15 @@ impl Channel { tracing::info!(channel_id = %self.id, "channel started"); loop { - // Compute next deadline from coalesce and retrigger timers - let next_deadline = match (self.coalesce_deadline, self.retrigger_deadline) { - (Some(a), Some(b)) => Some(a.min(b)), - (Some(a), None) => Some(a), - (None, Some(b)) => Some(b), - (None, None) => None, - }; + // Compute next deadline from coalesce/retrigger timers and receipt dispatch. + let next_deadline = [ + self.coalesce_deadline, + self.retrigger_deadline, + Some(self.worker_receipt_dispatch_deadline), + ] + .into_iter() + .flatten() + .min(); let sleep_duration = next_deadline .map(|deadline| { let now = tokio::time::Instant::now(); @@ -323,13 +369,28 @@ impl Channel { } } } - Ok(event) = self.event_rx.recv() => { - // Events bypass coalescing - flush buffer first if needed - if let Err(error) = self.flush_coalesce_buffer().await { - tracing::error!(%error, channel_id = %self.id, "error flushing coalesce buffer"); - } - if let Err(error) = self.handle_event(event).await { - tracing::error!(%error, channel_id = %self.id, "error handling event"); + event = self.event_rx.recv() => { + match event { + Ok(event) => { + // Events bypass coalescing - flush buffer first if needed + if let Err(error) = self.flush_coalesce_buffer().await { + tracing::error!(%error, channel_id = %self.id, "error flushing coalesce buffer"); + } + if let Err(error) = self.handle_event(event).await { + tracing::error!(%error, channel_id = %self.id, "error handling event"); + } + } + Err(tokio::sync::broadcast::error::RecvError::Lagged(skipped)) => { + tracing::warn!( + channel_id = %self.id, + skipped, + "channel event stream lagged; continuing after dropping stale events" + ); + } + Err(tokio::sync::broadcast::error::RecvError::Closed) => { + tracing::warn!(channel_id = %self.id, "channel event stream closed"); + break; + } } } _ = tokio::time::sleep(sleep_duration), if next_deadline.is_some() => { @@ -344,6 +405,14 @@ impl Channel { if self.retrigger_deadline.is_some_and(|d| d <= now) { self.flush_pending_retrigger().await; } + // Check worker terminal receipt dispatch deadline + if self.worker_receipt_dispatch_deadline <= now { + self.flush_due_worker_delivery_receipts().await; + self.worker_receipt_dispatch_deadline = tokio::time::Instant::now() + + std::time::Duration::from_secs( + WORKER_RECEIPT_DISPATCH_INTERVAL_SECS, + ); + } } else => break, } @@ -1318,7 +1387,9 @@ impl Channel { let _ = self .response_tx - .send(OutboundResponse::Status(crate::StatusUpdate::Thinking)) + .send(OutboundEnvelope::from(OutboundResponse::Status( + crate::StatusUpdate::Thinking, + ))) .await; // Inject attachments as a user message before the text prompt @@ -1421,18 +1492,27 @@ impl Channel { source, ); if !final_text.is_empty() { - if extracted.is_some() { - tracing::warn!(channel_id = %self.id, "extracted reply from malformed tool syntax in retrigger fallback"); - } - self.state - .conversation_logger - .log_bot_message(&self.state.channel_id, &final_text); - if let Err(error) = self - .response_tx - .send(OutboundResponse::Text(final_text)) - .await - { - tracing::error!(%error, channel_id = %self.id, "failed to send retrigger fallback reply"); + if crate::tools::reply::is_low_value_waiting_update(&final_text) { + tracing::info!( + channel_id = %self.id, + "suppressing low-value waiting retrigger fallback text" + ); + } else { + if extracted.is_some() { + tracing::warn!(channel_id = %self.id, "extracted reply from malformed tool syntax in retrigger fallback"); + } + self.state + .conversation_logger + .log_bot_message(&self.state.channel_id, &final_text); + if let Err(error) = self + .response_tx + .send(OutboundEnvelope::from(OutboundResponse::Text( + final_text, + ))) + .await + { + tracing::error!(%error, channel_id = %self.id, "failed to send retrigger fallback reply"); + } } } } else { @@ -1446,16 +1526,16 @@ impl Channel { } else if replied { tracing::debug!(channel_id = %self.id, "channel turn replied via tool (fallback suppressed)"); } else if is_retrigger { - // On retrigger turns the LLM should use the reply tool, but - // some models return the result as raw text instead. Send it - // as a fallback so the user still gets the worker/branch output. + // Retrigger turns are vulnerable to tool-call misses; when the + // model emits substantive text without calling `reply`, relay it. + // Keep suppressing low-value "still waiting" chatter. let text = response.trim(); - if !text.is_empty() { - tracing::info!( + if text.is_empty() { + tracing::debug!( channel_id = %self.id, - response_len = text.len(), - "retrigger produced text without reply tool, sending as fallback" + "retrigger turn fallback suppressed (empty text)" ); + } else { let extracted = extract_reply_from_tool_syntax(text); let source = self .conversation_id @@ -1466,23 +1546,31 @@ impl Channel { extracted.as_deref().unwrap_or(text), source, ); - if !final_text.is_empty() { + if final_text.is_empty() { + tracing::debug!( + channel_id = %self.id, + "retrigger turn fallback suppressed (empty normalized text)" + ); + } else if crate::tools::reply::is_low_value_waiting_update(&final_text) { + tracing::info!( + channel_id = %self.id, + "suppressing low-value waiting retrigger fallback text" + ); + } else { + if extracted.is_some() { + tracing::warn!(channel_id = %self.id, "extracted reply from malformed tool syntax in retrigger text output"); + } self.state .conversation_logger .log_bot_message(&self.state.channel_id, &final_text); if let Err(error) = self .response_tx - .send(OutboundResponse::Text(final_text)) + .send(OutboundEnvelope::from(OutboundResponse::Text(final_text))) .await { tracing::error!(%error, channel_id = %self.id, "failed to send retrigger fallback reply"); } } - } else { - tracing::debug!( - channel_id = %self.id, - "retrigger turn produced no text and no reply tool call" - ); } } else { // If the LLM returned text without using the reply tool, send it @@ -1501,20 +1589,25 @@ impl Channel { source, ); if !final_text.is_empty() { - if extracted.is_some() { - tracing::warn!(channel_id = %self.id, "extracted reply from malformed tool syntax in LLM text output"); - } - self.state.conversation_logger.log_bot_message_with_name( - &self.state.channel_id, - &final_text, - Some(self.agent_display_name()), - ); - if let Err(error) = self - .response_tx - .send(OutboundResponse::Text(final_text)) - .await - { - tracing::error!(%error, channel_id = %self.id, "failed to send fallback reply"); + if crate::tools::reply::is_low_value_waiting_update(&final_text) { + tracing::info!( + channel_id = %self.id, + "suppressing low-value waiting fallback text" + ); + } else { + if extracted.is_some() { + tracing::warn!(channel_id = %self.id, "extracted reply from malformed tool syntax in LLM text output"); + } + self.state + .conversation_logger + .log_bot_message(&self.state.channel_id, &final_text); + if let Err(error) = self + .response_tx + .send(OutboundEnvelope::from(OutboundResponse::Text(final_text))) + .await + { + tracing::error!(%error, channel_id = %self.id, "failed to send fallback reply"); + } } } @@ -1539,7 +1632,9 @@ impl Channel { // Ensure typing indicator is always cleaned up, even on error paths let _ = self .response_tx - .send(OutboundResponse::Status(crate::StatusUpdate::StopTyping)) + .send(OutboundEnvelope::from(OutboundResponse::Status( + crate::StatusUpdate::StopTyping, + ))) .await; } @@ -1620,18 +1715,32 @@ impl Channel { worker_type, .. } => { - run_logger.log_worker_started( - channel_id.as_ref(), - *worker_id, - task, - worker_type, - &self.deps.agent_id, - ); + run_logger.log_worker_started(channel_id.as_ref(), *worker_id, task); + let public_task_summary = summarize_worker_start_for_status(task); + if self.worker_is_user_visible(*worker_id).await { + self.send_status_update(crate::StatusUpdate::WorkerStarted { + worker_id: *worker_id, + task: public_task_summary.clone(), + }) + .await; + if let Some(status) = normalize_worker_checkpoint_status(&public_task_summary) { + self.worker_checkpoints.insert( + *worker_id, + WorkerCheckpointState { + last_status: status, + last_sent_at: tokio::time::Instant::now(), + }, + ); + } + } } ProcessEvent::WorkerStatus { worker_id, status, .. } => { run_logger.log_worker_status(*worker_id, status); + if self.worker_is_user_visible(*worker_id).await { + self.maybe_send_worker_checkpoint(*worker_id, status).await; + } } ProcessEvent::WorkerComplete { worker_id, @@ -1640,7 +1749,50 @@ impl Channel { success, .. } => { - run_logger.log_worker_completed(*worker_id, result, *success); + run_logger.log_worker_completed(*worker_id, result); + self.worker_checkpoints.remove(worker_id); + if *notify { + self.send_status_update(crate::StatusUpdate::WorkerCompleted { + worker_id: *worker_id, + result: summarize_worker_result_for_status(result), + }) + .await; + + let terminal_state = classify_worker_terminal_state(result); + let payload_text = + build_worker_terminal_receipt_payload(terminal_state, result); + match self + .state + .process_run_logger + .upsert_worker_terminal_receipt( + &self.id, + *worker_id, + terminal_state, + &payload_text, + ) + .await + { + Ok(receipt_id) => { + tracing::info!( + channel_id = %self.id, + worker_id = %worker_id, + receipt_id = %receipt_id, + terminal_state, + "queued worker terminal receipt" + ); + self.worker_receipt_dispatch_deadline = tokio::time::Instant::now(); + } + Err(error) => { + tracing::warn!( + %error, + channel_id = %self.id, + worker_id = %worker_id, + terminal_state, + "failed to queue worker terminal receipt" + ); + } + } + } let mut workers = self.state.active_workers.write().await; workers.remove(worker_id); @@ -1649,7 +1801,7 @@ impl Channel { self.state.worker_handles.write().await.remove(worker_id); self.state.worker_inputs.write().await.remove(worker_id); - if *notify { + if *notify && !is_worker_terminal_failure(result) { let mut history = self.state.history.write().await; let worker_message = format!("[Worker {worker_id} completed]: {result}"); history.push(rig::message::Message::from(worker_message)); @@ -1688,6 +1840,108 @@ impl Channel { Ok(()) } + async fn send_status_update(&self, status: crate::StatusUpdate) { + if let Err(error) = self + .response_tx + .send(OutboundEnvelope::from(OutboundResponse::Status(status))) + .await + { + tracing::debug!( + %error, + channel_id = %self.id, + "failed to route status update to messaging adapter" + ); + } + } + + async fn maybe_send_worker_checkpoint(&mut self, worker_id: WorkerId, raw_status: &str) { + let Some(status) = normalize_worker_checkpoint_status(raw_status) else { + return; + }; + + let now = tokio::time::Instant::now(); + let previous = self.worker_checkpoints.get(&worker_id); + if !should_emit_worker_checkpoint(previous, &status, now) { + return; + } + + self.send_status_update(crate::StatusUpdate::WorkerCheckpoint { + worker_id, + status: status.clone(), + }) + .await; + + self.worker_checkpoints.insert( + worker_id, + WorkerCheckpointState { + last_status: status, + last_sent_at: now, + }, + ); + } + + async fn flush_due_worker_delivery_receipts(&mut self) { + let due = match self + .state + .process_run_logger + .claim_due_worker_terminal_receipts(&self.id, WORKER_RECEIPT_DISPATCH_BATCH_SIZE) + .await + { + Ok(receipts) => receipts, + Err(error) => { + tracing::warn!( + %error, + channel_id = %self.id, + "failed to claim due worker terminal receipts" + ); + return; + } + }; + + if due.is_empty() { + return; + } + + for receipt in due { + let message = OutboundResponse::Text(receipt.payload_text.clone()); + let envelope = OutboundEnvelope::tracked(message, receipt.id.clone()); + + if let Err(error) = self.response_tx.send(envelope).await { + tracing::warn!( + %error, + channel_id = %self.id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + "failed to queue worker terminal receipt for outbound delivery" + ); + + if let Err(update_error) = self + .state + .process_run_logger + .fail_worker_delivery_receipt_attempt(&receipt.id, &error.to_string()) + .await + { + tracing::warn!( + %update_error, + channel_id = %self.id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + "failed to mark worker terminal receipt send failure" + ); + } + } + } + } + + async fn worker_is_user_visible(&self, worker_id: WorkerId) -> bool { + let status_block = self.state.status_block.read().await; + status_block + .active_workers + .iter() + .find(|worker| worker.id == worker_id) + .is_some_and(|worker| worker.notify_on_complete) + } + /// Flush the pending retrigger: send a synthetic system message to re-trigger /// the channel LLM so it can process background results and respond. async fn flush_pending_retrigger(&mut self) { @@ -1975,8 +2229,8 @@ async fn spawn_branch( /// Check whether the channel has capacity for another worker. async fn check_worker_limit(state: &ChannelState) -> std::result::Result<(), AgentError> { let max_workers = **state.deps.runtime_config.max_concurrent_workers.load(); - let workers = state.active_workers.read().await; - if workers.len() >= max_workers { + let worker_handles = state.worker_handles.read().await; + if worker_handles.len() >= max_workers { return Err(AgentError::WorkerLimitReached { channel_id: state.channel_id.to_string(), max: max_workers, @@ -2066,6 +2320,7 @@ pub async fn spawn_worker_from_state( state.deps.event_tx.clone(), state.deps.agent_id.clone(), Some(state.channel_id.clone()), + (**state.deps.runtime_config.cortex.load()).worker_timeout_secs, worker.run().instrument(worker_span), ); @@ -2073,7 +2328,7 @@ pub async fn spawn_worker_from_state( { let mut status = state.status_block.write().await; - status.add_worker(worker_id, &task, false); + status.add_worker(worker_id, &task, true); } state @@ -2161,6 +2416,7 @@ pub async fn spawn_opencode_worker_from_state( state.deps.event_tx.clone(), state.deps.agent_id.clone(), Some(state.channel_id.clone()), + (**state.deps.runtime_config.cortex.load()).worker_timeout_secs, async move { let result = worker.run().await?; Ok::(result.result_text) @@ -2173,7 +2429,7 @@ pub async fn spawn_opencode_worker_from_state( let opencode_task = format!("[opencode] {task}"); { let mut status = state.status_block.write().await; - status.add_worker(worker_id, &opencode_task, false); + status.add_worker(worker_id, &opencode_task, true); } state @@ -2203,6 +2459,7 @@ fn spawn_worker_task( event_tx: broadcast::Sender, agent_id: crate::AgentId, channel_id: Option, + timeout_secs: u64, future: F, ) -> tokio::task::JoinHandle<()> where @@ -2219,13 +2476,74 @@ where .with_label_values(&[&*agent_id]) .inc(); - let (result_text, notify, success) = match future.await { - Ok(text) => (text, true, true), - Err(error) => { - tracing::error!(worker_id = %worker_id, %error, "worker failed"); - (format!("Worker failed: {error}"), true, false) + let outcome = if timeout_secs == 0 { + match future.await { + Ok(text) => ("done", text, true), + Err(error) => { + tracing::error!(worker_id = %worker_id, %error, "worker failed"); + ("failed", format!("Worker failed: {error}"), true) + } + } + } else { + let timeout_duration = std::time::Duration::from_secs(timeout_secs.max(1)); + let mut event_rx = event_tx.subscribe(); + let future = future; + tokio::pin!(future); + let mut deadline = tokio::time::Instant::now() + timeout_duration; + + loop { + let sleep = tokio::time::sleep_until(deadline); + tokio::pin!(sleep); + + tokio::select! { + result = &mut future => { + let outcome = match result { + Ok(text) => ("done", text, true), + Err(error) => { + tracing::error!(worker_id = %worker_id, %error, "worker failed"); + ("failed", format!("Worker failed: {error}"), true) + } + }; + break outcome; + } + event = event_rx.recv() => { + match event { + Ok(event) => { + if is_worker_progress_event(&event, worker_id) { + deadline = tokio::time::Instant::now() + timeout_duration; + } + } + Err(tokio::sync::broadcast::error::RecvError::Lagged(skipped)) => { + tracing::warn!( + worker_id = %worker_id, + skipped, + "worker timeout watcher lagged on event stream" + ); + } + Err(tokio::sync::broadcast::error::RecvError::Closed) => { + tracing::warn!( + worker_id = %worker_id, + "worker timeout watcher event stream closed" + ); + } + } + } + _ = &mut sleep => { + tracing::error!( + worker_id = %worker_id, + timeout_secs, + "worker timed out due to inactivity" + ); + break ( + "timed_out", + format!("Worker timed out after {timeout_secs} seconds without progress."), + true, + ); + } + } } }; + let (terminal_status, result_text, notify) = outcome; #[cfg(feature = "metrics")] { let metrics = crate::telemetry::Metrics::global(); @@ -2239,6 +2557,13 @@ where .observe(worker_start.elapsed().as_secs_f64()); } + let _ = event_tx.send(ProcessEvent::WorkerStatus { + agent_id: agent_id.clone(), + worker_id, + channel_id: channel_id.clone(), + status: terminal_status.to_string(), + }); + let _ = event_tx.send(ProcessEvent::WorkerComplete { agent_id, worker_id, @@ -2250,6 +2575,32 @@ where }) } +fn is_worker_progress_event(event: &ProcessEvent, worker_id: WorkerId) -> bool { + match event { + ProcessEvent::WorkerStatus { + worker_id: event_worker_id, + .. + } => *event_worker_id == worker_id, + ProcessEvent::ToolStarted { + process_id: crate::ProcessId::Worker(event_worker_id), + .. + } => *event_worker_id == worker_id, + ProcessEvent::ToolCompleted { + process_id: crate::ProcessId::Worker(event_worker_id), + .. + } => *event_worker_id == worker_id, + ProcessEvent::WorkerPermission { + worker_id: event_worker_id, + .. + } => *event_worker_id == worker_id, + ProcessEvent::WorkerQuestion { + worker_id: event_worker_id, + .. + } => *event_worker_id == worker_id, + _ => false, + } +} + /// Some models emit tool call syntax as plain text instead of making actual tool calls. /// When the text starts with a tool-like prefix (e.g. `[reply]`, `(reply)`), try to /// extract the reply content so we can send it cleanly instead of showing raw JSON. @@ -2370,6 +2721,118 @@ fn extract_discord_message_id(message: &InboundMessage) -> Option { .and_then(|value| value.as_u64()) } +fn normalize_worker_checkpoint_status(status: &str) -> Option { + let trimmed = status.trim(); + if trimmed.is_empty() { + return None; + } + if trimmed.len() <= WORKER_CHECKPOINT_MAX_CHARS { + return Some(trimmed.to_string()); + } + + let end = trimmed.floor_char_boundary(WORKER_CHECKPOINT_MAX_CHARS); + let boundary = trimmed[..end].rfind(char::is_whitespace).unwrap_or(end); + Some(format!("{}...", &trimmed[..boundary])) +} + +fn is_high_priority_worker_checkpoint(status: &str) -> bool { + let normalized = status.to_ascii_lowercase(); + normalized.contains("waiting for input") + || normalized.contains("permission") + || normalized.contains("question") + || normalized.contains("failed") + || normalized.contains("error") + || normalized.contains("cancelled") + || normalized.contains("timed out") +} + +fn should_emit_worker_checkpoint( + previous: Option<&WorkerCheckpointState>, + next_status: &str, + now: tokio::time::Instant, +) -> bool { + let Some(previous) = previous else { + return true; + }; + + if previous.last_status == next_status { + return false; + } + + if is_high_priority_worker_checkpoint(next_status) { + return true; + } + + now.duration_since(previous.last_sent_at) + >= std::time::Duration::from_secs(WORKER_CHECKPOINT_MIN_INTERVAL_SECS) +} + +fn summarize_worker_result_for_status(result: &str) -> String { + let first_non_empty_line = result + .lines() + .find(|line| !line.trim().is_empty()) + .unwrap_or(result); + normalize_worker_checkpoint_status(first_non_empty_line).unwrap_or_else(|| "completed".into()) +} + +fn summarize_worker_start_for_status(task: &str) -> String { + let lowered = task.to_ascii_lowercase(); + if lowered.contains("research") + || lowered.contains("investigat") + || lowered.contains("verify") + || lowered.contains("source") + { + "research task".to_string() + } else if lowered.contains("[opencode]") + || lowered.contains("code") + || lowered.contains("implement") + || lowered.contains("refactor") + || lowered.contains("fix") + { + "coding task".to_string() + } else if lowered.contains("test") + || lowered.contains("pytest") + || lowered.contains("cargo test") + { + "test task".to_string() + } else if lowered.contains("summar") || lowered.contains("analy") || lowered.contains("review") + { + "analysis task".to_string() + } else { + "background task".to_string() + } +} + +fn is_worker_terminal_failure(result: &str) -> bool { + let trimmed = result.trim_start(); + trimmed.starts_with("Worker failed:") + || trimmed.starts_with("Worker timed out after ") + || trimmed.starts_with("Worker cancelled:") +} + +fn classify_worker_terminal_state(result: &str) -> &'static str { + let trimmed = result.trim_start(); + if trimmed.starts_with("Worker failed:") { + "failed" + } else if trimmed.starts_with("Worker timed out after ") { + "timed_out" + } else if trimmed.starts_with("Worker cancelled:") { + "cancelled" + } else { + "done" + } +} + +fn build_worker_terminal_receipt_payload(terminal_state: &str, result: &str) -> String { + let summary = summarize_worker_result_for_status(result); + match terminal_state { + "failed" => format!("Background task failed: {summary}"), + "timed_out" => format!("Background task timed out: {summary}"), + "cancelled" => "Background task was cancelled.".to_string(), + _ => format!("Background task completed: {summary}"), + } +} + /// Check if a ProcessEvent is targeted at a specific channel. /// /// Events from branches and workers carry a channel_id. We only process events @@ -2389,6 +2852,10 @@ fn event_is_for_channel(event: &ProcessEvent, channel_id: &ChannelId) -> bool { channel_id: event_channel, .. } => event_channel.as_ref() == Some(channel_id), + ProcessEvent::WorkerStarted { + channel_id: event_channel, + .. + } => event_channel.as_ref() == Some(channel_id), // Status block updates, tool events, etc. — match on agent_id which // is already filtered by the event bus subscription. Let them through. _ => true, @@ -2802,10 +3269,26 @@ fn apply_history_after_turn( #[cfg(test)] mod tests { + use super::WORKER_CHECKPOINT_MIN_INTERVAL_SECS; + use super::WorkerCheckpointState; use super::apply_history_after_turn; + use super::build_worker_terminal_receipt_payload; + use super::classify_worker_terminal_state; + use super::is_worker_progress_event; + use super::is_worker_terminal_failure; + use super::normalize_worker_checkpoint_status; + use super::should_emit_worker_checkpoint; + use super::spawn_worker_task; + use super::summarize_worker_result_for_status; + use super::summarize_worker_start_for_status; + use crate::ProcessEvent; use rig::completion::{CompletionError, PromptError}; use rig::message::Message; use rig::tool::ToolSetError; + use std::sync::Arc; + use std::time::Duration; + use tokio::sync::{broadcast, oneshot}; + use uuid::Uuid; fn user_msg(text: &str) -> Message { Message::User { @@ -3035,4 +3518,306 @@ mod tests { "no dangling tool-call messages in history after rollback" ); } + + #[tokio::test] + async fn worker_task_timeout_emits_terminal_events() { + let (event_tx, mut event_rx) = broadcast::channel(16); + let worker_id = Uuid::new_v4(); + let agent_id: crate::AgentId = Arc::from("test-agent"); + let channel_id: crate::ChannelId = Arc::from("test-channel"); + + let handle = spawn_worker_task(worker_id, event_tx, agent_id, Some(channel_id), 1, async { + tokio::time::sleep(Duration::from_secs(3)).await; + Ok::("should not complete".to_string()) + }); + + let mut saw_status = false; + let mut saw_complete = false; + let deadline = tokio::time::Instant::now() + Duration::from_secs(4); + + while tokio::time::Instant::now() < deadline && !(saw_status && saw_complete) { + let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); + let event = tokio::time::timeout(remaining, event_rx.recv()) + .await + .expect("timed out waiting for worker events") + .expect("failed to receive worker event"); + + match event { + ProcessEvent::WorkerStatus { + worker_id: event_worker_id, + status, + .. + } if event_worker_id == worker_id => { + saw_status = true; + assert_eq!(status, "timed_out"); + } + ProcessEvent::WorkerComplete { + worker_id: event_worker_id, + result, + .. + } if event_worker_id == worker_id => { + saw_complete = true; + assert!(result.contains("timed out after 1 seconds")); + } + _ => {} + } + } + + handle.await.expect("worker task join failed"); + assert!(saw_status, "expected terminal WorkerStatus event"); + assert!(saw_complete, "expected WorkerComplete event"); + } + + #[tokio::test] + async fn worker_timeout_resets_on_progress_events() { + let (event_tx, mut event_rx) = broadcast::channel(32); + let worker_id = Uuid::new_v4(); + let agent_id: crate::AgentId = Arc::from("test-agent"); + let channel_id: crate::ChannelId = Arc::from("test-channel"); + + let progress_tx = event_tx.clone(); + let progress_agent_id = agent_id.clone(); + let progress_channel_id = channel_id.clone(); + let progress_task = tokio::spawn(async move { + tokio::time::sleep(Duration::from_millis(700)).await; + let _ = progress_tx.send(ProcessEvent::WorkerStatus { + agent_id: progress_agent_id, + worker_id, + channel_id: Some(progress_channel_id), + status: "still working".to_string(), + }); + }); + + let handle = spawn_worker_task(worker_id, event_tx, agent_id, Some(channel_id), 1, async { + tokio::time::sleep(Duration::from_millis(1500)).await; + Ok::("completed after progress heartbeat".to_string()) + }); + + let mut terminal_status = None::; + let mut complete_result = None::; + let deadline = tokio::time::Instant::now() + Duration::from_secs(5); + while tokio::time::Instant::now() < deadline + && (terminal_status.is_none() || complete_result.is_none()) + { + let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); + let event = tokio::time::timeout(remaining, event_rx.recv()) + .await + .expect("timed out waiting for worker events") + .expect("failed to receive worker event"); + match event { + ProcessEvent::WorkerStatus { + worker_id: event_worker_id, + status, + .. + } if event_worker_id == worker_id => { + if status == "done" || status == "timed_out" || status == "failed" { + terminal_status = Some(status); + } + } + ProcessEvent::WorkerComplete { + worker_id: event_worker_id, + result, + .. + } if event_worker_id == worker_id => { + complete_result = Some(result); + } + _ => {} + } + } + + progress_task.await.expect("progress sender task failed"); + handle.await.expect("worker task join failed"); + + assert_eq!( + terminal_status.as_deref(), + Some("done"), + "worker should finish after progress heartbeat" + ); + assert_eq!( + complete_result.as_deref(), + Some("completed after progress heartbeat") + ); + } + + #[tokio::test] + async fn aborting_worker_task_drops_inner_future() { + struct DropSignal(Option>); + + impl Drop for DropSignal { + fn drop(&mut self) { + if let Some(sender) = self.0.take() { + let _ = sender.send(()); + } + } + } + + let (event_tx, _event_rx) = broadcast::channel(8); + let worker_id = Uuid::new_v4(); + let agent_id: crate::AgentId = Arc::from("test-agent"); + let channel_id: crate::ChannelId = Arc::from("test-channel"); + let (drop_tx, drop_rx) = oneshot::channel(); + let (started_tx, started_rx) = oneshot::channel(); + + let handle = + spawn_worker_task(worker_id, event_tx, agent_id, Some(channel_id), 30, async { + let _guard = DropSignal(Some(drop_tx)); + let _ = started_tx.send(()); + tokio::time::sleep(Duration::from_secs(120)).await; + Ok::("should not finish".to_string()) + }); + + tokio::time::timeout(Duration::from_secs(1), started_rx) + .await + .expect("future should start before cancellation") + .expect("start signal channel unexpectedly closed"); + handle.abort(); + let _ = handle.await; + + tokio::time::timeout(Duration::from_secs(1), drop_rx) + .await + .expect("future should be dropped when worker task is aborted") + .expect("drop signal channel unexpectedly closed"); + } + + #[test] + fn progress_event_detection_matches_worker_events() { + let worker_id = Uuid::new_v4(); + let other_worker_id = Uuid::new_v4(); + let agent_id: crate::AgentId = Arc::from("test-agent"); + let channel_id: crate::ChannelId = Arc::from("test-channel"); + + let progress = ProcessEvent::WorkerStatus { + agent_id: agent_id.clone(), + worker_id, + channel_id: Some(channel_id.clone()), + status: "working".to_string(), + }; + let non_progress = ProcessEvent::WorkerStatus { + agent_id, + worker_id: other_worker_id, + channel_id: Some(channel_id), + status: "working".to_string(), + }; + + assert!(is_worker_progress_event(&progress, worker_id)); + assert!(!is_worker_progress_event(&non_progress, worker_id)); + } + + #[test] + fn worker_failure_detection_matches_terminal_messages() { + assert!(is_worker_terminal_failure( + "Worker timed out after 300 seconds." + )); + assert!(is_worker_terminal_failure("Worker failed: request error")); + assert!(is_worker_terminal_failure( + "Worker cancelled: cancelled by request." + )); + assert!(!is_worker_terminal_failure( + "Completed summary with citations" + )); + } + + #[test] + fn worker_terminal_receipt_payload_reflects_terminal_state() { + assert_eq!( + classify_worker_terminal_state("Worker failed: provider error"), + "failed" + ); + assert_eq!( + classify_worker_terminal_state("Worker timed out after 45 seconds."), + "timed_out" + ); + assert_eq!( + classify_worker_terminal_state("Worker cancelled: cancelled by request."), + "cancelled" + ); + assert_eq!( + classify_worker_terminal_state("Completed report with citations"), + "done" + ); + + assert_eq!( + build_worker_terminal_receipt_payload("failed", "Worker failed: provider error"), + "Background task failed: Worker failed: provider error" + ); + assert_eq!( + build_worker_terminal_receipt_payload("done", "Completed report with citations"), + "Background task completed: Completed report with citations" + ); + } + + #[test] + fn worker_checkpoint_status_normalizes_and_truncates() { + assert_eq!(normalize_worker_checkpoint_status(" "), None); + assert_eq!( + normalize_worker_checkpoint_status("running tests"), + Some("running tests".to_string()) + ); + + let long_status = "word ".repeat(80); + let normalized = + normalize_worker_checkpoint_status(&long_status).expect("expected normalized status"); + assert!( + normalized.len() <= 223, + "status should be capped with ellipsis" + ); + assert!( + normalized.ends_with("..."), + "truncated checkpoint should end with ellipsis" + ); + } + + #[test] + fn worker_checkpoint_throttles_non_critical_updates() { + let now = tokio::time::Instant::now(); + let previous = WorkerCheckpointState { + last_status: "running".to_string(), + last_sent_at: now, + }; + + assert!( + !should_emit_worker_checkpoint(Some(&previous), "still running", now), + "non-critical updates should be throttled inside the interval" + ); + assert!( + should_emit_worker_checkpoint( + Some(&previous), + "waiting for input", + now + Duration::from_secs(1), + ), + "high-priority checkpoints should bypass throttle" + ); + assert!( + should_emit_worker_checkpoint( + Some(&previous), + "indexing repository", + now + Duration::from_secs(WORKER_CHECKPOINT_MIN_INTERVAL_SECS + 1), + ), + "non-critical updates should flow once interval elapsed" + ); + } + + #[test] + fn worker_result_summary_uses_first_non_empty_line() { + let summary = summarize_worker_result_for_status( + "\n\nCompleted research with 8 cited sources.\nAdditional detail that should not be included.", + ); + assert_eq!(summary, "Completed research with 8 cited sources."); + } + + #[test] + fn worker_start_summary_redacts_task_details() { + let research = summarize_worker_start_for_status( + "Research this GitHub commit thoroughly: https://github.com/openai/codex/commit/...", + ); + let coding = + summarize_worker_start_for_status("[opencode] Implement a retry loop in channel.rs"); + + assert_eq!(research, "research task"); + assert_eq!(coding, "coding task"); + assert!( + !research.contains("http"), + "public summary should not expose raw task content" + ); + } } diff --git a/src/api/channels.rs b/src/api/channels.rs index 857c41674..73c8285ac 100644 --- a/src/api/channels.rs +++ b/src/api/channels.rs @@ -132,19 +132,81 @@ pub(super) async fn channel_messages( pub(super) async fn channel_status( State(state): State>, ) -> Json> { - let snapshot: Vec<_> = { + let status_snapshot: Vec<_> = { let blocks = state.channel_status_blocks.read().await; blocks.iter().map(|(k, v)| (k.clone(), v.clone())).collect() }; + let state_snapshot: HashMap = { + let channel_states = state.channel_states.read().await; + channel_states + .iter() + .map(|(channel_id, channel_state)| (channel_id.clone(), channel_state.clone())) + .collect() + }; let mut result = HashMap::new(); - for (channel_id, status_block) in snapshot { + for (channel_id, status_block) in status_snapshot { let block = status_block.read().await; - if let Ok(value) = serde_json::to_value(&*block) { + if let Ok(mut value) = serde_json::to_value(&*block) { + if let Some(channel_state) = state_snapshot.get(&channel_id) { + match channel_state + .process_run_logger + .load_worker_delivery_receipt_stats(&channel_id) + .await + { + Ok(stats) => { + if let Some(object) = value.as_object_mut() { + if let Ok(stats_value) = serde_json::to_value(stats) { + object.insert("worker_delivery_receipts".to_string(), stats_value); + } + } + } + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + "failed to load worker delivery receipt stats" + ); + } + } + } result.insert(channel_id, value); } } + // Include channels that are active in state but missing from the + // channel_status_blocks snapshot (for example, during registration races). + for (channel_id, channel_state) in &state_snapshot { + if result.contains_key(channel_id) { + continue; + } + + let block = channel_state.status_block.read().await; + if let Ok(mut value) = serde_json::to_value(&*block) { + match channel_state + .process_run_logger + .load_worker_delivery_receipt_stats(channel_id) + .await + { + Ok(stats) => { + if let Some(object) = value.as_object_mut() { + if let Ok(stats_value) = serde_json::to_value(stats) { + object.insert("worker_delivery_receipts".to_string(), stats_value); + } + } + } + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + "failed to load worker delivery receipt stats" + ); + } + } + result.insert(channel_id.clone(), value); + } + } + Json(result) } @@ -198,7 +260,7 @@ pub(super) async fn cancel_process( .parse() .map_err(|_| StatusCode::BAD_REQUEST)?; channel_state - .cancel_worker(worker_id) + .cancel_worker(worker_id, None) .await .map_err(|_| StatusCode::NOT_FOUND)?; Ok(Json(CancelProcessResponse { diff --git a/src/api/webchat.rs b/src/api/webchat.rs index 352d6d487..ffcb3b503 100644 --- a/src/api/webchat.rs +++ b/src/api/webchat.rs @@ -86,6 +86,9 @@ pub(super) async fn webchat_send( WebChatEvent::StreamEnd => "stream_end", WebChatEvent::ToolStarted { .. } => "tool_started", WebChatEvent::ToolCompleted { .. } => "tool_completed", + WebChatEvent::WorkerStarted { .. } => "worker_started", + WebChatEvent::WorkerCheckpoint { .. } => "worker_checkpoint", + WebChatEvent::WorkerCompleted { .. } => "worker_completed", WebChatEvent::StopTyping => "stop_typing", WebChatEvent::Done => "done", }; diff --git a/src/conversation/history.rs b/src/conversation/history.rs index 9f0db80da..1086735fa 100644 --- a/src/conversation/history.rs +++ b/src/conversation/history.rs @@ -214,6 +214,43 @@ pub enum TimelineItem { }, } +const WORKER_TERMINAL_RECEIPT_KIND: &str = "worker_terminal"; +const WORKER_RECEIPT_MAX_ATTEMPTS: i64 = 6; +const WORKER_RECEIPT_BACKOFF_SECS: [i64; 5] = [5, 15, 45, 120, 300]; +const WORKER_RECEIPT_RETENTION_DAYS: i64 = 30; + +fn worker_receipt_backoff_secs(attempt_count: i64) -> Option { + if attempt_count <= 0 { + return WORKER_RECEIPT_BACKOFF_SECS.first().copied(); + } + WORKER_RECEIPT_BACKOFF_SECS + .get((attempt_count - 1) as usize) + .copied() +} + +#[derive(Debug, Clone)] +pub struct PendingWorkerDeliveryReceipt { + pub id: String, + pub worker_id: String, + pub channel_id: String, + pub terminal_state: String, + pub payload_text: String, + pub attempt_count: i64, +} + +#[derive(Debug, Clone, Serialize)] +pub struct WorkerDeliveryReceiptStats { + pub pending: u64, + pub failed: u64, +} + +#[derive(Debug, Clone, Serialize)] +pub struct WorkerDeliveryRetryOutcome { + pub status: String, + pub attempt_count: i64, + pub next_attempt_at: Option, +} + /// Persists branch and worker run records for channel timeline history. /// /// All write methods are fire-and-forget, same pattern as ConversationLogger. @@ -328,10 +365,22 @@ impl ProcessRunLogger { tokio::spawn(async move { if let Err(error) = sqlx::query( - "UPDATE worker_runs SET result = ?, status = ?, completed_at = CURRENT_TIMESTAMP WHERE id = ?" + "UPDATE worker_runs \ + SET result = ?, \ + status = CASE \ + WHEN status IN ('cancelled', 'failed', 'timed_out') THEN status \ + WHEN ? LIKE 'Worker cancelled:%' THEN 'cancelled' \ + WHEN ? LIKE 'Worker failed:%' THEN 'failed' \ + WHEN ? LIKE 'Worker timed out after %' THEN 'timed_out' \ + ELSE 'done' \ + END, \ + completed_at = CURRENT_TIMESTAMP \ + WHERE id = ?", ) .bind(&result) - .bind(status) + .bind(&result) + .bind(&result) + .bind(&result) .bind(&id) .execute(&pool) .await @@ -341,6 +390,399 @@ impl ProcessRunLogger { }); } + /// Create (or refresh) the durable terminal delivery receipt for a worker. + /// + /// One terminal receipt exists per worker (`kind = worker_terminal`). If the + /// receipt already exists and is not acked, it is reset to pending so it can + /// be retried. + pub async fn upsert_worker_terminal_receipt( + &self, + channel_id: &ChannelId, + worker_id: WorkerId, + terminal_state: &str, + payload_text: &str, + ) -> crate::error::Result { + let worker_id = worker_id.to_string(); + let channel_id = channel_id.to_string(); + + let existing = sqlx::query( + "SELECT id, status \ + FROM worker_delivery_receipts \ + WHERE worker_id = ? AND kind = ?", + ) + .bind(&worker_id) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .fetch_optional(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + if let Some(row) = existing { + let receipt_id: String = row.try_get("id").unwrap_or_default(); + let status: String = row.try_get("status").unwrap_or_default(); + + if status != "acked" { + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET channel_id = ?, \ + terminal_state = ?, \ + payload_text = ?, \ + status = 'pending', \ + last_error = NULL, \ + next_attempt_at = CURRENT_TIMESTAMP, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ?", + ) + .bind(&channel_id) + .bind(terminal_state) + .bind(payload_text) + .bind(&receipt_id) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + } + + return Ok(receipt_id); + } + + let receipt_id = uuid::Uuid::new_v4().to_string(); + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at) \ + VALUES (?, ?, ?, ?, 'pending', ?, ?, CURRENT_TIMESTAMP)", + ) + .bind(&receipt_id) + .bind(&worker_id) + .bind(&channel_id) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind(terminal_state) + .bind(payload_text) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + Ok(receipt_id) + } + + /// Claim due pending terminal receipts for delivery. + /// + /// Claimed receipts are transitioned to `sending` so we can distinguish in-flight + /// deliveries from queued retries. + pub async fn claim_due_worker_terminal_receipts( + &self, + channel_id: &ChannelId, + limit: i64, + ) -> crate::error::Result> { + let channel_id = channel_id.to_string(); + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let rows = sqlx::query( + "SELECT id, worker_id, channel_id, terminal_state, payload_text, attempt_count \ + FROM worker_delivery_receipts \ + WHERE channel_id = ? \ + AND kind = ? \ + AND status = 'pending' \ + AND next_attempt_at <= CURRENT_TIMESTAMP \ + ORDER BY next_attempt_at ASC, created_at ASC \ + LIMIT ?", + ) + .bind(&channel_id) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind(limit) + .fetch_all(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let mut claimed = Vec::with_capacity(rows.len()); + for row in rows { + let receipt_id: String = row.try_get("id").unwrap_or_default(); + let updated = sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'sending', updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? AND status = 'pending'", + ) + .bind(&receipt_id) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + if updated == 0 { + continue; + } + + claimed.push(PendingWorkerDeliveryReceipt { + id: receipt_id, + worker_id: row.try_get("worker_id").unwrap_or_default(), + channel_id: row.try_get("channel_id").unwrap_or_default(), + terminal_state: row.try_get("terminal_state").unwrap_or_default(), + payload_text: row.try_get("payload_text").unwrap_or_default(), + attempt_count: row.try_get("attempt_count").unwrap_or_default(), + }); + } + + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + Ok(claimed) + } + + /// Claim due pending terminal receipts across all channels. + /// + /// Used by the global receipt dispatcher to drain terminal notices even + /// when no channel loop is currently active. + pub async fn claim_due_worker_terminal_receipts_any( + &self, + limit: i64, + ) -> crate::error::Result> { + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let rows = sqlx::query( + "SELECT id, worker_id, channel_id, terminal_state, payload_text, attempt_count \ + FROM worker_delivery_receipts \ + WHERE kind = ? \ + AND status = 'pending' \ + AND next_attempt_at <= CURRENT_TIMESTAMP \ + ORDER BY next_attempt_at ASC, created_at ASC \ + LIMIT ?", + ) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind(limit) + .fetch_all(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let mut claimed = Vec::with_capacity(rows.len()); + for row in rows { + let receipt_id: String = row.try_get("id").unwrap_or_default(); + let updated = sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'sending', updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? AND status = 'pending'", + ) + .bind(&receipt_id) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + if updated == 0 { + continue; + } + + claimed.push(PendingWorkerDeliveryReceipt { + id: receipt_id, + worker_id: row.try_get("worker_id").unwrap_or_default(), + channel_id: row.try_get("channel_id").unwrap_or_default(), + terminal_state: row.try_get("terminal_state").unwrap_or_default(), + payload_text: row.try_get("payload_text").unwrap_or_default(), + attempt_count: row.try_get("attempt_count").unwrap_or_default(), + }); + } + + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + Ok(claimed) + } + + /// Mark a terminal receipt as delivered. + /// + /// Returns true if this call transitioned the row to acked. + pub async fn ack_worker_delivery_receipt( + &self, + receipt_id: &str, + ) -> crate::error::Result { + let updated = sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'acked', \ + acked_at = CURRENT_TIMESTAMP, \ + updated_at = CURRENT_TIMESTAMP, \ + last_error = NULL \ + WHERE id = ? AND status != 'acked'", + ) + .bind(receipt_id) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + Ok(updated > 0) + } + + /// Record a delivery failure and schedule the next retry (or terminal failure). + pub async fn fail_worker_delivery_receipt_attempt( + &self, + receipt_id: &str, + error: &str, + ) -> crate::error::Result { + let row = sqlx::query( + "SELECT status, attempt_count \ + FROM worker_delivery_receipts \ + WHERE id = ?", + ) + .bind(receipt_id) + .fetch_optional(&self.pool) + .await + .map_err(|db_error| anyhow::anyhow!(db_error))? + .ok_or_else(|| anyhow::anyhow!("worker delivery receipt not found: {receipt_id}"))?; + + let current_status: String = row.try_get("status").unwrap_or_default(); + let current_attempts: i64 = row.try_get("attempt_count").unwrap_or_default(); + + if current_status == "acked" { + return Ok(WorkerDeliveryRetryOutcome { + status: "acked".to_string(), + attempt_count: current_attempts, + next_attempt_at: None, + }); + } + + let attempt_count = current_attempts + 1; + if attempt_count >= WORKER_RECEIPT_MAX_ATTEMPTS { + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'failed', \ + attempt_count = ?, \ + last_error = ?, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ?", + ) + .bind(attempt_count) + .bind(error) + .bind(receipt_id) + .execute(&self.pool) + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; + + return Ok(WorkerDeliveryRetryOutcome { + status: "failed".to_string(), + attempt_count, + next_attempt_at: None, + }); + } + + let delay_secs = worker_receipt_backoff_secs(attempt_count).unwrap_or(300); + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'pending', \ + attempt_count = ?, \ + last_error = ?, \ + next_attempt_at = datetime('now', '+' || ? || ' seconds'), \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ?", + ) + .bind(attempt_count) + .bind(error) + .bind(delay_secs) + .bind(receipt_id) + .execute(&self.pool) + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; + + let next_attempt_at = chrono::Utc::now() + .checked_add_signed(chrono::TimeDelta::seconds(delay_secs)) + .map(|timestamp| timestamp.to_rfc3339()); + + Ok(WorkerDeliveryRetryOutcome { + status: "pending".to_string(), + attempt_count, + next_attempt_at, + }) + } + + /// Load worker delivery receipt stats for a channel. + pub async fn load_worker_delivery_receipt_stats( + &self, + channel_id: &str, + ) -> crate::error::Result { + let row = sqlx::query( + "SELECT \ + SUM(CASE WHEN status IN ('pending', 'sending') THEN 1 ELSE 0 END) AS pending_count, \ + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) AS failed_count \ + FROM worker_delivery_receipts \ + WHERE channel_id = ? \ + AND kind = ?", + ) + .bind(channel_id) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .fetch_one(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let pending = row.try_get::("pending_count").unwrap_or(0).max(0) as u64; + let failed = row.try_get::("failed_count").unwrap_or(0).max(0) as u64; + + Ok(WorkerDeliveryReceiptStats { pending, failed }) + } + + /// Delete old terminal delivery receipts that are no longer actionable. + /// + /// Keeps `pending` and `sending` rows intact, and only removes terminal rows + /// (`acked`, `failed`) older than the configured retention period. + pub async fn prune_worker_delivery_receipts(&self) -> crate::error::Result { + let deleted = sqlx::query( + "DELETE FROM worker_delivery_receipts \ + WHERE status IN ('acked', 'failed') \ + AND julianday(updated_at) < julianday('now', '-' || ? || ' days')", + ) + .bind(WORKER_RECEIPT_RETENTION_DAYS) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + Ok(deleted) + } + + /// Close orphaned branch and worker runs from a previous process lifetime. + /// + /// This is called on startup before channels begin handling messages. Any + /// rows with NULL `completed_at` cannot be resumed and should be marked + /// terminal so timelines and analytics stay accurate. + pub async fn close_orphaned_runs(&self) -> crate::error::Result<(u64, u64, u64)> { + let worker_result = sqlx::query( + "UPDATE worker_runs \ + SET status = 'failed', \ + result = COALESCE(result, 'Worker interrupted by restart before completion.'), \ + completed_at = CURRENT_TIMESTAMP \ + WHERE completed_at IS NULL", + ) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let branch_result = sqlx::query( + "UPDATE branch_runs \ + SET conclusion = COALESCE(conclusion, 'Branch interrupted by restart before completion.'), \ + completed_at = CURRENT_TIMESTAMP \ + WHERE completed_at IS NULL", + ) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let receipt_result = sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'pending', \ + next_attempt_at = CURRENT_TIMESTAMP, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE status = 'sending'", + ) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + Ok(( + worker_result.rows_affected(), + branch_result.rows_affected(), + receipt_result.rows_affected(), + )) + } + /// Load a unified timeline for a channel: messages, branch runs, and worker runs /// interleaved chronologically (oldest first). /// @@ -598,3 +1040,419 @@ pub struct WorkerDetailRow { pub transcript_blob: Option>, pub tool_calls: i64, } + +#[cfg(test)] +mod tests { + use super::*; + use sqlx::sqlite::SqliteConnectOptions; + use std::sync::Arc; + + async fn connect_logger() -> ProcessRunLogger { + let options = SqliteConnectOptions::new() + .in_memory(true) + .create_if_missing(true); + let pool = sqlx::pool::PoolOptions::::new() + .max_connections(1) + .connect_with(options) + .await + .expect("in-memory SQLite"); + sqlx::migrate!("./migrations") + .run(&pool) + .await + .expect("migrations"); + ProcessRunLogger::new(pool) + } + + #[tokio::test] + async fn worker_terminal_receipt_claim_ack_and_stats() { + let logger = connect_logger().await; + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "done", + "Background task completed: finished indexing", + ) + .await + .expect("upsert receipt"); + + let initial_stats = logger + .load_worker_delivery_receipt_stats(channel_id.as_ref()) + .await + .expect("load initial stats"); + assert_eq!(initial_stats.pending, 1); + assert_eq!(initial_stats.failed, 0); + + let claimed = logger + .claim_due_worker_terminal_receipts(&channel_id, 8) + .await + .expect("claim due receipts"); + assert_eq!(claimed.len(), 1); + assert_eq!(claimed[0].id, receipt_id); + assert_eq!(claimed[0].terminal_state, "done"); + assert_eq!(claimed[0].attempt_count, 0); + + let acked_now = logger + .ack_worker_delivery_receipt(&receipt_id) + .await + .expect("ack receipt"); + assert!(acked_now); + + let acked_again = logger + .ack_worker_delivery_receipt(&receipt_id) + .await + .expect("idempotent ack"); + assert!(!acked_again); + + let final_stats = logger + .load_worker_delivery_receipt_stats(channel_id.as_ref()) + .await + .expect("load final stats"); + assert_eq!(final_stats.pending, 0); + assert_eq!(final_stats.failed, 0); + } + + #[tokio::test] + async fn worker_terminal_receipt_failure_retries_then_fails() { + let logger = connect_logger().await; + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "failed", + "Background task failed: network timeout", + ) + .await + .expect("upsert receipt"); + + let first_outcome = logger + .fail_worker_delivery_receipt_attempt(&receipt_id, "temporary send failure") + .await + .expect("record first failure"); + assert_eq!(first_outcome.status, "pending"); + assert_eq!(first_outcome.attempt_count, 1); + assert!(first_outcome.next_attempt_at.is_some()); + + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET next_attempt_at = CURRENT_TIMESTAMP \ + WHERE id = ?", + ) + .bind(&receipt_id) + .execute(&logger.pool) + .await + .expect("advance retry deadline"); + + let claimed = logger + .claim_due_worker_terminal_receipts(&channel_id, 8) + .await + .expect("claim receipt after retry scheduling"); + assert_eq!(claimed.len(), 1); + assert_eq!(claimed[0].attempt_count, 1); + + for attempt in 2..=WORKER_RECEIPT_MAX_ATTEMPTS { + let outcome = logger + .fail_worker_delivery_receipt_attempt(&receipt_id, "adapter unavailable") + .await + .expect("record retry failure"); + assert_eq!(outcome.attempt_count, attempt); + if attempt < WORKER_RECEIPT_MAX_ATTEMPTS { + assert_eq!(outcome.status, "pending"); + assert!(outcome.next_attempt_at.is_some()); + } else { + assert_eq!(outcome.status, "failed"); + assert!(outcome.next_attempt_at.is_none()); + } + } + + let stats = logger + .load_worker_delivery_receipt_stats(channel_id.as_ref()) + .await + .expect("load retry stats"); + assert_eq!(stats.pending, 0); + assert_eq!(stats.failed, 1); + } + + #[tokio::test] + async fn close_orphaned_runs_requeues_sending_receipts() { + let logger = connect_logger().await; + let receipt_id = "receipt-test"; + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at) \ + VALUES (?, ?, ?, ?, 'sending', ?, ?, CURRENT_TIMESTAMP)", + ) + .bind(receipt_id) + .bind(uuid::Uuid::new_v4().to_string()) + .bind("channel:test") + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("done") + .bind("Background task completed: done") + .execute(&logger.pool) + .await + .expect("insert sending receipt"); + + let (_, _, recovered_receipts) = logger + .close_orphaned_runs() + .await + .expect("recover orphaned runs"); + assert_eq!(recovered_receipts, 1); + + let status: String = + sqlx::query_scalar("SELECT status FROM worker_delivery_receipts WHERE id = ?") + .bind(receipt_id) + .fetch_one(&logger.pool) + .await + .expect("load receipt status"); + assert_eq!(status, "pending"); + } + + #[tokio::test] + async fn claim_due_worker_terminal_receipts_any_claims_multiple_channels() { + let logger = connect_logger().await; + let channel_a: ChannelId = Arc::from("discord:1:100"); + let channel_b: ChannelId = Arc::from("discord:1:200"); + + logger + .upsert_worker_terminal_receipt( + &channel_a, + uuid::Uuid::new_v4(), + "done", + "Background task completed: channel a", + ) + .await + .expect("upsert channel a receipt"); + logger + .upsert_worker_terminal_receipt( + &channel_b, + uuid::Uuid::new_v4(), + "done", + "Background task completed: channel b", + ) + .await + .expect("upsert channel b receipt"); + + let claimed = logger + .claim_due_worker_terminal_receipts_any(10) + .await + .expect("claim due receipts across channels"); + assert_eq!(claimed.len(), 2); + assert!( + claimed + .iter() + .any(|receipt| receipt.channel_id == channel_a.as_ref()) + ); + assert!( + claimed + .iter() + .any(|receipt| receipt.channel_id == channel_b.as_ref()) + ); + } + + #[tokio::test] + async fn worker_terminal_receipt_cancelled_claim_ack_round_trip() { + let logger = connect_logger().await; + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "cancelled", + "Background task was cancelled.", + ) + .await + .expect("upsert cancelled receipt"); + + let claimed = logger + .claim_due_worker_terminal_receipts(&channel_id, 8) + .await + .expect("claim cancelled receipt"); + assert_eq!(claimed.len(), 1); + assert_eq!(claimed[0].id, receipt_id); + assert_eq!(claimed[0].terminal_state, "cancelled"); + assert_eq!(claimed[0].payload_text, "Background task was cancelled."); + + let acked = logger + .ack_worker_delivery_receipt(&receipt_id) + .await + .expect("ack cancelled receipt"); + assert!(acked); + + let stats = logger + .load_worker_delivery_receipt_stats(channel_id.as_ref()) + .await + .expect("load stats"); + assert_eq!(stats.pending, 0); + assert_eq!(stats.failed, 0); + } + + #[tokio::test] + async fn cancelled_receipt_delivery_failure_retries_then_acks() { + let logger = connect_logger().await; + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "cancelled", + "Background task was cancelled.", + ) + .await + .expect("upsert cancelled receipt"); + + let first_claim = logger + .claim_due_worker_terminal_receipts(&channel_id, 8) + .await + .expect("first claim"); + assert_eq!(first_claim.len(), 1); + assert_eq!(first_claim[0].id, receipt_id); + assert_eq!(first_claim[0].attempt_count, 0); + + let retry = logger + .fail_worker_delivery_receipt_attempt(&receipt_id, "adapter unavailable") + .await + .expect("record first delivery failure"); + assert_eq!(retry.status, "pending"); + assert_eq!(retry.attempt_count, 1); + assert!(retry.next_attempt_at.is_some()); + + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET next_attempt_at = CURRENT_TIMESTAMP \ + WHERE id = ?", + ) + .bind(&receipt_id) + .execute(&logger.pool) + .await + .expect("advance retry deadline"); + + let second_claim = logger + .claim_due_worker_terminal_receipts(&channel_id, 8) + .await + .expect("second claim after retry"); + assert_eq!(second_claim.len(), 1); + assert_eq!(second_claim[0].id, receipt_id); + assert_eq!(second_claim[0].attempt_count, 1); + + let acked = logger + .ack_worker_delivery_receipt(&receipt_id) + .await + .expect("ack retried receipt"); + assert!(acked); + + let status: String = + sqlx::query_scalar("SELECT status FROM worker_delivery_receipts WHERE id = ?") + .bind(&receipt_id) + .fetch_one(&logger.pool) + .await + .expect("load receipt status"); + assert_eq!(status, "acked"); + + let stats = logger + .load_worker_delivery_receipt_stats(channel_id.as_ref()) + .await + .expect("load stats after ack"); + assert_eq!(stats.pending, 0); + assert_eq!(stats.failed, 0); + } + + #[tokio::test] + async fn prune_worker_delivery_receipts_deletes_only_old_terminal_rows() { + let logger = connect_logger().await; + let worker_old_acked = uuid::Uuid::new_v4().to_string(); + let worker_old_failed = uuid::Uuid::new_v4().to_string(); + let worker_old_pending = uuid::Uuid::new_v4().to_string(); + let worker_recent_acked = uuid::Uuid::new_v4().to_string(); + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at, updated_at) \ + VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, ?)", + ) + .bind("old-acked") + .bind(&worker_old_acked) + .bind("channel:test") + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("acked") + .bind("done") + .bind("Background task completed: old") + .bind("2000-01-01T00:00:00Z") + .execute(&logger.pool) + .await + .expect("insert old acked"); + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at, updated_at) \ + VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, ?)", + ) + .bind("old-failed") + .bind(&worker_old_failed) + .bind("channel:test") + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("failed") + .bind("failed") + .bind("Background task failed: old") + .bind("2000-01-01T00:00:00Z") + .execute(&logger.pool) + .await + .expect("insert old failed"); + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at, updated_at) \ + VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, ?)", + ) + .bind("old-pending") + .bind(&worker_old_pending) + .bind("channel:test") + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("pending") + .bind("done") + .bind("Background task completed: pending") + .bind("2000-01-01T00:00:00Z") + .execute(&logger.pool) + .await + .expect("insert old pending"); + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at, updated_at) \ + VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)", + ) + .bind("recent-acked") + .bind(&worker_recent_acked) + .bind("channel:test") + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("acked") + .bind("done") + .bind("Background task completed: recent") + .execute(&logger.pool) + .await + .expect("insert recent acked"); + + let deleted = logger + .prune_worker_delivery_receipts() + .await + .expect("prune receipts"); + assert_eq!(deleted, 2); + + let remaining: Vec = + sqlx::query_scalar("SELECT id FROM worker_delivery_receipts ORDER BY id ASC") + .fetch_all(&logger.pool) + .await + .expect("load remaining receipt ids"); + assert_eq!(remaining, vec!["old-pending", "recent-acked"]); + } +} diff --git a/src/cron/scheduler.rs b/src/cron/scheduler.rs index bf8b841d8..21a55d878 100644 --- a/src/cron/scheduler.rs +++ b/src/cron/scheduler.rs @@ -564,7 +564,7 @@ async fn run_cron_job(job: &CronJob, context: &CronContext) -> Result<()> { let channel_id: crate::ChannelId = Arc::from(format!("cron:{}", job.id).as_str()); // Create the outbound response channel to collect whatever the channel produces - let (response_tx, mut response_rx) = tokio::sync::mpsc::channel::(32); + let (response_tx, mut response_rx) = tokio::sync::mpsc::channel::(32); // Subscribe to the agent's event bus (the channel needs this for branch/worker events) let event_rx = context.deps.event_tx.subscribe(); @@ -615,10 +615,16 @@ async fn run_cron_job(job: &CronJob, context: &CronContext) -> Result<()> { loop { match tokio::time::timeout(timeout, response_rx.recv()).await { - Ok(Some(OutboundResponse::Text(text))) => { + Ok(Some(crate::OutboundEnvelope { + response: OutboundResponse::Text(text), + .. + })) => { collected_text.push(text); } - Ok(Some(OutboundResponse::RichMessage { text, .. })) => { + Ok(Some(crate::OutboundEnvelope { + response: OutboundResponse::RichMessage { text, .. }, + .. + })) => { collected_text.push(text); } Ok(Some(_)) => { diff --git a/src/lib.rs b/src/lib.rs index 0e28ff0a3..a123f204b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -380,6 +380,38 @@ pub enum OutboundResponse { Status(StatusUpdate), } +/// Internal wrapper for outbound channel routing. +/// +/// Carries an optional durable delivery receipt ID for messages that must be +/// acknowledged by the outbound adapter path. +#[derive(Debug, Clone)] +pub struct OutboundEnvelope { + pub response: OutboundResponse, + pub receipt_id: Option, +} + +impl OutboundEnvelope { + pub fn untracked(response: OutboundResponse) -> Self { + Self { + response, + receipt_id: None, + } + } + + pub fn tracked(response: OutboundResponse, receipt_id: String) -> Self { + Self { + response, + receipt_id: Some(receipt_id), + } + } +} + +impl From for OutboundEnvelope { + fn from(response: OutboundResponse) -> Self { + Self::untracked(response) + } +} + /// A generic rich-formatted card (maps to Embeds in Discord). #[derive(Debug, Clone, Serialize, Deserialize, Default, schemars::JsonSchema)] pub struct Card { @@ -503,4 +535,10 @@ pub enum StatusUpdate { worker_id: WorkerId, result: String, }, + /// Progress checkpoint from a running worker. + /// Intended for sparse, meaningful user-facing updates. + WorkerCheckpoint { + worker_id: WorkerId, + status: String, + }, } diff --git a/src/llm/model.rs b/src/llm/model.rs index c2a166dd0..daf5163d2 100644 --- a/src/llm/model.rs +++ b/src/llm/model.rs @@ -517,7 +517,7 @@ impl SpacebotModel { "messages": messages, }); - if let Some(max_tokens) = request.max_tokens { + if let Some(max_tokens) = positive_max_tokens(request.max_tokens) { body["max_tokens"] = serde_json::json!(max_tokens); } @@ -629,7 +629,7 @@ impl SpacebotModel { ); } - if !is_chatgpt_codex && let Some(max_tokens) = request.max_tokens { + if !is_chatgpt_codex && let Some(max_tokens) = positive_max_tokens(request.max_tokens) { body["max_output_tokens"] = serde_json::json!(max_tokens); } @@ -757,7 +757,7 @@ impl SpacebotModel { "messages": messages, }); - if let Some(max_tokens) = request.max_tokens { + if let Some(max_tokens) = positive_max_tokens(request.max_tokens) { body["max_tokens"] = serde_json::json!(max_tokens); } @@ -843,7 +843,7 @@ impl SpacebotModel { "messages": messages, }); - if let Some(max_tokens) = request.max_tokens { + if let Some(max_tokens) = positive_max_tokens(request.max_tokens) { body["max_tokens"] = serde_json::json!(max_tokens); } @@ -925,6 +925,10 @@ fn reverse_map_tool_names( } } +fn positive_max_tokens(max_tokens: Option) -> Option { + max_tokens.filter(|value| *value > 0) +} + fn tool_result_content_to_string(content: &OneOrMany) -> String { content .iter() @@ -1550,4 +1554,16 @@ mod tests { panic!("expected ToolCall"); } } + + #[test] + fn positive_max_tokens_omits_zero() { + assert_eq!(positive_max_tokens(None), None); + assert_eq!(positive_max_tokens(Some(0)), None); + } + + #[test] + fn positive_max_tokens_keeps_positive_values() { + assert_eq!(positive_max_tokens(Some(1)), Some(1)); + assert_eq!(positive_max_tokens(Some(2048)), Some(2048)); + } } diff --git a/src/main.rs b/src/main.rs index e1eec905c..8b48d8e2f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -126,6 +126,10 @@ struct ActiveChannel { _outbound_handle: tokio::task::JoinHandle<()>, } +const WORKER_RECEIPT_GLOBAL_DISPATCH_INTERVAL_SECS: u64 = 5; +const WORKER_RECEIPT_GLOBAL_DISPATCH_BATCH_SIZE: i64 = 32; +const WORKER_RECEIPT_PRUNE_INTERVAL_SECS: u64 = 60 * 60; + fn main() -> anyhow::Result<()> { rustls::crypto::ring::default_provider() .install_default() @@ -243,6 +247,159 @@ async fn cmd_stop() -> anyhow::Result<()> { Ok(()) } +fn spawn_worker_receipt_dispatch_loop( + agent_id: String, + process_run_logger: spacebot::conversation::history::ProcessRunLogger, + channel_store: spacebot::conversation::ChannelStore, + conversation_logger: spacebot::conversation::history::ConversationLogger, + messaging_manager: Arc, +) -> tokio::task::JoinHandle<()> { + tokio::spawn(async move { + let mut next_prune_at = tokio::time::Instant::now() + std::time::Duration::from_secs(5); + loop { + if tokio::time::Instant::now() >= next_prune_at { + match process_run_logger.prune_worker_delivery_receipts().await { + Ok(deleted) if deleted > 0 => { + tracing::info!( + agent_id = %agent_id, + deleted, + "pruned old worker delivery receipts" + ); + } + Ok(_) => {} + Err(error) => { + tracing::warn!( + agent_id = %agent_id, + %error, + "failed to prune worker delivery receipts" + ); + } + } + next_prune_at = tokio::time::Instant::now() + + std::time::Duration::from_secs(WORKER_RECEIPT_PRUNE_INTERVAL_SECS); + } + + let due = match process_run_logger + .claim_due_worker_terminal_receipts_any(WORKER_RECEIPT_GLOBAL_DISPATCH_BATCH_SIZE) + .await + { + Ok(receipts) => receipts, + Err(error) => { + tracing::warn!( + agent_id = %agent_id, + %error, + "global worker receipt dispatcher failed to claim receipts" + ); + Vec::new() + } + }; + + for receipt in due { + let delivery_result = async { + let channel_info = channel_store + .get(&receipt.channel_id) + .await + .map_err(|error| { + anyhow::anyhow!("failed to resolve channel info: {error}") + })? + .ok_or_else(|| { + anyhow::anyhow!( + "cannot deliver worker receipt: channel '{}' not found", + receipt.channel_id + ) + })?; + + let target = + spacebot::messaging::target::resolve_broadcast_target(&channel_info) + .ok_or_else(|| { + anyhow::anyhow!( + "cannot resolve broadcast target for channel '{}'", + receipt.channel_id + ) + })?; + + messaging_manager + .broadcast( + &target.adapter, + &target.target, + spacebot::OutboundResponse::Text(receipt.payload_text.clone()), + ) + .await + } + .await; + + match delivery_result { + Ok(()) => match process_run_logger + .ack_worker_delivery_receipt(&receipt.id) + .await + { + Ok(acked_now) => { + if acked_now { + let channel_id: spacebot::ChannelId = + Arc::from(receipt.channel_id.as_str()); + conversation_logger + .log_bot_message(&channel_id, &receipt.payload_text); + tracing::info!( + agent_id = %agent_id, + channel_id = %receipt.channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + "global worker receipt dispatcher delivered terminal receipt" + ); + } + } + Err(error) => { + tracing::warn!( + agent_id = %agent_id, + channel_id = %receipt.channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + %error, + "failed to ack globally delivered worker terminal receipt" + ); + } + }, + Err(error) => { + match process_run_logger + .fail_worker_delivery_receipt_attempt(&receipt.id, &error.to_string()) + .await + { + Ok(outcome) => { + tracing::warn!( + agent_id = %agent_id, + channel_id = %receipt.channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + attempt_count = outcome.attempt_count, + status = %outcome.status, + next_attempt_at = ?outcome.next_attempt_at, + %error, + "global worker receipt dispatcher failed to deliver terminal receipt" + ); + } + Err(update_error) => { + tracing::warn!( + agent_id = %agent_id, + channel_id = %receipt.channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + %update_error, + "failed to record global worker receipt delivery failure" + ); + } + } + } + } + } + + tokio::time::sleep(std::time::Duration::from_secs( + WORKER_RECEIPT_GLOBAL_DISPATCH_INTERVAL_SECS, + )) + .await; + } + }) +} + /// Stop if running, don't error if not. fn cmd_stop_if_running() { let paths = spacebot::daemon::DaemonPaths::from_default(); @@ -856,7 +1013,8 @@ async fn run( }; // Create outbound response channel - let (response_tx, mut response_rx) = mpsc::channel::(32); + let (response_tx, mut response_rx) = + mpsc::channel::(32); // Subscribe to the agent's event bus let event_rx = agent.deps.event_tx.subscribe(); @@ -930,12 +1088,35 @@ async fn run( let latest_message = Arc::new(tokio::sync::RwLock::new(message.clone())); let outbound_message = latest_message.clone(); let outbound_conversation_id = conversation_id.clone(); + let outbound_channel_id: spacebot::ChannelId = + Arc::from(outbound_conversation_id.clone()); + let outbound_process_logger = + spacebot::conversation::history::ProcessRunLogger::new( + agent.db.sqlite.clone(), + ); + let outbound_conversation_logger = + spacebot::conversation::history::ConversationLogger::new( + agent.db.sqlite.clone(), + ); let api_event_tx = api_state.event_tx.clone(); let sse_agent_id = agent_id.to_string(); let sse_channel_id = conversation_id.clone(); let outbound_agent_names = agent.deps.agent_names.clone(); let outbound_handle = tokio::spawn(async move { - while let Some(response) = response_rx.recv().await { + while let Some(envelope) = response_rx.recv().await { + let receipt_id = envelope.receipt_id.clone(); + let response = envelope.response; + let receipt_log_text = match &response { + spacebot::OutboundResponse::Text(text) => Some(text.clone()), + spacebot::OutboundResponse::RichMessage { text, .. } => { + Some(text.clone()) + } + spacebot::OutboundResponse::ThreadReply { text, .. } => { + Some(text.clone()) + } + _ => None, + }; + // Forward relevant events to SSE clients match &response { spacebot::OutboundResponse::Text(text) => { @@ -977,121 +1158,87 @@ async fn run( } let current_message = outbound_message.read().await.clone(); + let is_status_update = + matches!(response, spacebot::OutboundResponse::Status(_)); + let delivery_result = match response { + spacebot::OutboundResponse::Status(status) => { + messaging_for_outbound.send_status(¤t_message, status).await + } + response => { + tracing::info!( + conversation_id = %outbound_conversation_id, + "routing outbound response to messaging adapter" + ); + messaging_for_outbound.respond(¤t_message, response).await + } + }; - // Internal link channels: route replies back to the sender's link channel - if current_message.source == "internal" { - let reply_text = match &response { - spacebot::OutboundResponse::Text(t) => Some(t.clone()), - spacebot::OutboundResponse::RichMessage { text, .. } => Some(text.clone()), - spacebot::OutboundResponse::ThreadReply { text, .. } => Some(text.clone()), - spacebot::OutboundResponse::Status(_) => None, - _ => None, - }; - - if let Some(text) = reply_text { - let reply_to_agent = current_message.metadata - .get("reply_to_agent") - .and_then(|v| v.as_str()) - .map(String::from); - let reply_to_channel = current_message.metadata - .get("reply_to_channel") - .and_then(|v| v.as_str()) - .map(String::from); - - if let (Some(target_agent), Some(target_channel)) = (reply_to_agent, reply_to_channel) { - let agent_display = outbound_agent_names - .get(&sse_agent_id) - .cloned() - .unwrap_or_else(|| sse_agent_id.clone()); - - // Include the original sent message so the receiving - // agent's link channel can seed its history with context - let original_text = match ¤t_message.content { - spacebot::MessageContent::Text(t) => Some(t.clone()), - spacebot::MessageContent::Media { text, .. } => text.clone(), - _ => None, - }; - - let mut metadata = std::collections::HashMap::from([ - ("from_agent_id".into(), serde_json::json!(&sse_agent_id)), - ("reply_to_agent".into(), serde_json::json!(&sse_agent_id)), - ("reply_to_channel".into(), serde_json::json!(&outbound_conversation_id)), - ]); - if let Some(original) = original_text { - metadata.insert("original_sent_message".into(), serde_json::json!(original)); - } - // Propagate originating_channel and originating_source so both sides - // know where to route conclusions and which adapter to use. - if let Some(originating) = current_message.metadata.get("originating_channel") { - metadata.insert("originating_channel".into(), originating.clone()); - } - if let Some(source) = current_message.metadata.get("originating_source") { - metadata.insert("originating_source".into(), source.clone()); + if let Some(receipt_id) = receipt_id { + match &delivery_result { + Ok(()) => { + match outbound_process_logger + .ack_worker_delivery_receipt(&receipt_id) + .await + { + Ok(acked_now) => { + if acked_now { + if let Some(text) = receipt_log_text.as_deref() { + outbound_conversation_logger + .log_bot_message(&outbound_channel_id, text); + } + tracing::info!( + channel_id = %outbound_conversation_id, + receipt_id = %receipt_id, + "worker terminal receipt delivered" + ); + } + } + Err(error) => { + tracing::warn!( + %error, + channel_id = %outbound_conversation_id, + receipt_id = %receipt_id, + "failed to ack worker terminal receipt" + ); + } } - - let reply_message = spacebot::InboundMessage { - id: uuid::Uuid::new_v4().to_string(), - source: "internal".into(), - conversation_id: target_channel.clone(), - sender_id: sse_agent_id.clone(), - agent_id: Some(Arc::from(target_agent.as_str())), - content: spacebot::MessageContent::Text(text), - timestamp: chrono::Utc::now(), - metadata, - formatted_author: Some(format!("[{agent_display}]")), - }; - - if let Err(error) = messaging_for_outbound - .inject_message(reply_message) + } + Err(error) => { + match outbound_process_logger + .fail_worker_delivery_receipt_attempt( + &receipt_id, + &error.to_string(), + ) .await { - tracing::error!( - %error, - from = %sse_agent_id, - to = %target_agent, - "failed to route link channel reply" - ); - } else { - // Emit SSE event so the dashboard animates the edge - api_event_tx.send(spacebot::api::ApiEvent::AgentMessageSent { - from_agent_id: sse_agent_id.clone(), - to_agent_id: target_agent.clone(), - link_id: target_channel.clone(), - channel_id: target_channel.clone(), - }).ok(); - - tracing::info!( - from = %sse_agent_id, - to = %target_agent, - channel = %target_channel, - "routed link channel reply" - ); + Ok(outcome) => { + tracing::warn!( + channel_id = %outbound_conversation_id, + receipt_id = %receipt_id, + attempt_count = outcome.attempt_count, + status = %outcome.status, + next_attempt_at = ?outcome.next_attempt_at, + "worker terminal receipt delivery failed" + ); + } + Err(update_error) => { + tracing::warn!( + %update_error, + channel_id = %outbound_conversation_id, + receipt_id = %receipt_id, + "failed to record worker terminal receipt delivery failure" + ); + } } } } - continue; } - match response { - spacebot::OutboundResponse::Status(status) => { - if let Err(error) = messaging_for_outbound - .send_status(¤t_message, status) - .await - { - tracing::warn!(%error, "failed to send status update"); - } - } - response => { - tracing::info!( - conversation_id = %outbound_conversation_id, - "routing outbound response to messaging adapter" - ); - if let Err(error) = messaging_for_outbound - .respond(¤t_message, response) - .await - { - tracing::error!(%error, "failed to send outbound response"); - } + if let Err(error) = delivery_result { + if is_status_update { + tracing::warn!(%error, "failed to send status update"); + } else { + tracing::error!(%error, "failed to send outbound response"); } } } @@ -1372,6 +1519,27 @@ async fn initialize_agents( ) })?; + let process_run_logger = + spacebot::conversation::history::ProcessRunLogger::new(db.sqlite.clone()); + let (recovered_workers, recovered_branches, recovered_receipts) = process_run_logger + .close_orphaned_runs() + .await + .with_context(|| { + format!( + "failed to recover orphaned runs for agent '{}'", + agent_config.id + ) + })?; + if recovered_workers > 0 || recovered_branches > 0 || recovered_receipts > 0 { + tracing::warn!( + agent_id = %agent_config.id, + recovered_workers, + recovered_branches, + recovered_receipts, + "recovered orphaned process runs from previous startup" + ); + } + // Per-agent settings store (redb-backed) let settings_path = agent_config.data_dir.join("settings.redb"); let settings_store = Arc::new( @@ -1657,6 +1825,20 @@ async fn initialize_agents( tracing::info!("messaging adapters started"); + // Start a global worker terminal receipt dispatcher for each agent so + // pending receipts are delivered even when no channel loop is active. + for (agent_id, agent) in agents.iter() { + let handle = spawn_worker_receipt_dispatch_loop( + agent_id.to_string(), + spacebot::conversation::history::ProcessRunLogger::new(agent.db.sqlite.clone()), + spacebot::conversation::ChannelStore::new(agent.db.sqlite.clone()), + spacebot::conversation::history::ConversationLogger::new(agent.db.sqlite.clone()), + messaging_manager.clone(), + ); + cortex_handles.push(handle); + tracing::info!(agent_id = %agent_id, "worker receipt dispatcher loop started"); + } + // Initialize cron schedulers for each agent let mut cron_stores_map = std::collections::HashMap::new(); let mut cron_schedulers_map = std::collections::HashMap::new(); diff --git a/src/messaging/discord.rs b/src/messaging/discord.rs index ab433dd3a..0c17c86e7 100644 --- a/src/messaging/discord.rs +++ b/src/messaging/discord.rs @@ -26,6 +26,8 @@ pub struct DiscordAdapter { bot_user_id: Arc>>, /// Maps InboundMessage.id to the Discord MessageId being edited during streaming. active_messages: Arc>>, + /// Per-channel progress message used for worker checkpoint edits. + progress_messages: Arc>>, /// Typing handles per message. Typing stops when the handle is dropped. typing_tasks: Arc>>, shard_manager: Arc>>>, @@ -39,6 +41,7 @@ impl DiscordAdapter { http: Arc::new(RwLock::new(None)), bot_user_id: Arc::new(RwLock::new(None)), active_messages: Arc::new(RwLock::new(HashMap::new())), + progress_messages: Arc::new(RwLock::new(HashMap::new())), typing_tasks: Arc::new(RwLock::new(HashMap::new())), shard_manager: Arc::new(RwLock::new(None)), } @@ -85,6 +88,57 @@ impl DiscordAdapter { .and_then(|value| value.as_u64()) .map(MessageId::new) } + + fn progress_message_key(message: &InboundMessage, worker_id: crate::WorkerId) -> String { + format!("{}:{worker_id}", Self::channel_key(message)) + } + + async fn upsert_progress_message( + &self, + message: &InboundMessage, + worker_id: crate::WorkerId, + content: &str, + ) -> anyhow::Result<()> { + let http = self.get_http().await?; + let channel_id = self.extract_channel_id(message)?; + let key = Self::progress_message_key(message, worker_id); + let display_text = if content.len() > 2000 { + let end = content.floor_char_boundary(1997); + format!("{}...", &content[..end]) + } else { + content.to_string() + }; + + let existing_id = self.progress_messages.read().await.get(&key).copied(); + if let Some(message_id) = existing_id { + let builder = EditMessage::new().content(display_text.clone()); + match channel_id.edit_message(&*http, message_id, builder).await { + Ok(_) => return Ok(()), + Err(error) => { + tracing::warn!(%error, "failed to edit progress message; creating a new one"); + } + } + } + + let reply_to = Self::extract_reply_message_id(message); + let mut builder = CreateMessage::new().content(display_text); + if let Some(reply_message_id) = reply_to { + builder = builder.reference_message((channel_id, reply_message_id)); + } + let sent = channel_id + .send_message(&*http, builder) + .await + .context("failed to send worker progress message")?; + self.progress_messages.write().await.insert(key, sent.id); + Ok(()) + } + + async fn clear_progress_message(&self, message: &InboundMessage, worker_id: crate::WorkerId) { + self.progress_messages + .write() + .await + .remove(&Self::progress_message_key(message, worker_id)); + } } impl Messaging for DiscordAdapter { @@ -371,7 +425,53 @@ impl Messaging for DiscordAdapter { .await .insert(Self::channel_key(message), typing); } - _ => { + StatusUpdate::WorkerStarted { worker_id, task } => { + self.stop_typing(message).await; + let text = format!( + "Background task `{}` started: {}", + short_worker_id(worker_id), + task + ); + if let Err(error) = self + .upsert_progress_message(message, worker_id, &text) + .await + { + tracing::debug!(%error, "failed to update discord progress message"); + } + } + StatusUpdate::WorkerCheckpoint { worker_id, status } => { + self.stop_typing(message).await; + let text = format!( + "Background task `{}`: {}", + short_worker_id(worker_id), + status + ); + if let Err(error) = self + .upsert_progress_message(message, worker_id, &text) + .await + { + tracing::debug!(%error, "failed to update discord progress message"); + } + } + StatusUpdate::WorkerCompleted { worker_id, result } => { + self.stop_typing(message).await; + let text = format!( + "Background task `{}` completed: {}", + short_worker_id(worker_id), + result + ); + if let Err(error) = self + .upsert_progress_message(message, worker_id, &text) + .await + { + tracing::debug!(%error, "failed to update discord progress message"); + } + self.clear_progress_message(message, worker_id).await; + } + StatusUpdate::StopTyping + | StatusUpdate::ToolStarted { .. } + | StatusUpdate::ToolCompleted { .. } + | StatusUpdate::BranchStarted { .. } => { self.stop_typing(message).await; } } @@ -542,6 +642,7 @@ impl Messaging for DiscordAdapter { async fn shutdown(&self) -> crate::Result<()> { self.typing_tasks.write().await.clear(); + self.progress_messages.write().await.clear(); if let Some(shard_manager) = self.shard_manager.read().await.as_ref() { shard_manager.shutdown_all().await; @@ -916,6 +1017,11 @@ async fn build_metadata( (metadata, formatted_author) } +fn short_worker_id(worker_id: crate::WorkerId) -> String { + let full = worker_id.to_string(); + full.chars().take(8).collect() +} + /// Split a message into chunks that fit within Discord's 2000 char limit. /// Tries to split at newlines, then spaces, then hard-cuts. fn split_message(text: &str, max_len: usize) -> Vec { @@ -1075,7 +1181,7 @@ fn build_poll( #[cfg(test)] mod tests { use super::*; - use crate::{Button, ButtonStyle, Card, CardField, InteractiveElements, Poll}; + use crate::{Button, ButtonStyle, Card, CardField, InteractiveElements, MessageContent, Poll}; #[test] fn test_build_embed_limits() { @@ -1133,4 +1239,34 @@ mod tests { let _ = build_poll(&poll); // Again, can't easily inspect CreatePoll fields, but we verify it runs. } + + #[test] + fn progress_message_key_is_scoped_per_worker() { + let worker_a = + uuid::Uuid::parse_str("11111111-1111-1111-1111-111111111111").expect("valid uuid"); + let worker_b = + uuid::Uuid::parse_str("22222222-2222-2222-2222-222222222222").expect("valid uuid"); + + let mut metadata = std::collections::HashMap::new(); + metadata.insert( + "discord_channel_id".to_string(), + serde_json::Value::from(42_u64), + ); + + let message = InboundMessage { + id: "msg-1".to_string(), + source: "discord".to_string(), + conversation_id: "discord:42".to_string(), + sender_id: "user-1".to_string(), + agent_id: None, + content: MessageContent::Text("hello".to_string()), + timestamp: chrono::Utc::now(), + metadata, + formatted_author: None, + }; + + let key_a = DiscordAdapter::progress_message_key(&message, worker_a); + let key_b = DiscordAdapter::progress_message_key(&message, worker_b); + assert_ne!(key_a, key_b, "workers in same channel need distinct keys"); + } } diff --git a/src/messaging/slack.rs b/src/messaging/slack.rs index 2aa1e3cbf..b16041935 100644 --- a/src/messaging/slack.rs +++ b/src/messaging/slack.rs @@ -809,7 +809,14 @@ impl Messaging for SlackAdapter { StatusUpdate::StopTyping => String::new(), // empty string clears the status StatusUpdate::ToolStarted { .. } => "Working…".to_string(), StatusUpdate::ToolCompleted { .. } => "Working…".to_string(), - _ => "Working…".to_string(), + StatusUpdate::WorkerStarted { task, .. } => { + format!("Starting: {}", truncate_status_text(task, 120)) + } + StatusUpdate::WorkerCheckpoint { status, .. } => truncate_status_text(status, 140), + StatusUpdate::WorkerCompleted { result, .. } => { + format!("Done: {}", truncate_status_text(result, 120)) + } + StatusUpdate::BranchStarted { .. } => "Branch started…".to_string(), }; let session = self.session(); @@ -1258,6 +1265,15 @@ fn markdown_content(text: impl Into) -> SlackMessageContent { } } +fn truncate_status_text(text: &str, max_chars: usize) -> String { + if text.len() <= max_chars { + return text.to_string(); + } + + let end = text.floor_char_boundary(max_chars.saturating_sub(3)); + format!("{}...", &text[..end]) +} + /// Extract `MessageContent` from an optional `SlackMessageContent`. fn extract_message_content(content: &Option) -> MessageContent { let Some(msg_content) = content else { diff --git a/src/messaging/webchat.rs b/src/messaging/webchat.rs index a016df1f4..6f18c1a7f 100644 --- a/src/messaging/webchat.rs +++ b/src/messaging/webchat.rs @@ -25,6 +25,9 @@ pub enum WebChatEvent { StreamEnd, ToolStarted { tool_name: String }, ToolCompleted { tool_name: String }, + WorkerStarted { worker_id: String, task: String }, + WorkerCheckpoint { worker_id: String, status: String }, + WorkerCompleted { worker_id: String, result: String }, StopTyping, Done, } @@ -117,7 +120,21 @@ impl Messaging for WebChatAdapter { StatusUpdate::StopTyping => WebChatEvent::StopTyping, StatusUpdate::ToolStarted { tool_name } => WebChatEvent::ToolStarted { tool_name }, StatusUpdate::ToolCompleted { tool_name } => WebChatEvent::ToolCompleted { tool_name }, - _ => return Ok(()), + StatusUpdate::WorkerStarted { worker_id, task } => WebChatEvent::WorkerStarted { + worker_id: worker_id.to_string(), + task, + }, + StatusUpdate::WorkerCheckpoint { worker_id, status } => { + WebChatEvent::WorkerCheckpoint { + worker_id: worker_id.to_string(), + status, + } + } + StatusUpdate::WorkerCompleted { worker_id, result } => WebChatEvent::WorkerCompleted { + worker_id: worker_id.to_string(), + result, + }, + StatusUpdate::BranchStarted { .. } => return Ok(()), }; let _ = tx.send(event).await; diff --git a/src/tools.rs b/src/tools.rs index 7d8d58690..014d25756 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -96,8 +96,7 @@ pub use worker_inspect::{ use crate::agent::channel::ChannelState; use crate::config::{BrowserConfig, RuntimeConfig}; use crate::memory::MemorySearch; -use crate::sandbox::Sandbox; -use crate::{AgentId, ChannelId, OutboundResponse, ProcessEvent, WorkerId}; +use crate::{AgentId, ChannelId, OutboundEnvelope, ProcessEvent, WorkerId}; use rig::tool::Tool as _; use rig::tool::server::{ToolServer, ToolServerHandle}; use std::path::PathBuf; @@ -190,7 +189,7 @@ pub fn truncate_output(value: &str, max_bytes: usize) -> String { pub async fn add_channel_tools( handle: &ToolServerHandle, state: ChannelState, - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, conversation_id: impl Into, skip_flag: SkipFlag, replied_flag: RepliedFlag, diff --git a/src/tools/browser.rs b/src/tools/browser.rs index f6e692f9b..0c008a692 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -24,9 +24,13 @@ use std::collections::HashMap; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; use std::path::PathBuf; use std::sync::Arc; +use std::time::Duration; use tokio::sync::Mutex; use tokio::task::JoinHandle; +/// Per-browser-action timeout to avoid hanging worker runs on Chrome/CDP stalls. +const BROWSER_ACTION_TIMEOUT_SECS: u64 = 45; + /// Validate that a URL is safe for the browser to navigate to. /// Blocks private/loopback IPs, link-local addresses, and cloud metadata endpoints /// to prevent server-side request forgery. @@ -465,6 +469,28 @@ impl Tool for BrowserTool { } impl BrowserTool { + async fn with_action_timeout( + action_name: &str, + action_future: F, + ) -> Result + where + F: std::future::Future>, + E: std::fmt::Display, + { + match tokio::time::timeout( + Duration::from_secs(BROWSER_ACTION_TIMEOUT_SECS), + action_future, + ) + .await + { + Ok(Ok(value)) => Ok(value), + Ok(Err(error)) => Err(BrowserError::new(format!("{action_name} failed: {error}"))), + Err(_) => Err(BrowserError::new(format!( + "{action_name} timed out after {BROWSER_ACTION_TIMEOUT_SECS}s" + ))), + } + } + async fn handle_launch(&self) -> Result { let mut state = self.state.lock().await; @@ -492,9 +518,8 @@ impl BrowserTool { "launching chrome" ); - let (browser, mut handler) = Browser::launch(chrome_config) - .await - .map_err(|error| BrowserError::new(format!("failed to launch browser: {error}")))?; + let (browser, mut handler) = + Self::with_action_timeout("browser launch", Browser::launch(chrome_config)).await?; let handler_task = tokio::spawn(async move { while handler.next().await.is_some() {} }); @@ -515,9 +540,7 @@ impl BrowserTool { let mut state = self.state.lock().await; let page = self.get_or_create_page(&mut state, Some(&url)).await?; - page.goto(&url) - .await - .map_err(|error| BrowserError::new(format!("navigation failed: {error}")))?; + Self::with_action_timeout("navigation", page.goto(&url)).await?; let title = page.get_title().await.ok().flatten(); let current_url = page.url().await.ok().flatten(); @@ -545,10 +568,7 @@ impl BrowserTool { validate_url(target_url)?; } - let page = browser - .new_page(target_url) - .await - .map_err(|error| BrowserError::new(format!("failed to open tab: {error}")))?; + let page = Self::with_action_timeout("open tab", browser.new_page(target_url)).await?; let target_id = page_target_id(&page); let title = page.get_title().await.ok().flatten(); @@ -989,10 +1009,7 @@ impl BrowserTool { .ok_or_else(|| BrowserError::new("browser not launched — call launch first"))?; let target_url = url.unwrap_or("about:blank"); - let page = browser - .new_page(target_url) - .await - .map_err(|error| BrowserError::new(format!("failed to create page: {error}")))?; + let page = Self::with_action_timeout("create page", browser.new_page(target_url)).await?; let target_id = page_target_id(&page); state.pages.insert(target_id.clone(), page); diff --git a/src/tools/cancel.rs b/src/tools/cancel.rs index ea1fd950c..e827834c0 100644 --- a/src/tools/cancel.rs +++ b/src/tools/cancel.rs @@ -100,7 +100,7 @@ impl Tool for CancelTool { .parse::() .map_err(|e| CancelError(format!("Invalid worker ID: {e}")))?; self.state - .cancel_worker(worker_id) + .cancel_worker(worker_id, args.reason.as_deref()) .await .map_err(CancelError)?; } diff --git a/src/tools/react.rs b/src/tools/react.rs index 6679d5edf..ce3aa4ef2 100644 --- a/src/tools/react.rs +++ b/src/tools/react.rs @@ -1,6 +1,6 @@ //! React tool for adding emoji reactions to messages (channel only). -use crate::OutboundResponse; +use crate::{OutboundEnvelope, OutboundResponse}; use rig::completion::ToolDefinition; use rig::tool::Tool; use schemars::JsonSchema; @@ -10,11 +10,11 @@ use tokio::sync::mpsc; /// Tool for reacting to messages with emoji. #[derive(Debug, Clone)] pub struct ReactTool { - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, } impl ReactTool { - pub fn new(response_tx: mpsc::Sender) -> Self { + pub fn new(response_tx: mpsc::Sender) -> Self { Self { response_tx } } } @@ -66,7 +66,7 @@ impl Tool for ReactTool { tracing::info!(emoji = %args.emoji, "react tool called"); self.response_tx - .send(OutboundResponse::Reaction(args.emoji.clone())) + .send(OutboundResponse::Reaction(args.emoji.clone()).into()) .await .map_err(|error| ReactError(format!("failed to send reaction: {error}")))?; diff --git a/src/tools/reply.rs b/src/tools/reply.rs index 3bec9e1c9..80e320119 100644 --- a/src/tools/reply.rs +++ b/src/tools/reply.rs @@ -2,7 +2,7 @@ use crate::conversation::ConversationLogger; -use crate::{ChannelId, OutboundResponse}; +use crate::{ChannelId, OutboundEnvelope, OutboundResponse}; use regex::Regex; use rig::completion::ToolDefinition; use rig::tool::Tool; @@ -40,7 +40,7 @@ pub fn new_replied_flag() -> RepliedFlag { /// tools once and shares them across calls. #[derive(Debug, Clone)] pub struct ReplyTool { - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, conversation_id: String, conversation_logger: ConversationLogger, channel_id: ChannelId, @@ -51,7 +51,7 @@ pub struct ReplyTool { impl ReplyTool { /// Create a new reply tool bound to a conversation's response channel. pub fn new( - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, conversation_id: impl Into, conversation_logger: ConversationLogger, channel_id: ChannelId, @@ -220,6 +220,31 @@ pub(crate) fn normalize_discord_mention_tokens(content: &str, source: &str) -> S normalized } +pub(crate) fn is_low_value_waiting_update(content: &str) -> bool { + let lowered = content.to_ascii_lowercase(); + + let spawned = lowered.contains("worker was spawned") + || lowered.contains("spawned a worker") + || lowered.contains("worker was started"); + let no_report = lowered.contains("hasn't reported back") + || lowered.contains("has not reported back") + || lowered.contains("hasn't come back") + || lowered.contains("has not come back"); + if spawned && no_report { + return true; + } + + let known_template = lowered.contains("still waiting on the research results") + || lowered.contains("still waiting on the worker results") + || lowered.contains("still waiting on the results") + || lowered.contains("still waiting for the worker results"); + if known_template { + return true; + } + + false +} + impl Tool for ReplyTool { const NAME: &'static str = "reply"; @@ -385,7 +410,7 @@ impl Tool for ReplyTool { }; self.response_tx - .send(response) + .send(response.into()) .await .map_err(|e| ReplyError(format!("failed to send reply: {e}")))?; @@ -404,7 +429,9 @@ impl Tool for ReplyTool { #[cfg(test)] mod tests { - use super::{normalize_discord_mention_tokens, sanitize_discord_user_id}; + use super::{ + is_low_value_waiting_update, normalize_discord_mention_tokens, sanitize_discord_user_id, + }; #[test] fn normalizes_broken_discord_mentions() { @@ -435,4 +462,16 @@ mod tests { let parsed = sanitize_discord_user_id(">234152400653385729").expect("should parse id"); assert_eq!(parsed, "234152400653385729"); } + + #[test] + fn suppresses_low_value_waiting_updates() { + let content = "Still waiting on the research results — the worker was spawned and hasn't reported back yet."; + assert!(is_low_value_waiting_update(content)); + assert!(!is_low_value_waiting_update( + "Still waiting on your approval before I run anything." + )); + assert!(!is_low_value_waiting_update( + "I found 3 key findings and linked the sources below." + )); + } } diff --git a/src/tools/send_file.rs b/src/tools/send_file.rs index 02437e829..a5f4e00a5 100644 --- a/src/tools/send_file.rs +++ b/src/tools/send_file.rs @@ -1,6 +1,6 @@ //! Send file tool for delivering file attachments to users (channel only). -use crate::OutboundResponse; +use crate::{OutboundEnvelope, OutboundResponse}; use rig::completion::ToolDefinition; use rig::tool::Tool; use schemars::JsonSchema; @@ -16,38 +16,12 @@ use tokio::sync::mpsc; /// File access is restricted to the agent's workspace boundary. #[derive(Debug, Clone)] pub struct SendFileTool { - response_tx: mpsc::Sender, - workspace: PathBuf, + response_tx: mpsc::Sender, } impl SendFileTool { - pub fn new(response_tx: mpsc::Sender, workspace: PathBuf) -> Self { - Self { - response_tx, - workspace, - } - } - - /// Validate that a path falls within the workspace boundary. - fn validate_workspace_path(&self, path: &std::path::Path) -> Result { - let workspace = &self.workspace; - - let canonical = path.canonicalize().map_err(|error| { - SendFileError(format!("can't resolve path '{}': {error}", path.display())) - })?; - let workspace_canonical = workspace - .canonicalize() - .unwrap_or_else(|_| workspace.clone()); - - if !canonical.starts_with(&workspace_canonical) { - return Err(SendFileError(format!( - "ACCESS DENIED: Path is outside the workspace boundary. \ - File operations are restricted to {}.", - workspace.display() - ))); - } - - Ok(canonical) + pub fn new(response_tx: mpsc::Sender) -> Self { + Self { response_tx } } } @@ -161,7 +135,7 @@ impl Tool for SendFileTool { }; self.response_tx - .send(response) + .send(response.into()) .await .map_err(|error| SendFileError(format!("failed to send file: {error}")))?; diff --git a/src/tools/skip.rs b/src/tools/skip.rs index 481b5fb8b..065f1ba49 100644 --- a/src/tools/skip.rs +++ b/src/tools/skip.rs @@ -5,7 +5,7 @@ //! instead of `reply`. The channel checks the skip flag after the LLM turn and //! suppresses any fallback text output. -use crate::OutboundResponse; +use crate::{OutboundEnvelope, OutboundResponse}; use rig::completion::ToolDefinition; use rig::tool::Tool; use schemars::JsonSchema; @@ -29,11 +29,11 @@ pub fn new_skip_flag() -> SkipFlag { #[derive(Debug, Clone)] pub struct SkipTool { flag: SkipFlag, - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, } impl SkipTool { - pub fn new(flag: SkipFlag, response_tx: mpsc::Sender) -> Self { + pub fn new(flag: SkipFlag, response_tx: mpsc::Sender) -> Self { Self { flag, response_tx } } } @@ -86,7 +86,7 @@ impl Tool for SkipTool { // Cancel the typing indicator so it doesn't linger let _ = self .response_tx - .send(OutboundResponse::Status(crate::StatusUpdate::StopTyping)) + .send(OutboundResponse::Status(crate::StatusUpdate::StopTyping).into()) .await; let reason = args.reason.as_deref().unwrap_or("no reason given"); From 072212b46df2f763afa3cf44f1e816fac795126f Mon Sep 17 00:00:00 2001 From: Victor Sumner Date: Tue, 24 Feb 2026 17:33:24 -0500 Subject: [PATCH 2/8] chore(api): trim worker progress interface surface --- docs/content/docs/(features)/workers.mdx | 1 - src/api/channels.rs | 46 ++---------------------- src/api/webchat.rs | 3 -- src/messaging/webchat.rs | 19 +--------- 4 files changed, 3 insertions(+), 66 deletions(-) diff --git a/docs/content/docs/(features)/workers.mdx b/docs/content/docs/(features)/workers.mdx index 1e1275f30..aa8f1b9e1 100644 --- a/docs/content/docs/(features)/workers.mdx +++ b/docs/content/docs/(features)/workers.mdx @@ -135,7 +135,6 @@ Terminal worker notices (`done`, `failed`, `timed_out`, `cancelled`) are queued - Successful delivery marks the receipt as acknowledged. - On process restart, in-flight (`sending`) receipts are re-queued so completion notices are not silently dropped. - Old terminal receipts (`acked`, `failed`) are pruned periodically to keep storage bounded. -- `/api/channels/status` includes `worker_delivery_receipts` counts (`pending`, `failed`) per channel for observability. ## Model Routing diff --git a/src/api/channels.rs b/src/api/channels.rs index 73c8285ac..4cc6bd87b 100644 --- a/src/api/channels.rs +++ b/src/api/channels.rs @@ -147,29 +147,7 @@ pub(super) async fn channel_status( let mut result = HashMap::new(); for (channel_id, status_block) in status_snapshot { let block = status_block.read().await; - if let Ok(mut value) = serde_json::to_value(&*block) { - if let Some(channel_state) = state_snapshot.get(&channel_id) { - match channel_state - .process_run_logger - .load_worker_delivery_receipt_stats(&channel_id) - .await - { - Ok(stats) => { - if let Some(object) = value.as_object_mut() { - if let Ok(stats_value) = serde_json::to_value(stats) { - object.insert("worker_delivery_receipts".to_string(), stats_value); - } - } - } - Err(error) => { - tracing::warn!( - %error, - channel_id = %channel_id, - "failed to load worker delivery receipt stats" - ); - } - } - } + if let Ok(value) = serde_json::to_value(&*block) { result.insert(channel_id, value); } } @@ -182,27 +160,7 @@ pub(super) async fn channel_status( } let block = channel_state.status_block.read().await; - if let Ok(mut value) = serde_json::to_value(&*block) { - match channel_state - .process_run_logger - .load_worker_delivery_receipt_stats(channel_id) - .await - { - Ok(stats) => { - if let Some(object) = value.as_object_mut() { - if let Ok(stats_value) = serde_json::to_value(stats) { - object.insert("worker_delivery_receipts".to_string(), stats_value); - } - } - } - Err(error) => { - tracing::warn!( - %error, - channel_id = %channel_id, - "failed to load worker delivery receipt stats" - ); - } - } + if let Ok(value) = serde_json::to_value(&*block) { result.insert(channel_id.clone(), value); } } diff --git a/src/api/webchat.rs b/src/api/webchat.rs index ffcb3b503..352d6d487 100644 --- a/src/api/webchat.rs +++ b/src/api/webchat.rs @@ -86,9 +86,6 @@ pub(super) async fn webchat_send( WebChatEvent::StreamEnd => "stream_end", WebChatEvent::ToolStarted { .. } => "tool_started", WebChatEvent::ToolCompleted { .. } => "tool_completed", - WebChatEvent::WorkerStarted { .. } => "worker_started", - WebChatEvent::WorkerCheckpoint { .. } => "worker_checkpoint", - WebChatEvent::WorkerCompleted { .. } => "worker_completed", WebChatEvent::StopTyping => "stop_typing", WebChatEvent::Done => "done", }; diff --git a/src/messaging/webchat.rs b/src/messaging/webchat.rs index 6f18c1a7f..a016df1f4 100644 --- a/src/messaging/webchat.rs +++ b/src/messaging/webchat.rs @@ -25,9 +25,6 @@ pub enum WebChatEvent { StreamEnd, ToolStarted { tool_name: String }, ToolCompleted { tool_name: String }, - WorkerStarted { worker_id: String, task: String }, - WorkerCheckpoint { worker_id: String, status: String }, - WorkerCompleted { worker_id: String, result: String }, StopTyping, Done, } @@ -120,21 +117,7 @@ impl Messaging for WebChatAdapter { StatusUpdate::StopTyping => WebChatEvent::StopTyping, StatusUpdate::ToolStarted { tool_name } => WebChatEvent::ToolStarted { tool_name }, StatusUpdate::ToolCompleted { tool_name } => WebChatEvent::ToolCompleted { tool_name }, - StatusUpdate::WorkerStarted { worker_id, task } => WebChatEvent::WorkerStarted { - worker_id: worker_id.to_string(), - task, - }, - StatusUpdate::WorkerCheckpoint { worker_id, status } => { - WebChatEvent::WorkerCheckpoint { - worker_id: worker_id.to_string(), - status, - } - } - StatusUpdate::WorkerCompleted { worker_id, result } => WebChatEvent::WorkerCompleted { - worker_id: worker_id.to_string(), - result, - }, - StatusUpdate::BranchStarted { .. } => return Ok(()), + _ => return Ok(()), }; let _ = tx.send(event).await; From d2b612b91b7cbf7f745b87fe538caf373fc40915 Mon Sep 17 00:00:00 2001 From: Victor Sumner Date: Tue, 24 Feb 2026 19:50:02 -0500 Subject: [PATCH 3/8] Harden worker delivery contracts and persist worker events --- docs/content/docs/(configuration)/config.mdx | 14 + docs/content/docs/(deployment)/roadmap.mdx | 2 + docs/content/docs/(features)/workers.mdx | 22 + .../20260224000002_worker_task_contracts.sql | 35 + ...l => 20260224000003_worker_tool_calls.sql} | 0 migrations/20260225000001_worker_events.sql | 24 + src/agent/channel.rs | 387 ++++++- src/api/agents.rs | 1 + src/api/workers.rs | 25 + src/config.rs | 68 ++ src/conversation.rs | 3 +- src/conversation/history.rs | 965 +++++++++++++++++- src/db.rs | 36 + src/main.rs | 118 ++- src/messaging/discord.rs | 30 +- src/messaging/manager.rs | 6 +- src/messaging/slack.rs | 11 +- src/messaging/telegram.rs | 21 +- src/messaging/traits.rs | 23 +- src/messaging/webchat.rs | 15 +- src/tools.rs | 1 + src/tools/conclude_link.rs | 8 +- src/tools/send_file.rs | 30 +- src/tools/worker_inspect.rs | 25 + 24 files changed, 1795 insertions(+), 75 deletions(-) create mode 100644 migrations/20260224000002_worker_task_contracts.sql rename migrations/{20260224000001_worker_tool_calls.sql => 20260224000003_worker_tool_calls.sql} (100%) create mode 100644 migrations/20260225000001_worker_events.sql diff --git a/docs/content/docs/(configuration)/config.mdx b/docs/content/docs/(configuration)/config.mdx index 21a78bfb9..745362618 100644 --- a/docs/content/docs/(configuration)/config.mdx +++ b/docs/content/docs/(configuration)/config.mdx @@ -84,6 +84,12 @@ background_threshold = 0.80 # background summarization aggressive_threshold = 0.85 # aggressive summarization emergency_threshold = 0.95 # drop oldest 50%, no LLM +# Deterministic worker task contract timing. +[defaults.worker_contract] +ack_secs = 5 +progress_secs = 45 +tick_secs = 2 + # Cortex (system observer) settings. [defaults.cortex] tick_interval_secs = 30 @@ -471,6 +477,14 @@ Map of model names to ordered fallback chains. Used when the primary model retur Thresholds are fractions of `context_window`. +### `[defaults.worker_contract]` + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| `ack_secs` | integer | 5 | Deadline to confirm a worker start was surfaced | +| `progress_secs` | integer | 45 | Deadline between meaningful worker progress updates | +| `tick_secs` | integer | 2 | Poll interval for worker contract deadline checks | + ### `[defaults.cortex]` | Key | Type | Default | Description | diff --git a/docs/content/docs/(deployment)/roadmap.mdx b/docs/content/docs/(deployment)/roadmap.mdx index 6711bdd2d..73ae3f63e 100644 --- a/docs/content/docs/(deployment)/roadmap.mdx +++ b/docs/content/docs/(deployment)/roadmap.mdx @@ -39,6 +39,8 @@ The full message-in → LLM → response-out pipeline is wired end-to-end across - **Tools** — 16 tools implement Rig's `Tool` trait with real logic (reply, branch, spawn_worker, route, cancel, skip, react, memory_save, memory_recall, set_status, shell, file, exec, browser, cron, web_search) - **Workspace containment** — file tool validates paths stay within workspace boundary, shell/exec tools block instance directory traversal, sensitive file access, and secret env var leakage - **Conversation persistence** — `ConversationLogger` with fire-and-forget SQLite writes, compaction archiving +- **Worker task contracts** — deterministic worker ack/progress/terminal deadlines with one-time SLA nudge and durable terminal convergence (`terminal_acked` / `terminal_failed`) +- **Worker event journal** — append-only `worker_events` persistence for started/status/tool/permission/question/completed lifecycle debugging - **Cron** — scheduler with timers, active hours, circuit breaker (3 failures → disable), creates real channels. CronTool wired into channel tool factory. - **Message routing** — full event loop with binding resolution, channel lifecycle, outbound routing - **Settings store** — redb key-value with WorkerLogMode diff --git a/docs/content/docs/(features)/workers.mdx b/docs/content/docs/(features)/workers.mdx index aa8f1b9e1..b8c97b62d 100644 --- a/docs/content/docs/(features)/workers.mdx +++ b/docs/content/docs/(features)/workers.mdx @@ -127,6 +127,16 @@ Worker runs are bounded by `worker_timeout_secs` (default `300`) as an inactivit If no progress arrives within the timeout window, Spacebot marks the worker as `timed_out`, records a terminal result, and removes it from active worker state so the channel can continue delegating work. +## Deterministic Task Contracts + +Each worker run now gets an internal task contract with three deadlines: + +- **Acknowledge deadline** — confirms the worker start was surfaced to the user-facing adapter. +- **Progress deadline** — expects a meaningful heartbeat before the deadline. +- **Terminal deadline** — tracks terminal delivery lifecycle until receipt ack/failure. + +If the acknowledge deadline is missed, Spacebot emits a synthesized "running" checkpoint. If the progress deadline is missed, it emits one synthesized "still working" nudge (one-time, no spam loop). Terminal receipt ack/failure then closes the contract as `terminal_acked` or `terminal_failed`. + ## Terminal Delivery Reliability Terminal worker notices (`done`, `failed`, `timed_out`, `cancelled`) are queued as durable delivery receipts before they are sent to the messaging adapter. @@ -136,6 +146,18 @@ Terminal worker notices (`done`, `failed`, `timed_out`, `cancelled`) are queued - On process restart, in-flight (`sending`) receipts are re-queued so completion notices are not silently dropped. - Old terminal receipts (`acked`, `failed`) are pruned periodically to keep storage bounded. +## Worker Event Journal + +Worker lifecycle updates are also written to an append-only `worker_events` table: + +- `started` with task + worker type +- `status` checkpoints +- `tool_started` / `tool_completed` +- `permission` / `question` +- `completed` with terminal summary + +This gives us durable debugging context even after in-memory status blocks are gone. The workers API and `worker_inspect` surface this timeline so long-running task behavior can be audited post-run. + ## Model Routing Workers default to `anthropic/claude-haiku-4.5-20250514`. Task-type overrides apply — for example, a `coding` task type routes to `anthropic/claude-sonnet-4-20250514`. Fallback chains are supported. All hot-reloadable. diff --git a/migrations/20260224000002_worker_task_contracts.sql b/migrations/20260224000002_worker_task_contracts.sql new file mode 100644 index 000000000..f21ccb1cb --- /dev/null +++ b/migrations/20260224000002_worker_task_contracts.sql @@ -0,0 +1,35 @@ +-- Deterministic worker task contracts. +-- +-- Tracks acknowledgement/progress/terminal guarantees for worker executions so +-- long-running tasks always provide bounded feedback and reach terminal states. + +CREATE TABLE IF NOT EXISTS worker_task_contracts ( + id TEXT PRIMARY KEY, + agent_id TEXT NOT NULL, + channel_id TEXT NOT NULL, + worker_id TEXT NOT NULL UNIQUE, + task_summary TEXT NOT NULL, + state TEXT NOT NULL DEFAULT 'created', + ack_deadline_at TIMESTAMP NOT NULL, + progress_deadline_at TIMESTAMP NOT NULL, + terminal_deadline_at TIMESTAMP NOT NULL, + last_progress_at TIMESTAMP, + last_status_hash TEXT, + attempt_count INTEGER NOT NULL DEFAULT 0, + sla_nudge_sent INTEGER NOT NULL DEFAULT 0, + terminal_state TEXT, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE INDEX idx_worker_task_contracts_channel_state + ON worker_task_contracts(channel_id, state); + +CREATE INDEX idx_worker_task_contracts_ack_due + ON worker_task_contracts(state, ack_deadline_at); + +CREATE INDEX idx_worker_task_contracts_progress_due + ON worker_task_contracts(state, progress_deadline_at); + +CREATE INDEX idx_worker_task_contracts_terminal_due + ON worker_task_contracts(state, terminal_deadline_at); diff --git a/migrations/20260224000001_worker_tool_calls.sql b/migrations/20260224000003_worker_tool_calls.sql similarity index 100% rename from migrations/20260224000001_worker_tool_calls.sql rename to migrations/20260224000003_worker_tool_calls.sql diff --git a/migrations/20260225000001_worker_events.sql b/migrations/20260225000001_worker_events.sql new file mode 100644 index 000000000..f902f9c08 --- /dev/null +++ b/migrations/20260225000001_worker_events.sql @@ -0,0 +1,24 @@ +-- Durable worker event journal for debugging and UX timeline recovery. +-- +-- Captures lifecycle checkpoints (started/progress/tool activity/completed) as +-- append-only records tied to worker_runs. + +CREATE TABLE IF NOT EXISTS worker_events ( + id TEXT PRIMARY KEY, + worker_id TEXT NOT NULL, + channel_id TEXT, + agent_id TEXT, + event_type TEXT NOT NULL, + payload_json TEXT, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (worker_id) REFERENCES worker_runs(id) ON DELETE CASCADE +); + +CREATE INDEX idx_worker_events_worker + ON worker_events(worker_id, created_at); + +CREATE INDEX idx_worker_events_channel + ON worker_events(channel_id, created_at); + +CREATE INDEX idx_worker_events_agent + ON worker_events(agent_id, created_at); diff --git a/src/agent/channel.rs b/src/agent/channel.rs index 43c55b217..c8aaf869e 100644 --- a/src/agent/channel.rs +++ b/src/agent/channel.rs @@ -41,6 +41,9 @@ const WORKER_CHECKPOINT_MIN_INTERVAL_SECS: u64 = 20; const WORKER_CHECKPOINT_MAX_CHARS: usize = 220; const WORKER_RECEIPT_DISPATCH_INTERVAL_SECS: u64 = 5; const WORKER_RECEIPT_DISPATCH_BATCH_SIZE: i64 = 8; +const WORKER_CONTRACT_ACK_BATCH_SIZE: i64 = 8; +const WORKER_CONTRACT_PROGRESS_BATCH_SIZE: i64 = 8; +const WORKER_CONTRACT_TERMINAL_BATCH_SIZE: i64 = 8; #[derive(Debug, Clone)] struct WorkerCheckpointState { @@ -112,6 +115,7 @@ impl ChannelState { channel_id: Some(self.channel_id.clone()), result: format!("Worker cancelled: {reason}."), notify: true, + success: false, }); Ok(()) } else if removed { @@ -135,6 +139,7 @@ impl ChannelState { channel_id: Some(self.channel_id.clone()), result: format!("Worker cancelled: {reason}."), notify: true, + success: false, }); Ok(()) } else { @@ -204,10 +209,22 @@ pub struct Channel { pending_retrigger_metadata: HashMap, /// Deadline for firing the pending retrigger (debounce timer). retrigger_deadline: Option, + /// Optional cross-agent messaging tool for linked agent conversations. + send_agent_message_tool: Option, + /// Number of turns processed in a link channel. + link_turn_count: u32, + /// Originating channel id propagated through link channels. + originating_channel: Option, + /// Originating adapter source propagated through link channels. + originating_source: Option, + /// Set once a link conversation has been explicitly concluded. + link_concluded: bool, /// Per-worker checkpoint state used for status dedupe/throttling. worker_checkpoints: HashMap, /// Periodic deadline for checking due worker terminal delivery receipts. worker_receipt_dispatch_deadline: tokio::time::Instant, + /// Periodic deadline for deterministic worker task contract checks. + worker_contract_tick_deadline: tokio::time::Instant, } impl Channel { @@ -288,6 +305,9 @@ impl Channel { }; let self_tx = message_tx.clone(); + let worker_contract_tick_secs = (**deps.runtime_config.worker_contract.load()) + .tick_secs + .max(1); let channel = Self { id: id.clone(), title: None, @@ -311,9 +331,16 @@ impl Channel { pending_retrigger: false, pending_retrigger_metadata: HashMap::new(), retrigger_deadline: None, + send_agent_message_tool, + link_turn_count: 0, + originating_channel: None, + originating_source: None, + link_concluded: false, worker_checkpoints: HashMap::new(), worker_receipt_dispatch_deadline: tokio::time::Instant::now() + std::time::Duration::from_secs(WORKER_RECEIPT_DISPATCH_INTERVAL_SECS), + worker_contract_tick_deadline: tokio::time::Instant::now() + + std::time::Duration::from_secs(worker_contract_tick_secs), }; (channel, message_tx) @@ -338,6 +365,7 @@ impl Channel { self.coalesce_deadline, self.retrigger_deadline, Some(self.worker_receipt_dispatch_deadline), + Some(self.worker_contract_tick_deadline), ] .into_iter() .flatten() @@ -413,6 +441,15 @@ impl Channel { WORKER_RECEIPT_DISPATCH_INTERVAL_SECS, ); } + // Check worker task contract deadline + if self.worker_contract_tick_deadline <= now { + self.flush_due_worker_task_contract_deadlines().await; + let tick_secs = (**self.deps.runtime_config.worker_contract.load()) + .tick_secs + .max(1); + self.worker_contract_tick_deadline = tokio::time::Instant::now() + + std::time::Duration::from_secs(tick_secs); + } } else => break, } @@ -1715,8 +1752,38 @@ impl Channel { worker_type, .. } => { - run_logger.log_worker_started(channel_id.as_ref(), *worker_id, task); + run_logger.log_worker_started( + channel_id.as_ref(), + *worker_id, + task, + worker_type, + &self.deps.agent_id, + ); + let worker_contract_config = **self.deps.runtime_config.worker_contract.load(); + let terminal_secs = (**self.deps.runtime_config.cortex.load()) + .worker_timeout_secs + .max(1); let public_task_summary = summarize_worker_start_for_status(task); + if let Err(error) = run_logger + .upsert_worker_task_contract( + &self.deps.agent_id, + &self.id, + *worker_id, + &public_task_summary, + worker_contract_config.ack_secs.max(1), + worker_contract_config.progress_secs.max(1), + terminal_secs, + ) + .await + { + tracing::warn!( + %error, + channel_id = %self.id, + worker_id = %worker_id, + "failed to upsert worker task contract" + ); + } + self.worker_contract_tick_deadline = tokio::time::Instant::now(); if self.worker_is_user_visible(*worker_id).await { self.send_status_update(crate::StatusUpdate::WorkerStarted { worker_id: *worker_id, @@ -1738,10 +1805,146 @@ impl Channel { worker_id, status, .. } => { run_logger.log_worker_status(*worker_id, status); + let progress_secs = (**self.deps.runtime_config.worker_contract.load()) + .progress_secs + .max(1); + if let Err(error) = run_logger + .touch_worker_task_contract_progress(*worker_id, Some(status), progress_secs) + .await + { + tracing::warn!( + %error, + channel_id = %self.id, + worker_id = %worker_id, + "failed to refresh worker task contract progress" + ); + } if self.worker_is_user_visible(*worker_id).await { self.maybe_send_worker_checkpoint(*worker_id, status).await; } } + ProcessEvent::ToolStarted { + process_id: ProcessId::Worker(worker_id), + channel_id, + tool_name, + .. + } if channel_id.as_ref() == Some(&self.id) => { + run_logger.log_worker_event( + *worker_id, + "tool_started", + serde_json::json!({ "tool_name": tool_name }), + ); + let progress_secs = (**self.deps.runtime_config.worker_contract.load()) + .progress_secs + .max(1); + if let Err(error) = run_logger + .touch_worker_task_contract_progress( + *worker_id, + Some(tool_name.as_str()), + progress_secs, + ) + .await + { + tracing::warn!( + %error, + channel_id = %self.id, + worker_id = %worker_id, + "failed to refresh worker task contract progress from tool event" + ); + } + } + ProcessEvent::ToolCompleted { + process_id: ProcessId::Worker(worker_id), + channel_id, + tool_name, + .. + } if channel_id.as_ref() == Some(&self.id) => { + run_logger.log_worker_event( + *worker_id, + "tool_completed", + serde_json::json!({ "tool_name": tool_name }), + ); + let progress_secs = (**self.deps.runtime_config.worker_contract.load()) + .progress_secs + .max(1); + if let Err(error) = run_logger + .touch_worker_task_contract_progress( + *worker_id, + Some(tool_name.as_str()), + progress_secs, + ) + .await + { + tracing::warn!( + %error, + channel_id = %self.id, + worker_id = %worker_id, + "failed to refresh worker task contract progress from tool event" + ); + } + } + ProcessEvent::WorkerPermission { + worker_id, + channel_id, + permission_id, + description, + .. + } if channel_id.as_ref() == Some(&self.id) => { + run_logger.log_worker_event( + *worker_id, + "permission", + serde_json::json!({ + "permission_id": permission_id, + "description": description, + }), + ); + let progress_secs = (**self.deps.runtime_config.worker_contract.load()) + .progress_secs + .max(1); + if let Err(error) = run_logger + .touch_worker_task_contract_progress( + *worker_id, + Some(description.as_str()), + progress_secs, + ) + .await + { + tracing::warn!( + %error, + channel_id = %self.id, + worker_id = %worker_id, + "failed to refresh worker task contract progress from permission event" + ); + } + } + ProcessEvent::WorkerQuestion { + worker_id, + channel_id, + question_id, + .. + } if channel_id.as_ref() == Some(&self.id) => { + run_logger.log_worker_event( + *worker_id, + "question", + serde_json::json!({ + "question_id": question_id, + }), + ); + let progress_secs = (**self.deps.runtime_config.worker_contract.load()) + .progress_secs + .max(1); + if let Err(error) = run_logger + .touch_worker_task_contract_progress(*worker_id, None, progress_secs) + .await + { + tracing::warn!( + %error, + channel_id = %self.id, + worker_id = %worker_id, + "failed to refresh worker task contract progress from question event" + ); + } + } ProcessEvent::WorkerComplete { worker_id, result, @@ -1749,7 +1952,7 @@ impl Channel { success, .. } => { - run_logger.log_worker_completed(*worker_id, result); + run_logger.log_worker_completed(*worker_id, result, *success); self.worker_checkpoints.remove(worker_id); if *notify { self.send_status_update(crate::StatusUpdate::WorkerCompleted { @@ -1759,6 +1962,26 @@ impl Channel { .await; let terminal_state = classify_worker_terminal_state(result); + let terminal_secs = (**self.deps.runtime_config.cortex.load()) + .worker_timeout_secs + .max(1); + if let Err(error) = self + .state + .process_run_logger + .mark_worker_task_contract_terminal_pending( + *worker_id, + terminal_state, + terminal_secs, + ) + .await + { + tracing::warn!( + %error, + channel_id = %self.id, + worker_id = %worker_id, + "failed to mark worker contract terminal pending" + ); + } let payload_text = build_worker_terminal_receipt_payload(terminal_state, result); match self @@ -1933,6 +2156,117 @@ impl Channel { } } + async fn flush_due_worker_task_contract_deadlines(&mut self) { + let worker_contract_config = **self.deps.runtime_config.worker_contract.load(); + + let due_ack = match self + .state + .process_run_logger + .claim_due_worker_task_contract_ack_deadlines( + &self.id, + WORKER_CONTRACT_ACK_BATCH_SIZE, + worker_contract_config.ack_secs.max(1), + ) + .await + { + Ok(due) => due, + Err(error) => { + tracing::warn!( + %error, + channel_id = %self.id, + "failed to claim due worker task contract ack deadlines" + ); + Vec::new() + } + }; + + for due in due_ack { + if !self.worker_is_user_visible(due.worker_id).await { + if let Err(error) = self + .state + .process_run_logger + .mark_worker_task_contract_acknowledged(due.worker_id) + .await + { + tracing::warn!( + %error, + channel_id = %self.id, + worker_id = %due.worker_id, + "failed to auto-ack hidden worker task contract" + ); + } + continue; + } + let status = build_worker_ack_checkpoint(&due.task_summary, due.attempt_count); + self.send_status_update(crate::StatusUpdate::WorkerCheckpoint { + worker_id: due.worker_id, + status, + }) + .await; + } + + let due_progress = match self + .state + .process_run_logger + .claim_due_worker_task_contract_progress_deadlines( + &self.id, + WORKER_CONTRACT_PROGRESS_BATCH_SIZE, + ) + .await + { + Ok(due) => due, + Err(error) => { + tracing::warn!( + %error, + channel_id = %self.id, + "failed to claim due worker task contract progress deadlines" + ); + Vec::new() + } + }; + + for due in due_progress { + if !self.worker_is_user_visible(due.worker_id).await { + continue; + } + let status = build_worker_progress_sla_nudge(&due.task_summary); + self.send_status_update(crate::StatusUpdate::WorkerCheckpoint { + worker_id: due.worker_id, + status, + }) + .await; + } + + let due_terminal = match self + .state + .process_run_logger + .claim_due_worker_task_contract_terminal_deadlines( + &self.id, + WORKER_CONTRACT_TERMINAL_BATCH_SIZE, + ) + .await + { + Ok(due) => due, + Err(error) => { + tracing::warn!( + %error, + channel_id = %self.id, + "failed to claim due worker task contract terminal deadlines" + ); + Vec::new() + } + }; + + for due in due_terminal { + self.worker_checkpoints.remove(&due.worker_id); + tracing::warn!( + channel_id = %self.id, + worker_id = %due.worker_id, + "worker terminal deadline elapsed before adapter acknowledgement" + ); + } + } + async fn worker_is_user_visible(&self, worker_id: WorkerId) -> bool { let status_block = self.state.status_block.read().await; status_block @@ -2478,10 +2812,10 @@ where let outcome = if timeout_secs == 0 { match future.await { - Ok(text) => ("done", text, true), + Ok(text) => ("done", text, true, true), Err(error) => { tracing::error!(worker_id = %worker_id, %error, "worker failed"); - ("failed", format!("Worker failed: {error}"), true) + ("failed", format!("Worker failed: {error}"), true, false) } } } else { @@ -2498,10 +2832,10 @@ where tokio::select! { result = &mut future => { let outcome = match result { - Ok(text) => ("done", text, true), + Ok(text) => ("done", text, true, true), Err(error) => { tracing::error!(worker_id = %worker_id, %error, "worker failed"); - ("failed", format!("Worker failed: {error}"), true) + ("failed", format!("Worker failed: {error}"), true, false) } }; break outcome; @@ -2538,12 +2872,13 @@ where "timed_out", format!("Worker timed out after {timeout_secs} seconds without progress."), true, + false, ); } } } }; - let (terminal_status, result_text, notify) = outcome; + let (terminal_status, result_text, notify, success) = outcome; #[cfg(feature = "metrics")] { let metrics = crate::telemetry::Metrics::global(); @@ -2803,6 +3138,22 @@ fn summarize_worker_start_for_status(task: &str) -> String { } } +fn build_worker_ack_checkpoint(task_summary: &str, attempt_count: i64) -> String { + let message = if attempt_count <= 1 { + format!("Acknowledged {task_summary}; running now.") + } else { + format!("Still running {task_summary}.") + }; + normalize_worker_checkpoint_status(&message) + .unwrap_or_else(|| "background task running".to_string()) +} + +fn build_worker_progress_sla_nudge(task_summary: &str) -> String { + let message = format!("Still working on {task_summary}. I will report back when complete."); + normalize_worker_checkpoint_status(&message) + .unwrap_or_else(|| "still working; I will report back when complete.".to_string()) +} + fn is_worker_terminal_failure(result: &str) -> bool { let trimmed = result.trim_start(); trimmed.starts_with("Worker failed:") @@ -3272,6 +3623,8 @@ mod tests { use super::WORKER_CHECKPOINT_MIN_INTERVAL_SECS; use super::WorkerCheckpointState; use super::apply_history_after_turn; + use super::build_worker_ack_checkpoint; + use super::build_worker_progress_sla_nudge; use super::build_worker_terminal_receipt_payload; use super::classify_worker_terminal_state; use super::is_worker_progress_event; @@ -3820,4 +4173,24 @@ mod tests { "public summary should not expose raw task content" ); } + + #[test] + fn worker_ack_checkpoint_is_deterministic() { + assert_eq!( + build_worker_ack_checkpoint("research task", 1), + "Acknowledged research task; running now." + ); + assert_eq!( + build_worker_ack_checkpoint("research task", 2), + "Still running research task." + ); + } + + #[test] + fn worker_progress_sla_nudge_is_deterministic() { + assert_eq!( + build_worker_progress_sla_nudge("analysis task"), + "Still working on analysis task. I will report back when complete." + ); + } } diff --git a/src/api/agents.rs b/src/api/agents.rs index 154fe6d5d..c07924dc3 100644 --- a/src/api/agents.rs +++ b/src/api/agents.rs @@ -524,6 +524,7 @@ pub(super) async fn create_agent( memory_persistence: None, coalesce: None, ingestion: None, + worker_contract: None, cortex: None, warmup: None, browser: None, diff --git a/src/api/workers.rs b/src/api/workers.rs index 39fcc8d73..e176ed4bb 100644 --- a/src/api/workers.rs +++ b/src/api/workers.rs @@ -67,6 +67,15 @@ pub(super) struct WorkerDetailResponse { completed_at: Option, transcript: Option>, tool_calls: i64, + events: Vec, +} + +#[derive(Serialize)] +pub(super) struct WorkerEventItem { + id: String, + event_type: String, + payload_json: Option, + created_at: String, } /// List worker runs for an agent, with live status merged from StatusBlocks. @@ -163,6 +172,21 @@ pub(super) async fn worker_detail( }) .ok() }); + let events = logger + .list_worker_events(&query.worker_id, 200) + .await + .map_err(|error| { + tracing::warn!(%error, worker_id = %query.worker_id, "failed to list worker events"); + StatusCode::INTERNAL_SERVER_ERROR + })? + .into_iter() + .map(|event| WorkerEventItem { + id: event.id, + event_type: event.event_type, + payload_json: event.payload_json, + created_at: event.created_at, + }) + .collect(); Ok(Json(WorkerDetailResponse { id: detail.id, @@ -176,5 +200,6 @@ pub(super) async fn worker_detail( completed_at: detail.completed_at, transcript, tool_calls: detail.tool_calls, + events, })) } diff --git a/src/config.rs b/src/config.rs index 098c78c23..5c28ab812 100644 --- a/src/config.rs +++ b/src/config.rs @@ -337,6 +337,7 @@ pub struct DefaultsConfig { pub memory_persistence: MemoryPersistenceConfig, pub coalesce: CoalesceConfig, pub ingestion: IngestionConfig, + pub worker_contract: WorkerContractConfig, pub cortex: CortexConfig, pub warmup: WarmupConfig, pub browser: BrowserConfig, @@ -365,6 +366,7 @@ impl std::fmt::Debug for DefaultsConfig { .field("memory_persistence", &self.memory_persistence) .field("coalesce", &self.coalesce) .field("ingestion", &self.ingestion) + .field("worker_contract", &self.worker_contract) .field("cortex", &self.cortex) .field("warmup", &self.warmup) .field("browser", &self.browser) @@ -567,6 +569,27 @@ impl Default for OpenCodeConfig { } } +/// Worker task contract timing configuration. +#[derive(Debug, Clone, Copy)] +pub struct WorkerContractConfig { + /// Deadline (seconds) to confirm a spawned worker has been acknowledged. + pub ack_secs: u64, + /// Deadline (seconds) between meaningful progress updates. + pub progress_secs: u64, + /// Polling interval (seconds) for contract deadline checks. + pub tick_secs: u64, +} + +impl Default for WorkerContractConfig { + fn default() -> Self { + Self { + ack_secs: 5, + progress_secs: 45, + tick_secs: 2, + } + } +} + /// Cortex configuration. #[derive(Debug, Clone, Copy)] pub struct CortexConfig { @@ -756,6 +779,7 @@ pub struct AgentConfig { pub memory_persistence: Option, pub coalesce: Option, pub ingestion: Option, + pub worker_contract: Option, pub cortex: Option, pub warmup: Option, pub browser: Option, @@ -806,6 +830,7 @@ pub struct ResolvedAgentConfig { pub memory_persistence: MemoryPersistenceConfig, pub coalesce: CoalesceConfig, pub ingestion: IngestionConfig, + pub worker_contract: WorkerContractConfig, pub cortex: CortexConfig, pub warmup: WarmupConfig, pub browser: BrowserConfig, @@ -832,6 +857,7 @@ impl Default for DefaultsConfig { memory_persistence: MemoryPersistenceConfig::default(), coalesce: CoalesceConfig::default(), ingestion: IngestionConfig::default(), + worker_contract: WorkerContractConfig::default(), cortex: CortexConfig::default(), warmup: WarmupConfig::default(), browser: BrowserConfig::default(), @@ -880,6 +906,7 @@ impl AgentConfig { .unwrap_or(defaults.memory_persistence), coalesce: self.coalesce.unwrap_or(defaults.coalesce), ingestion: self.ingestion.unwrap_or(defaults.ingestion), + worker_contract: self.worker_contract.unwrap_or(defaults.worker_contract), cortex: self.cortex.unwrap_or(defaults.cortex), warmup: self.warmup.unwrap_or(defaults.warmup), browser: self @@ -1712,6 +1739,7 @@ struct TomlDefaultsConfig { memory_persistence: Option, coalesce: Option, ingestion: Option, + worker_contract: Option, cortex: Option, warmup: Option, browser: Option, @@ -1764,6 +1792,13 @@ struct TomlIngestionConfig { chunk_size: Option, } +#[derive(Deserialize)] +struct TomlWorkerContractConfig { + ack_secs: Option, + progress_secs: Option, + tick_secs: Option, +} + #[derive(Deserialize)] struct TomlCompactionConfig { background_threshold: Option, @@ -1858,6 +1893,7 @@ struct TomlAgentConfig { memory_persistence: Option, coalesce: Option, ingestion: Option, + worker_contract: Option, cortex: Option, warmup: Option, browser: Option, @@ -2539,6 +2575,7 @@ impl Config { memory_persistence: None, coalesce: None, ingestion: None, + worker_contract: None, cortex: None, warmup: None, browser: None, @@ -3042,6 +3079,21 @@ impl Config { chunk_size: ig.chunk_size.unwrap_or(base_defaults.ingestion.chunk_size), }) .unwrap_or(base_defaults.ingestion), + worker_contract: toml + .defaults + .worker_contract + .map(|contract| WorkerContractConfig { + ack_secs: contract + .ack_secs + .unwrap_or(base_defaults.worker_contract.ack_secs), + progress_secs: contract + .progress_secs + .unwrap_or(base_defaults.worker_contract.progress_secs), + tick_secs: contract + .tick_secs + .unwrap_or(base_defaults.worker_contract.tick_secs), + }) + .unwrap_or(base_defaults.worker_contract), cortex: toml .defaults .cortex @@ -3237,6 +3289,17 @@ impl Config { .unwrap_or(defaults.ingestion.poll_interval_secs), chunk_size: ig.chunk_size.unwrap_or(defaults.ingestion.chunk_size), }), + worker_contract: a.worker_contract.map(|contract| WorkerContractConfig { + ack_secs: contract + .ack_secs + .unwrap_or(defaults.worker_contract.ack_secs), + progress_secs: contract + .progress_secs + .unwrap_or(defaults.worker_contract.progress_secs), + tick_secs: contract + .tick_secs + .unwrap_or(defaults.worker_contract.tick_secs), + }), cortex: a.cortex.map(|c| CortexConfig { tick_interval_secs: c .tick_interval_secs @@ -3330,6 +3393,7 @@ impl Config { memory_persistence: None, coalesce: None, ingestion: None, + worker_contract: None, cortex: None, warmup: None, browser: None, @@ -3600,6 +3664,7 @@ pub struct RuntimeConfig { pub memory_persistence: ArcSwap, pub coalesce: ArcSwap, pub ingestion: ArcSwap, + pub worker_contract: ArcSwap, pub max_turns: ArcSwap, pub branch_max_turns: ArcSwap, pub context_window: ArcSwap, @@ -3660,6 +3725,7 @@ impl RuntimeConfig { memory_persistence: ArcSwap::from_pointee(agent_config.memory_persistence), coalesce: ArcSwap::from_pointee(agent_config.coalesce), ingestion: ArcSwap::from_pointee(agent_config.ingestion), + worker_contract: ArcSwap::from_pointee(agent_config.worker_contract), max_turns: ArcSwap::from_pointee(agent_config.max_turns), branch_max_turns: ArcSwap::from_pointee(agent_config.branch_max_turns), context_window: ArcSwap::from_pointee(agent_config.context_window), @@ -3742,6 +3808,8 @@ impl RuntimeConfig { .store(Arc::new(resolved.memory_persistence)); self.coalesce.store(Arc::new(resolved.coalesce)); self.ingestion.store(Arc::new(resolved.ingestion)); + self.worker_contract + .store(Arc::new(resolved.worker_contract)); self.max_turns.store(Arc::new(resolved.max_turns)); self.branch_max_turns .store(Arc::new(resolved.branch_max_turns)); diff --git a/src/conversation.rs b/src/conversation.rs index bbfc5fac9..2e3abf3e6 100644 --- a/src/conversation.rs +++ b/src/conversation.rs @@ -7,6 +7,7 @@ pub mod worker_transcript; pub use channels::ChannelStore; pub use history::{ - ConversationLogger, ProcessRunLogger, TimelineItem, WorkerDetailRow, WorkerRunRow, + ConversationLogger, ProcessRunLogger, TimelineItem, WorkerDetailRow, WorkerEventRow, + WorkerRunRow, }; pub use worker_transcript::{ActionContent, TranscriptStep}; diff --git a/src/conversation/history.rs b/src/conversation/history.rs index 1086735fa..99f4e34e3 100644 --- a/src/conversation/history.rs +++ b/src/conversation/history.rs @@ -5,6 +5,7 @@ use crate::{BranchId, ChannelId, WorkerId}; use serde::Serialize; use sqlx::{Row as _, SqlitePool}; use std::collections::HashMap; +use std::hash::{Hash as _, Hasher as _}; /// Persists conversation messages (user and assistant) to SQLite. /// @@ -218,6 +219,13 @@ const WORKER_TERMINAL_RECEIPT_KIND: &str = "worker_terminal"; const WORKER_RECEIPT_MAX_ATTEMPTS: i64 = 6; const WORKER_RECEIPT_BACKOFF_SECS: [i64; 5] = [5, 15, 45, 120, 300]; const WORKER_RECEIPT_RETENTION_DAYS: i64 = 30; +const WORKER_CONTRACT_STATE_CREATED: &str = "created"; +const WORKER_CONTRACT_STATE_ACKED: &str = "acked"; +const WORKER_CONTRACT_STATE_PROGRESSING: &str = "progressing"; +const WORKER_CONTRACT_STATE_SLA_MISSED: &str = "sla_missed"; +const WORKER_CONTRACT_STATE_TERMINAL_PENDING: &str = "terminal_pending"; +const WORKER_CONTRACT_STATE_TERMINAL_ACKED: &str = "terminal_acked"; +const WORKER_CONTRACT_STATE_TERMINAL_FAILED: &str = "terminal_failed"; fn worker_receipt_backoff_secs(attempt_count: i64) -> Option { if attempt_count <= 0 { @@ -228,6 +236,12 @@ fn worker_receipt_backoff_secs(attempt_count: i64) -> Option { .copied() } +fn status_fingerprint(status: &str) -> String { + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + status.hash(&mut hasher); + format!("{:016x}", hasher.finish()) +} + #[derive(Debug, Clone)] pub struct PendingWorkerDeliveryReceipt { pub id: String, @@ -251,6 +265,24 @@ pub struct WorkerDeliveryRetryOutcome { pub next_attempt_at: Option, } +#[derive(Debug, Clone)] +pub struct DueWorkerTaskContractAck { + pub worker_id: WorkerId, + pub task_summary: String, + pub attempt_count: i64, +} + +#[derive(Debug, Clone)] +pub struct DueWorkerTaskContractProgress { + pub worker_id: WorkerId, + pub task_summary: String, +} + +#[derive(Debug, Clone)] +pub struct DueWorkerTaskContractTerminal { + pub worker_id: WorkerId, +} + /// Persists branch and worker run records for channel timeline history. /// /// All write methods are fire-and-forget, same pattern as ConversationLogger. @@ -264,6 +296,67 @@ impl ProcessRunLogger { Self { pool } } + fn log_worker_event_with_context( + &self, + worker_id: String, + channel_id: Option, + agent_id: Option, + event_type: String, + payload_json: Option, + ) { + let pool = self.pool.clone(); + + tokio::spawn(async move { + let event_id = uuid::Uuid::new_v4().to_string(); + if let Err(error) = sqlx::query( + "INSERT INTO worker_events \ + (id, worker_id, channel_id, agent_id, event_type, payload_json) \ + VALUES ( \ + ?, \ + ?, \ + COALESCE(?, (SELECT channel_id FROM worker_runs WHERE id = ?)), \ + COALESCE(?, (SELECT agent_id FROM worker_runs WHERE id = ?)), \ + ?, \ + ? \ + )", + ) + .bind(&event_id) + .bind(&worker_id) + .bind(&channel_id) + .bind(&worker_id) + .bind(&agent_id) + .bind(&worker_id) + .bind(&event_type) + .bind(&payload_json) + .execute(&pool) + .await + { + tracing::warn!( + %error, + worker_id = %worker_id, + event_type = %event_type, + "failed to persist worker event" + ); + } + }); + } + + /// Record a worker lifecycle event. Fire-and-forget. + pub fn log_worker_event( + &self, + worker_id: WorkerId, + event_type: &str, + payload: serde_json::Value, + ) { + self.log_worker_event_with_context( + worker_id.to_string(), + None, + None, + event_type.to_string(), + Some(payload.to_string()), + ); + } + /// Record a branch starting. Fire-and-forget. pub fn log_branch_started( &self, @@ -341,19 +434,45 @@ impl ProcessRunLogger { .await { tracing::warn!(%error, worker_id = %id, "failed to persist worker start"); + return; + } + + let payload_json = serde_json::json!({ + "task": task, + "worker_type": worker_type, + }) + .to_string(); + let event_id = uuid::Uuid::new_v4().to_string(); + + if let Err(error) = sqlx::query( + "INSERT INTO worker_events \ + (id, worker_id, channel_id, agent_id, event_type, payload_json) \ + VALUES (?, ?, ?, ?, 'started', ?)", + ) + .bind(&event_id) + .bind(&id) + .bind(&channel_id) + .bind(&agent_id) + .bind(&payload_json) + .execute(&pool) + .await + { + tracing::warn!(%error, worker_id = %id, "failed to persist worker start event"); } }); } /// Update a worker's status. Fire-and-forget. - /// Worker status text updates are transient — they're available via the - /// in-memory StatusBlock for live workers and don't need to be persisted. - /// The `status` column is reserved for the state enum (running/done/failed). - pub fn log_worker_status(&self, _worker_id: WorkerId, _status: &str) { - // Intentionally a no-op. Status text was previously written to the - // `status` column, overwriting the state enum with free-text like - // "Searching for weather in Germany" which broke badge rendering - // and status filtering. + /// Worker status text is kept in a separate append-only event table so the + /// worker_runs status enum remains queryable (`running`, `done`, etc.). + pub fn log_worker_status(&self, worker_id: WorkerId, status: &str) { + self.log_worker_event( + worker_id, + "status", + serde_json::json!({ + "status": status, + }), + ); } /// Record a worker completing with its result. Fire-and-forget. @@ -361,7 +480,7 @@ impl ProcessRunLogger { let pool = self.pool.clone(); let id = worker_id.to_string(); let result = result.to_string(); - let status = if success { "done" } else { "failed" }; + let success_int = if success { 1_i64 } else { 0_i64 }; tokio::spawn(async move { if let Err(error) = sqlx::query( @@ -372,7 +491,8 @@ impl ProcessRunLogger { WHEN ? LIKE 'Worker cancelled:%' THEN 'cancelled' \ WHEN ? LIKE 'Worker failed:%' THEN 'failed' \ WHEN ? LIKE 'Worker timed out after %' THEN 'timed_out' \ - ELSE 'done' \ + WHEN ? = 1 THEN 'done' \ + ELSE 'failed' \ END, \ completed_at = CURRENT_TIMESTAMP \ WHERE id = ?", @@ -381,15 +501,464 @@ impl ProcessRunLogger { .bind(&result) .bind(&result) .bind(&result) + .bind(success_int) .bind(&id) .execute(&pool) .await { tracing::warn!(%error, worker_id = %id, "failed to persist worker completion"); } + + let payload_json = serde_json::json!({ + "result": result, + "success": success, + }) + .to_string(); + let event_id = uuid::Uuid::new_v4().to_string(); + + if let Err(error) = sqlx::query( + "INSERT INTO worker_events \ + (id, worker_id, channel_id, agent_id, event_type, payload_json) \ + VALUES ( \ + ?, \ + ?, \ + (SELECT channel_id FROM worker_runs WHERE id = ?), \ + (SELECT agent_id FROM worker_runs WHERE id = ?), \ + 'completed', \ + ? \ + )", + ) + .bind(&event_id) + .bind(&id) + .bind(&id) + .bind(&id) + .bind(&payload_json) + .execute(&pool) + .await + { + tracing::warn!( + %error, + worker_id = %id, + "failed to persist worker completion event" + ); + } }); } + /// Create or refresh the deterministic task contract for a worker. + pub async fn upsert_worker_task_contract( + &self, + agent_id: &crate::AgentId, + channel_id: &ChannelId, + worker_id: WorkerId, + task_summary: &str, + ack_secs: u64, + progress_secs: u64, + terminal_secs: u64, + ) -> crate::error::Result<()> { + let id = uuid::Uuid::new_v4().to_string(); + let worker_id = worker_id.to_string(); + let channel_id = channel_id.to_string(); + let status_hash = status_fingerprint(task_summary); + + sqlx::query( + "INSERT OR IGNORE INTO worker_task_contracts \ + (id, agent_id, channel_id, worker_id, task_summary, state, \ + ack_deadline_at, progress_deadline_at, terminal_deadline_at, \ + last_status_hash, created_at, updated_at) \ + VALUES (?, ?, ?, ?, ?, ?, \ + datetime('now', '+' || ? || ' seconds'), \ + datetime('now', '+' || ? || ' seconds'), \ + datetime('now', '+' || ? || ' seconds'), \ + ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)", + ) + .bind(&id) + .bind(agent_id.as_ref()) + .bind(&channel_id) + .bind(&worker_id) + .bind(task_summary) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(ack_secs as i64) + .bind(progress_secs as i64) + .bind(terminal_secs as i64) + .bind(&status_hash) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + sqlx::query( + "UPDATE worker_task_contracts \ + SET task_summary = ?, \ + state = ?, \ + ack_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + progress_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + terminal_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + last_status_hash = ?, \ + sla_nudge_sent = 0, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ? \ + AND state NOT IN (?, ?)", + ) + .bind(task_summary) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(ack_secs as i64) + .bind(progress_secs as i64) + .bind(terminal_secs as i64) + .bind(&status_hash) + .bind(&worker_id) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + Ok(()) + } + + /// Mark that a user-visible acknowledgement has been delivered for a worker. + pub async fn mark_worker_task_contract_acknowledged( + &self, + worker_id: WorkerId, + ) -> crate::error::Result<()> { + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = CASE \ + WHEN state IN (?, ?, ?) THEN state \ + ELSE ? \ + END, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ?", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(worker_id.to_string()) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + Ok(()) + } + + /// Refresh progress heartbeat information for a worker contract. + pub async fn touch_worker_task_contract_progress( + &self, + worker_id: WorkerId, + status: Option<&str>, + progress_secs: u64, + ) -> crate::error::Result<()> { + let status_hash = status.map(status_fingerprint); + + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = CASE \ + WHEN state IN (?, ?, ?, ?) THEN ? \ + ELSE state \ + END, \ + last_progress_at = CURRENT_TIMESTAMP, \ + progress_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + last_status_hash = COALESCE(?, last_status_hash), \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ? \ + AND state NOT IN (?, ?)", + ) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(WORKER_CONTRACT_STATE_SLA_MISSED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(progress_secs as i64) + .bind(status_hash) + .bind(worker_id.to_string()) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + Ok(()) + } + + /// Mark a worker contract as terminal pending while delivery receipts are in-flight. + pub async fn mark_worker_task_contract_terminal_pending( + &self, + worker_id: WorkerId, + terminal_state: &str, + terminal_secs: u64, + ) -> crate::error::Result<()> { + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + terminal_state = ?, \ + terminal_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ? \ + AND state NOT IN (?, ?)", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .bind(terminal_state) + .bind(terminal_secs as i64) + .bind(worker_id.to_string()) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + Ok(()) + } + + /// Claim workers whose acknowledgement deadline has expired. + pub async fn claim_due_worker_task_contract_ack_deadlines( + &self, + channel_id: &ChannelId, + limit: i64, + retry_secs: u64, + ) -> crate::error::Result> { + let channel_id = channel_id.to_string(); + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let rows = sqlx::query( + "SELECT id, worker_id, task_summary, attempt_count \ + FROM worker_task_contracts \ + WHERE channel_id = ? \ + AND state = ? \ + AND ack_deadline_at <= CURRENT_TIMESTAMP \ + ORDER BY ack_deadline_at ASC, created_at ASC \ + LIMIT ?", + ) + .bind(&channel_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(limit) + .fetch_all(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let mut due = Vec::with_capacity(rows.len()); + for row in rows { + let contract_id: String = row.try_get("id").unwrap_or_default(); + let worker_id_raw: String = row.try_get("worker_id").unwrap_or_default(); + let task_summary: String = row.try_get("task_summary").unwrap_or_default(); + let attempt_count: i64 = row.try_get("attempt_count").unwrap_or_default(); + + let updated = sqlx::query( + "UPDATE worker_task_contracts \ + SET ack_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + attempt_count = attempt_count + 1, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? \ + AND state = ? \ + AND ack_deadline_at <= CURRENT_TIMESTAMP", + ) + .bind(retry_secs as i64) + .bind(&contract_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + if updated == 0 { + continue; + } + + match uuid::Uuid::parse_str(&worker_id_raw) { + Ok(worker_id) => due.push(DueWorkerTaskContractAck { + worker_id, + task_summary, + attempt_count: attempt_count + 1, + }), + Err(error) => { + tracing::warn!( + %error, + worker_id = %worker_id_raw, + "skipping malformed worker task contract id" + ); + } + } + } + + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + Ok(due) + } + + /// Claim workers whose progress deadline has expired and have not been nudged yet. + pub async fn claim_due_worker_task_contract_progress_deadlines( + &self, + channel_id: &ChannelId, + limit: i64, + ) -> crate::error::Result> { + let channel_id = channel_id.to_string(); + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let rows = sqlx::query( + "SELECT id, worker_id, task_summary \ + FROM worker_task_contracts \ + WHERE channel_id = ? \ + AND state IN (?, ?, ?) \ + AND sla_nudge_sent = 0 \ + AND progress_deadline_at <= CURRENT_TIMESTAMP \ + ORDER BY progress_deadline_at ASC, created_at ASC \ + LIMIT ?", + ) + .bind(&channel_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(limit) + .fetch_all(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let mut due = Vec::with_capacity(rows.len()); + for row in rows { + let contract_id: String = row.try_get("id").unwrap_or_default(); + let worker_id_raw: String = row.try_get("worker_id").unwrap_or_default(); + let task_summary: String = row.try_get("task_summary").unwrap_or_default(); + + let updated = sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + sla_nudge_sent = 1, \ + attempt_count = attempt_count + 1, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? \ + AND state IN (?, ?, ?) \ + AND sla_nudge_sent = 0 \ + AND progress_deadline_at <= CURRENT_TIMESTAMP", + ) + .bind(WORKER_CONTRACT_STATE_SLA_MISSED) + .bind(&contract_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + if updated == 0 { + continue; + } + + match uuid::Uuid::parse_str(&worker_id_raw) { + Ok(worker_id) => due.push(DueWorkerTaskContractProgress { + worker_id, + task_summary, + }), + Err(error) => { + tracing::warn!( + %error, + worker_id = %worker_id_raw, + "skipping malformed worker task contract id" + ); + } + } + } + + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + Ok(due) + } + + /// Claim terminal-pending contracts whose delivery window elapsed. + /// + /// Overdue contracts are transitioned to `terminal_failed` and any pending + /// terminal delivery receipts are marked `failed` to stop retry churn. + pub async fn claim_due_worker_task_contract_terminal_deadlines( + &self, + channel_id: &ChannelId, + limit: i64, + ) -> crate::error::Result> { + let channel_id = channel_id.to_string(); + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let rows = sqlx::query( + "SELECT id, worker_id \ + FROM worker_task_contracts \ + WHERE channel_id = ? \ + AND state = ? \ + AND terminal_deadline_at <= CURRENT_TIMESTAMP \ + ORDER BY terminal_deadline_at ASC, created_at ASC \ + LIMIT ?", + ) + .bind(&channel_id) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .bind(limit) + .fetch_all(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let mut due = Vec::with_capacity(rows.len()); + for row in rows { + let contract_id: String = row.try_get("id").unwrap_or_default(); + let worker_id_raw: String = row.try_get("worker_id").unwrap_or_default(); + + let updated = sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + terminal_state = COALESCE(terminal_state, 'failed'), \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? \ + AND state = ? \ + AND terminal_deadline_at <= CURRENT_TIMESTAMP", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .bind(&contract_id) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + if updated == 0 { + continue; + } + + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'failed', \ + last_error = ?, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ? \ + AND kind = ? \ + AND status IN ('pending', 'sending')", + ) + .bind("terminal deadline elapsed before adapter acknowledgement") + .bind(&worker_id_raw) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + match uuid::Uuid::parse_str(&worker_id_raw) { + Ok(worker_id) => due.push(DueWorkerTaskContractTerminal { worker_id }), + Err(error) => { + tracing::warn!( + %error, + worker_id = %worker_id_raw, + "skipping malformed worker task contract id" + ); + } + } + } + + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + Ok(due) + } + /// Create (or refresh) the durable terminal delivery receipt for a worker. /// /// One terminal receipt exists per worker (`kind = worker_terminal`). If the @@ -610,6 +1179,28 @@ impl ProcessRunLogger { .map_err(|error| anyhow::anyhow!(error))? .rows_affected(); + if updated > 0 { + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ( + SELECT worker_id FROM worker_delivery_receipts WHERE id = ? + ) \ + AND state IN (?, ?, ?, ?, ?)", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(receipt_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(WORKER_CONTRACT_STATE_SLA_MISSED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + } + Ok(updated > 0) } @@ -658,6 +1249,26 @@ impl ProcessRunLogger { .await .map_err(|db_error| anyhow::anyhow!(db_error))?; + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ( + SELECT worker_id FROM worker_delivery_receipts WHERE id = ? + ) \ + AND state IN (?, ?, ?, ?, ?)", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .bind(receipt_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(WORKER_CONTRACT_STATE_SLA_MISSED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .execute(&self.pool) + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; + return Ok(WorkerDeliveryRetryOutcome { status: "failed".to_string(), attempt_count, @@ -743,7 +1354,7 @@ impl ProcessRunLogger { /// This is called on startup before channels begin handling messages. Any /// rows with NULL `completed_at` cannot be resumed and should be marked /// terminal so timelines and analytics stay accurate. - pub async fn close_orphaned_runs(&self) -> crate::error::Result<(u64, u64, u64)> { + pub async fn close_orphaned_runs(&self) -> crate::error::Result<(u64, u64, u64, u64)> { let worker_result = sqlx::query( "UPDATE worker_runs \ SET status = 'failed', \ @@ -776,10 +1387,25 @@ impl ProcessRunLogger { .await .map_err(|error| anyhow::anyhow!(error))?; + let contract_result = sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + terminal_state = COALESCE(terminal_state, 'failed'), \ + updated_at = CURRENT_TIMESTAMP \ + WHERE state NOT IN (?, ?)", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + Ok(( worker_result.rows_affected(), branch_result.rows_affected(), receipt_result.rows_affected(), + contract_result.rows_affected(), )) } @@ -1008,6 +1634,45 @@ impl ProcessRunLogger { tool_calls: row.try_get::("tool_calls").unwrap_or(0), })) } + + /// List recent worker events for a worker, oldest first. + pub async fn list_worker_events( + &self, + worker_id: &str, + limit: i64, + ) -> crate::error::Result> { + let rows = sqlx::query( + "SELECT id, worker_id, channel_id, agent_id, event_type, payload_json, created_at \ + FROM worker_events \ + WHERE worker_id = ? \ + ORDER BY created_at DESC \ + LIMIT ?", + ) + .bind(worker_id) + .bind(limit.clamp(1, 500)) + .fetch_all(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let mut events = rows + .into_iter() + .map(|row| WorkerEventRow { + id: row.try_get("id").unwrap_or_default(), + worker_id: row.try_get("worker_id").unwrap_or_default(), + channel_id: row.try_get("channel_id").ok(), + agent_id: row.try_get("agent_id").ok(), + event_type: row.try_get("event_type").unwrap_or_default(), + payload_json: row.try_get("payload_json").ok(), + created_at: row + .try_get::, _>("created_at") + .map(|t| t.to_rfc3339()) + .unwrap_or_default(), + }) + .collect::>(); + + events.reverse(); + Ok(events) + } } /// A worker run row without the transcript blob (for list queries). @@ -1041,6 +1706,18 @@ pub struct WorkerDetailRow { pub tool_calls: i64, } +/// A worker lifecycle event row. +#[derive(Debug, Clone, Serialize)] +pub struct WorkerEventRow { + pub id: String, + pub worker_id: String, + pub channel_id: Option, + pub agent_id: Option, + pub event_type: String, + pub payload_json: Option, + pub created_at: String, +} + #[cfg(test)] mod tests { use super::*; @@ -1063,6 +1740,70 @@ mod tests { ProcessRunLogger::new(pool) } + #[tokio::test] + async fn worker_event_journal_records_lifecycle_updates() { + let logger = connect_logger().await; + let agent_id: crate::AgentId = Arc::from("agent:test"); + let worker_id = uuid::Uuid::new_v4(); + let worker_id_text = worker_id.to_string(); + + logger.log_worker_started(None, worker_id, "research task", "builtin", &agent_id); + + let mut started_seen = false; + for _ in 0..20 { + let events = logger + .list_worker_events(&worker_id_text, 20) + .await + .expect("list worker events"); + if events.iter().any(|event| event.event_type == "started") { + started_seen = true; + break; + } + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + } + assert!(started_seen, "expected started event"); + + logger.log_worker_status(worker_id, "searching source material"); + logger.log_worker_event( + worker_id, + "tool_started", + serde_json::json!({ "tool_name": "web_search" }), + ); + logger.log_worker_completed(worker_id, "done", true); + + let mut events = Vec::new(); + for _ in 0..20 { + events = logger + .list_worker_events(&worker_id_text, 20) + .await + .expect("list worker events"); + if events.iter().any(|event| event.event_type == "status") + && events + .iter() + .any(|event| event.event_type == "tool_started") + && events.iter().any(|event| event.event_type == "completed") + { + break; + } + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + } + + assert!( + events.iter().any(|event| event.event_type == "status"), + "expected status event" + ); + assert!( + events + .iter() + .any(|event| event.event_type == "tool_started"), + "expected tool_started event" + ); + assert!( + events.iter().any(|event| event.event_type == "completed"), + "expected completed event" + ); + } + #[tokio::test] async fn worker_terminal_receipt_claim_ack_and_stats() { let logger = connect_logger().await; @@ -1199,11 +1940,12 @@ mod tests { .await .expect("insert sending receipt"); - let (_, _, recovered_receipts) = logger + let (_, _, recovered_receipts, recovered_contracts) = logger .close_orphaned_runs() .await .expect("recover orphaned runs"); assert_eq!(recovered_receipts, 1); + assert_eq!(recovered_contracts, 0); let status: String = sqlx::query_scalar("SELECT status FROM worker_delivery_receipts WHERE id = ?") @@ -1214,6 +1956,205 @@ mod tests { assert_eq!(status, "pending"); } + #[tokio::test] + async fn worker_task_contract_deadline_claims_and_terminal_ack_flow() { + let logger = connect_logger().await; + let agent_id: crate::AgentId = Arc::from("agent:test"); + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "research task", + 0, + 0, + 60, + ) + .await + .expect("upsert contract"); + + let due_ack = logger + .claim_due_worker_task_contract_ack_deadlines(&channel_id, 10, 5) + .await + .expect("claim due ack deadlines"); + assert_eq!(due_ack.len(), 1); + assert_eq!(due_ack[0].worker_id, worker_id); + assert_eq!(due_ack[0].attempt_count, 1); + + logger + .mark_worker_task_contract_acknowledged(worker_id) + .await + .expect("mark acknowledged"); + logger + .touch_worker_task_contract_progress(worker_id, Some("indexing source data"), 30) + .await + .expect("touch progress"); + + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + progress_deadline_at = CURRENT_TIMESTAMP, \ + sla_nudge_sent = 0 \ + WHERE worker_id = ?", + ) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(worker_id.to_string()) + .execute(&logger.pool) + .await + .expect("force progress deadline"); + + let due_progress = logger + .claim_due_worker_task_contract_progress_deadlines(&channel_id, 10) + .await + .expect("claim due progress deadlines"); + assert_eq!(due_progress.len(), 1); + assert_eq!(due_progress[0].worker_id, worker_id); + + let due_progress_again = logger + .claim_due_worker_task_contract_progress_deadlines(&channel_id, 10) + .await + .expect("second progress claim should be empty"); + assert!(due_progress_again.is_empty()); + + logger + .mark_worker_task_contract_terminal_pending(worker_id, "done", 60) + .await + .expect("mark terminal pending"); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "done", + "Background task completed: done", + ) + .await + .expect("upsert receipt"); + let acked = logger + .ack_worker_delivery_receipt(&receipt_id) + .await + .expect("ack receipt"); + assert!(acked); + + let state: String = + sqlx::query_scalar("SELECT state FROM worker_task_contracts WHERE worker_id = ?") + .bind(worker_id.to_string()) + .fetch_one(&logger.pool) + .await + .expect("load contract state"); + assert_eq!(state, WORKER_CONTRACT_STATE_TERMINAL_ACKED); + } + + #[tokio::test] + async fn worker_task_contract_moves_to_terminal_failed_on_receipt_exhaustion() { + let logger = connect_logger().await; + let agent_id: crate::AgentId = Arc::from("agent:test"); + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "analysis task", + 5, + 45, + 60, + ) + .await + .expect("upsert contract"); + logger + .mark_worker_task_contract_terminal_pending(worker_id, "failed", 60) + .await + .expect("mark terminal pending"); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "failed", + "Background task failed: request error", + ) + .await + .expect("upsert receipt"); + + for _ in 0..WORKER_RECEIPT_MAX_ATTEMPTS { + let _ = logger + .fail_worker_delivery_receipt_attempt(&receipt_id, "adapter unavailable") + .await + .expect("record delivery failure"); + } + + let state: String = + sqlx::query_scalar("SELECT state FROM worker_task_contracts WHERE worker_id = ?") + .bind(worker_id.to_string()) + .fetch_one(&logger.pool) + .await + .expect("load contract state"); + assert_eq!(state, WORKER_CONTRACT_STATE_TERMINAL_FAILED); + } + + #[tokio::test] + async fn worker_task_contract_terminal_deadline_claim_marks_failed_and_stops_receipts() { + let logger = connect_logger().await; + let agent_id: crate::AgentId = Arc::from("agent:test"); + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "deadline task", + 5, + 45, + 1, + ) + .await + .expect("upsert contract"); + logger + .mark_worker_task_contract_terminal_pending(worker_id, "done", 0) + .await + .expect("mark terminal pending"); + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "done", + "Background task completed: done", + ) + .await + .expect("upsert receipt"); + + let due_terminal = logger + .claim_due_worker_task_contract_terminal_deadlines(&channel_id, 10) + .await + .expect("claim due terminal deadlines"); + assert_eq!(due_terminal.len(), 1); + assert_eq!(due_terminal[0].worker_id, worker_id); + + let state: String = + sqlx::query_scalar("SELECT state FROM worker_task_contracts WHERE worker_id = ?") + .bind(worker_id.to_string()) + .fetch_one(&logger.pool) + .await + .expect("load contract state"); + assert_eq!(state, WORKER_CONTRACT_STATE_TERMINAL_FAILED); + + let receipt_status: String = + sqlx::query_scalar("SELECT status FROM worker_delivery_receipts WHERE id = ?") + .bind(receipt_id) + .fetch_one(&logger.pool) + .await + .expect("load receipt status"); + assert_eq!(receipt_status, "failed"); + } + #[tokio::test] async fn claim_due_worker_terminal_receipts_any_claims_multiple_channels() { let logger = connect_logger().await; diff --git a/src/db.rs b/src/db.rs index ac41768af..5af0368b2 100644 --- a/src/db.rs +++ b/src/db.rs @@ -66,3 +66,39 @@ impl Db { // LanceDB and redb close automatically when dropped } } + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + #[test] + fn migration_versions_are_unique() { + let migrations_dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("migrations"); + let entries = std::fs::read_dir(&migrations_dir).expect("read migrations directory"); + + let mut seen_versions = HashSet::new(); + for entry in entries { + let entry = entry.expect("read migration directory entry"); + let path = entry.path(); + if path.extension().and_then(|ext| ext.to_str()) != Some("sql") { + continue; + } + + let file_name = path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or_default(); + let (version, _) = file_name + .split_once('_') + .expect("migration filename should contain version prefix"); + assert!( + version.chars().all(|character| character.is_ascii_digit()), + "migration version should be numeric: {file_name}" + ); + assert!( + seen_versions.insert(version.to_string()), + "duplicate migration version detected: {version} ({file_name})" + ); + } + } +} diff --git a/src/main.rs b/src/main.rs index 8b48d8e2f..dcccf4c0e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1101,7 +1101,6 @@ async fn run( let api_event_tx = api_state.event_tx.clone(); let sse_agent_id = agent_id.to_string(); let sse_channel_id = conversation_id.clone(); - let outbound_agent_names = agent.deps.agent_names.clone(); let outbound_handle = tokio::spawn(async move { while let Some(envelope) = response_rx.recv().await { let receipt_id = envelope.receipt_id.clone(); @@ -1158,24 +1157,94 @@ async fn run( } let current_message = outbound_message.read().await.clone(); + let acknowledged_worker_id = match &response { + spacebot::OutboundResponse::Status( + spacebot::StatusUpdate::WorkerStarted { worker_id, .. }, + ) + | spacebot::OutboundResponse::Status( + spacebot::StatusUpdate::WorkerCheckpoint { worker_id, .. }, + ) + | spacebot::OutboundResponse::Status( + spacebot::StatusUpdate::WorkerCompleted { worker_id, .. }, + ) => Some(*worker_id), + _ => None, + }; let is_status_update = matches!(response, spacebot::OutboundResponse::Status(_)); - let delivery_result = match response { + let (delivery_result, delivery_outcome) = match response { spacebot::OutboundResponse::Status(status) => { - messaging_for_outbound.send_status(¤t_message, status).await + match messaging_for_outbound + .send_status(¤t_message, status) + .await + { + Ok(outcome) => (Ok(()), outcome), + Err(error) => ( + Err(error), + spacebot::messaging::traits::DeliveryOutcome::NotSurfaced, + ), + } } response => { tracing::info!( conversation_id = %outbound_conversation_id, "routing outbound response to messaging adapter" ); - messaging_for_outbound.respond(¤t_message, response).await + ( + messaging_for_outbound + .respond(¤t_message, response) + .await, + spacebot::messaging::traits::DeliveryOutcome::Surfaced, + ) } }; + let status_surfaced = delivery_outcome.is_surfaced(); + + if let (Ok(()), Some(worker_id)) = + (&delivery_result, acknowledged_worker_id) + && status_surfaced + && let Err(error) = outbound_process_logger + .mark_worker_task_contract_acknowledged(worker_id) + .await + { + tracing::warn!( + %error, + channel_id = %outbound_conversation_id, + worker_id = %worker_id, + "failed to mark worker task contract acknowledged" + ); + } if let Some(receipt_id) = receipt_id { - match &delivery_result { - Ok(()) => { + if is_status_update && !status_surfaced { + match outbound_process_logger + .fail_worker_delivery_receipt_attempt( + &receipt_id, + "status update not surfaced by adapter", + ) + .await + { + Ok(outcome) => { + tracing::warn!( + channel_id = %outbound_conversation_id, + receipt_id = %receipt_id, + attempt_count = outcome.attempt_count, + status = %outcome.status, + next_attempt_at = ?outcome.next_attempt_at, + "worker terminal receipt was not surfaced; scheduled retry" + ); + } + Err(update_error) => { + tracing::warn!( + %update_error, + channel_id = %outbound_conversation_id, + receipt_id = %receipt_id, + "failed to record unsurfaced worker terminal receipt" + ); + } + } + } else { + match &delivery_result { + Ok(()) => { match outbound_process_logger .ack_worker_delivery_receipt(&receipt_id) .await @@ -1202,8 +1271,8 @@ async fn run( ); } } - } - Err(error) => { + }, + Err(error) => { match outbound_process_logger .fail_worker_delivery_receipt_attempt( &receipt_id, @@ -1233,6 +1302,7 @@ async fn run( } } } + } if let Err(error) = delivery_result { if is_status_update { @@ -1240,6 +1310,12 @@ async fn run( } else { tracing::error!(%error, "failed to send outbound response"); } + } else if is_status_update && !status_surfaced { + tracing::warn!( + channel_id = %outbound_conversation_id, + delivery_outcome = ?delivery_outcome, + "status update was accepted by adapter but not surfaced" + ); } } }); @@ -1521,21 +1597,27 @@ async fn initialize_agents( let process_run_logger = spacebot::conversation::history::ProcessRunLogger::new(db.sqlite.clone()); - let (recovered_workers, recovered_branches, recovered_receipts) = process_run_logger - .close_orphaned_runs() - .await - .with_context(|| { - format!( - "failed to recover orphaned runs for agent '{}'", - agent_config.id - ) - })?; - if recovered_workers > 0 || recovered_branches > 0 || recovered_receipts > 0 { + let (recovered_workers, recovered_branches, recovered_receipts, recovered_contracts) = + process_run_logger + .close_orphaned_runs() + .await + .with_context(|| { + format!( + "failed to recover orphaned runs for agent '{}'", + agent_config.id + ) + })?; + if recovered_workers > 0 + || recovered_branches > 0 + || recovered_receipts > 0 + || recovered_contracts > 0 + { tracing::warn!( agent_id = %agent_config.id, recovered_workers, recovered_branches, recovered_receipts, + recovered_contracts, "recovered orphaned process runs from previous startup" ); } diff --git a/src/messaging/discord.rs b/src/messaging/discord.rs index 0c17c86e7..97744f927 100644 --- a/src/messaging/discord.rs +++ b/src/messaging/discord.rs @@ -1,7 +1,7 @@ //! Discord messaging adapter using serenity. use crate::config::DiscordPermissions; -use crate::messaging::traits::{HistoryMessage, InboundStream, Messaging}; +use crate::messaging::traits::{DeliveryOutcome, HistoryMessage, InboundStream, Messaging}; use crate::{InboundMessage, MessageContent, OutboundResponse, StatusUpdate}; use anyhow::Context as _; @@ -380,7 +380,7 @@ impl Messaging for DiscordAdapter { self.active_messages.write().await.remove(&message.id); } OutboundResponse::Status(status) => { - self.send_status(message, status).await?; + let _ = self.send_status(message, status).await?; } // Slack-specific variants — graceful fallbacks for Discord OutboundResponse::RemoveReaction(_) => {} // no-op @@ -413,8 +413,8 @@ impl Messaging for DiscordAdapter { &self, message: &InboundMessage, status: StatusUpdate, - ) -> crate::Result<()> { - match status { + ) -> crate::Result { + let surfaced = match status { StatusUpdate::Thinking => { let http = self.get_http().await?; let channel_id = self.extract_channel_id(message)?; @@ -424,6 +424,7 @@ impl Messaging for DiscordAdapter { .write() .await .insert(Self::channel_key(message), typing); + true } StatusUpdate::WorkerStarted { worker_id, task } => { self.stop_typing(message).await; @@ -437,6 +438,9 @@ impl Messaging for DiscordAdapter { .await { tracing::debug!(%error, "failed to update discord progress message"); + false + } else { + true } } StatusUpdate::WorkerCheckpoint { worker_id, status } => { @@ -451,6 +455,9 @@ impl Messaging for DiscordAdapter { .await { tracing::debug!(%error, "failed to update discord progress message"); + false + } else { + true } } StatusUpdate::WorkerCompleted { worker_id, result } => { @@ -465,18 +472,25 @@ impl Messaging for DiscordAdapter { .await { tracing::debug!(%error, "failed to update discord progress message"); + false + } else { + self.clear_progress_message(message, worker_id).await; + true } - self.clear_progress_message(message, worker_id).await; } StatusUpdate::StopTyping | StatusUpdate::ToolStarted { .. } | StatusUpdate::ToolCompleted { .. } | StatusUpdate::BranchStarted { .. } => { self.stop_typing(message).await; + true } - } - - Ok(()) + }; + Ok(if surfaced { + DeliveryOutcome::Surfaced + } else { + DeliveryOutcome::NotSurfaced + }) } async fn broadcast(&self, target: &str, response: OutboundResponse) -> crate::Result<()> { diff --git a/src/messaging/manager.rs b/src/messaging/manager.rs index c08014676..5df68dd87 100644 --- a/src/messaging/manager.rs +++ b/src/messaging/manager.rs @@ -1,6 +1,8 @@ //! MessagingManager: Fan-in and routing for all adapters. -use crate::messaging::traits::{HistoryMessage, InboundStream, Messaging, MessagingDyn}; +use crate::messaging::traits::{ + DeliveryOutcome, HistoryMessage, InboundStream, Messaging, MessagingDyn, +}; use crate::{InboundMessage, OutboundResponse, StatusUpdate}; use anyhow::Context as _; @@ -217,7 +219,7 @@ impl MessagingManager { &self, message: &InboundMessage, status: StatusUpdate, - ) -> crate::Result<()> { + ) -> crate::Result { let adapters = self.adapters.read().await; let adapter = adapters .get(&message.source) diff --git a/src/messaging/slack.rs b/src/messaging/slack.rs index b16041935..b7e698d16 100644 --- a/src/messaging/slack.rs +++ b/src/messaging/slack.rs @@ -22,7 +22,7 @@ //! - DM broadcast via `conversations.open` use crate::config::{SlackCommandConfig, SlackPermissions}; -use crate::messaging::traits::{HistoryMessage, InboundStream, Messaging}; +use crate::messaging::traits::{DeliveryOutcome, HistoryMessage, InboundStream, Messaging}; use crate::{InboundMessage, MessageContent, OutboundResponse, StatusUpdate}; use anyhow::Context as _; @@ -787,7 +787,7 @@ impl Messaging for SlackAdapter { &self, message: &InboundMessage, status: StatusUpdate, - ) -> crate::Result<()> { + ) -> crate::Result { let thread_ts = match extract_thread_ts(message) { Some(ts) => ts, None => { @@ -796,12 +796,12 @@ impl Messaging for SlackAdapter { "skipping assistant.threads.setStatus — message has no thread_ts \ (typing indicators only work in Slack Assistant threads)" ); - return Ok(()); + return Ok(DeliveryOutcome::NotSurfaced); } }; let channel_id = match extract_channel_id(message) { Ok(id) => id, - Err(_) => return Ok(()), + Err(_) => return Ok(DeliveryOutcome::NotSurfaced), }; let status_text = match &status { @@ -830,9 +830,10 @@ impl Messaging for SlackAdapter { // Best-effort — don't propagate status errors into the main response pipeline. if let Err(err) = session.assistant_threads_set_status(&req).await { tracing::debug!(error = %err, "failed to set slack assistant thread status (non-fatal)"); + return Ok(DeliveryOutcome::NotSurfaced); } - Ok(()) + Ok(DeliveryOutcome::Surfaced) } async fn respond( diff --git a/src/messaging/telegram.rs b/src/messaging/telegram.rs index eab7994c2..9d8cd83c1 100644 --- a/src/messaging/telegram.rs +++ b/src/messaging/telegram.rs @@ -1,7 +1,7 @@ //! Telegram messaging adapter using teloxide. use crate::config::TelegramPermissions; -use crate::messaging::traits::{InboundStream, Messaging}; +use crate::messaging::traits::{DeliveryOutcome, InboundStream, Messaging}; use crate::{Attachment, InboundMessage, MessageContent, OutboundResponse, StatusUpdate}; use anyhow::Context as _; @@ -472,7 +472,7 @@ impl Messaging for TelegramAdapter { .remove(&message.conversation_id); } OutboundResponse::Status(status) => { - self.send_status(message, status).await?; + let _ = self.send_status(message, status).await?; } // Slack-specific variants — graceful fallbacks for Telegram OutboundResponse::RemoveReaction(_) => {} // no-op @@ -493,7 +493,7 @@ impl Messaging for TelegramAdapter { &self, message: &InboundMessage, status: StatusUpdate, - ) -> crate::Result<()> { + ) -> crate::Result { match status { StatusUpdate::Thinking => { let chat_id = self.extract_chat_id(message)?; @@ -520,13 +520,22 @@ impl Messaging for TelegramAdapter { .write() .await .insert(conversation_id, handle); + Ok(DeliveryOutcome::Surfaced) } - _ => { + StatusUpdate::StopTyping + | StatusUpdate::ToolStarted { .. } + | StatusUpdate::ToolCompleted { .. } + | StatusUpdate::BranchStarted { .. } => { self.stop_typing(&message.conversation_id).await; + Ok(DeliveryOutcome::Surfaced) + } + StatusUpdate::WorkerStarted { .. } + | StatusUpdate::WorkerCheckpoint { .. } + | StatusUpdate::WorkerCompleted { .. } => { + // Telegram adapter does not currently surface worker status updates. + Ok(DeliveryOutcome::NotSurfaced) } } - - Ok(()) } async fn broadcast(&self, target: &str, response: OutboundResponse) -> crate::Result<()> { diff --git a/src/messaging/traits.rs b/src/messaging/traits.rs index 1f5e10f00..12870cb44 100644 --- a/src/messaging/traits.rs +++ b/src/messaging/traits.rs @@ -8,6 +8,21 @@ use std::pin::Pin; /// Message stream type. pub type InboundStream = Pin + Send>>; +/// Result of attempting to deliver a status update. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DeliveryOutcome { + /// Adapter surfaced the status update to the user. + Surfaced, + /// Adapter accepted the call but did not surface user-visible output. + NotSurfaced, +} + +impl DeliveryOutcome { + pub fn is_surfaced(self) -> bool { + matches!(self, Self::Surfaced) + } +} + /// A message from platform history used for backfilling channel context. #[derive(Debug, Clone)] pub struct HistoryMessage { @@ -37,8 +52,8 @@ pub trait Messaging: Send + Sync + 'static { &self, _message: &InboundMessage, _status: StatusUpdate, - ) -> impl std::future::Future> + Send { - async { Ok(()) } + ) -> impl std::future::Future> + Send { + async { Ok(DeliveryOutcome::NotSurfaced) } } /// Broadcast a message. @@ -90,7 +105,7 @@ pub trait MessagingDyn: Send + Sync + 'static { &'a self, message: &'a InboundMessage, status: StatusUpdate, - ) -> Pin> + Send + 'a>>; + ) -> Pin> + Send + 'a>>; fn broadcast<'a>( &'a self, @@ -136,7 +151,7 @@ impl MessagingDyn for T { &'a self, message: &'a InboundMessage, status: StatusUpdate, - ) -> Pin> + Send + 'a>> { + ) -> Pin> + Send + 'a>> { Box::pin(Messaging::send_status(self, message, status)) } diff --git a/src/messaging/webchat.rs b/src/messaging/webchat.rs index a016df1f4..1d23b5aa6 100644 --- a/src/messaging/webchat.rs +++ b/src/messaging/webchat.rs @@ -4,7 +4,7 @@ //! Inbound messages are injected by the API handler via `MessagingManager::inject_message`, //! and outbound responses are routed to per-session channels consumed as SSE streams. -use crate::messaging::traits::{InboundStream, Messaging}; +use crate::messaging::traits::{DeliveryOutcome, InboundStream, Messaging}; use crate::{InboundMessage, OutboundResponse, StatusUpdate}; use std::collections::HashMap; @@ -106,10 +106,10 @@ impl Messaging for WebChatAdapter { &self, message: &InboundMessage, status: StatusUpdate, - ) -> crate::Result<()> { + ) -> crate::Result { let sessions = self.sessions.read().await; let Some(tx) = sessions.get(&message.conversation_id) else { - return Ok(()); + return Ok(DeliveryOutcome::NotSurfaced); }; let event = match status { @@ -117,11 +117,14 @@ impl Messaging for WebChatAdapter { StatusUpdate::StopTyping => WebChatEvent::StopTyping, StatusUpdate::ToolStarted { tool_name } => WebChatEvent::ToolStarted { tool_name }, StatusUpdate::ToolCompleted { tool_name } => WebChatEvent::ToolCompleted { tool_name }, - _ => return Ok(()), + _ => return Ok(DeliveryOutcome::NotSurfaced), }; - let _ = tx.send(event).await; - Ok(()) + Ok(if tx.send(event).await.is_ok() { + DeliveryOutcome::Surfaced + } else { + DeliveryOutcome::NotSurfaced + }) } async fn health_check(&self) -> crate::Result<()> { diff --git a/src/tools.rs b/src/tools.rs index 014d25756..397b90b01 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -96,6 +96,7 @@ pub use worker_inspect::{ use crate::agent::channel::ChannelState; use crate::config::{BrowserConfig, RuntimeConfig}; use crate::memory::MemorySearch; +use crate::sandbox::Sandbox; use crate::{AgentId, ChannelId, OutboundEnvelope, ProcessEvent, WorkerId}; use rig::tool::Tool as _; use rig::tool::server::{ToolServer, ToolServerHandle}; diff --git a/src/tools/conclude_link.rs b/src/tools/conclude_link.rs index 315a3f2d5..c1a1e079e 100644 --- a/src/tools/conclude_link.rs +++ b/src/tools/conclude_link.rs @@ -4,7 +4,7 @@ //! tool with a summary. The channel checks the flag after the LLM turn and //! routes the summary back to the originating channel as a system message. -use crate::OutboundResponse; +use crate::{OutboundEnvelope, OutboundResponse}; use rig::completion::ToolDefinition; use rig::tool::Tool; use schemars::JsonSchema; @@ -32,14 +32,14 @@ pub fn new_conclude_link() -> (ConcludeLinkFlag, ConcludeLinkSummary) { pub struct ConcludeLinkTool { flag: ConcludeLinkFlag, summary: ConcludeLinkSummary, - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, } impl ConcludeLinkTool { pub fn new( flag: ConcludeLinkFlag, summary: ConcludeLinkSummary, - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, ) -> Self { Self { flag, @@ -96,7 +96,7 @@ impl Tool for ConcludeLinkTool { let _ = self .response_tx - .send(OutboundResponse::Status(crate::StatusUpdate::StopTyping)) + .send(OutboundResponse::Status(crate::StatusUpdate::StopTyping).into()) .await; tracing::info!( diff --git a/src/tools/send_file.rs b/src/tools/send_file.rs index a5f4e00a5..f5ac47a27 100644 --- a/src/tools/send_file.rs +++ b/src/tools/send_file.rs @@ -17,11 +17,37 @@ use tokio::sync::mpsc; #[derive(Debug, Clone)] pub struct SendFileTool { response_tx: mpsc::Sender, + workspace: PathBuf, } impl SendFileTool { - pub fn new(response_tx: mpsc::Sender) -> Self { - Self { response_tx } + pub fn new(response_tx: mpsc::Sender, workspace: PathBuf) -> Self { + Self { + response_tx, + workspace, + } + } + + /// Validate that a path falls within the workspace boundary. + fn validate_workspace_path(&self, path: &std::path::Path) -> Result { + let workspace = &self.workspace; + + let canonical = path.canonicalize().map_err(|error| { + SendFileError(format!("can't resolve path '{}': {error}", path.display())) + })?; + let workspace_canonical = workspace + .canonicalize() + .unwrap_or_else(|_| workspace.clone()); + + if !canonical.starts_with(&workspace_canonical) { + return Err(SendFileError(format!( + "ACCESS DENIED: Path is outside the workspace boundary. \ + File operations are restricted to {}.", + workspace.display() + ))); + } + + Ok(canonical) } } diff --git a/src/tools/worker_inspect.rs b/src/tools/worker_inspect.rs index 960d52972..ad020a5ee 100644 --- a/src/tools/worker_inspect.rs +++ b/src/tools/worker_inspect.rs @@ -115,6 +115,31 @@ impl Tool for WorkerInspectTool { summary.push_str(&format!("\n### Result\n\n{result}\n")); } + let events = self + .run_logger + .list_worker_events(&worker_id, 50) + .await + .map_err(|e| WorkerInspectError(format!("Failed to load worker events: {e}")))?; + if !events.is_empty() { + summary.push_str("\n### Event Timeline\n\n"); + for event in events { + let payload = event + .payload_json + .as_deref() + .map(|json| truncate_utf8(json, 200)) + .unwrap_or_default(); + if payload.is_empty() { + summary.push_str(&format!("- [{}] {}\n", event.created_at, event.event_type)); + } else { + summary.push_str(&format!( + "- [{}] {} — {}\n", + event.created_at, event.event_type, payload + )); + } + } + summary.push('\n'); + } + if let Some(blob) = &detail.transcript_blob { match worker_transcript::deserialize_transcript(blob) { Ok(steps) => { From 3cf1d67cfb37611451499b6ca69c83d3e1cc5cb9 Mon Sep 17 00:00:00 2001 From: Victor Sumner Date: Tue, 24 Feb 2026 19:58:35 -0500 Subject: [PATCH 4/8] Fix clippy -Dwarnings violations in worker contract paths --- src/agent/channel.rs | 64 +++++++++++++++++++++++++++---------- src/conversation/history.rs | 47 ++++++++++++++++----------- 2 files changed, 77 insertions(+), 34 deletions(-) diff --git a/src/agent/channel.rs b/src/agent/channel.rs index c8aaf869e..396a62fb0 100644 --- a/src/agent/channel.rs +++ b/src/agent/channel.rs @@ -305,9 +305,7 @@ impl Channel { }; let self_tx = message_tx.clone(); - let worker_contract_tick_secs = (**deps.runtime_config.worker_contract.load()) - .tick_secs - .max(1); + let worker_contract_tick_secs = deps.runtime_config.worker_contract.load().tick_secs.max(1); let channel = Self { id: id.clone(), title: None, @@ -444,7 +442,11 @@ impl Channel { // Check worker task contract deadline if self.worker_contract_tick_deadline <= now { self.flush_due_worker_task_contract_deadlines().await; - let tick_secs = (**self.deps.runtime_config.worker_contract.load()) + let tick_secs = self + .deps + .runtime_config + .worker_contract + .load() .tick_secs .max(1); self.worker_contract_tick_deadline = tokio::time::Instant::now() @@ -1760,7 +1762,11 @@ impl Channel { &self.deps.agent_id, ); let worker_contract_config = **self.deps.runtime_config.worker_contract.load(); - let terminal_secs = (**self.deps.runtime_config.cortex.load()) + let terminal_secs = self + .deps + .runtime_config + .cortex + .load() .worker_timeout_secs .max(1); let public_task_summary = summarize_worker_start_for_status(task); @@ -1770,9 +1776,11 @@ impl Channel { &self.id, *worker_id, &public_task_summary, - worker_contract_config.ack_secs.max(1), - worker_contract_config.progress_secs.max(1), - terminal_secs, + crate::conversation::history::WorkerTaskContractTiming { + ack_secs: worker_contract_config.ack_secs.max(1), + progress_secs: worker_contract_config.progress_secs.max(1), + terminal_secs, + }, ) .await { @@ -1805,7 +1813,11 @@ impl Channel { worker_id, status, .. } => { run_logger.log_worker_status(*worker_id, status); - let progress_secs = (**self.deps.runtime_config.worker_contract.load()) + let progress_secs = self + .deps + .runtime_config + .worker_contract + .load() .progress_secs .max(1); if let Err(error) = run_logger @@ -1834,7 +1846,11 @@ impl Channel { "tool_started", serde_json::json!({ "tool_name": tool_name }), ); - let progress_secs = (**self.deps.runtime_config.worker_contract.load()) + let progress_secs = self + .deps + .runtime_config + .worker_contract + .load() .progress_secs .max(1); if let Err(error) = run_logger @@ -1864,7 +1880,11 @@ impl Channel { "tool_completed", serde_json::json!({ "tool_name": tool_name }), ); - let progress_secs = (**self.deps.runtime_config.worker_contract.load()) + let progress_secs = self + .deps + .runtime_config + .worker_contract + .load() .progress_secs .max(1); if let Err(error) = run_logger @@ -1898,7 +1918,11 @@ impl Channel { "description": description, }), ); - let progress_secs = (**self.deps.runtime_config.worker_contract.load()) + let progress_secs = self + .deps + .runtime_config + .worker_contract + .load() .progress_secs .max(1); if let Err(error) = run_logger @@ -1930,7 +1954,11 @@ impl Channel { "question_id": question_id, }), ); - let progress_secs = (**self.deps.runtime_config.worker_contract.load()) + let progress_secs = self + .deps + .runtime_config + .worker_contract + .load() .progress_secs .max(1); if let Err(error) = run_logger @@ -1962,7 +1990,11 @@ impl Channel { .await; let terminal_state = classify_worker_terminal_state(result); - let terminal_secs = (**self.deps.runtime_config.cortex.load()) + let terminal_secs = self + .deps + .runtime_config + .cortex + .load() .worker_timeout_secs .max(1); if let Err(error) = self @@ -2654,7 +2686,7 @@ pub async fn spawn_worker_from_state( state.deps.event_tx.clone(), state.deps.agent_id.clone(), Some(state.channel_id.clone()), - (**state.deps.runtime_config.cortex.load()).worker_timeout_secs, + state.deps.runtime_config.cortex.load().worker_timeout_secs, worker.run().instrument(worker_span), ); @@ -2750,7 +2782,7 @@ pub async fn spawn_opencode_worker_from_state( state.deps.event_tx.clone(), state.deps.agent_id.clone(), Some(state.channel_id.clone()), - (**state.deps.runtime_config.cortex.load()).worker_timeout_secs, + state.deps.runtime_config.cortex.load().worker_timeout_secs, async move { let result = worker.run().await?; Ok::(result.result_text) diff --git a/src/conversation/history.rs b/src/conversation/history.rs index 99f4e34e3..f40e710de 100644 --- a/src/conversation/history.rs +++ b/src/conversation/history.rs @@ -283,6 +283,13 @@ pub struct DueWorkerTaskContractTerminal { pub worker_id: WorkerId, } +#[derive(Debug, Clone, Copy)] +pub struct WorkerTaskContractTiming { + pub ack_secs: u64, + pub progress_secs: u64, + pub terminal_secs: u64, +} + /// Persists branch and worker run records for channel timeline history. /// /// All write methods are fire-and-forget, same pattern as ConversationLogger. @@ -552,9 +559,7 @@ impl ProcessRunLogger { channel_id: &ChannelId, worker_id: WorkerId, task_summary: &str, - ack_secs: u64, - progress_secs: u64, - terminal_secs: u64, + timing: WorkerTaskContractTiming, ) -> crate::error::Result<()> { let id = uuid::Uuid::new_v4().to_string(); let worker_id = worker_id.to_string(); @@ -578,9 +583,9 @@ impl ProcessRunLogger { .bind(&worker_id) .bind(task_summary) .bind(WORKER_CONTRACT_STATE_CREATED) - .bind(ack_secs as i64) - .bind(progress_secs as i64) - .bind(terminal_secs as i64) + .bind(timing.ack_secs as i64) + .bind(timing.progress_secs as i64) + .bind(timing.terminal_secs as i64) .bind(&status_hash) .execute(&self.pool) .await @@ -601,9 +606,9 @@ impl ProcessRunLogger { ) .bind(task_summary) .bind(WORKER_CONTRACT_STATE_CREATED) - .bind(ack_secs as i64) - .bind(progress_secs as i64) - .bind(terminal_secs as i64) + .bind(timing.ack_secs as i64) + .bind(timing.progress_secs as i64) + .bind(timing.terminal_secs as i64) .bind(&status_hash) .bind(&worker_id) .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) @@ -1969,9 +1974,11 @@ mod tests { &channel_id, worker_id, "research task", - 0, - 0, - 60, + WorkerTaskContractTiming { + ack_secs: 0, + progress_secs: 0, + terminal_secs: 60, + }, ) .await .expect("upsert contract"); @@ -2061,9 +2068,11 @@ mod tests { &channel_id, worker_id, "analysis task", - 5, - 45, - 60, + WorkerTaskContractTiming { + ack_secs: 5, + progress_secs: 45, + terminal_secs: 60, + }, ) .await .expect("upsert contract"); @@ -2111,9 +2120,11 @@ mod tests { &channel_id, worker_id, "deadline task", - 5, - 45, - 1, + WorkerTaskContractTiming { + ack_secs: 5, + progress_secs: 45, + terminal_secs: 1, + }, ) .await .expect("upsert contract"); From 63d44f55666bc224ce8678dad7b8755c6e2af337 Mon Sep 17 00:00:00 2001 From: Victor Sumner Date: Tue, 24 Feb 2026 20:20:32 -0500 Subject: [PATCH 5/8] Fix worker delivery contract edge cases and align docs --- docs/content/docs/(configuration)/config.mdx | 1 + docs/content/docs/(features)/workers.mdx | 6 +- migrations/20260225000001_worker_events.sql | 6 +- src/agent/channel.rs | 460 ++++++++++++------- src/api/workers.rs | 43 +- src/config.rs | 98 +++- src/conversation/history.rs | 303 +++++++++--- src/db.rs | 6 +- src/messaging/discord.rs | 2 +- src/messaging/webchat.rs | 5 +- src/tools/browser.rs | 86 ++-- src/tools/cancel.rs | 14 +- src/tools/conclude_link.rs | 6 +- 13 files changed, 709 insertions(+), 327 deletions(-) diff --git a/docs/content/docs/(configuration)/config.mdx b/docs/content/docs/(configuration)/config.mdx index 745362618..c054517ca 100644 --- a/docs/content/docs/(configuration)/config.mdx +++ b/docs/content/docs/(configuration)/config.mdx @@ -233,6 +233,7 @@ Most config values are hot-reloaded when their files change. Spacebot watches `c | `max_concurrent_branches` | Yes | Next branch spawn checks new limit | | Browser config | Yes | Next worker spawn uses new config | | Warmup config | Yes | Next warmup pass uses new values | +| `[defaults.worker_contract]` (`ack_secs`, `progress_secs`, `tick_secs`) | Yes | Runtime contract deadlines and polling update without restart | | Identity files (SOUL.md, etc.) | Yes | Next channel message renders new identity | | Skills (SKILL.md files) | Yes | Next message / worker spawn sees new skills | | Bindings | Yes | Next message routes using new bindings | diff --git a/docs/content/docs/(features)/workers.mdx b/docs/content/docs/(features)/workers.mdx index b8c97b62d..cfaba3fae 100644 --- a/docs/content/docs/(features)/workers.mdx +++ b/docs/content/docs/(features)/workers.mdx @@ -62,13 +62,15 @@ Workers don't get memory tools, channel tools, or branch tools. They can't talk ``` Running ──→ Done (fire-and-forget completed) -Running ──→ Failed (error or cancellation) +Running ──→ Failed (error) +Running ──→ Cancelled (cancelled by channel/system) Running ──→ WaitingForInput (interactive worker finished initial task) WaitingForInput ──→ Running (follow-up message received via route) WaitingForInput ──→ Failed (follow-up processing failed) +WaitingForInput ──→ Cancelled (cancelled by channel/system) ``` -`Done` and `Failed` are terminal. Illegal transitions are runtime errors. +`Done`, `Failed`, and `Cancelled` are terminal. Illegal transitions are runtime errors. ## Context and History diff --git a/migrations/20260225000001_worker_events.sql b/migrations/20260225000001_worker_events.sql index f902f9c08..f7d3baa00 100644 --- a/migrations/20260225000001_worker_events.sql +++ b/migrations/20260225000001_worker_events.sql @@ -14,11 +14,11 @@ CREATE TABLE IF NOT EXISTS worker_events ( FOREIGN KEY (worker_id) REFERENCES worker_runs(id) ON DELETE CASCADE ); -CREATE INDEX idx_worker_events_worker +CREATE INDEX IF NOT EXISTS idx_worker_events_worker ON worker_events(worker_id, created_at); -CREATE INDEX idx_worker_events_channel +CREATE INDEX IF NOT EXISTS idx_worker_events_channel ON worker_events(channel_id, created_at); -CREATE INDEX idx_worker_events_agent +CREATE INDEX IF NOT EXISTS idx_worker_events_agent ON worker_events(agent_id, created_at); diff --git a/src/agent/channel.rs b/src/agent/channel.rs index 396a62fb0..8f6166b49 100644 --- a/src/agent/channel.rs +++ b/src/agent/channel.rs @@ -78,6 +78,49 @@ pub struct ChannelState { } impl ChannelState { + fn send_worker_terminal_events( + &self, + worker_id: WorkerId, + status: &str, + result: String, + success: bool, + ) { + if let Err(error) = self.deps.event_tx.send(crate::ProcessEvent::WorkerStatus { + agent_id: self.deps.agent_id.clone(), + worker_id, + channel_id: Some(self.channel_id.clone()), + status: status.to_string(), + }) { + tracing::warn!( + %error, + channel_id = %self.channel_id, + worker_id = %worker_id, + status, + "failed to emit worker terminal status event" + ); + } + if let Err(error) = self + .deps + .event_tx + .send(crate::ProcessEvent::WorkerComplete { + agent_id: self.deps.agent_id.clone(), + worker_id, + channel_id: Some(self.channel_id.clone()), + result, + notify: true, + success, + }) + { + tracing::warn!( + %error, + channel_id = %self.channel_id, + worker_id = %worker_id, + success, + "failed to emit worker terminal completion event" + ); + } + } + /// Cancel a running worker by aborting its tokio task and cleaning up state. /// Returns an error message if the worker is not found. pub async fn cancel_worker( @@ -100,47 +143,42 @@ impl ChannelState { .map(str::trim) .filter(|value| !value.is_empty()) .unwrap_or("cancelled by request"); - let _ = self.deps.event_tx.send(crate::ProcessEvent::WorkerStatus { - agent_id: self.deps.agent_id.clone(), - worker_id, - channel_id: Some(self.channel_id.clone()), - status: "cancelled".to_string(), - }); - let _ = self - .deps - .event_tx - .send(crate::ProcessEvent::WorkerComplete { - agent_id: self.deps.agent_id.clone(), - worker_id, - channel_id: Some(self.channel_id.clone()), - result: format!("Worker cancelled: {reason}."), - notify: true, - success: false, - }); + + match handle.await { + Err(join_error) if join_error.is_cancelled() => { + self.send_worker_terminal_events( + worker_id, + "cancelled", + format!("Worker cancelled: {reason}."), + false, + ); + } + Err(join_error) => { + let failure = format!("Worker failed during cancellation: {join_error}"); + tracing::warn!( + %join_error, + worker_id = %worker_id, + channel_id = %self.channel_id, + "worker join failed after cancellation request" + ); + self.send_worker_terminal_events(worker_id, "failed", failure, false); + } + Ok(()) => { + tracing::debug!( + worker_id = %worker_id, + channel_id = %self.channel_id, + "worker finished before cancellation took effect" + ); + } + } Ok(()) } else if removed { // Worker was in active_workers but had no handle (shouldn't happen, but handle gracefully) - let reason = reason - .map(str::trim) - .filter(|value| !value.is_empty()) - .unwrap_or("cancelled by request"); - let _ = self.deps.event_tx.send(crate::ProcessEvent::WorkerStatus { - agent_id: self.deps.agent_id.clone(), - worker_id, - channel_id: Some(self.channel_id.clone()), - status: "cancelled".to_string(), - }); - let _ = self - .deps - .event_tx - .send(crate::ProcessEvent::WorkerComplete { - agent_id: self.deps.agent_id.clone(), - worker_id, - channel_id: Some(self.channel_id.clone()), - result: format!("Worker cancelled: {reason}."), - notify: true, - success: false, - }); + tracing::warn!( + worker_id = %worker_id, + channel_id = %self.channel_id, + "worker cancellation requested but no join handle was present" + ); Ok(()) } else { Err(format!("Worker {worker_id} not found")) @@ -1692,7 +1730,7 @@ impl Channel { let mut should_retrigger = false; let mut retrigger_metadata = std::collections::HashMap::new(); - let run_logger = &self.state.process_run_logger; + let run_logger = self.state.process_run_logger.clone(); match &event { ProcessEvent::BranchStarted { @@ -1761,7 +1799,7 @@ impl Channel { worker_type, &self.deps.agent_id, ); - let worker_contract_config = **self.deps.runtime_config.worker_contract.load(); + let worker_contract_config = self.deps.runtime_config.worker_contract.load(); let terminal_secs = self .deps .runtime_config @@ -1770,27 +1808,35 @@ impl Channel { .worker_timeout_secs .max(1); let public_task_summary = summarize_worker_start_for_status(task); - if let Err(error) = run_logger - .upsert_worker_task_contract( - &self.deps.agent_id, - &self.id, - *worker_id, - &public_task_summary, - crate::conversation::history::WorkerTaskContractTiming { - ack_secs: worker_contract_config.ack_secs.max(1), - progress_secs: worker_contract_config.progress_secs.max(1), - terminal_secs, - }, - ) - .await - { - tracing::warn!( - %error, - channel_id = %self.id, - worker_id = %worker_id, - "failed to upsert worker task contract" - ); - } + let timing = crate::conversation::history::WorkerTaskContractTiming { + ack_secs: worker_contract_config.ack_secs.max(1), + progress_secs: worker_contract_config.progress_secs.max(1), + terminal_secs, + }; + let run_logger = run_logger.clone(); + let agent_id = self.deps.agent_id.clone(); + let channel_id = self.id.clone(); + let event_worker_id = *worker_id; + let task_summary = public_task_summary.clone(); + tokio::spawn(async move { + if let Err(error) = run_logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + event_worker_id, + &task_summary, + timing, + ) + .await + { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %event_worker_id, + "failed to upsert worker task contract" + ); + } + }); self.worker_contract_tick_deadline = tokio::time::Instant::now(); if self.worker_is_user_visible(*worker_id).await { self.send_status_update(crate::StatusUpdate::WorkerStarted { @@ -1820,17 +1866,27 @@ impl Channel { .load() .progress_secs .max(1); - if let Err(error) = run_logger - .touch_worker_task_contract_progress(*worker_id, Some(status), progress_secs) - .await - { - tracing::warn!( - %error, - channel_id = %self.id, - worker_id = %worker_id, - "failed to refresh worker task contract progress" - ); - } + let run_logger = run_logger.clone(); + let channel_id = self.id.clone(); + let event_worker_id = *worker_id; + let status_text = status.clone(); + tokio::spawn(async move { + if let Err(error) = run_logger + .touch_worker_task_contract_progress( + event_worker_id, + Some(status_text.as_str()), + progress_secs, + ) + .await + { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %event_worker_id, + "failed to refresh worker task contract progress" + ); + } + }); if self.worker_is_user_visible(*worker_id).await { self.maybe_send_worker_checkpoint(*worker_id, status).await; } @@ -1853,21 +1909,27 @@ impl Channel { .load() .progress_secs .max(1); - if let Err(error) = run_logger - .touch_worker_task_contract_progress( - *worker_id, - Some(tool_name.as_str()), - progress_secs, - ) - .await - { - tracing::warn!( - %error, - channel_id = %self.id, - worker_id = %worker_id, - "failed to refresh worker task contract progress from tool event" - ); - } + let run_logger = run_logger.clone(); + let channel_id = self.id.clone(); + let event_worker_id = *worker_id; + let tool_name = tool_name.clone(); + tokio::spawn(async move { + if let Err(error) = run_logger + .touch_worker_task_contract_progress( + event_worker_id, + Some(tool_name.as_str()), + progress_secs, + ) + .await + { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %event_worker_id, + "failed to refresh worker task contract progress from tool event" + ); + } + }); } ProcessEvent::ToolCompleted { process_id: ProcessId::Worker(worker_id), @@ -1887,21 +1949,27 @@ impl Channel { .load() .progress_secs .max(1); - if let Err(error) = run_logger - .touch_worker_task_contract_progress( - *worker_id, - Some(tool_name.as_str()), - progress_secs, - ) - .await - { - tracing::warn!( - %error, - channel_id = %self.id, - worker_id = %worker_id, - "failed to refresh worker task contract progress from tool event" - ); - } + let run_logger = run_logger.clone(); + let channel_id = self.id.clone(); + let event_worker_id = *worker_id; + let tool_name = tool_name.clone(); + tokio::spawn(async move { + if let Err(error) = run_logger + .touch_worker_task_contract_progress( + event_worker_id, + Some(tool_name.as_str()), + progress_secs, + ) + .await + { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %event_worker_id, + "failed to refresh worker task contract progress from tool event" + ); + } + }); } ProcessEvent::WorkerPermission { worker_id, @@ -1925,21 +1993,27 @@ impl Channel { .load() .progress_secs .max(1); - if let Err(error) = run_logger - .touch_worker_task_contract_progress( - *worker_id, - Some(description.as_str()), - progress_secs, - ) - .await - { - tracing::warn!( - %error, - channel_id = %self.id, - worker_id = %worker_id, - "failed to refresh worker task contract progress from permission event" - ); - } + let run_logger = run_logger.clone(); + let channel_id = self.id.clone(); + let event_worker_id = *worker_id; + let description = description.clone(); + tokio::spawn(async move { + if let Err(error) = run_logger + .touch_worker_task_contract_progress( + event_worker_id, + Some(description.as_str()), + progress_secs, + ) + .await + { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %event_worker_id, + "failed to refresh worker task contract progress from permission event" + ); + } + }); } ProcessEvent::WorkerQuestion { worker_id, @@ -1961,17 +2035,22 @@ impl Channel { .load() .progress_secs .max(1); - if let Err(error) = run_logger - .touch_worker_task_contract_progress(*worker_id, None, progress_secs) - .await - { - tracing::warn!( - %error, - channel_id = %self.id, - worker_id = %worker_id, - "failed to refresh worker task contract progress from question event" - ); - } + let run_logger = run_logger.clone(); + let channel_id = self.id.clone(); + let event_worker_id = *worker_id; + tokio::spawn(async move { + if let Err(error) = run_logger + .touch_worker_task_contract_progress(event_worker_id, None, progress_secs) + .await + { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %event_worker_id, + "failed to refresh worker task contract progress from question event" + ); + } + }); } ProcessEvent::WorkerComplete { worker_id, @@ -1997,56 +2076,62 @@ impl Channel { .load() .worker_timeout_secs .max(1); - if let Err(error) = self - .state - .process_run_logger - .mark_worker_task_contract_terminal_pending( - *worker_id, - terminal_state, - terminal_secs, - ) - .await - { - tracing::warn!( - %error, - channel_id = %self.id, - worker_id = %worker_id, - "failed to mark worker contract terminal pending" - ); - } - let payload_text = - build_worker_terminal_receipt_payload(terminal_state, result); - match self - .state - .process_run_logger - .upsert_worker_terminal_receipt( - &self.id, - *worker_id, - terminal_state, - &payload_text, - ) - .await - { - Ok(receipt_id) => { - tracing::info!( - channel_id = %self.id, - worker_id = %worker_id, - receipt_id = %receipt_id, - terminal_state, - "queued worker terminal receipt" - ); - self.worker_receipt_dispatch_deadline = tokio::time::Instant::now(); - } - Err(error) => { + self.worker_receipt_dispatch_deadline = tokio::time::Instant::now(); + let run_logger = self.state.process_run_logger.clone(); + let channel_id = self.id.clone(); + let worker_id = *worker_id; + let terminal_state = terminal_state.to_string(); + let result_text = result.clone(); + tokio::spawn(async move { + if let Err(error) = run_logger + .mark_worker_task_contract_terminal_pending( + worker_id, + &terminal_state, + terminal_secs, + ) + .await + { tracing::warn!( %error, - channel_id = %self.id, + channel_id = %channel_id, worker_id = %worker_id, - terminal_state, - "failed to queue worker terminal receipt" + terminal_state = %terminal_state, + "failed to mark worker contract terminal pending" ); + return; } - } + + let payload_text = + build_worker_terminal_receipt_payload(&terminal_state, &result_text); + match run_logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + &terminal_state, + &payload_text, + ) + .await + { + Ok(receipt_id) => { + tracing::info!( + channel_id = %channel_id, + worker_id = %worker_id, + receipt_id = %receipt_id, + terminal_state = %terminal_state, + "queued worker terminal receipt" + ); + } + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %worker_id, + terminal_state = %terminal_state, + "failed to queue worker terminal receipt" + ); + } + } + }); } let mut workers = self.state.active_workers.write().await; @@ -2924,21 +3009,42 @@ where .observe(worker_start.elapsed().as_secs_f64()); } - let _ = event_tx.send(ProcessEvent::WorkerStatus { + if let Err(error) = event_tx.send(ProcessEvent::WorkerStatus { agent_id: agent_id.clone(), worker_id, channel_id: channel_id.clone(), status: terminal_status.to_string(), - }); + }) { + tracing::warn!( + %error, + agent_id = %agent_id, + worker_id = %worker_id, + channel_id = ?channel_id, + terminal_status, + "failed to send terminal worker status event" + ); + } - let _ = event_tx.send(ProcessEvent::WorkerComplete { + let result_len = result_text.len(); + let completion_channel_id = channel_id.clone(); + if let Err(error) = event_tx.send(ProcessEvent::WorkerComplete { agent_id, worker_id, channel_id, result: result_text, notify, success, - }); + }) { + tracing::warn!( + %error, + worker_id = %worker_id, + channel_id = ?completion_channel_id, + result_len, + notify, + success, + "failed to send worker completion event" + ); + } }) } diff --git a/src/api/workers.rs b/src/api/workers.rs index e176ed4bb..0469f04fb 100644 --- a/src/api/workers.rs +++ b/src/api/workers.rs @@ -52,6 +52,7 @@ pub(super) struct WorkerListItem { pub(super) struct WorkerDetailQuery { agent_id: String, worker_id: String, + limit: Option, } #[derive(Serialize)] @@ -74,7 +75,7 @@ pub(super) struct WorkerDetailResponse { pub(super) struct WorkerEventItem { id: String, event_type: String, - payload_json: Option, + payload_json: Option, created_at: String, } @@ -165,6 +166,12 @@ pub(super) async fn worker_detail( })? .ok_or(StatusCode::NOT_FOUND)?; + let event_limit = match query.limit { + Some(0) => return Err(StatusCode::BAD_REQUEST), + Some(limit) => limit.min(5_000) as i64, + None => 200, + }; + let transcript = detail.transcript_blob.as_deref().and_then(|blob| { worker_transcript::deserialize_transcript(blob) .map_err(|error| { @@ -173,18 +180,40 @@ pub(super) async fn worker_detail( .ok() }); let events = logger - .list_worker_events(&query.worker_id, 200) + .list_worker_events(&query.worker_id, event_limit) .await .map_err(|error| { tracing::warn!(%error, worker_id = %query.worker_id, "failed to list worker events"); StatusCode::INTERNAL_SERVER_ERROR })? .into_iter() - .map(|event| WorkerEventItem { - id: event.id, - event_type: event.event_type, - payload_json: event.payload_json, - created_at: event.created_at, + .map(|event| { + let payload_json = event.payload_json.as_deref().and_then(|payload| { + let trimmed = payload.trim(); + if trimmed.is_empty() { + return None; + } + + match serde_json::from_str::(trimmed) { + Ok(value) => Some(value), + Err(error) => { + tracing::warn!( + %error, + worker_id = %query.worker_id, + event_id = %event.id, + "failed to parse worker event payload_json" + ); + None + } + } + }); + + WorkerEventItem { + id: event.id, + event_type: event.event_type, + payload_json, + created_at: event.created_at, + } }) .collect(); diff --git a/src/config.rs b/src/config.rs index 5c28ab812..c04002e0a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -3007,6 +3007,11 @@ impl Config { .collect::>>()?; let base_defaults = DefaultsConfig::default(); + let resolve_nonzero_secs = |value: Option, fallback: u64| { + value + .and_then(|configured| (configured > 0).then_some(configured)) + .unwrap_or(fallback) + }; let defaults = DefaultsConfig { routing: resolve_routing(toml.defaults.routing, &base_defaults.routing), max_concurrent_branches: toml @@ -3083,15 +3088,18 @@ impl Config { .defaults .worker_contract .map(|contract| WorkerContractConfig { - ack_secs: contract - .ack_secs - .unwrap_or(base_defaults.worker_contract.ack_secs), - progress_secs: contract - .progress_secs - .unwrap_or(base_defaults.worker_contract.progress_secs), - tick_secs: contract - .tick_secs - .unwrap_or(base_defaults.worker_contract.tick_secs), + ack_secs: resolve_nonzero_secs( + contract.ack_secs, + base_defaults.worker_contract.ack_secs, + ), + progress_secs: resolve_nonzero_secs( + contract.progress_secs, + base_defaults.worker_contract.progress_secs, + ), + tick_secs: resolve_nonzero_secs( + contract.tick_secs, + base_defaults.worker_contract.tick_secs, + ), }) .unwrap_or(base_defaults.worker_contract), cortex: toml @@ -3290,15 +3298,18 @@ impl Config { chunk_size: ig.chunk_size.unwrap_or(defaults.ingestion.chunk_size), }), worker_contract: a.worker_contract.map(|contract| WorkerContractConfig { - ack_secs: contract - .ack_secs - .unwrap_or(defaults.worker_contract.ack_secs), - progress_secs: contract - .progress_secs - .unwrap_or(defaults.worker_contract.progress_secs), - tick_secs: contract - .tick_secs - .unwrap_or(defaults.worker_contract.tick_secs), + ack_secs: resolve_nonzero_secs( + contract.ack_secs, + defaults.worker_contract.ack_secs, + ), + progress_secs: resolve_nonzero_secs( + contract.progress_secs, + defaults.worker_contract.progress_secs, + ), + tick_secs: resolve_nonzero_secs( + contract.tick_secs, + defaults.worker_contract.tick_secs, + ), }), cortex: a.cortex.map(|c| CortexConfig { tick_interval_secs: c @@ -5165,6 +5176,57 @@ id = "main" assert_eq!(provider.base_url, "http://remote-ollama:11434"); } + #[test] + fn worker_contract_zero_defaults_fallback_to_safe_defaults() { + let toml = r#" +[defaults.worker_contract] +ack_secs = 0 +progress_secs = 0 +tick_secs = 0 + +[[agents]] +id = "main" +"#; + let parsed: TomlConfig = toml::from_str(toml).expect("failed to parse test TOML"); + let config = Config::from_toml(parsed, PathBuf::from(".")).expect("failed to build Config"); + let defaults = WorkerContractConfig::default(); + + assert_eq!(config.defaults.worker_contract.ack_secs, defaults.ack_secs); + assert_eq!( + config.defaults.worker_contract.progress_secs, + defaults.progress_secs + ); + assert_eq!( + config.defaults.worker_contract.tick_secs, + defaults.tick_secs + ); + } + + #[test] + fn worker_contract_zero_agent_override_falls_back_to_instance_defaults() { + let toml = r#" +[defaults.worker_contract] +ack_secs = 9 +progress_secs = 27 +tick_secs = 3 + +[[agents]] +id = "main" + +[agents.worker_contract] +ack_secs = 0 +progress_secs = 0 +tick_secs = 0 +"#; + let parsed: TomlConfig = toml::from_str(toml).expect("failed to parse test TOML"); + let config = Config::from_toml(parsed, PathBuf::from(".")).expect("failed to build Config"); + let resolved = config.agents[0].resolve(&config.instance_dir, &config.defaults); + + assert_eq!(resolved.worker_contract.ack_secs, 9); + assert_eq!(resolved.worker_contract.progress_secs, 27); + assert_eq!(resolved.worker_contract.tick_secs, 3); + } + #[test] fn test_warmup_defaults_applied_when_not_configured() { let toml = r#" diff --git a/src/conversation/history.rs b/src/conversation/history.rs index f40e710de..2a3893f66 100644 --- a/src/conversation/history.rs +++ b/src/conversation/history.rs @@ -594,7 +594,6 @@ impl ProcessRunLogger { sqlx::query( "UPDATE worker_task_contracts \ SET task_summary = ?, \ - state = ?, \ ack_deadline_at = datetime('now', '+' || ? || ' seconds'), \ progress_deadline_at = datetime('now', '+' || ? || ' seconds'), \ terminal_deadline_at = datetime('now', '+' || ? || ' seconds'), \ @@ -605,7 +604,6 @@ impl ProcessRunLogger { AND state NOT IN (?, ?)", ) .bind(task_summary) - .bind(WORKER_CONTRACT_STATE_CREATED) .bind(timing.ack_secs as i64) .bind(timing.progress_secs as i64) .bind(timing.terminal_secs as i64) @@ -978,53 +976,43 @@ impl ProcessRunLogger { ) -> crate::error::Result { let worker_id = worker_id.to_string(); let channel_id = channel_id.to_string(); + let candidate_receipt_id = uuid::Uuid::new_v4().to_string(); - let existing = sqlx::query( - "SELECT id, status \ - FROM worker_delivery_receipts \ - WHERE worker_id = ? AND kind = ?", - ) - .bind(&worker_id) - .bind(WORKER_TERMINAL_RECEIPT_KIND) - .fetch_optional(&self.pool) - .await - .map_err(|error| anyhow::anyhow!(error))?; - - if let Some(row) = existing { - let receipt_id: String = row.try_get("id").unwrap_or_default(); - let status: String = row.try_get("status").unwrap_or_default(); - - if status != "acked" { - sqlx::query( - "UPDATE worker_delivery_receipts \ - SET channel_id = ?, \ - terminal_state = ?, \ - payload_text = ?, \ - status = 'pending', \ - last_error = NULL, \ - next_attempt_at = CURRENT_TIMESTAMP, \ - updated_at = CURRENT_TIMESTAMP \ - WHERE id = ?", - ) - .bind(&channel_id) - .bind(terminal_state) - .bind(payload_text) - .bind(&receipt_id) - .execute(&self.pool) - .await - .map_err(|error| anyhow::anyhow!(error))?; - } - - return Ok(receipt_id); - } - - let receipt_id = uuid::Uuid::new_v4().to_string(); sqlx::query( "INSERT INTO worker_delivery_receipts \ (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at) \ - VALUES (?, ?, ?, ?, 'pending', ?, ?, CURRENT_TIMESTAMP)", + VALUES (?, ?, ?, ?, 'pending', ?, ?, CURRENT_TIMESTAMP) \ + ON CONFLICT(worker_id, kind) DO UPDATE SET \ + channel_id = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.channel_id \ + ELSE excluded.channel_id \ + END, \ + terminal_state = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.terminal_state \ + ELSE excluded.terminal_state \ + END, \ + payload_text = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.payload_text \ + ELSE excluded.payload_text \ + END, \ + status = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.status \ + ELSE 'pending' \ + END, \ + last_error = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.last_error \ + ELSE NULL \ + END, \ + next_attempt_at = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.next_attempt_at \ + ELSE CURRENT_TIMESTAMP \ + END, \ + updated_at = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.updated_at \ + ELSE CURRENT_TIMESTAMP \ + END", ) - .bind(&receipt_id) + .bind(&candidate_receipt_id) .bind(&worker_id) .bind(&channel_id) .bind(WORKER_TERMINAL_RECEIPT_KIND) @@ -1034,6 +1022,17 @@ impl ProcessRunLogger { .await .map_err(|error| anyhow::anyhow!(error))?; + let receipt_id: String = sqlx::query_scalar( + "SELECT id \ + FROM worker_delivery_receipts \ + WHERE worker_id = ? AND kind = ?", + ) + .bind(&worker_id) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .fetch_one(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + Ok(receipt_id) } @@ -1170,6 +1169,11 @@ impl ProcessRunLogger { &self, receipt_id: &str, ) -> crate::error::Result { + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; let updated = sqlx::query( "UPDATE worker_delivery_receipts \ SET status = 'acked', \ @@ -1179,7 +1183,7 @@ impl ProcessRunLogger { WHERE id = ? AND status != 'acked'", ) .bind(receipt_id) - .execute(&self.pool) + .execute(&mut *tx) .await .map_err(|error| anyhow::anyhow!(error))? .rows_affected(); @@ -1201,10 +1205,11 @@ impl ProcessRunLogger { .bind(WORKER_CONTRACT_STATE_PROGRESSING) .bind(WORKER_CONTRACT_STATE_SLA_MISSED) .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) - .execute(&self.pool) + .execute(&mut *tx) .await .map_err(|error| anyhow::anyhow!(error))?; } + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; Ok(updated > 0) } @@ -1215,13 +1220,18 @@ impl ProcessRunLogger { receipt_id: &str, error: &str, ) -> crate::error::Result { + let mut tx = self + .pool + .begin() + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; let row = sqlx::query( "SELECT status, attempt_count \ FROM worker_delivery_receipts \ WHERE id = ?", ) .bind(receipt_id) - .fetch_optional(&self.pool) + .fetch_optional(&mut *tx) .await .map_err(|db_error| anyhow::anyhow!(db_error))? .ok_or_else(|| anyhow::anyhow!("worker delivery receipt not found: {receipt_id}"))?; @@ -1230,6 +1240,9 @@ impl ProcessRunLogger { let current_attempts: i64 = row.try_get("attempt_count").unwrap_or_default(); if current_status == "acked" { + tx.commit() + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; return Ok(WorkerDeliveryRetryOutcome { status: "acked".to_string(), attempt_count: current_attempts, @@ -1250,7 +1263,7 @@ impl ProcessRunLogger { .bind(attempt_count) .bind(error) .bind(receipt_id) - .execute(&self.pool) + .execute(&mut *tx) .await .map_err(|db_error| anyhow::anyhow!(db_error))?; @@ -1270,9 +1283,12 @@ impl ProcessRunLogger { .bind(WORKER_CONTRACT_STATE_PROGRESSING) .bind(WORKER_CONTRACT_STATE_SLA_MISSED) .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) - .execute(&self.pool) + .execute(&mut *tx) .await .map_err(|db_error| anyhow::anyhow!(db_error))?; + tx.commit() + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; return Ok(WorkerDeliveryRetryOutcome { status: "failed".to_string(), @@ -1295,9 +1311,12 @@ impl ProcessRunLogger { .bind(error) .bind(delay_secs) .bind(receipt_id) - .execute(&self.pool) + .execute(&mut *tx) .await .map_err(|db_error| anyhow::anyhow!(db_error))?; + tx.commit() + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; let next_attempt_at = chrono::Utc::now() .checked_add_signed(chrono::TimeDelta::seconds(delay_secs)) @@ -1397,7 +1416,13 @@ impl ProcessRunLogger { SET state = ?, \ terminal_state = COALESCE(terminal_state, 'failed'), \ updated_at = CURRENT_TIMESTAMP \ - WHERE state NOT IN (?, ?)", + WHERE state NOT IN (?, ?) \ + AND NOT EXISTS ( \ + SELECT 1 \ + FROM worker_delivery_receipts \ + WHERE worker_delivery_receipts.worker_id = worker_task_contracts.worker_id \ + AND worker_delivery_receipts.status = 'acked' \ + )", ) .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) @@ -1646,6 +1671,11 @@ impl ProcessRunLogger { worker_id: &str, limit: i64, ) -> crate::error::Result> { + if limit <= 0 { + return Err(anyhow::anyhow!("invalid limit: must be > 0").into()); + } + let limit = limit.min(500); + let rows = sqlx::query( "SELECT id, worker_id, channel_id, agent_id, event_type, payload_json, created_at \ FROM worker_events \ @@ -1654,7 +1684,7 @@ impl ProcessRunLogger { LIMIT ?", ) .bind(worker_id) - .bind(limit.clamp(1, 500)) + .bind(limit) .fetch_all(&self.pool) .await .map_err(|error| anyhow::anyhow!(error))?; @@ -1809,6 +1839,16 @@ mod tests { ); } + #[tokio::test] + async fn list_worker_events_rejects_non_positive_limit() { + let logger = connect_logger().await; + let error = logger + .list_worker_events("worker:test", 0) + .await + .expect_err("non-positive limit should fail"); + assert!(error.to_string().contains("invalid limit")); + } + #[tokio::test] async fn worker_terminal_receipt_claim_ack_and_stats() { let logger = connect_logger().await; @@ -1861,6 +1901,63 @@ mod tests { assert_eq!(final_stats.failed, 0); } + #[tokio::test] + async fn worker_terminal_receipt_upsert_preserves_acked_rows() { + let logger = connect_logger().await; + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "done", + "Background task completed: first payload", + ) + .await + .expect("upsert initial receipt"); + + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'acked', \ + acked_at = CURRENT_TIMESTAMP, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ?", + ) + .bind(&receipt_id) + .execute(&logger.pool) + .await + .expect("mark receipt acked"); + + let second_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "failed", + "Background task failed: should not overwrite acked receipt", + ) + .await + .expect("upsert should preserve acked row"); + assert_eq!(second_id, receipt_id); + + let row = sqlx::query( + "SELECT status, terminal_state, payload_text \ + FROM worker_delivery_receipts \ + WHERE id = ?", + ) + .bind(&receipt_id) + .fetch_one(&logger.pool) + .await + .expect("load receipt row"); + let status: String = row.try_get("status").unwrap_or_default(); + let terminal_state: String = row.try_get("terminal_state").unwrap_or_default(); + let payload_text: String = row.try_get("payload_text").unwrap_or_default(); + + assert_eq!(status, "acked"); + assert_eq!(terminal_state, "done"); + assert_eq!(payload_text, "Background task completed: first payload"); + } + #[tokio::test] async fn worker_terminal_receipt_failure_retries_then_fails() { let logger = connect_logger().await; @@ -2055,6 +2152,56 @@ mod tests { assert_eq!(state, WORKER_CONTRACT_STATE_TERMINAL_ACKED); } + #[tokio::test] + async fn upsert_worker_task_contract_preserves_existing_state() { + let logger = connect_logger().await; + let agent_id: crate::AgentId = Arc::from("agent:test"); + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "first summary", + WorkerTaskContractTiming { + ack_secs: 5, + progress_secs: 45, + terminal_secs: 60, + }, + ) + .await + .expect("upsert initial contract"); + logger + .touch_worker_task_contract_progress(worker_id, Some("indexing"), 45) + .await + .expect("mark contract progressing"); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "updated summary", + WorkerTaskContractTiming { + ack_secs: 10, + progress_secs: 30, + terminal_secs: 120, + }, + ) + .await + .expect("refresh contract"); + + let state: String = + sqlx::query_scalar("SELECT state FROM worker_task_contracts WHERE worker_id = ?") + .bind(worker_id.to_string()) + .fetch_one(&logger.pool) + .await + .expect("load contract state"); + assert_eq!(state, WORKER_CONTRACT_STATE_PROGRESSING); + } + #[tokio::test] async fn worker_task_contract_moves_to_terminal_failed_on_receipt_exhaustion() { let logger = connect_logger().await; @@ -2166,6 +2313,58 @@ mod tests { assert_eq!(receipt_status, "failed"); } + #[tokio::test] + async fn close_orphaned_runs_does_not_fail_contract_with_acked_receipt() { + let logger = connect_logger().await; + let agent_id: crate::AgentId = Arc::from("agent:test"); + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "orphaned contract", + WorkerTaskContractTiming { + ack_secs: 5, + progress_secs: 45, + terminal_secs: 60, + }, + ) + .await + .expect("upsert contract"); + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at, acked_at) \ + VALUES (?, ?, ?, ?, 'acked', ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)", + ) + .bind(uuid::Uuid::new_v4().to_string()) + .bind(worker_id.to_string()) + .bind(channel_id.as_ref()) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("done") + .bind("Background task completed: already delivered") + .execute(&logger.pool) + .await + .expect("insert acked receipt"); + + let (_, _, _, recovered_contracts) = logger + .close_orphaned_runs() + .await + .expect("close orphaned runs"); + assert_eq!(recovered_contracts, 0); + + let state: String = + sqlx::query_scalar("SELECT state FROM worker_task_contracts WHERE worker_id = ?") + .bind(worker_id.to_string()) + .fetch_one(&logger.pool) + .await + .expect("load contract state"); + assert_eq!(state, WORKER_CONTRACT_STATE_CREATED); + } + #[tokio::test] async fn claim_due_worker_terminal_receipts_any_claims_multiple_channels() { let logger = connect_logger().await; diff --git a/src/db.rs b/src/db.rs index 5af0368b2..f003ec08c 100644 --- a/src/db.rs +++ b/src/db.rs @@ -80,7 +80,7 @@ mod tests { for entry in entries { let entry = entry.expect("read migration directory entry"); let path = entry.path(); - if path.extension().and_then(|ext| ext.to_str()) != Some("sql") { + if path.extension().and_then(|extension| extension.to_str()) != Some("sql") { continue; } @@ -91,6 +91,10 @@ mod tests { let (version, _) = file_name .split_once('_') .expect("migration filename should contain version prefix"); + assert!( + !version.is_empty(), + "migration version should not be empty: {file_name}" + ); assert!( version.chars().all(|character| character.is_ascii_digit()), "migration version should be numeric: {file_name}" diff --git a/src/messaging/discord.rs b/src/messaging/discord.rs index 97744f927..7acf18e4b 100644 --- a/src/messaging/discord.rs +++ b/src/messaging/discord.rs @@ -483,7 +483,7 @@ impl Messaging for DiscordAdapter { | StatusUpdate::ToolCompleted { .. } | StatusUpdate::BranchStarted { .. } => { self.stop_typing(message).await; - true + false } }; Ok(if surfaced { diff --git a/src/messaging/webchat.rs b/src/messaging/webchat.rs index 1d23b5aa6..267f63185 100644 --- a/src/messaging/webchat.rs +++ b/src/messaging/webchat.rs @@ -117,7 +117,10 @@ impl Messaging for WebChatAdapter { StatusUpdate::StopTyping => WebChatEvent::StopTyping, StatusUpdate::ToolStarted { tool_name } => WebChatEvent::ToolStarted { tool_name }, StatusUpdate::ToolCompleted { tool_name } => WebChatEvent::ToolCompleted { tool_name }, - _ => return Ok(DeliveryOutcome::NotSurfaced), + StatusUpdate::BranchStarted { .. } => return Ok(DeliveryOutcome::NotSurfaced), + StatusUpdate::WorkerStarted { .. } => return Ok(DeliveryOutcome::NotSurfaced), + StatusUpdate::WorkerCompleted { .. } => return Ok(DeliveryOutcome::NotSurfaced), + StatusUpdate::WorkerCheckpoint { .. } => return Ok(DeliveryOutcome::NotSurfaced), }; Ok(if tx.send(event).await.is_ok() { diff --git a/src/tools/browser.rs b/src/tools/browser.rs index 0c008a692..a1f56d04d 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -660,9 +660,7 @@ impl BrowserTool { .remove(&id) .ok_or_else(|| BrowserError::new(format!("no tab with target_id '{id}'")))?; - page.close() - .await - .map_err(|error| BrowserError::new(format!("failed to close tab: {error}")))?; + Self::with_action_timeout("close tab", page.close()).await?; if state.active_target.as_ref() == Some(&id) { state.active_target = state.pages.keys().next().cloned(); @@ -679,18 +677,17 @@ impl BrowserTool { let page = self.require_active_page(&state)?.clone(); // Enable accessibility domain if not already enabled - page.execute(AxEnableParams::default()) - .await - .map_err(|error| { - BrowserError::new(format!("failed to enable accessibility: {error}")) - })?; + Self::with_action_timeout( + "snapshot accessibility enable", + page.execute(AxEnableParams::default()), + ) + .await?; - let ax_tree = page - .execute(GetFullAxTreeParams::default()) - .await - .map_err(|error| { - BrowserError::new(format!("failed to get accessibility tree: {error}")) - })?; + let ax_tree = Self::with_action_timeout( + "snapshot accessibility tree", + page.execute(GetFullAxTreeParams::default()), + ) + .await?; state.element_refs.clear(); state.next_ref = 0; @@ -778,10 +775,7 @@ impl BrowserTool { match act_kind { ActKind::Click => { let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .click() - .await - .map_err(|error| BrowserError::new(format!("click failed: {error}")))?; + Self::with_action_timeout("act click", element.click()).await?; Ok(BrowserOutput::success("Clicked element")) } ActKind::Type => { @@ -789,14 +783,8 @@ impl BrowserTool { return Err(BrowserError::new("text is required for act:type")); }; let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .click() - .await - .map_err(|error| BrowserError::new(format!("focus failed: {error}")))?; - element - .type_str(&text) - .await - .map_err(|error| BrowserError::new(format!("type failed: {error}")))?; + Self::with_action_timeout("act focus", element.click()).await?; + Self::with_action_timeout("act type", element.type_str(&text)).await?; Ok(BrowserOutput::success(format!( "Typed '{}' into element", truncate_for_display(&text, 50) @@ -808,36 +796,27 @@ impl BrowserTool { }; if element_ref.is_some() { let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .press_key(&key) - .await - .map_err(|error| BrowserError::new(format!("press_key failed: {error}")))?; + Self::with_action_timeout("act press_key", element.press_key(&key)).await?; } else { - dispatch_key_press(page, &key).await?; + Self::with_action_timeout("act press_key", dispatch_key_press(page, &key)) + .await?; } Ok(BrowserOutput::success(format!("Pressed key '{key}'"))) } ActKind::Hover => { let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .hover() - .await - .map_err(|error| BrowserError::new(format!("hover failed: {error}")))?; + Self::with_action_timeout("act hover", element.hover()).await?; Ok(BrowserOutput::success("Hovered over element")) } ActKind::ScrollIntoView => { let element = self.resolve_element_ref(&state, page, element_ref).await?; - element.scroll_into_view().await.map_err(|error| { - BrowserError::new(format!("scroll_into_view failed: {error}")) - })?; + Self::with_action_timeout("act scroll_into_view", element.scroll_into_view()) + .await?; Ok(BrowserOutput::success("Scrolled element into view")) } ActKind::Focus => { let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .focus() - .await - .map_err(|error| BrowserError::new(format!("focus failed: {error}")))?; + Self::with_action_timeout("act focus", element.focus()).await?; Ok(BrowserOutput::success("Focused element")) } } @@ -853,18 +832,17 @@ impl BrowserTool { let screenshot_data = if let Some(ref_id) = element_ref { let element = self.resolve_element_ref(&state, page, Some(ref_id)).await?; - element - .screenshot(CaptureScreenshotFormat::Png) - .await - .map_err(|error| BrowserError::new(format!("element screenshot failed: {error}")))? + Self::with_action_timeout( + "element screenshot", + element.screenshot(CaptureScreenshotFormat::Png), + ) + .await? } else { let params = ScreenshotParams::builder() .format(CaptureScreenshotFormat::Png) .full_page(full_page) .build(); - page.screenshot(params) - .await - .map_err(|error| BrowserError::new(format!("screenshot failed: {error}")))? + Self::with_action_timeout("page screenshot", page.screenshot(params)).await? }; // Save to disk @@ -916,10 +894,7 @@ impl BrowserTool { let state = self.state.lock().await; let page = self.require_active_page(&state)?; - let result = page - .evaluate(script) - .await - .map_err(|error| BrowserError::new(format!("evaluate failed: {error}")))?; + let result = Self::with_action_timeout("evaluate", page.evaluate(script)).await?; let value = result.value().cloned(); @@ -940,10 +915,7 @@ impl BrowserTool { let state = self.state.lock().await; let page = self.require_active_page(&state)?; - let html = page - .content() - .await - .map_err(|error| BrowserError::new(format!("failed to get page content: {error}")))?; + let html = Self::with_action_timeout("page content", page.content()).await?; let title = page.get_title().await.ok().flatten(); let url = page.url().await.ok().flatten(); diff --git a/src/tools/cancel.rs b/src/tools/cancel.rs index e827834c0..91dcbc568 100644 --- a/src/tools/cancel.rs +++ b/src/tools/cancel.rs @@ -107,11 +107,15 @@ impl Tool for CancelTool { other => return Err(CancelError(format!("Unknown process type: {other}"))), } - let message = if let Some(reason) = &args.reason { - format!( - "{} {} cancelled: {reason}", - args.process_type, args.process_id - ) + let message = if args.process_type == "worker" { + if let Some(reason) = &args.reason { + format!( + "{} {} cancelled: {reason}", + args.process_type, args.process_id + ) + } else { + format!("{} {} cancelled.", args.process_type, args.process_id) + } } else { format!("{} {} cancelled.", args.process_type, args.process_id) }; diff --git a/src/tools/conclude_link.rs b/src/tools/conclude_link.rs index c1a1e079e..d348eff01 100644 --- a/src/tools/conclude_link.rs +++ b/src/tools/conclude_link.rs @@ -94,10 +94,10 @@ impl Tool for ConcludeLinkTool { let summary_len = args.summary.len(); *self.summary.write().await = Some(args.summary); - let _ = self - .response_tx + self.response_tx .send(OutboundResponse::Status(crate::StatusUpdate::StopTyping).into()) - .await; + .await + .ok(); tracing::info!( summary_len, From 685bfdef58ce1a30c2773a313341e4109ec8ed81 Mon Sep 17 00:00:00 2001 From: Victor Sumner Date: Tue, 24 Feb 2026 20:26:44 -0500 Subject: [PATCH 6/8] Harden worker contract semantics and deterministic status hashing --- Cargo.lock | 1 + Cargo.toml | 1 + src/agent/channel.rs | 262 ++++++++++++++++-------------------- src/conversation/history.rs | 183 ++++++++++++++++++++----- 4 files changed, 266 insertions(+), 181 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 945796092..d6dad83c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8260,6 +8260,7 @@ dependencies = [ "emojis", "fastembed", "flate2", + "fnv", "futures", "hex", "ignore", diff --git a/Cargo.toml b/Cargo.toml index fd73aa72b..6360f79d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ fastembed = "4" # Encoding base64 = "0.22" hex = "0.4" +fnv = "1.0" # Compression flate2 = "1" diff --git a/src/agent/channel.rs b/src/agent/channel.rs index 8f6166b49..83cbf46e1 100644 --- a/src/agent/channel.rs +++ b/src/agent/channel.rs @@ -39,11 +39,31 @@ const WORKER_CHECKPOINT_MIN_INTERVAL_SECS: u64 = 20; /// Maximum length for user-facing checkpoint text. const WORKER_CHECKPOINT_MAX_CHARS: usize = 220; +/// How often terminal delivery receipts are drained from SQLite. +/// +/// Keep this small enough for low completion latency, but not so small that +/// the dispatcher loops too aggressively under idle load. const WORKER_RECEIPT_DISPATCH_INTERVAL_SECS: u64 = 5; +/// Max receipt rows to claim per dispatch pass. +/// +/// `i64` matches SQL bind/count types; conversion to `usize` only happens when +/// allocating local vectors from fetched row counts. const WORKER_RECEIPT_DISPATCH_BATCH_SIZE: i64 = 8; +/// Max contract rows to claim per acknowledgement deadline scan. +/// +/// `i64` is used for direct SQL LIMIT binding. const WORKER_CONTRACT_ACK_BATCH_SIZE: i64 = 8; +/// Max contract rows to claim per progress-SLA scan. +/// +/// Tune with `WORKER_CONTRACT_ACK_BATCH_SIZE` to avoid large burst writes. const WORKER_CONTRACT_PROGRESS_BATCH_SIZE: i64 = 8; +/// Max contract rows to claim per terminal deadline scan. +/// +/// Uses `i64` for SQL LIMIT compatibility; callers only cast when needed. const WORKER_CONTRACT_TERMINAL_BATCH_SIZE: i64 = 8; +const WORKER_FAILED_PREFIX: &str = "Worker failed:"; +const WORKER_TIMED_OUT_PREFIX: &str = "Worker timed out after "; +const WORKER_CANCELLED_PREFIX: &str = "Worker cancelled:"; #[derive(Debug, Clone)] struct WorkerCheckpointState { @@ -149,7 +169,7 @@ impl ChannelState { self.send_worker_terminal_events( worker_id, "cancelled", - format!("Worker cancelled: {reason}."), + format!("{WORKER_CANCELLED_PREFIX} {reason}."), false, ); } @@ -1859,34 +1879,12 @@ impl Channel { worker_id, status, .. } => { run_logger.log_worker_status(*worker_id, status); - let progress_secs = self - .deps - .runtime_config - .worker_contract - .load() - .progress_secs - .max(1); - let run_logger = run_logger.clone(); - let channel_id = self.id.clone(); - let event_worker_id = *worker_id; - let status_text = status.clone(); - tokio::spawn(async move { - if let Err(error) = run_logger - .touch_worker_task_contract_progress( - event_worker_id, - Some(status_text.as_str()), - progress_secs, - ) - .await - { - tracing::warn!( - %error, - channel_id = %channel_id, - worker_id = %event_worker_id, - "failed to refresh worker task contract progress" - ); - } - }); + self.spawn_worker_progress_refresh( + run_logger.clone(), + *worker_id, + Some(status.clone()), + "worker status", + ); if self.worker_is_user_visible(*worker_id).await { self.maybe_send_worker_checkpoint(*worker_id, status).await; } @@ -1897,39 +1895,14 @@ impl Channel { tool_name, .. } if channel_id.as_ref() == Some(&self.id) => { - run_logger.log_worker_event( + self.log_worker_event_and_refresh_progress( + run_logger.clone(), *worker_id, "tool_started", serde_json::json!({ "tool_name": tool_name }), + Some(tool_name.clone()), + "tool_started", ); - let progress_secs = self - .deps - .runtime_config - .worker_contract - .load() - .progress_secs - .max(1); - let run_logger = run_logger.clone(); - let channel_id = self.id.clone(); - let event_worker_id = *worker_id; - let tool_name = tool_name.clone(); - tokio::spawn(async move { - if let Err(error) = run_logger - .touch_worker_task_contract_progress( - event_worker_id, - Some(tool_name.as_str()), - progress_secs, - ) - .await - { - tracing::warn!( - %error, - channel_id = %channel_id, - worker_id = %event_worker_id, - "failed to refresh worker task contract progress from tool event" - ); - } - }); } ProcessEvent::ToolCompleted { process_id: ProcessId::Worker(worker_id), @@ -1937,39 +1910,14 @@ impl Channel { tool_name, .. } if channel_id.as_ref() == Some(&self.id) => { - run_logger.log_worker_event( + self.log_worker_event_and_refresh_progress( + run_logger.clone(), *worker_id, "tool_completed", serde_json::json!({ "tool_name": tool_name }), + Some(tool_name.clone()), + "tool_completed", ); - let progress_secs = self - .deps - .runtime_config - .worker_contract - .load() - .progress_secs - .max(1); - let run_logger = run_logger.clone(); - let channel_id = self.id.clone(); - let event_worker_id = *worker_id; - let tool_name = tool_name.clone(); - tokio::spawn(async move { - if let Err(error) = run_logger - .touch_worker_task_contract_progress( - event_worker_id, - Some(tool_name.as_str()), - progress_secs, - ) - .await - { - tracing::warn!( - %error, - channel_id = %channel_id, - worker_id = %event_worker_id, - "failed to refresh worker task contract progress from tool event" - ); - } - }); } ProcessEvent::WorkerPermission { worker_id, @@ -1978,42 +1926,17 @@ impl Channel { description, .. } if channel_id.as_ref() == Some(&self.id) => { - run_logger.log_worker_event( + self.log_worker_event_and_refresh_progress( + run_logger.clone(), *worker_id, "permission", serde_json::json!({ "permission_id": permission_id, "description": description, }), + Some(description.clone()), + "permission", ); - let progress_secs = self - .deps - .runtime_config - .worker_contract - .load() - .progress_secs - .max(1); - let run_logger = run_logger.clone(); - let channel_id = self.id.clone(); - let event_worker_id = *worker_id; - let description = description.clone(); - tokio::spawn(async move { - if let Err(error) = run_logger - .touch_worker_task_contract_progress( - event_worker_id, - Some(description.as_str()), - progress_secs, - ) - .await - { - tracing::warn!( - %error, - channel_id = %channel_id, - worker_id = %event_worker_id, - "failed to refresh worker task contract progress from permission event" - ); - } - }); } ProcessEvent::WorkerQuestion { worker_id, @@ -2021,36 +1944,16 @@ impl Channel { question_id, .. } if channel_id.as_ref() == Some(&self.id) => { - run_logger.log_worker_event( + self.log_worker_event_and_refresh_progress( + run_logger.clone(), *worker_id, "question", serde_json::json!({ "question_id": question_id, }), + None, + "question", ); - let progress_secs = self - .deps - .runtime_config - .worker_contract - .load() - .progress_secs - .max(1); - let run_logger = run_logger.clone(); - let channel_id = self.id.clone(); - let event_worker_id = *worker_id; - tokio::spawn(async move { - if let Err(error) = run_logger - .touch_worker_task_contract_progress(event_worker_id, None, progress_secs) - .await - { - tracing::warn!( - %error, - channel_id = %channel_id, - worker_id = %event_worker_id, - "failed to refresh worker task contract progress from question event" - ); - } - }); } ProcessEvent::WorkerComplete { worker_id, @@ -2194,6 +2097,60 @@ impl Channel { } } + fn spawn_worker_progress_refresh( + &self, + run_logger: ProcessRunLogger, + worker_id: WorkerId, + status_text: Option, + event_label: &'static str, + ) { + let progress_secs = self + .deps + .runtime_config + .worker_contract + .load() + .progress_secs + .max(1); + let channel_id = self.id.clone(); + + tokio::spawn(async move { + if let Err(error) = run_logger + .touch_worker_task_contract_progress( + worker_id, + status_text.as_deref(), + progress_secs, + ) + .await + { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %worker_id, + event_label, + "failed to refresh worker task contract progress" + ); + } + }); + } + + fn log_worker_event_and_refresh_progress( + &self, + run_logger: ProcessRunLogger, + worker_id: WorkerId, + event_label: &'static str, + payload: serde_json::Value, + status_text: Option, + progress_event_label: &'static str, + ) { + run_logger.log_worker_event(worker_id, event_label, payload); + self.spawn_worker_progress_refresh( + run_logger, + worker_id, + status_text, + progress_event_label, + ); + } + async fn maybe_send_worker_checkpoint(&mut self, worker_id: WorkerId, raw_status: &str) { let Some(status) = normalize_worker_checkpoint_status(raw_status) else { return; @@ -2932,7 +2889,12 @@ where Ok(text) => ("done", text, true, true), Err(error) => { tracing::error!(worker_id = %worker_id, %error, "worker failed"); - ("failed", format!("Worker failed: {error}"), true, false) + ( + "failed", + format!("{WORKER_FAILED_PREFIX} {error}"), + true, + false, + ) } } } else { @@ -2952,7 +2914,7 @@ where Ok(text) => ("done", text, true, true), Err(error) => { tracing::error!(worker_id = %worker_id, %error, "worker failed"); - ("failed", format!("Worker failed: {error}"), true, false) + ("failed", format!("{WORKER_FAILED_PREFIX} {error}"), true, false) } }; break outcome; @@ -2987,7 +2949,9 @@ where ); break ( "timed_out", - format!("Worker timed out after {timeout_secs} seconds without progress."), + format!( + "{WORKER_TIMED_OUT_PREFIX}{timeout_secs} seconds without progress." + ), true, false, ); @@ -3294,18 +3258,18 @@ fn build_worker_progress_sla_nudge(task_summary: &str) -> String { fn is_worker_terminal_failure(result: &str) -> bool { let trimmed = result.trim_start(); - trimmed.starts_with("Worker failed:") - || trimmed.starts_with("Worker timed out after ") - || trimmed.starts_with("Worker cancelled:") + trimmed.starts_with(WORKER_FAILED_PREFIX) + || trimmed.starts_with(WORKER_TIMED_OUT_PREFIX) + || trimmed.starts_with(WORKER_CANCELLED_PREFIX) } fn classify_worker_terminal_state(result: &str) -> &'static str { let trimmed = result.trim_start(); - if trimmed.starts_with("Worker failed:") { + if trimmed.starts_with(WORKER_FAILED_PREFIX) { "failed" - } else if trimmed.starts_with("Worker timed out after ") { + } else if trimmed.starts_with(WORKER_TIMED_OUT_PREFIX) { "timed_out" - } else if trimmed.starts_with("Worker cancelled:") { + } else if trimmed.starts_with(WORKER_CANCELLED_PREFIX) { "cancelled" } else { "done" diff --git a/src/conversation/history.rs b/src/conversation/history.rs index 2a3893f66..4b58ac9aa 100644 --- a/src/conversation/history.rs +++ b/src/conversation/history.rs @@ -2,6 +2,7 @@ use crate::{BranchId, ChannelId, WorkerId}; +use fnv::FnvHasher; use serde::Serialize; use sqlx::{Row as _, SqlitePool}; use std::collections::HashMap; @@ -237,7 +238,7 @@ fn worker_receipt_backoff_secs(attempt_count: i64) -> Option { } fn status_fingerprint(status: &str) -> String { - let mut hasher = std::collections::hash_map::DefaultHasher::new(); + let mut hasher = FnvHasher::default(); status.hash(&mut hasher); format!("{:016x}", hasher.finish()) } @@ -626,7 +627,7 @@ impl ProcessRunLogger { sqlx::query( "UPDATE worker_task_contracts \ SET state = CASE \ - WHEN state IN (?, ?, ?) THEN state \ + WHEN state IN (?, ?, ?, ?, ?) THEN state \ ELSE ? \ END, \ updated_at = CURRENT_TIMESTAMP \ @@ -635,6 +636,8 @@ impl ProcessRunLogger { .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(WORKER_CONTRACT_STATE_SLA_MISSED) .bind(WORKER_CONTRACT_STATE_ACKED) .bind(worker_id.to_string()) .execute(&self.pool) @@ -744,10 +747,59 @@ impl ProcessRunLogger { let mut due = Vec::with_capacity(rows.len()); for row in rows { - let contract_id: String = row.try_get("id").unwrap_or_default(); - let worker_id_raw: String = row.try_get("worker_id").unwrap_or_default(); - let task_summary: String = row.try_get("task_summary").unwrap_or_default(); - let attempt_count: i64 = row.try_get("attempt_count").unwrap_or_default(); + let contract_id: String = match row.try_get("id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + column = "id", + "skipping malformed ack-deadline contract row" + ); + continue; + } + }; + let worker_id_raw: String = match row.try_get("worker_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + column = "worker_id", + "skipping malformed ack-deadline contract row" + ); + continue; + } + }; + let task_summary: String = match row.try_get("task_summary") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + worker_id = %worker_id_raw, + column = "task_summary", + "skipping malformed ack-deadline contract row" + ); + continue; + } + }; + let attempt_count: i64 = match row.try_get("attempt_count") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + worker_id = %worker_id_raw, + column = "attempt_count", + "skipping malformed ack-deadline contract row" + ); + continue; + } + }; let updated = sqlx::query( "UPDATE worker_task_contracts \ @@ -824,9 +876,45 @@ impl ProcessRunLogger { let mut due = Vec::with_capacity(rows.len()); for row in rows { - let contract_id: String = row.try_get("id").unwrap_or_default(); - let worker_id_raw: String = row.try_get("worker_id").unwrap_or_default(); - let task_summary: String = row.try_get("task_summary").unwrap_or_default(); + let contract_id: String = match row.try_get("id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + column = "id", + "skipping malformed progress-deadline contract row" + ); + continue; + } + }; + let worker_id_raw: String = match row.try_get("worker_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + column = "worker_id", + "skipping malformed progress-deadline contract row" + ); + continue; + } + }; + let task_summary: String = match row.try_get("task_summary") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + worker_id = %worker_id_raw, + column = "task_summary", + "skipping malformed progress-deadline contract row" + ); + continue; + } + }; let updated = sqlx::query( "UPDATE worker_task_contracts \ @@ -906,8 +994,31 @@ impl ProcessRunLogger { let mut due = Vec::with_capacity(rows.len()); for row in rows { - let contract_id: String = row.try_get("id").unwrap_or_default(); - let worker_id_raw: String = row.try_get("worker_id").unwrap_or_default(); + let contract_id: String = match row.try_get("id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + column = "id", + "skipping malformed terminal-deadline contract row" + ); + continue; + } + }; + let worker_id_raw: String = match row.try_get("worker_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + column = "worker_id", + "skipping malformed terminal-deadline contract row" + ); + continue; + } + }; let updated = sqlx::query( "UPDATE worker_task_contracts \ @@ -978,7 +1089,7 @@ impl ProcessRunLogger { let channel_id = channel_id.to_string(); let candidate_receipt_id = uuid::Uuid::new_v4().to_string(); - sqlx::query( + let receipt_id: String = sqlx::query_scalar( "INSERT INTO worker_delivery_receipts \ (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at) \ VALUES (?, ?, ?, ?, 'pending', ?, ?, CURRENT_TIMESTAMP) \ @@ -1010,7 +1121,8 @@ impl ProcessRunLogger { updated_at = CASE \ WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.updated_at \ ELSE CURRENT_TIMESTAMP \ - END", + END \ + RETURNING id", ) .bind(&candidate_receipt_id) .bind(&worker_id) @@ -1018,17 +1130,6 @@ impl ProcessRunLogger { .bind(WORKER_TERMINAL_RECEIPT_KIND) .bind(terminal_state) .bind(payload_text) - .execute(&self.pool) - .await - .map_err(|error| anyhow::anyhow!(error))?; - - let receipt_id: String = sqlx::query_scalar( - "SELECT id \ - FROM worker_delivery_receipts \ - WHERE worker_id = ? AND kind = ?", - ) - .bind(&worker_id) - .bind(WORKER_TERMINAL_RECEIPT_KIND) .fetch_one(&self.pool) .await .map_err(|error| anyhow::anyhow!(error))?; @@ -1671,8 +1772,11 @@ impl ProcessRunLogger { worker_id: &str, limit: i64, ) -> crate::error::Result> { - if limit <= 0 { - return Err(anyhow::anyhow!("invalid limit: must be > 0").into()); + if limit == 0 { + return Ok(Vec::new()); + } + if limit < 0 { + return Err(anyhow::anyhow!("invalid limit: must be >= 0").into()); } let limit = limit.min(500); @@ -1680,7 +1784,7 @@ impl ProcessRunLogger { "SELECT id, worker_id, channel_id, agent_id, event_type, payload_json, created_at \ FROM worker_events \ WHERE worker_id = ? \ - ORDER BY created_at DESC \ + ORDER BY created_at DESC, id DESC \ LIMIT ?", ) .bind(worker_id) @@ -1840,12 +1944,18 @@ mod tests { } #[tokio::test] - async fn list_worker_events_rejects_non_positive_limit() { + async fn list_worker_events_zero_limit_returns_empty_and_negative_is_invalid() { let logger = connect_logger().await; - let error = logger + let events = logger .list_worker_events("worker:test", 0) .await - .expect_err("non-positive limit should fail"); + .expect("zero limit should return empty result"); + assert!(events.is_empty()); + + let error = logger + .list_worker_events("worker:test", -1) + .await + .expect_err("negative limit should fail"); assert!(error.to_string().contains("invalid limit")); } @@ -2238,11 +2348,20 @@ mod tests { .await .expect("upsert receipt"); - for _ in 0..WORKER_RECEIPT_MAX_ATTEMPTS { - let _ = logger + for attempt in 1..=WORKER_RECEIPT_MAX_ATTEMPTS { + let outcome = logger .fail_worker_delivery_receipt_attempt(&receipt_id, "adapter unavailable") .await .expect("record delivery failure"); + if attempt < WORKER_RECEIPT_MAX_ATTEMPTS { + assert_eq!(outcome.status, "pending"); + assert_eq!(outcome.attempt_count, attempt); + assert!(outcome.next_attempt_at.is_some()); + } else { + assert_eq!(outcome.status, "failed"); + assert_eq!(outcome.attempt_count, attempt); + assert!(outcome.next_attempt_at.is_none()); + } } let state: String = From fb59dd49ec60a83c97b8da614c291288169e40ee Mon Sep 17 00:00:00 2001 From: Victor Sumner Date: Tue, 24 Feb 2026 20:34:51 -0500 Subject: [PATCH 7/8] Address follow-up reliability and docs findings --- Cargo.lock | 1 - Cargo.toml | 1 - docs/content/docs/(configuration)/config.mdx | 2 + docs/content/docs/(features)/workers.mdx | 4 +- src/agent/channel.rs | 164 ++++++++++----- src/conversation/history.rs | 201 +++++++++++++++++-- src/db.rs | 4 + src/messaging/discord.rs | 10 +- src/tools/browser.rs | 28 ++- src/tools/cancel.rs | 19 +- 10 files changed, 342 insertions(+), 92 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d6dad83c6..945796092 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8260,7 +8260,6 @@ dependencies = [ "emojis", "fastembed", "flate2", - "fnv", "futures", "hex", "ignore", diff --git a/Cargo.toml b/Cargo.toml index 6360f79d4..fd73aa72b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,6 @@ fastembed = "4" # Encoding base64 = "0.22" hex = "0.4" -fnv = "1.0" # Compression flate2 = "1" diff --git a/docs/content/docs/(configuration)/config.mdx b/docs/content/docs/(configuration)/config.mdx index c054517ca..8b62ff9c6 100644 --- a/docs/content/docs/(configuration)/config.mdx +++ b/docs/content/docs/(configuration)/config.mdx @@ -486,6 +486,8 @@ Thresholds are fractions of `context_window`. | `progress_secs` | integer | 45 | Deadline between meaningful worker progress updates | | `tick_secs` | integer | 2 | Poll interval for worker contract deadline checks | +Setting `ack_secs`, `progress_secs`, or `tick_secs` to `0` is treated as unset and falls back to the resolved default for that scope. + ### `[defaults.cortex]` | Key | Type | Default | Description | diff --git a/docs/content/docs/(features)/workers.mdx b/docs/content/docs/(features)/workers.mdx index cfaba3fae..36d0e7bc4 100644 --- a/docs/content/docs/(features)/workers.mdx +++ b/docs/content/docs/(features)/workers.mdx @@ -64,13 +64,15 @@ Workers don't get memory tools, channel tools, or branch tools. They can't talk Running ──→ Done (fire-and-forget completed) Running ──→ Failed (error) Running ──→ Cancelled (cancelled by channel/system) +Running ──→ timed_out (inactivity timeout elapsed) Running ──→ WaitingForInput (interactive worker finished initial task) WaitingForInput ──→ Running (follow-up message received via route) WaitingForInput ──→ Failed (follow-up processing failed) WaitingForInput ──→ Cancelled (cancelled by channel/system) +WaitingForInput ──→ timed_out (inactivity timeout elapsed) ``` -`Done`, `Failed`, and `Cancelled` are terminal. Illegal transitions are runtime errors. +`Done`, `Failed`, `Cancelled`, and `timed_out` are terminal. Illegal transitions are runtime errors. ## Context and History diff --git a/src/agent/channel.rs b/src/agent/channel.rs index 83cbf46e1..5454dc93c 100644 --- a/src/agent/channel.rs +++ b/src/agent/channel.rs @@ -491,7 +491,7 @@ impl Channel { } // Check worker terminal receipt dispatch deadline if self.worker_receipt_dispatch_deadline <= now { - self.flush_due_worker_delivery_receipts().await; + self.flush_due_worker_delivery_receipts(); self.worker_receipt_dispatch_deadline = tokio::time::Instant::now() + std::time::Duration::from_secs( WORKER_RECEIPT_DISPATCH_INTERVAL_SECS, @@ -499,7 +499,7 @@ impl Channel { } // Check worker task contract deadline if self.worker_contract_tick_deadline <= now { - self.flush_due_worker_task_contract_deadlines().await; + self.flush_due_worker_task_contract_deadlines(); let tick_secs = self .deps .runtime_config @@ -2177,18 +2177,30 @@ impl Channel { ); } - async fn flush_due_worker_delivery_receipts(&mut self) { - let due = match self - .state - .process_run_logger - .claim_due_worker_terminal_receipts(&self.id, WORKER_RECEIPT_DISPATCH_BATCH_SIZE) + fn flush_due_worker_delivery_receipts(&self) { + let run_logger = self.state.process_run_logger.clone(); + let response_tx = self.response_tx.clone(); + let channel_id = self.id.clone(); + tokio::spawn(async move { + Self::flush_due_worker_delivery_receipts_task(run_logger, response_tx, channel_id) + .await; + }); + } + + async fn flush_due_worker_delivery_receipts_task( + run_logger: ProcessRunLogger, + response_tx: mpsc::Sender, + channel_id: ChannelId, + ) { + let due = match run_logger + .claim_due_worker_terminal_receipts(&channel_id, WORKER_RECEIPT_DISPATCH_BATCH_SIZE) .await { Ok(receipts) => receipts, Err(error) => { tracing::warn!( %error, - channel_id = %self.id, + channel_id = %channel_id, "failed to claim due worker terminal receipts" ); return; @@ -2203,24 +2215,22 @@ impl Channel { let message = OutboundResponse::Text(receipt.payload_text.clone()); let envelope = OutboundEnvelope::tracked(message, receipt.id.clone()); - if let Err(error) = self.response_tx.send(envelope).await { + if let Err(error) = response_tx.send(envelope).await { tracing::warn!( %error, - channel_id = %self.id, + channel_id = %channel_id, worker_id = %receipt.worker_id, receipt_id = %receipt.id, "failed to queue worker terminal receipt for outbound delivery" ); - if let Err(update_error) = self - .state - .process_run_logger + if let Err(update_error) = run_logger .fail_worker_delivery_receipt_attempt(&receipt.id, &error.to_string()) .await { tracing::warn!( %update_error, - channel_id = %self.id, + channel_id = %channel_id, worker_id = %receipt.worker_id, receipt_id = %receipt.id, "failed to mark worker terminal receipt send failure" @@ -2230,16 +2240,42 @@ impl Channel { } } - async fn flush_due_worker_task_contract_deadlines(&mut self) { - let worker_contract_config = **self.deps.runtime_config.worker_contract.load(); + fn flush_due_worker_task_contract_deadlines(&self) { + let run_logger = self.state.process_run_logger.clone(); + let response_tx = self.response_tx.clone(); + let channel_id = self.id.clone(); + let status_block = self.state.status_block.clone(); + let ack_secs = self + .deps + .runtime_config + .worker_contract + .load() + .ack_secs + .max(1); + tokio::spawn(async move { + Self::flush_due_worker_task_contract_deadlines_task( + run_logger, + response_tx, + channel_id, + status_block, + ack_secs, + ) + .await; + }); + } - let due_ack = match self - .state - .process_run_logger + async fn flush_due_worker_task_contract_deadlines_task( + run_logger: ProcessRunLogger, + response_tx: mpsc::Sender, + channel_id: ChannelId, + status_block: Arc>, + ack_secs: u64, + ) { + let due_ack = match run_logger .claim_due_worker_task_contract_ack_deadlines( - &self.id, + &channel_id, WORKER_CONTRACT_ACK_BATCH_SIZE, - worker_contract_config.ack_secs.max(1), + ack_secs, ) .await { @@ -2247,7 +2283,7 @@ impl Channel { Err(error) => { tracing::warn!( %error, - channel_id = %self.id, + channel_id = %channel_id, "failed to claim due worker task contract ack deadlines" ); Vec::new() @@ -2255,16 +2291,14 @@ impl Channel { }; for due in due_ack { - if !self.worker_is_user_visible(due.worker_id).await { - if let Err(error) = self - .state - .process_run_logger + if !Self::worker_is_user_visible_in_status_block(&status_block, due.worker_id).await { + if let Err(error) = run_logger .mark_worker_task_contract_acknowledged(due.worker_id) .await { tracing::warn!( %error, - channel_id = %self.id, + channel_id = %channel_id, worker_id = %due.worker_id, "failed to auto-ack hidden worker task contract" ); @@ -2272,18 +2306,27 @@ impl Channel { continue; } let status = build_worker_ack_checkpoint(&due.task_summary, due.attempt_count); - self.send_status_update(crate::StatusUpdate::WorkerCheckpoint { - worker_id: due.worker_id, - status, - }) - .await; + if let Err(error) = response_tx + .send(OutboundEnvelope::from(OutboundResponse::Status( + crate::StatusUpdate::WorkerCheckpoint { + worker_id: due.worker_id, + status, + }, + ))) + .await + { + tracing::debug!( + %error, + channel_id = %channel_id, + worker_id = %due.worker_id, + "failed to route worker ack checkpoint status update" + ); + } } - let due_progress = match self - .state - .process_run_logger + let due_progress = match run_logger .claim_due_worker_task_contract_progress_deadlines( - &self.id, + &channel_id, WORKER_CONTRACT_PROGRESS_BATCH_SIZE, ) .await @@ -2292,7 +2335,7 @@ impl Channel { Err(error) => { tracing::warn!( %error, - channel_id = %self.id, + channel_id = %channel_id, "failed to claim due worker task contract progress deadlines" ); Vec::new() @@ -2300,22 +2343,31 @@ impl Channel { }; for due in due_progress { - if !self.worker_is_user_visible(due.worker_id).await { + if !Self::worker_is_user_visible_in_status_block(&status_block, due.worker_id).await { continue; } let status = build_worker_progress_sla_nudge(&due.task_summary); - self.send_status_update(crate::StatusUpdate::WorkerCheckpoint { - worker_id: due.worker_id, - status, - }) - .await; + if let Err(error) = response_tx + .send(OutboundEnvelope::from(OutboundResponse::Status( + crate::StatusUpdate::WorkerCheckpoint { + worker_id: due.worker_id, + status, + }, + ))) + .await + { + tracing::debug!( + %error, + channel_id = %channel_id, + worker_id = %due.worker_id, + "failed to route worker progress checkpoint status update" + ); + } } - let due_terminal = match self - .state - .process_run_logger + let due_terminal = match run_logger .claim_due_worker_task_contract_terminal_deadlines( - &self.id, + &channel_id, WORKER_CONTRACT_TERMINAL_BATCH_SIZE, ) .await @@ -2324,7 +2376,7 @@ impl Channel { Err(error) => { tracing::warn!( %error, - channel_id = %self.id, + channel_id = %channel_id, "failed to claim due worker task contract terminal deadlines" ); Vec::new() @@ -2332,15 +2384,26 @@ impl Channel { }; for due in due_terminal { - self.worker_checkpoints.remove(&due.worker_id); tracing::warn!( - channel_id = %self.id, + channel_id = %channel_id, worker_id = %due.worker_id, "worker terminal deadline elapsed before adapter acknowledgement" ); } } + async fn worker_is_user_visible_in_status_block( + status_block: &Arc>, + worker_id: WorkerId, + ) -> bool { + let status_block = status_block.read().await; + status_block + .active_workers + .iter() + .find(|worker| worker.id == worker_id) + .is_some_and(|worker| worker.notify_on_complete) + } + async fn worker_is_user_visible(&self, worker_id: WorkerId) -> bool { let status_block = self.state.status_block.read().await; status_block @@ -2932,6 +2995,7 @@ where skipped, "worker timeout watcher lagged on event stream" ); + deadline = tokio::time::Instant::now() + timeout_duration; } Err(tokio::sync::broadcast::error::RecvError::Closed) => { tracing::warn!( diff --git a/src/conversation/history.rs b/src/conversation/history.rs index 4b58ac9aa..c7fd195bb 100644 --- a/src/conversation/history.rs +++ b/src/conversation/history.rs @@ -2,11 +2,10 @@ use crate::{BranchId, ChannelId, WorkerId}; -use fnv::FnvHasher; use serde::Serialize; +use sha2::{Digest as _, Sha256}; use sqlx::{Row as _, SqlitePool}; use std::collections::HashMap; -use std::hash::{Hash as _, Hasher as _}; /// Persists conversation messages (user and assistant) to SQLite. /// @@ -238,9 +237,9 @@ fn worker_receipt_backoff_secs(attempt_count: i64) -> Option { } fn status_fingerprint(status: &str) -> String { - let mut hasher = FnvHasher::default(); - status.hash(&mut hasher); - format!("{:016x}", hasher.finish()) + let mut hasher = Sha256::new(); + hasher.update(status.as_bytes()); + format!("{:x}", hasher.finalize()) } #[derive(Debug, Clone)] @@ -1172,7 +1171,18 @@ impl ProcessRunLogger { let mut claimed = Vec::with_capacity(rows.len()); for row in rows { - let receipt_id: String = row.try_get("id").unwrap_or_default(); + let receipt_id: String = match row.try_get("id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + column = "id", + "skipping malformed terminal receipt row" + ); + continue; + } + }; let updated = sqlx::query( "UPDATE worker_delivery_receipts \ SET status = 'sending', updated_at = CURRENT_TIMESTAMP \ @@ -1188,13 +1198,83 @@ impl ProcessRunLogger { continue; } + let worker_id: String = match row.try_get("worker_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + receipt_id = %receipt_id, + column = "worker_id", + "skipping malformed terminal receipt row" + ); + continue; + } + }; + let receipt_channel_id: String = match row.try_get("channel_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "channel_id", + "skipping malformed terminal receipt row" + ); + continue; + } + }; + let terminal_state: String = match row.try_get("terminal_state") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "terminal_state", + "skipping malformed terminal receipt row" + ); + continue; + } + }; + let payload_text: String = match row.try_get("payload_text") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "payload_text", + "skipping malformed terminal receipt row" + ); + continue; + } + }; + let attempt_count: i64 = match row.try_get("attempt_count") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "attempt_count", + "skipping malformed terminal receipt row" + ); + continue; + } + }; + claimed.push(PendingWorkerDeliveryReceipt { id: receipt_id, - worker_id: row.try_get("worker_id").unwrap_or_default(), - channel_id: row.try_get("channel_id").unwrap_or_default(), - terminal_state: row.try_get("terminal_state").unwrap_or_default(), - payload_text: row.try_get("payload_text").unwrap_or_default(), - attempt_count: row.try_get("attempt_count").unwrap_or_default(), + worker_id, + channel_id: receipt_channel_id, + terminal_state, + payload_text, + attempt_count, }); } @@ -1233,7 +1313,17 @@ impl ProcessRunLogger { let mut claimed = Vec::with_capacity(rows.len()); for row in rows { - let receipt_id: String = row.try_get("id").unwrap_or_default(); + let receipt_id: String = match row.try_get("id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + column = "id", + "skipping malformed terminal receipt row (global claim)" + ); + continue; + } + }; let updated = sqlx::query( "UPDATE worker_delivery_receipts \ SET status = 'sending', updated_at = CURRENT_TIMESTAMP \ @@ -1249,13 +1339,78 @@ impl ProcessRunLogger { continue; } + let worker_id: String = match row.try_get("worker_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + receipt_id = %receipt_id, + column = "worker_id", + "skipping malformed terminal receipt row (global claim)" + ); + continue; + } + }; + let receipt_channel_id: String = match row.try_get("channel_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "channel_id", + "skipping malformed terminal receipt row (global claim)" + ); + continue; + } + }; + let terminal_state: String = match row.try_get("terminal_state") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "terminal_state", + "skipping malformed terminal receipt row (global claim)" + ); + continue; + } + }; + let payload_text: String = match row.try_get("payload_text") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "payload_text", + "skipping malformed terminal receipt row (global claim)" + ); + continue; + } + }; + let attempt_count: i64 = match row.try_get("attempt_count") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "attempt_count", + "skipping malformed terminal receipt row (global claim)" + ); + continue; + } + }; + claimed.push(PendingWorkerDeliveryReceipt { id: receipt_id, - worker_id: row.try_get("worker_id").unwrap_or_default(), - channel_id: row.try_get("channel_id").unwrap_or_default(), - terminal_state: row.try_get("terminal_state").unwrap_or_default(), - payload_text: row.try_get("payload_text").unwrap_or_default(), - attempt_count: row.try_get("attempt_count").unwrap_or_default(), + worker_id, + channel_id: receipt_channel_id, + terminal_state, + payload_text, + attempt_count, }); } @@ -1337,8 +1492,16 @@ impl ProcessRunLogger { .map_err(|db_error| anyhow::anyhow!(db_error))? .ok_or_else(|| anyhow::anyhow!("worker delivery receipt not found: {receipt_id}"))?; - let current_status: String = row.try_get("status").unwrap_or_default(); - let current_attempts: i64 = row.try_get("attempt_count").unwrap_or_default(); + let current_status: String = row.try_get("status").map_err(|decode_error| { + anyhow::anyhow!( + "failed to decode worker_delivery_receipts.status for {receipt_id}: {decode_error}" + ) + })?; + let current_attempts: i64 = row.try_get("attempt_count").map_err(|decode_error| { + anyhow::anyhow!( + "failed to decode worker_delivery_receipts.attempt_count for {receipt_id}: {decode_error}" + ) + })?; if current_status == "acked" { tx.commit() diff --git a/src/db.rs b/src/db.rs index f003ec08c..a418e54e0 100644 --- a/src/db.rs +++ b/src/db.rs @@ -104,5 +104,9 @@ mod tests { "duplicate migration version detected: {version} ({file_name})" ); } + assert!( + !seen_versions.is_empty(), + "no migrations found in migrations/" + ); } } diff --git a/src/messaging/discord.rs b/src/messaging/discord.rs index 7acf18e4b..350454a64 100644 --- a/src/messaging/discord.rs +++ b/src/messaging/discord.rs @@ -26,7 +26,8 @@ pub struct DiscordAdapter { bot_user_id: Arc>>, /// Maps InboundMessage.id to the Discord MessageId being edited during streaming. active_messages: Arc>>, - /// Per-channel progress message used for worker checkpoint edits. + /// Per-worker-per-channel progress message used for worker checkpoint edits. + /// Keys are generated by `progress_message_key()` as `{channel_id}:{worker_id}`. progress_messages: Arc>>, /// Typing handles per message. Typing stops when the handle is dropped. typing_tasks: Arc>>, @@ -108,14 +109,15 @@ impl DiscordAdapter { } else { content.to_string() }; + let mut progress_messages = self.progress_messages.write().await; - let existing_id = self.progress_messages.read().await.get(&key).copied(); - if let Some(message_id) = existing_id { + if let Some(message_id) = progress_messages.get(&key).copied() { let builder = EditMessage::new().content(display_text.clone()); match channel_id.edit_message(&*http, message_id, builder).await { Ok(_) => return Ok(()), Err(error) => { tracing::warn!(%error, "failed to edit progress message; creating a new one"); + progress_messages.remove(&key); } } } @@ -129,7 +131,7 @@ impl DiscordAdapter { .send_message(&*http, builder) .await .context("failed to send worker progress message")?; - self.progress_messages.write().await.insert(key, sent.id); + progress_messages.insert(key, sent.id); Ok(()) } diff --git a/src/tools/browser.rs b/src/tools/browser.rs index a1f56d04d..ff4cf36b1 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -131,6 +131,11 @@ fn is_v4_mapped_blocked(ip: Ipv6Addr) -> bool { /// Tool for browser automation (worker-only). #[derive(Debug, Clone)] pub struct BrowserTool { + /// Shared browser session state for this worker. + /// + /// Operations intentionally hold this mutex across long awaits (including + /// `with_action_timeout(...)` and `Browser::launch`) so actions are + /// serialized per worker and cannot interleave unpredictably. state: Arc>, config: BrowserConfig, screenshot_dir: PathBuf, @@ -783,8 +788,11 @@ impl BrowserTool { return Err(BrowserError::new("text is required for act:type")); }; let element = self.resolve_element_ref(&state, page, element_ref).await?; - Self::with_action_timeout("act focus", element.click()).await?; - Self::with_action_timeout("act type", element.type_str(&text)).await?; + Self::with_action_timeout("act type", async { + element.click().await?; + element.type_str(&text).await + }) + .await?; Ok(BrowserOutput::success(format!( "Typed '{}' into element", truncate_for_display(&text, 50) @@ -922,9 +930,10 @@ impl BrowserTool { // Truncate very large pages for LLM consumption let truncated = if html.len() > 100_000 { + let end = html.floor_char_boundary(100_000); format!( "{}... [truncated, {} bytes total]", - &html[..100_000], + &html[..end], html.len() ) } else { @@ -947,10 +956,12 @@ impl BrowserTool { async fn handle_close(&self) -> Result { let mut state = self.state.lock().await; - if let Some(mut browser) = state.browser.take() - && let Err(error) = browser.close().await - { - tracing::warn!(%error, "browser close returned error"); + if let Some(mut browser) = state.browser.take() { + let close_result = + Self::with_action_timeout("browser close", async { browser.close().await }).await; + if let Err(error) = close_result { + tracing::warn!(%error, "browser close returned error"); + } } state.pages.clear(); @@ -1094,6 +1105,7 @@ fn truncate_for_display(text: &str, max_len: usize) -> String { if text.len() <= max_len { text.to_string() } else { - format!("{}...", &text[..max_len]) + let end = text.floor_char_boundary(max_len); + format!("{}...", &text[..end]) } } diff --git a/src/tools/cancel.rs b/src/tools/cancel.rs index 91dcbc568..0cfa4ce84 100644 --- a/src/tools/cancel.rs +++ b/src/tools/cancel.rs @@ -107,15 +107,18 @@ impl Tool for CancelTool { other => return Err(CancelError(format!("Unknown process type: {other}"))), } + let display_reason = args + .reason + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + .unwrap_or("cancelled by request"); + let message = if args.process_type == "worker" { - if let Some(reason) = &args.reason { - format!( - "{} {} cancelled: {reason}", - args.process_type, args.process_id - ) - } else { - format!("{} {} cancelled.", args.process_type, args.process_id) - } + format!( + "{} {} cancelled: {display_reason}", + args.process_type, args.process_id + ) } else { format!("{} {} cancelled.", args.process_type, args.process_id) }; From 84a019d9f946aade311a9a612fbb209f97948f74 Mon Sep 17 00:00:00 2001 From: Victor Sumner Date: Tue, 24 Feb 2026 21:09:23 -0500 Subject: [PATCH 8/8] Harden outbound delivery paths and simplify routing flow --- docs/content/docs/(configuration)/config.mdx | 6 +- docs/content/docs/(features)/workers.mdx | 3 +- src/agent/channel.rs | 28 +- src/conversation/history.rs | 218 +++++-- src/main.rs | 588 +++++++++++++------ src/messaging/discord.rs | 53 +- src/tools/browser.rs | 6 + 7 files changed, 624 insertions(+), 278 deletions(-) diff --git a/docs/content/docs/(configuration)/config.mdx b/docs/content/docs/(configuration)/config.mdx index 8b62ff9c6..283a5e924 100644 --- a/docs/content/docs/(configuration)/config.mdx +++ b/docs/content/docs/(configuration)/config.mdx @@ -86,9 +86,9 @@ emergency_threshold = 0.95 # drop oldest 50%, no LLM # Deterministic worker task contract timing. [defaults.worker_contract] -ack_secs = 5 -progress_secs = 45 -tick_secs = 2 +ack_secs = 5 # seconds before first ack checkpoint +progress_secs = 45 # seconds between progress heartbeat nudges +tick_secs = 2 # scheduler tick interval for contract deadline checks # Cortex (system observer) settings. [defaults.cortex] diff --git a/docs/content/docs/(features)/workers.mdx b/docs/content/docs/(features)/workers.mdx index 36d0e7bc4..a253d1b2e 100644 --- a/docs/content/docs/(features)/workers.mdx +++ b/docs/content/docs/(features)/workers.mdx @@ -156,7 +156,8 @@ Worker lifecycle updates are also written to an append-only `worker_events` tabl - `started` with task + worker type - `status` checkpoints -- `tool_started` / `tool_completed` +- `tool_started` +- `tool_completed` - `permission` / `question` - `completed` with terminal summary diff --git a/src/agent/channel.rs b/src/agent/channel.rs index 5454dc93c..72b6b03e3 100644 --- a/src/agent/channel.rs +++ b/src/agent/channel.rs @@ -172,6 +172,7 @@ impl ChannelState { format!("{WORKER_CANCELLED_PREFIX} {reason}."), false, ); + Ok(()) } Err(join_error) => { let failure = format!("Worker failed during cancellation: {join_error}"); @@ -182,6 +183,7 @@ impl ChannelState { "worker join failed after cancellation request" ); self.send_worker_terminal_events(worker_id, "failed", failure, false); + Ok(()) } Ok(()) => { tracing::debug!( @@ -189,9 +191,11 @@ impl ChannelState { channel_id = %self.channel_id, "worker finished before cancellation took effect" ); + Err(format!( + "Worker {worker_id} finished before cancellation took effect" + )) } } - Ok(()) } else if removed { // Worker was in active_workers but had no handle (shouldn't happen, but handle gracefully) tracing::warn!( @@ -1482,12 +1486,19 @@ impl Channel { .tool_server_handle(self.tool_server.clone()) .build(); - let _ = self + if let Err(error) = self .response_tx .send(OutboundEnvelope::from(OutboundResponse::Status( crate::StatusUpdate::Thinking, ))) - .await; + .await + { + tracing::warn!( + %error, + channel_id = %self.id, + "failed to send thinking status update" + ); + } // Inject attachments as a user message before the text prompt if !attachment_content.is_empty() { @@ -1727,12 +1738,19 @@ impl Channel { } // Ensure typing indicator is always cleaned up, even on error paths - let _ = self + if let Err(error) = self .response_tx .send(OutboundEnvelope::from(OutboundResponse::Status( crate::StatusUpdate::StopTyping, ))) - .await; + .await + { + tracing::warn!( + %error, + channel_id = %self.id, + "failed to send stop-typing status update" + ); + } } /// Handle a process event (branch results, worker completions, status updates). diff --git a/src/conversation/history.rs b/src/conversation/history.rs index c7fd195bb..c6840a493 100644 --- a/src/conversation/history.rs +++ b/src/conversation/history.rs @@ -421,6 +421,7 @@ impl ProcessRunLogger { agent_id: &crate::AgentId, ) { let pool = self.pool.clone(); + let logger = self.clone(); let id = worker_id.to_string(); let channel_id = channel_id.map(|c| c.to_string()); let task = task.to_string(); @@ -449,23 +450,13 @@ impl ProcessRunLogger { "worker_type": worker_type, }) .to_string(); - let event_id = uuid::Uuid::new_v4().to_string(); - - if let Err(error) = sqlx::query( - "INSERT INTO worker_events \ - (id, worker_id, channel_id, agent_id, event_type, payload_json) \ - VALUES (?, ?, ?, ?, 'started', ?)", - ) - .bind(&event_id) - .bind(&id) - .bind(&channel_id) - .bind(&agent_id) - .bind(&payload_json) - .execute(&pool) - .await - { - tracing::warn!(%error, worker_id = %id, "failed to persist worker start event"); - } + logger.log_worker_event_with_context( + id.clone(), + channel_id.clone(), + Some(agent_id.clone()), + "started".to_string(), + Some(payload_json), + ); }); } @@ -485,6 +476,7 @@ impl ProcessRunLogger { /// Record a worker completing with its result. Fire-and-forget. pub fn log_worker_completed(&self, worker_id: WorkerId, result: &str, success: bool) { let pool = self.pool.clone(); + let logger = self.clone(); let id = worker_id.to_string(); let result = result.to_string(); let success_int = if success { 1_i64 } else { 0_i64 }; @@ -514,6 +506,7 @@ impl ProcessRunLogger { .await { tracing::warn!(%error, worker_id = %id, "failed to persist worker completion"); + return; } let payload_json = serde_json::json!({ @@ -521,34 +514,13 @@ impl ProcessRunLogger { "success": success, }) .to_string(); - let event_id = uuid::Uuid::new_v4().to_string(); - - if let Err(error) = sqlx::query( - "INSERT INTO worker_events \ - (id, worker_id, channel_id, agent_id, event_type, payload_json) \ - VALUES ( \ - ?, \ - ?, \ - (SELECT channel_id FROM worker_runs WHERE id = ?), \ - (SELECT agent_id FROM worker_runs WHERE id = ?), \ - 'completed', \ - ? \ - )", - ) - .bind(&event_id) - .bind(&id) - .bind(&id) - .bind(&id) - .bind(&payload_json) - .execute(&pool) - .await - { - tracing::warn!( - %error, - worker_id = %id, - "failed to persist worker completion event" - ); - } + logger.log_worker_event_with_context( + id.clone(), + None, + None, + "completed".to_string(), + Some(payload_json), + ); }); } @@ -1208,6 +1180,12 @@ impl ProcessRunLogger { column = "worker_id", "skipping malformed terminal receipt row" ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + &channel_id, + ) + .await; continue; } }; @@ -1222,6 +1200,12 @@ impl ProcessRunLogger { column = "channel_id", "skipping malformed terminal receipt row" ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + &channel_id, + ) + .await; continue; } }; @@ -1236,6 +1220,12 @@ impl ProcessRunLogger { column = "terminal_state", "skipping malformed terminal receipt row" ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + &channel_id, + ) + .await; continue; } }; @@ -1250,6 +1240,12 @@ impl ProcessRunLogger { column = "payload_text", "skipping malformed terminal receipt row" ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + &channel_id, + ) + .await; continue; } }; @@ -1264,6 +1260,12 @@ impl ProcessRunLogger { column = "attempt_count", "skipping malformed terminal receipt row" ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + &channel_id, + ) + .await; continue; } }; @@ -1348,6 +1350,12 @@ impl ProcessRunLogger { column = "worker_id", "skipping malformed terminal receipt row (global claim)" ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + "global", + ) + .await; continue; } }; @@ -1361,6 +1369,12 @@ impl ProcessRunLogger { column = "channel_id", "skipping malformed terminal receipt row (global claim)" ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + "global", + ) + .await; continue; } }; @@ -1374,6 +1388,12 @@ impl ProcessRunLogger { column = "terminal_state", "skipping malformed terminal receipt row (global claim)" ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + "global", + ) + .await; continue; } }; @@ -1387,6 +1407,12 @@ impl ProcessRunLogger { column = "payload_text", "skipping malformed terminal receipt row (global claim)" ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + "global", + ) + .await; continue; } }; @@ -1400,6 +1426,12 @@ impl ProcessRunLogger { column = "attempt_count", "skipping malformed terminal receipt row (global claim)" ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + "global", + ) + .await; continue; } }; @@ -1643,6 +1675,12 @@ impl ProcessRunLogger { /// rows with NULL `completed_at` cannot be resumed and should be marked /// terminal so timelines and analytics stay accurate. pub async fn close_orphaned_runs(&self) -> crate::error::Result<(u64, u64, u64, u64)> { + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + let worker_result = sqlx::query( "UPDATE worker_runs \ SET status = 'failed', \ @@ -1650,7 +1688,7 @@ impl ProcessRunLogger { completed_at = CURRENT_TIMESTAMP \ WHERE completed_at IS NULL", ) - .execute(&self.pool) + .execute(&mut *tx) .await .map_err(|error| anyhow::anyhow!(error))?; @@ -1660,7 +1698,7 @@ impl ProcessRunLogger { completed_at = CURRENT_TIMESTAMP \ WHERE completed_at IS NULL", ) - .execute(&self.pool) + .execute(&mut *tx) .await .map_err(|error| anyhow::anyhow!(error))?; @@ -1671,7 +1709,7 @@ impl ProcessRunLogger { updated_at = CURRENT_TIMESTAMP \ WHERE status = 'sending'", ) - .execute(&self.pool) + .execute(&mut *tx) .await .map_err(|error| anyhow::anyhow!(error))?; @@ -1691,10 +1729,12 @@ impl ProcessRunLogger { .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) - .execute(&self.pool) + .execute(&mut *tx) .await .map_err(|error| anyhow::anyhow!(error))?; + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + Ok(( worker_result.rows_affected(), branch_result.rows_affected(), @@ -1703,6 +1743,31 @@ impl ProcessRunLogger { )) } + async fn revert_claimed_terminal_receipt_to_pending( + tx: &mut sqlx::Transaction<'_, sqlx::Sqlite>, + receipt_id: &str, + scope: &str, + ) { + if let Err(error) = sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'pending', \ + next_attempt_at = CURRENT_TIMESTAMP, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? AND status = 'sending'", + ) + .bind(receipt_id) + .execute(&mut **tx) + .await + { + tracing::warn!( + %error, + receipt_id = %receipt_id, + scope, + "failed to revert malformed terminal receipt claim" + ); + } + } + /// Load a unified timeline for a channel: messages, branch runs, and worker runs /// interleaved chronologically (oldest first). /// @@ -1956,21 +2021,60 @@ impl ProcessRunLogger { .await .map_err(|error| anyhow::anyhow!(error))?; - let mut events = rows - .into_iter() - .map(|row| WorkerEventRow { - id: row.try_get("id").unwrap_or_default(), - worker_id: row.try_get("worker_id").unwrap_or_default(), + let mut events = Vec::with_capacity(rows.len()); + for row in rows { + let id: String = match row.try_get("id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + worker_id = %worker_id, + column = "id", + "skipping malformed worker event row" + ); + continue; + } + }; + let event_worker_id: String = match row.try_get("worker_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + worker_id = %worker_id, + event_id = %id, + column = "worker_id", + "skipping malformed worker event row" + ); + continue; + } + }; + let event_type: String = match row.try_get("event_type") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + worker_id = %worker_id, + event_id = %id, + column = "event_type", + "skipping malformed worker event row" + ); + continue; + } + }; + + events.push(WorkerEventRow { + id, + worker_id: event_worker_id, channel_id: row.try_get("channel_id").ok(), agent_id: row.try_get("agent_id").ok(), - event_type: row.try_get("event_type").unwrap_or_default(), + event_type, payload_json: row.try_get("payload_json").ok(), created_at: row .try_get::, _>("created_at") .map(|t| t.to_rfc3339()) .unwrap_or_default(), - }) - .collect::>(); + }); + } events.reverse(); Ok(events) diff --git a/src/main.rs b/src/main.rs index dcccf4c0e..19f0adf66 100644 --- a/src/main.rs +++ b/src/main.rs @@ -130,6 +130,373 @@ const WORKER_RECEIPT_GLOBAL_DISPATCH_INTERVAL_SECS: u64 = 5; const WORKER_RECEIPT_GLOBAL_DISPATCH_BATCH_SIZE: i64 = 32; const WORKER_RECEIPT_PRUNE_INTERVAL_SECS: u64 = 60 * 60; +struct OutboundRouteContext<'a> { + messaging_for_outbound: &'a spacebot::messaging::MessagingManager, + current_message: &'a spacebot::InboundMessage, + outbound_conversation_id: &'a str, + outbound_agent_names: &'a HashMap, + sse_agent_id: &'a str, + api_event_tx: &'a tokio::sync::broadcast::Sender, +} + +struct RoutedOutboundResponse { + delivery_result: spacebot::Result<()>, + delivery_outcome: spacebot::messaging::traits::DeliveryOutcome, + status_surfaced: bool, + is_status_update: bool, + acknowledged_worker_id: Option, +} + +struct ReceiptDeliveryContext<'a> { + outbound_process_logger: &'a spacebot::conversation::history::ProcessRunLogger, + outbound_conversation_logger: &'a spacebot::conversation::history::ConversationLogger, + outbound_channel_id: &'a spacebot::ChannelId, + outbound_conversation_id: &'a str, +} + +fn outbound_response_text(response: &spacebot::OutboundResponse) -> Option { + match response { + spacebot::OutboundResponse::Text(text) + | spacebot::OutboundResponse::StreamChunk(text) + | spacebot::OutboundResponse::Ephemeral { text, .. } + | spacebot::OutboundResponse::ScheduledMessage { text, .. } + | spacebot::OutboundResponse::RichMessage { text, .. } + | spacebot::OutboundResponse::ThreadReply { text, .. } => Some(text.clone()), + _ => None, + } +} + +fn acknowledged_worker_id_from_response( + response: &spacebot::OutboundResponse, +) -> Option { + match response { + spacebot::OutboundResponse::Status(spacebot::StatusUpdate::WorkerStarted { + worker_id, + .. + }) + | spacebot::OutboundResponse::Status(spacebot::StatusUpdate::WorkerCheckpoint { + worker_id, + .. + }) + | spacebot::OutboundResponse::Status(spacebot::StatusUpdate::WorkerCompleted { + worker_id, + .. + }) => Some(*worker_id), + _ => None, + } +} + +fn emit_outbound_sse_event( + api_event_tx: &tokio::sync::broadcast::Sender, + sse_agent_id: &str, + sse_channel_id: &str, + response: &spacebot::OutboundResponse, +) { + match response { + spacebot::OutboundResponse::Text(text) + | spacebot::OutboundResponse::RichMessage { text, .. } + | spacebot::OutboundResponse::ThreadReply { text, .. } => { + api_event_tx + .send(spacebot::api::ApiEvent::OutboundMessage { + agent_id: sse_agent_id.to_string(), + channel_id: sse_channel_id.to_string(), + text: text.clone(), + }) + .ok(); + } + spacebot::OutboundResponse::Status(spacebot::StatusUpdate::Thinking) => { + api_event_tx + .send(spacebot::api::ApiEvent::TypingState { + agent_id: sse_agent_id.to_string(), + channel_id: sse_channel_id.to_string(), + is_typing: true, + }) + .ok(); + } + spacebot::OutboundResponse::Status(spacebot::StatusUpdate::StopTyping) => { + api_event_tx + .send(spacebot::api::ApiEvent::TypingState { + agent_id: sse_agent_id.to_string(), + channel_id: sse_channel_id.to_string(), + is_typing: false, + }) + .ok(); + } + _ => {} + } +} + +async fn route_internal_link_reply( + context: &OutboundRouteContext<'_>, + response: &spacebot::OutboundResponse, +) -> spacebot::Result { + let Some(text) = outbound_response_text(response) else { + return Ok(spacebot::messaging::traits::DeliveryOutcome::NotSurfaced); + }; + + let reply_to_agent = context + .current_message + .metadata + .get("reply_to_agent") + .and_then(|value| value.as_str()) + .map(str::to_owned); + let reply_to_channel = context + .current_message + .metadata + .get("reply_to_channel") + .and_then(|value| value.as_str()) + .map(str::to_owned); + + let (Some(target_agent), Some(target_channel)) = (reply_to_agent, reply_to_channel) else { + return Err(spacebot::Error::Other(anyhow::anyhow!( + "internal link reply missing reply_to_agent/reply_to_channel metadata" + ))); + }; + + let agent_display = context + .outbound_agent_names + .get(context.sse_agent_id) + .cloned() + .unwrap_or_else(|| context.sse_agent_id.to_string()); + + let original_text = match &context.current_message.content { + spacebot::MessageContent::Text(text) => Some(text.clone()), + spacebot::MessageContent::Media { text, .. } => text.clone(), + _ => None, + }; + + let mut metadata = HashMap::new(); + metadata.insert( + "from_agent_id".to_string(), + serde_json::json!(context.sse_agent_id), + ); + metadata.insert( + "reply_to_agent".to_string(), + serde_json::json!(context.sse_agent_id), + ); + metadata.insert( + "reply_to_channel".to_string(), + serde_json::json!(context.outbound_conversation_id), + ); + if let Some(original) = original_text { + metadata.insert( + "original_sent_message".to_string(), + serde_json::json!(original), + ); + } + if let Some(originating) = context.current_message.metadata.get("originating_channel") { + metadata.insert("originating_channel".to_string(), originating.clone()); + } + if let Some(source) = context.current_message.metadata.get("originating_source") { + metadata.insert("originating_source".to_string(), source.clone()); + } + + let reply_message = spacebot::InboundMessage { + id: uuid::Uuid::new_v4().to_string(), + source: "internal".to_string(), + conversation_id: target_channel.clone(), + sender_id: context.sse_agent_id.to_string(), + agent_id: Some(Arc::from(target_agent.as_str())), + content: spacebot::MessageContent::Text(text), + timestamp: chrono::Utc::now(), + metadata, + formatted_author: Some(format!("[{agent_display}]")), + }; + + context + .messaging_for_outbound + .inject_message(reply_message) + .await?; + + context + .api_event_tx + .send(spacebot::api::ApiEvent::AgentMessageSent { + from_agent_id: context.sse_agent_id.to_string(), + to_agent_id: target_agent.clone(), + link_id: target_channel.clone(), + channel_id: target_channel.clone(), + }) + .ok(); + + tracing::info!( + from = %context.sse_agent_id, + to = %target_agent, + channel = %target_channel, + "routed link channel reply" + ); + + Ok(spacebot::messaging::traits::DeliveryOutcome::Surfaced) +} + +async fn route_outbound_response( + context: &OutboundRouteContext<'_>, + response: spacebot::OutboundResponse, +) -> RoutedOutboundResponse { + if context.current_message.source == "internal" { + let acknowledged_worker_id = acknowledged_worker_id_from_response(&response); + if matches!(response, spacebot::OutboundResponse::Status(_)) { + return RoutedOutboundResponse { + delivery_result: Ok(()), + delivery_outcome: spacebot::messaging::traits::DeliveryOutcome::Surfaced, + status_surfaced: true, + is_status_update: true, + acknowledged_worker_id, + }; + } + + let (delivery_result, delivery_outcome) = + match route_internal_link_reply(context, &response).await { + Ok(outcome) => (Ok(()), outcome), + Err(error) => ( + Err(error), + spacebot::messaging::traits::DeliveryOutcome::NotSurfaced, + ), + }; + let status_surfaced = delivery_outcome.is_surfaced(); + return RoutedOutboundResponse { + delivery_result, + delivery_outcome, + status_surfaced, + is_status_update: false, + acknowledged_worker_id, + }; + } + + let acknowledged_worker_id = acknowledged_worker_id_from_response(&response); + match response { + spacebot::OutboundResponse::Status(status) => { + let (delivery_result, delivery_outcome) = match context + .messaging_for_outbound + .send_status(context.current_message, status) + .await + { + Ok(outcome) => (Ok(()), outcome), + Err(error) => ( + Err(error), + spacebot::messaging::traits::DeliveryOutcome::NotSurfaced, + ), + }; + let status_surfaced = delivery_outcome.is_surfaced(); + RoutedOutboundResponse { + delivery_result, + delivery_outcome, + status_surfaced, + is_status_update: true, + acknowledged_worker_id, + } + } + response => { + tracing::info!( + conversation_id = %context.outbound_conversation_id, + "routing outbound response to messaging adapter" + ); + RoutedOutboundResponse { + delivery_result: context + .messaging_for_outbound + .respond(context.current_message, response) + .await, + delivery_outcome: spacebot::messaging::traits::DeliveryOutcome::Surfaced, + status_surfaced: true, + is_status_update: false, + acknowledged_worker_id, + } + } + } +} + +async fn handle_delivery_receipt( + context: &ReceiptDeliveryContext<'_>, + receipt_id: &str, + routed: &RoutedOutboundResponse, + receipt_log_text: Option<&str>, +) { + if routed.is_status_update && !routed.status_surfaced { + match context + .outbound_process_logger + .fail_worker_delivery_receipt_attempt( + receipt_id, + "status update not surfaced by adapter", + ) + .await + { + Ok(outcome) => { + tracing::warn!( + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + attempt_count = outcome.attempt_count, + status = %outcome.status, + next_attempt_at = ?outcome.next_attempt_at, + "worker terminal receipt was not surfaced; scheduled retry" + ); + } + Err(update_error) => { + tracing::warn!( + %update_error, + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + "failed to record unsurfaced worker terminal receipt" + ); + } + } + return; + } + + match &routed.delivery_result { + Ok(()) => match context + .outbound_process_logger + .ack_worker_delivery_receipt(receipt_id) + .await + { + Ok(acked_now) => { + if acked_now { + if let Some(text) = receipt_log_text { + context + .outbound_conversation_logger + .log_bot_message(context.outbound_channel_id, text); + } + tracing::info!( + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + "worker terminal receipt delivered" + ); + } + } + Err(error) => { + tracing::warn!( + %error, + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + "failed to ack worker terminal receipt" + ); + } + }, + Err(error) => match context + .outbound_process_logger + .fail_worker_delivery_receipt_attempt(receipt_id, &error.to_string()) + .await + { + Ok(outcome) => { + tracing::warn!( + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + attempt_count = outcome.attempt_count, + status = %outcome.status, + next_attempt_at = ?outcome.next_attempt_at, + "worker terminal receipt delivery failed" + ); + } + Err(update_error) => { + tracing::warn!( + %update_error, + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + "failed to record worker terminal receipt delivery failure" + ); + } + }, + } +} + fn main() -> anyhow::Result<()> { rustls::crypto::ring::default_provider() .install_default() @@ -1098,110 +1465,37 @@ async fn run( spacebot::conversation::history::ConversationLogger::new( agent.db.sqlite.clone(), ); + let outbound_agent_names = agent.deps.agent_names.clone(); let api_event_tx = api_state.event_tx.clone(); let sse_agent_id = agent_id.to_string(); let sse_channel_id = conversation_id.clone(); let outbound_handle = tokio::spawn(async move { while let Some(envelope) = response_rx.recv().await { - let receipt_id = envelope.receipt_id.clone(); + let receipt_id = envelope.receipt_id; let response = envelope.response; - let receipt_log_text = match &response { - spacebot::OutboundResponse::Text(text) => Some(text.clone()), - spacebot::OutboundResponse::RichMessage { text, .. } => { - Some(text.clone()) - } - spacebot::OutboundResponse::ThreadReply { text, .. } => { - Some(text.clone()) - } - _ => None, - }; + let receipt_log_text = outbound_response_text(&response); - // Forward relevant events to SSE clients - match &response { - spacebot::OutboundResponse::Text(text) => { - api_event_tx.send(spacebot::api::ApiEvent::OutboundMessage { - agent_id: sse_agent_id.clone(), - channel_id: sse_channel_id.clone(), - text: text.clone(), - }).ok(); - } - spacebot::OutboundResponse::RichMessage { text, .. } => { - api_event_tx.send(spacebot::api::ApiEvent::OutboundMessage { - agent_id: sse_agent_id.clone(), - channel_id: sse_channel_id.clone(), - text: text.clone(), - }).ok(); - } - spacebot::OutboundResponse::ThreadReply { text, .. } => { - api_event_tx.send(spacebot::api::ApiEvent::OutboundMessage { - agent_id: sse_agent_id.clone(), - channel_id: sse_channel_id.clone(), - text: text.clone(), - }).ok(); - } - spacebot::OutboundResponse::Status(spacebot::StatusUpdate::Thinking) => { - api_event_tx.send(spacebot::api::ApiEvent::TypingState { - agent_id: sse_agent_id.clone(), - channel_id: sse_channel_id.clone(), - is_typing: true, - }).ok(); - } - spacebot::OutboundResponse::Status(spacebot::StatusUpdate::StopTyping) => { - api_event_tx.send(spacebot::api::ApiEvent::TypingState { - agent_id: sse_agent_id.clone(), - channel_id: sse_channel_id.clone(), - is_typing: false, - }).ok(); - } - _ => {} - } + emit_outbound_sse_event( + &api_event_tx, + &sse_agent_id, + &sse_channel_id, + &response, + ); let current_message = outbound_message.read().await.clone(); - let acknowledged_worker_id = match &response { - spacebot::OutboundResponse::Status( - spacebot::StatusUpdate::WorkerStarted { worker_id, .. }, - ) - | spacebot::OutboundResponse::Status( - spacebot::StatusUpdate::WorkerCheckpoint { worker_id, .. }, - ) - | spacebot::OutboundResponse::Status( - spacebot::StatusUpdate::WorkerCompleted { worker_id, .. }, - ) => Some(*worker_id), - _ => None, - }; - let is_status_update = - matches!(response, spacebot::OutboundResponse::Status(_)); - let (delivery_result, delivery_outcome) = match response { - spacebot::OutboundResponse::Status(status) => { - match messaging_for_outbound - .send_status(¤t_message, status) - .await - { - Ok(outcome) => (Ok(()), outcome), - Err(error) => ( - Err(error), - spacebot::messaging::traits::DeliveryOutcome::NotSurfaced, - ), - } - } - response => { - tracing::info!( - conversation_id = %outbound_conversation_id, - "routing outbound response to messaging adapter" - ); - ( - messaging_for_outbound - .respond(¤t_message, response) - .await, - spacebot::messaging::traits::DeliveryOutcome::Surfaced, - ) - } + let route_context = OutboundRouteContext { + messaging_for_outbound: &messaging_for_outbound, + current_message: ¤t_message, + outbound_conversation_id: &outbound_conversation_id, + outbound_agent_names: &outbound_agent_names, + sse_agent_id: &sse_agent_id, + api_event_tx: &api_event_tx, }; - let status_surfaced = delivery_outcome.is_surfaced(); + let routed = route_outbound_response(&route_context, response).await; if let (Ok(()), Some(worker_id)) = - (&delivery_result, acknowledged_worker_id) - && status_surfaced + (&routed.delivery_result, routed.acknowledged_worker_id) + && routed.status_surfaced && let Err(error) = outbound_process_logger .mark_worker_task_contract_acknowledged(worker_id) .await @@ -1214,106 +1508,32 @@ async fn run( ); } - if let Some(receipt_id) = receipt_id { - if is_status_update && !status_surfaced { - match outbound_process_logger - .fail_worker_delivery_receipt_attempt( - &receipt_id, - "status update not surfaced by adapter", - ) - .await - { - Ok(outcome) => { - tracing::warn!( - channel_id = %outbound_conversation_id, - receipt_id = %receipt_id, - attempt_count = outcome.attempt_count, - status = %outcome.status, - next_attempt_at = ?outcome.next_attempt_at, - "worker terminal receipt was not surfaced; scheduled retry" - ); - } - Err(update_error) => { - tracing::warn!( - %update_error, - channel_id = %outbound_conversation_id, - receipt_id = %receipt_id, - "failed to record unsurfaced worker terminal receipt" - ); - } - } - } else { - match &delivery_result { - Ok(()) => { - match outbound_process_logger - .ack_worker_delivery_receipt(&receipt_id) - .await - { - Ok(acked_now) => { - if acked_now { - if let Some(text) = receipt_log_text.as_deref() { - outbound_conversation_logger - .log_bot_message(&outbound_channel_id, text); - } - tracing::info!( - channel_id = %outbound_conversation_id, - receipt_id = %receipt_id, - "worker terminal receipt delivered" - ); - } - } - Err(error) => { - tracing::warn!( - %error, - channel_id = %outbound_conversation_id, - receipt_id = %receipt_id, - "failed to ack worker terminal receipt" - ); - } - } - }, - Err(error) => { - match outbound_process_logger - .fail_worker_delivery_receipt_attempt( - &receipt_id, - &error.to_string(), - ) - .await - { - Ok(outcome) => { - tracing::warn!( - channel_id = %outbound_conversation_id, - receipt_id = %receipt_id, - attempt_count = outcome.attempt_count, - status = %outcome.status, - next_attempt_at = ?outcome.next_attempt_at, - "worker terminal receipt delivery failed" - ); - } - Err(update_error) => { - tracing::warn!( - %update_error, - channel_id = %outbound_conversation_id, - receipt_id = %receipt_id, - "failed to record worker terminal receipt delivery failure" - ); - } - } - } - } - } + if let Some(receipt_id) = receipt_id.as_deref() { + let receipt_context = ReceiptDeliveryContext { + outbound_process_logger: &outbound_process_logger, + outbound_conversation_logger: &outbound_conversation_logger, + outbound_channel_id: &outbound_channel_id, + outbound_conversation_id: &outbound_conversation_id, + }; + handle_delivery_receipt( + &receipt_context, + receipt_id, + &routed, + receipt_log_text.as_deref(), + ) + .await; } - if let Err(error) = delivery_result { - if is_status_update { + if let Err(error) = &routed.delivery_result { + if routed.is_status_update { tracing::warn!(%error, "failed to send status update"); } else { tracing::error!(%error, "failed to send outbound response"); } - } else if is_status_update && !status_surfaced { + } else if routed.is_status_update && !routed.status_surfaced { tracing::warn!( channel_id = %outbound_conversation_id, - delivery_outcome = ?delivery_outcome, + delivery_outcome = ?routed.delivery_outcome, "status update was accepted by adapter but not surfaced" ); } diff --git a/src/messaging/discord.rs b/src/messaging/discord.rs index 350454a64..c93ee347c 100644 --- a/src/messaging/discord.rs +++ b/src/messaging/discord.rs @@ -141,6 +141,28 @@ impl DiscordAdapter { .await .remove(&Self::progress_message_key(message, worker_id)); } + + async fn handle_worker_progress( + &self, + message: &InboundMessage, + worker_id: crate::WorkerId, + text: String, + clear_on_success: bool, + ) -> bool { + self.stop_typing(message).await; + if let Err(error) = self + .upsert_progress_message(message, worker_id, &text) + .await + { + tracing::debug!(%error, "failed to update discord progress message"); + false + } else { + if clear_on_success { + self.clear_progress_message(message, worker_id).await; + } + true + } + } } impl Messaging for DiscordAdapter { @@ -429,56 +451,31 @@ impl Messaging for DiscordAdapter { true } StatusUpdate::WorkerStarted { worker_id, task } => { - self.stop_typing(message).await; let text = format!( "Background task `{}` started: {}", short_worker_id(worker_id), task ); - if let Err(error) = self - .upsert_progress_message(message, worker_id, &text) + self.handle_worker_progress(message, worker_id, text, false) .await - { - tracing::debug!(%error, "failed to update discord progress message"); - false - } else { - true - } } StatusUpdate::WorkerCheckpoint { worker_id, status } => { - self.stop_typing(message).await; let text = format!( "Background task `{}`: {}", short_worker_id(worker_id), status ); - if let Err(error) = self - .upsert_progress_message(message, worker_id, &text) + self.handle_worker_progress(message, worker_id, text, false) .await - { - tracing::debug!(%error, "failed to update discord progress message"); - false - } else { - true - } } StatusUpdate::WorkerCompleted { worker_id, result } => { - self.stop_typing(message).await; let text = format!( "Background task `{}` completed: {}", short_worker_id(worker_id), result ); - if let Err(error) = self - .upsert_progress_message(message, worker_id, &text) + self.handle_worker_progress(message, worker_id, text, true) .await - { - tracing::debug!(%error, "failed to update discord progress message"); - false - } else { - self.clear_progress_message(message, worker_id).await; - true - } } StatusUpdate::StopTyping | StatusUpdate::ToolStarted { .. } diff --git a/src/tools/browser.rs b/src/tools/browser.rs index ff4cf36b1..09eeb792e 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -955,12 +955,14 @@ impl BrowserTool { async fn handle_close(&self) -> Result { let mut state = self.state.lock().await; + let mut close_error: Option = None; if let Some(mut browser) = state.browser.take() { let close_result = Self::with_action_timeout("browser close", async { browser.close().await }).await; if let Err(error) = close_result { tracing::warn!(%error, "browser close returned error"); + close_error = Some(error); } } @@ -970,6 +972,10 @@ impl BrowserTool { state.next_ref = 0; state._handler_task = None; + if let Some(error) = close_error { + return Err(error); + } + tracing::info!("browser closed"); Ok(BrowserOutput::success("Browser closed")) }