From 6a53306610f9d60ad05903c9cfef6d53bd7af49d Mon Sep 17 00:00:00 2001 From: Wojtek Majewski Date: Mon, 12 Jan 2026 10:30:48 +0100 Subject: [PATCH] docs: add 0.13.2 release notes with stalled task recovery guide --- pkgs/website/astro.config.mjs | 4 + .../src/content/docs/build/retrying-steps.mdx | 60 +++++--- .../docs/concepts/worker-lifecycle.mdx | 22 +-- .../website/src/content/docs/deploy/index.mdx | 15 ++ .../content/docs/deploy/monitor-execution.mdx | 44 +++++- .../deploy/troubleshooting-stalled-tasks.mdx | 118 +++++++++++++++ ...stalled-task-recovery-and-config-fixes.mdx | 71 +++++++++ .../configuration/step-execution.mdx | 135 +++++++++++------- 8 files changed, 383 insertions(+), 86 deletions(-) create mode 100644 pkgs/website/src/content/docs/deploy/troubleshooting-stalled-tasks.mdx create mode 100644 pkgs/website/src/content/docs/news/pgflow-0-13-2-stalled-task-recovery-and-config-fixes.mdx diff --git a/pkgs/website/astro.config.mjs b/pkgs/website/astro.config.mjs index 8f85d3bf7..3da4cebf5 100644 --- a/pkgs/website/astro.config.mjs +++ b/pkgs/website/astro.config.mjs @@ -336,6 +336,10 @@ export default defineConfig({ label: 'Troubleshooting connections', link: '/deploy/troubleshooting-connections/', }, + { + label: 'Troubleshooting stalled tasks', + link: '/deploy/troubleshooting-stalled-tasks/', + }, { label: 'Prune records', link: '/deploy/prune-records/' }, { label: 'Tune deployed flows', diff --git a/pkgs/website/src/content/docs/build/retrying-steps.mdx b/pkgs/website/src/content/docs/build/retrying-steps.mdx index 29c60acd5..13ac3f599 100644 --- a/pkgs/website/src/content/docs/build/retrying-steps.mdx +++ b/pkgs/website/src/content/docs/build/retrying-steps.mdx @@ -5,12 +5,13 @@ sidebar: order: 25 --- -import { Aside, CardGrid, LinkCard } from "@astrojs/starlight/components"; +import { Aside, CardGrid, LinkCard } from '@astrojs/starlight/components'; Configure retry behavior based on step reliability characteristics. Set conservative flow-level defaults and override per-step as needed. For detailed information about each configuration option, see the [Step Execution Options](/reference/configuration/step-execution/) reference. @@ -22,12 +23,14 @@ Not all failures should retry. Understanding the difference helps you configure ### Transient Failures Temporary problems that might succeed on retry: + - Network timeouts - Rate limiting (429 responses) - Temporary service unavailability (503 responses) - Database connection issues Configure with retries: + ```typescript .step({ slug: 'fetchExternalData', @@ -39,12 +42,14 @@ Configure with retries: ### Permanent Failures Problems that will never succeed on retry: + - Invalid input format (malformed email, negative numbers) - Missing required fields - Business rule violations - Schema validation errors Configure without retries: + ```typescript .step({ slug: 'validInput', @@ -56,11 +61,15 @@ Configure without retries: ``` For detailed guidance on validation patterns, see [Validation Steps](/build/validation-steps/). @@ -78,25 +87,35 @@ When different steps have different reliability requirements: ```typescript new Flow({ slug: 'dataPipeline', - maxAttempts: 3, // Sensible defaults + maxAttempts: 3, // Sensible defaults baseDelay: 1, }) - .step({ - slug: 'validateInput', - maxAttempts: 1, // No retries - validation should not fail - }, validateHandler) - .step({ - slug: 'fetchExternal', - maxAttempts: 5, // External API might be flaky - baseDelay: 10, // Longer delays for external service - }, fetchHandler) - .step({ - slug: 'saveResults', - // Use flow defaults - }, saveHandler) + .step( + { + slug: 'validateInput', + maxAttempts: 1, // No retries - validation should not fail + }, + validateHandler + ) + .step( + { + slug: 'fetchExternal', + maxAttempts: 5, // External API might be flaky + baseDelay: 10, // Longer delays for external service + }, + fetchHandler + ) + .step( + { + slug: 'saveResults', + // Use flow defaults + }, + saveHandler + ); ``` **Why this approach:** + - Set reasonable flow-level defaults - Override only where needed - Validation steps need no retries (fail immediately on bad input) @@ -131,4 +150,9 @@ new Flow({ href="/deploy/tune-flow-config/" description="Adjust configuration for production flows without redeploying" /> + diff --git a/pkgs/website/src/content/docs/concepts/worker-lifecycle.mdx b/pkgs/website/src/content/docs/concepts/worker-lifecycle.mdx index 580aea349..7300b3a1e 100644 --- a/pkgs/website/src/content/docs/concepts/worker-lifecycle.mdx +++ b/pkgs/website/src/content/docs/concepts/worker-lifecycle.mdx @@ -5,7 +5,7 @@ sidebar: order: 30 --- -import { Aside, CardGrid, LinkCard } from "@astrojs/starlight/components"; +import { Aside, CardGrid, LinkCard } from '@astrojs/starlight/components'; pgflow workers are designed to be resilient. They poll for tasks, send heartbeats, and automatically restart when they stop. @@ -55,6 +55,7 @@ restart -> startup: "starts new\nworker" ## Why Workers Stop Edge Functions have execution time limits: + - Free tier: 150 seconds - Paid tier: 400 seconds @@ -97,13 +98,13 @@ You don't need to manually curl or restart anything after the initial setup. ## Local vs Production Behavior -| Aspect | Local | Production | -|---------------------|--------------------------------|-----------------------------------------| -| Cron interval | 1 second | 1 second | -| HTTP request | Always for enabled functions | Only if not enough active workers | -| Debounce | Bypassed | Applied (prevents too frequent pings) | -| Active worker check | Bypassed | Required (enabled, non-deprecated) | -| Detection | Automatic (is_local) | Automatic | +| Aspect | Local | Production | +| ------------------- | ---------------------------- | ------------------------------------- | +| Cron interval | 1 second | 1 second | +| HTTP request | Always for enabled functions | Only if not enough active workers | +| Debounce | Bypassed | Applied (prevents too frequent pings) | +| Active worker check | Bypassed | Required (enabled, non-deprecated) | +| Detection | Automatic (is_local) | Automatic | ## Related @@ -123,4 +124,9 @@ You don't need to manually curl or restart anything after the initial setup. href="/build/local-development/" description="Tips for local development" /> + diff --git a/pkgs/website/src/content/docs/deploy/index.mdx b/pkgs/website/src/content/docs/deploy/index.mdx index 079d8a513..3a0eb1a90 100644 --- a/pkgs/website/src/content/docs/deploy/index.mdx +++ b/pkgs/website/src/content/docs/deploy/index.mdx @@ -52,6 +52,21 @@ Learn how to deploy pgflow to production, monitor workflow execution, and mainta /> +## Troubleshoot + + + + + + ## Maintain diff --git a/pkgs/website/src/content/docs/deploy/monitor-execution.mdx b/pkgs/website/src/content/docs/deploy/monitor-execution.mdx index b7787d686..3e147e017 100644 --- a/pkgs/website/src/content/docs/deploy/monitor-execution.mdx +++ b/pkgs/website/src/content/docs/deploy/monitor-execution.mdx @@ -5,7 +5,14 @@ sidebar: order: 10 --- -import { Aside, Steps, Tabs, CardGrid, LinkCard, FileTree } from "@astrojs/starlight/components"; +import { + Aside, + Steps, + Tabs, + CardGrid, + LinkCard, + FileTree, +} from '@astrojs/starlight/components'; This guide explains how to monitor your pgflow flows during and after execution using SQL queries. @@ -31,6 +38,7 @@ run_id | flow_slug | status | input | output ``` Run statuses include: + - `started`: The run has been created and is executing steps - `completed`: All steps have completed successfully - `failed`: One or more steps have failed after max retries @@ -61,6 +69,7 @@ final_step | created | 2 | 1 | null ``` Step statuses include: + - `created`: The step has been created but may be waiting for dependencies - `started`: The step has started execution (all dependencies are complete) - `completed`: The step has completed successfully @@ -95,6 +104,7 @@ run_id | step_slug | status | attempts_count | message_id | queued_at ``` Active task statuses: + - `queued`: Task is ready to run, waiting for a worker to claim it - `started`: Task is currently being processed by a worker (with `started_at` timestamp and `worker_id`) @@ -139,18 +149,21 @@ AND ss.status = 'failed'; To start flows from TypeScript applications and stream real-time progress updates, see [Start Flows from TypeScript Client](/build/starting-flows/typescript-client/). This page focuses on SQL-based monitoring for debugging and operations. + ## View step dependencies @@ -172,8 +185,29 @@ GROUP BY steps.step_slug; ## Next steps - - - - + + + + + diff --git a/pkgs/website/src/content/docs/deploy/troubleshooting-stalled-tasks.mdx b/pkgs/website/src/content/docs/deploy/troubleshooting-stalled-tasks.mdx new file mode 100644 index 000000000..d38eabca7 --- /dev/null +++ b/pkgs/website/src/content/docs/deploy/troubleshooting-stalled-tasks.mdx @@ -0,0 +1,118 @@ +--- +title: Troubleshooting Stalled Tasks +description: Diagnose and recover tasks stuck in 'started' status +sidebar: + order: 126 +--- + +import { Aside, CardGrid, LinkCard } from '@astrojs/starlight/components'; + +## What are stalled tasks? + +Tasks get stuck in `started` status when workers die mid-processing. This is common in serverless environments where functions can be terminated unexpectedly. When this happens, the message keeps cycling in the queue but can't be picked up because the task status filter excludes it. This wastes resources and blocks flow progress. + +## Automatic Recovery + +pgflow automatically recovers stalled tasks via a cron job that runs every 15 seconds: + +1. **Detection** - Identifies tasks stuck in `started` status longer than their step timeout + 30 second buffer +2. **Requeue** - Resets task status to `queued` and clears `started_at` +3. **Tracking** - Increments `requeued_count` and sets `last_requeued_at` for observability +4. **Limit** - After 3 requeue attempts, archives the message and marks the task with `permanently_stalled_at` timestamp + + + +## Find Currently Stalled Tasks + +To find tasks that are currently stalled (before automatic recovery kicks in): + +```sql +SELECT + r.flow_slug, + st.step_slug, + st.run_id, + st.status, + st.started_at, + st.requeued_count, + now() - st.started_at AS stuck_duration +FROM pgflow.step_tasks st +JOIN pgflow.runs r ON r.run_id = st.run_id +WHERE st.status = 'started' + AND st.started_at < now() - interval '5 minutes' +ORDER BY st.started_at; +``` + +## Find Tasks That Exceeded Max Requeues + +Tasks that have been requeued 3 or more times need manual investigation: + +```sql +SELECT + r.flow_slug, + r.run_id, + st.step_slug, + st.requeued_count, + st.permanently_stalled_at, + st.last_requeued_at +FROM pgflow.step_tasks st +JOIN pgflow.runs r ON r.run_id = st.run_id +WHERE st.permanently_stalled_at IS NOT NULL +ORDER BY st.permanently_stalled_at DESC; +``` + +These tasks have a persistent issue causing repeated stalls. Check your step handler logs and input data to determine the root cause. + +## Customize Cron Interval + +The stalled task recovery cron runs every 15 seconds by default. To change the interval: + +```sql +SELECT pgflow.setup_requeue_stalled_tasks_cron('30 seconds'); +``` + + + +## Related + + + + + + + + + diff --git a/pkgs/website/src/content/docs/news/pgflow-0-13-2-stalled-task-recovery-and-config-fixes.mdx b/pkgs/website/src/content/docs/news/pgflow-0-13-2-stalled-task-recovery-and-config-fixes.mdx new file mode 100644 index 000000000..0aa048014 --- /dev/null +++ b/pkgs/website/src/content/docs/news/pgflow-0-13-2-stalled-task-recovery-and-config-fixes.mdx @@ -0,0 +1,71 @@ +--- +draft: false +title: 'pgflow 0.13.2: Stalled Task Recovery and Config Fixes' +description: 'Automatic recovery for stalled tasks and fix for maxPgConnections config' +date: 2026-01-12 +authors: + - jumski +tags: + - bugfix + - patch +featured: false +--- + +This patch release fixes two issues: tasks getting stuck when workers crash, and the `maxPgConnections` config being ignored. + +## Stalled Tasks Automatic Recovery + +### The Problem + +When a worker crashes or is terminated unexpectedly, tasks can get stuck in `started` status indefinitely. These "stalled" tasks never complete and block flow progress. This was reported in [#586](https://github.com/pgflow-dev/pgflow/issues/586). + +### The Solution + +A new `pgflow.requeue_stalled_tasks()` function automatically detects and recovers stalled tasks: + +- Runs via cron job every 15 seconds +- Identifies tasks stuck in `started` status beyond their timeout + 30s buffer +- Requeues them back to `queued` status (up to 3 times) +- After 3 requeue attempts, archives the message and marks task with `permanently_stalled_at` timestamp for manual investigation + +The cron job is set up automatically via migration. For more details, see the [Troubleshooting Stalled Tasks](/deploy/troubleshooting-stalled-tasks/) guide. + +### Visibility Timeout Increase + +Default visibility timeout increased from 2s to 5s to reduce the likelihood of tasks appearing stalled during normal processing delays. + +## Did This Affect You? + +Run this query to check if you had stalled tasks before upgrading: + +```sql +SELECT count(*) +FROM pgflow.step_tasks +WHERE status = 'started' + AND started_at < now() - interval '5 minutes'; +``` + +If you see results, those tasks will now be automatically recovered. + +To find tasks that exceeded the max requeue limit (for manual investigation): + +```sql +SELECT r.run_id, r.flow_slug, st.step_slug, st.requeued_count, st.permanently_stalled_at +FROM pgflow.step_tasks st +JOIN pgflow.runs r ON r.run_id = st.run_id +WHERE st.permanently_stalled_at IS NOT NULL; +``` + +## Connection Config Fix + +### The Bug + +The `maxPgConnections` configuration option was being ignored when passed to `createFlowWorker()`. + +### The Fix + +The config is now properly passed through the connection chain with a default of 4 connections. + +## Credits + +Thanks to [matz](https://github.com/matz) for reporting issue [#586](https://github.com/pgflow-dev/pgflow/issues/586)! diff --git a/pkgs/website/src/content/docs/reference/configuration/step-execution.mdx b/pkgs/website/src/content/docs/reference/configuration/step-execution.mdx index bbda0fa1b..d2f886257 100644 --- a/pkgs/website/src/content/docs/reference/configuration/step-execution.mdx +++ b/pkgs/website/src/content/docs/reference/configuration/step-execution.mdx @@ -5,12 +5,13 @@ sidebar: order: 20 --- -import { Aside } from "@astrojs/starlight/components"; +import { Aside } from '@astrojs/starlight/components'; These settings are defined in your TypeScript flow code and compiled into SQL migrations. They control how individual steps are executed, delayed, and retried. Set defaults at the flow level, override for specific steps. Step-level options are `null` by default, inheriting from flow-level settings. ## Default Configuration @@ -18,14 +19,15 @@ After deployment, you can update these settings without recompiling your flow. S ```typescript new Flow({ slug: 'myFlow', - maxAttempts: 3, // max retry attempts before marking as failed - baseDelay: 1, // initial retry delay in seconds - timeout: 60 // visibility timeout in seconds + maxAttempts: 3, // max retry attempts before marking as failed + baseDelay: 1, // initial retry delay in seconds + timeout: 60, // visibility timeout in seconds // Note: startDelay is step-level only, not available as a default at flow level -}) +}); ``` ## `maxAttempts` + **Type:** `number` **Default:** `3` @@ -35,11 +37,12 @@ The maximum number of times a task will be attempted before being marked as perm // Flow level new Flow({ slug: 'myFlow', maxAttempts: 5 }) -// Step level (overrides flow default) -.step({ slug: 'myStep', maxAttempts: 7 }, handler) + // Step level (overrides flow default) + .step({ slug: 'myStep', maxAttempts: 7 }, handler); ``` ## `baseDelay` + **Type:** `number` **Default:** `1` @@ -49,11 +52,12 @@ The initial delay (in seconds) before the first retry. pgflow uses exponential b // Flow level new Flow({ slug: 'myFlow', baseDelay: 2 }) -// Step level (overrides flow default) -.step({ slug: 'myStep', baseDelay: 10 }, handler) + // Step level (overrides flow default) + .step({ slug: 'myStep', baseDelay: 10 }, handler); ``` ## `timeout` + **Type:** `number` **Default:** `60` @@ -61,29 +65,34 @@ The visibility timeout (in seconds) - how long a task remains invisible to other ```ts // Flow level new Flow({ slug: 'myFlow', timeout: 120 }) -// Step level (overrides flow default) -.step({ slug: 'myStep', timeout: 300 }, handler) + // Step level (overrides flow default) + .step({ slug: 'myStep', timeout: 300 }, handler); ``` ## `startDelay` + **Type:** `number` **Default:** `0` @@ -112,6 +121,7 @@ Time 40: Step C starts (waits 10s after B completes) This results in 40+ seconds of delays, not the expected 10s. **Better alternatives:** + - **Need uniform delays?** Use a constant as shown below - **Rate limiting?** Use worker's `maxConcurrent` setting - **Debug delays?** Add only to specific steps you're debugging @@ -123,9 +133,10 @@ To apply the same delay to multiple steps, use a constant: ```typescript const RATE_LIMIT_DELAY = 2; flow - .step({ slug: "apiCall1", startDelay: RATE_LIMIT_DELAY }, handler1) - .step({ slug: "apiCall2", startDelay: RATE_LIMIT_DELAY }, handler2) + .step({ slug: 'apiCall1', startDelay: RATE_LIMIT_DELAY }, handler1) + .step({ slug: 'apiCall2', startDelay: RATE_LIMIT_DELAY }, handler2); ``` + ## Configuration Examples @@ -137,12 +148,12 @@ When all steps can use the same configuration: ```typescript new Flow({ slug: 'myFlow', - maxAttempts: 3, // Default for all steps - baseDelay: 1, // Default for all steps - timeout: 60 // Default for all steps + maxAttempts: 3, // Default for all steps + baseDelay: 1, // Default for all steps + timeout: 60, // Default for all steps }) - .step({ slug: 'step1' }, handler1) // Uses flow defaults - .step({ slug: 'step2' }, handler2) // Uses flow defaults + .step({ slug: 'step1' }, handler1) // Uses flow defaults + .step({ slug: 'step2' }, handler2); // Uses flow defaults ``` ### Mixed Configuration @@ -152,25 +163,34 @@ Override flow defaults for specific steps that need different behavior: ```typescript new Flow({ slug: 'analyzeData', - maxAttempts: 3, // Flow defaults + maxAttempts: 3, // Flow defaults baseDelay: 1, - timeout: 60 + timeout: 60, }) - .step({ - slug: 'fetchData', - // Uses all flow defaults - }, fetchHandler) - .step({ - slug: 'processData', - maxAttempts: 5, // Override: more retries - timeout: 300 // Override: needs more time - // baseDelay uses flow default (1) - }, processHandler) - .step({ - slug: 'callApi', - baseDelay: 10, // Override: longer initial delay - // maxAttempts and timeout use flow defaults - }, apiHandler) + .step( + { + slug: 'fetchData', + // Uses all flow defaults + }, + fetchHandler + ) + .step( + { + slug: 'processData', + maxAttempts: 5, // Override: more retries + timeout: 300, // Override: needs more time + // baseDelay uses flow default (1) + }, + processHandler + ) + .step( + { + slug: 'callApi', + baseDelay: 10, // Override: longer initial delay + // maxAttempts and timeout use flow defaults + }, + apiHandler + ); ``` ## Retry Behavior @@ -182,7 +202,11 @@ delay = baseDelay * 2^attemptCount ``` ### Retry Delay Examples @@ -190,18 +214,19 @@ Unlike [Background Jobs Mode](/get-started/faq/#what-are-the-two-edge-worker-mod Here's how retry delays grow with different base delays: | Attempt | Delay (baseDelay: 2s) | Delay (baseDelay: 5s) | Delay (baseDelay: 10s) | -|---------|----------------------|----------------------|------------------------| -| 1 | 2s | 5s | 10s | -| 2 | 4s | 10s | 20s | -| 3 | 8s | 20s | 40s | -| 4 | 16s | 40s | 80s | -| 5 | 32s | 80s | 160s | -| 6 | 64s | 160s | 320s | -| 7 | 128s | 320s | 640s | +| ------- | --------------------- | --------------------- | ---------------------- | +| 1 | 2s | 5s | 10s | +| 2 | 4s | 10s | 20s | +| 3 | 8s | 20s | 40s | +| 4 | 16s | 40s | 80s | +| 5 | 32s | 80s | 160s | +| 6 | 64s | 160s | 320s | +| 7 | 128s | 320s | 640s | ### When Tasks Fail Permanently A task is marked as permanently failed when: + - It has been attempted `maxAttempts` times - Each attempt resulted in an error - The task status changes from `queued` to `failed`