From 6a53306610f9d60ad05903c9cfef6d53bd7af49d Mon Sep 17 00:00:00 2001
From: Wojtek Majewski <wojciech.majewski@pm.me>
Date: Mon, 12 Jan 2026 10:30:48 +0100
Subject: [PATCH] docs: add 0.13.2 release notes with stalled task recovery
 guide

---
 pkgs/website/astro.config.mjs                 |   4 +
 .../src/content/docs/build/retrying-steps.mdx |  60 +++++---
 .../docs/concepts/worker-lifecycle.mdx        |  22 +--
 .../website/src/content/docs/deploy/index.mdx |  15 ++
 .../content/docs/deploy/monitor-execution.mdx |  44 +++++-
 .../deploy/troubleshooting-stalled-tasks.mdx  | 118 +++++++++++++++
 ...stalled-task-recovery-and-config-fixes.mdx |  71 +++++++++
 .../configuration/step-execution.mdx          | 135 +++++++++++-------
 8 files changed, 383 insertions(+), 86 deletions(-)
 create mode 100644 pkgs/website/src/content/docs/deploy/troubleshooting-stalled-tasks.mdx
 create mode 100644 pkgs/website/src/content/docs/news/pgflow-0-13-2-stalled-task-recovery-and-config-fixes.mdx
diff --git a/pkgs/website/astro.config.mjs b/pkgs/website/astro.config.mjs
index 8f85d3bf7..3da4cebf5 100644
--- a/pkgs/website/astro.config.mjs
+++ b/pkgs/website/astro.config.mjs
@@ -336,6 +336,10 @@ export default defineConfig({
                       label: 'Troubleshooting connections',
                       link: '/deploy/troubleshooting-connections/',
                     },
+                    {
+                      label: 'Troubleshooting stalled tasks',
+                      link: '/deploy/troubleshooting-stalled-tasks/',
+                    },
                     { label: 'Prune records', link: '/deploy/prune-records/' },
                     {
                       label: 'Tune deployed flows',
diff --git a/pkgs/website/src/content/docs/build/retrying-steps.mdx b/pkgs/website/src/content/docs/build/retrying-steps.mdx
index 29c60acd5..13ac3f599 100644
--- a/pkgs/website/src/content/docs/build/retrying-steps.mdx
+++ b/pkgs/website/src/content/docs/build/retrying-steps.mdx
@@ -5,12 +5,13 @@ sidebar:
   order: 25
 ---
 
-import { Aside, CardGrid, LinkCard } from "@astrojs/starlight/components";
+import { Aside, CardGrid, LinkCard } from '@astrojs/starlight/components';
 
 Configure retry behavior based on step reliability characteristics. Set conservative flow-level defaults and override per-step as needed.
 
 <Aside type="tip">
-For scheduling delays between steps, see [Delaying Steps](/build/delaying-steps/).
+  For scheduling delays between steps, see [Delaying
+  Steps](/build/delaying-steps/).
 </Aside>
 
 For detailed information about each configuration option, see the [Step Execution Options](/reference/configuration/step-execution/) reference.
@@ -22,12 +23,14 @@ Not all failures should retry. Understanding the difference helps you configure
 ### Transient Failures
 
 Temporary problems that might succeed on retry:
+
 - Network timeouts
 - Rate limiting (429 responses)
 - Temporary service unavailability (503 responses)
 - Database connection issues
 
 Configure with retries:
+
 ```typescript
 .step({
   slug: 'fetchExternalData',
@@ -39,12 +42,14 @@ Configure with retries:
 ### Permanent Failures
 
 Problems that will never succeed on retry:
+
 - Invalid input format (malformed email, negative numbers)
 - Missing required fields
 - Business rule violations
 - Schema validation errors
 
 Configure without retries:
+
 ```typescript
 .step({
   slug: 'validInput',
@@ -56,11 +61,15 @@ Configure without retries:
 ```
 
 <Aside type="note">
-`maxAttempts: 1` means "run once, do not retry". If the step fails, it fails immediately without retry attempts.
+  `maxAttempts: 1` means "run once, do not retry". If the step fails, it fails
+  immediately without retry attempts.
 </Aside>
 
 <Aside type="caution" title="Current Limitation">
-pgflow does not distinguish between transient and permanent failures automatically. All exceptions trigger retry logic based on `maxAttempts`. Use `maxAttempts: 1` for steps that perform validation or other operations that should fail fast.
+  pgflow does not distinguish between transient and permanent failures
+  automatically. All exceptions trigger retry logic based on `maxAttempts`. Use
+  `maxAttempts: 1` for steps that perform validation or other operations that
+  should fail fast.
 </Aside>
 
 For detailed guidance on validation patterns, see [Validation Steps](/build/validation-steps/).
@@ -78,25 +87,35 @@ When different steps have different reliability requirements:
 ```typescript
 new Flow({
   slug: 'dataPipeline',
-  maxAttempts: 3,     // Sensible defaults
+  maxAttempts: 3, // Sensible defaults
   baseDelay: 1,
 })
-  .step({
-    slug: 'validateInput',
-    maxAttempts: 1,   // No retries - validation should not fail
-  }, validateHandler)
-  .step({
-    slug: 'fetchExternal',
-    maxAttempts: 5,   // External API might be flaky
-    baseDelay: 10,    // Longer delays for external service
-  }, fetchHandler)
-  .step({
-    slug: 'saveResults',
-    // Use flow defaults
-  }, saveHandler)
+  .step(
+    {
+      slug: 'validateInput',
+      maxAttempts: 1, // No retries - validation should not fail
+    },
+    validateHandler
+  )
+  .step(
+    {
+      slug: 'fetchExternal',
+      maxAttempts: 5, // External API might be flaky
+      baseDelay: 10, // Longer delays for external service
+    },
+    fetchHandler
+  )
+  .step(
+    {
+      slug: 'saveResults',
+      // Use flow defaults
+    },
+    saveHandler
+  );
 ```
 
 **Why this approach:**
+
 - Set reasonable flow-level defaults
 - Override only where needed
 - Validation steps need no retries (fail immediately on bad input)
@@ -131,4 +150,9 @@ new Flow({
     href="/deploy/tune-flow-config/"
     description="Adjust configuration for production flows without redeploying"
   />
+  <LinkCard
+    title="Troubleshooting Stalled Tasks"
+    href="/deploy/troubleshooting-stalled-tasks/"
+    description="Recover tasks stuck when workers crash mid-processing"
+  />
 </CardGrid>
diff --git a/pkgs/website/src/content/docs/concepts/worker-lifecycle.mdx b/pkgs/website/src/content/docs/concepts/worker-lifecycle.mdx
index 580aea349..7300b3a1e 100644
--- a/pkgs/website/src/content/docs/concepts/worker-lifecycle.mdx
+++ b/pkgs/website/src/content/docs/concepts/worker-lifecycle.mdx
@@ -5,7 +5,7 @@ sidebar:
   order: 30
 ---
 
-import { Aside, CardGrid, LinkCard } from "@astrojs/starlight/components";
+import { Aside, CardGrid, LinkCard } from '@astrojs/starlight/components';
 
 pgflow workers are designed to be resilient. They poll for tasks, send heartbeats, and automatically restart when they stop.
 
@@ -55,6 +55,7 @@ restart -> startup: "starts new\nworker"
 ## Why Workers Stop
 
 Edge Functions have execution time limits:
+
 - Free tier: 150 seconds
 - Paid tier: 400 seconds
 
@@ -97,13 +98,13 @@ You don't need to manually curl or restart anything after the initial setup.
 
 ## Local vs Production Behavior
 
-| Aspect              | Local                          | Production                              |
-|---------------------|--------------------------------|-----------------------------------------|
-| Cron interval       | 1 second                       | 1 second                                |
-| HTTP request        | Always for enabled functions   | Only if not enough active workers       |
-| Debounce            | Bypassed                       | Applied (prevents too frequent pings)   |
-| Active worker check | Bypassed                       | Required (enabled, non-deprecated)      |
-| Detection           | Automatic (is_local)           | Automatic                               |
+| Aspect              | Local                        | Production                            |
+| ------------------- | ---------------------------- | ------------------------------------- |
+| Cron interval       | 1 second                     | 1 second                              |
+| HTTP request        | Always for enabled functions | Only if not enough active workers     |
+| Debounce            | Bypassed                     | Applied (prevents too frequent pings) |
+| Active worker check | Bypassed                     | Required (enabled, non-deprecated)    |
+| Detection           | Automatic (is_local)         | Automatic                             |
 
 ## Related
 
@@ -123,4 +124,9 @@ You don't need to manually curl or restart anything after the initial setup.
     href="/build/local-development/"
     description="Tips for local development"
   />
+  <LinkCard
+    title="Troubleshooting Stalled Tasks"
+    href="/deploy/troubleshooting-stalled-tasks/"
+    description="Recover tasks left behind when workers terminate unexpectedly"
+  />
 </CardGrid>
diff --git a/pkgs/website/src/content/docs/deploy/index.mdx b/pkgs/website/src/content/docs/deploy/index.mdx
index 079d8a513..3a0eb1a90 100644
--- a/pkgs/website/src/content/docs/deploy/index.mdx
+++ b/pkgs/website/src/content/docs/deploy/index.mdx
@@ -52,6 +52,21 @@ Learn how to deploy pgflow to production, monitor workflow execution, and mainta
   />
 </CardGrid>
 
+## Troubleshoot
+
+<CardGrid>
+  <LinkCard
+    title="Connection issues"
+    href="/deploy/troubleshooting-connections/"
+    description="Diagnose and fix common database connection problems"
+  />
+  <LinkCard
+    title="Stalled tasks"
+    href="/deploy/troubleshooting-stalled-tasks/"
+    description="Diagnose and recover tasks stuck in 'started' status"
+  />
+</CardGrid>
+
 ## Maintain
 
 <CardGrid>
diff --git a/pkgs/website/src/content/docs/deploy/monitor-execution.mdx b/pkgs/website/src/content/docs/deploy/monitor-execution.mdx
index b7787d686..3e147e017 100644
--- a/pkgs/website/src/content/docs/deploy/monitor-execution.mdx
+++ b/pkgs/website/src/content/docs/deploy/monitor-execution.mdx
@@ -5,7 +5,14 @@ sidebar:
   order: 10
 ---
 
-import { Aside, Steps, Tabs, CardGrid, LinkCard, FileTree } from "@astrojs/starlight/components";
+import {
+  Aside,
+  Steps,
+  Tabs,
+  CardGrid,
+  LinkCard,
+  FileTree,
+} from '@astrojs/starlight/components';
 
 This guide explains how to monitor your pgflow flows during and after execution using SQL queries.
 
@@ -31,6 +38,7 @@ run_id        | flow_slug    | status    | input                | output
 ```
 
 Run statuses include:
+
 - `started`: The run has been created and is executing steps
 - `completed`: All steps have completed successfully
 - `failed`: One or more steps have failed after max retries
@@ -61,6 +69,7 @@ final_step   | created   | 2              | 1              | null
 ```
 
 Step statuses include:
+
 - `created`: The step has been created but may be waiting for dependencies
 - `started`: The step has started execution (all dependencies are complete)
 - `completed`: The step has completed successfully
@@ -95,6 +104,7 @@ run_id   | step_slug    | status  | attempts_count | message_id | queued_at
 ```
 
 Active task statuses:
+
 - `queued`: Task is ready to run, waiting for a worker to claim it
 - `started`: Task is currently being processed by a worker (with `started_at` timestamp and `worker_id`)
 
@@ -139,18 +149,21 @@ AND ss.status = 'failed';
 To start flows from TypeScript applications and stream real-time progress updates, see [Start Flows from TypeScript Client](/build/starting-flows/typescript-client/).
 
 This page focuses on SQL-based monitoring for debugging and operations.
+
 </Aside>
 
 <Aside type="tip" title="Flow Visualization">
 Applications typically create dashboards to visualize flows and their execution status.
 
 pgflow stores all the information needed to build rich visualizations of your flow execution, including:
+
 - Step dependencies
 - Execution times
 - Retry attempts
 - Inputs and outputs
 
 This data is available through SQL queries to the pgflow schema tables.
+
 </Aside>
 
 ## View step dependencies
@@ -172,8 +185,29 @@ GROUP BY steps.step_slug;
 ## Next steps
 
 <CardGrid>
-  <LinkCard title="Start Flows from TypeScript Client" href="/build/starting-flows/typescript-client/" description="Start flows from TypeScript apps and stream real-time progress updates"/>
-  <LinkCard title="Organize Flow code" href="/build/organize-flow-code/" description="Learn how to structure your pgflow code for maintainability and reusability"/>
-  <LinkCard title="Tune deployed flows" href="/deploy/tune-flow-config/" description="Adjust retry behavior and timeouts for production flows"/>
-  <LinkCard title="Version your Flows" href="/build/version-flows/" description="Learn how to safely update your flows without breaking existing runs"/>
+  <LinkCard
+    title="Start Flows from TypeScript Client"
+    href="/build/starting-flows/typescript-client/"
+    description="Start flows from TypeScript apps and stream real-time progress updates"
+  />
+  <LinkCard
+    title="Organize Flow code"
+    href="/build/organize-flow-code/"
+    description="Learn how to structure your pgflow code for maintainability and reusability"
+  />
+  <LinkCard
+    title="Tune deployed flows"
+    href="/deploy/tune-flow-config/"
+    description="Adjust retry behavior and timeouts for production flows"
+  />
+  <LinkCard
+    title="Version your Flows"
+    href="/build/version-flows/"
+    description="Learn how to safely update your flows without breaking existing runs"
+  />
+  <LinkCard
+    title="Troubleshoot stalled tasks"
+    href="/deploy/troubleshooting-stalled-tasks/"
+    description="Diagnose and recover tasks stuck in 'started' status"
+  />
 </CardGrid>
diff --git a/pkgs/website/src/content/docs/deploy/troubleshooting-stalled-tasks.mdx b/pkgs/website/src/content/docs/deploy/troubleshooting-stalled-tasks.mdx
new file mode 100644
index 000000000..d38eabca7
--- /dev/null
+++ b/pkgs/website/src/content/docs/deploy/troubleshooting-stalled-tasks.mdx
@@ -0,0 +1,118 @@
+---
+title: Troubleshooting Stalled Tasks
+description: Diagnose and recover tasks stuck in 'started' status
+sidebar:
+  order: 126
+---
+
+import { Aside, CardGrid, LinkCard } from '@astrojs/starlight/components';
+
+## What are stalled tasks?
+
+Tasks get stuck in `started` status when workers die mid-processing. This is common in serverless environments where functions can be terminated unexpectedly. When this happens, the message keeps cycling in the queue but can't be picked up because the task status filter excludes it. This wastes resources and blocks flow progress.
+
+## Automatic Recovery
+
+pgflow automatically recovers stalled tasks via a cron job that runs every 15 seconds:
+
+1. **Detection** - Identifies tasks stuck in `started` status longer than their step timeout + 30 second buffer
+2. **Requeue** - Resets task status to `queued` and clears `started_at`
+3. **Tracking** - Increments `requeued_count` and sets `last_requeued_at` for observability
+4. **Limit** - After 3 requeue attempts, archives the message and marks the task with `permanently_stalled_at` timestamp
+
+<Aside type="note" title="Why limit requeues?">
+  The 3-attempt limit prevents infinite recovery loops. If a task keeps stalling
+  after 3 requeues, it likely has a deeper issue (e.g., input data that crashes
+  the handler, external service permanently down). The task's status remains
+  `'started'` but is excluded from reprocessing by the `permanently_stalled_at`
+  timestamp so you can investigate the root cause rather than endlessly
+  retrying.
+</Aside>
+
+## Find Currently Stalled Tasks
+
+To find tasks that are currently stalled (before automatic recovery kicks in):
+
+```sql
+SELECT
+  r.flow_slug,
+  st.step_slug,
+  st.run_id,
+  st.status,
+  st.started_at,
+  st.requeued_count,
+  now() - st.started_at AS stuck_duration
+FROM pgflow.step_tasks st
+JOIN pgflow.runs r ON r.run_id = st.run_id
+WHERE st.status = 'started'
+  AND st.started_at < now() - interval '5 minutes'
+ORDER BY st.started_at;
+```
+
+## Find Tasks That Exceeded Max Requeues
+
+Tasks that have been requeued 3 or more times need manual investigation:
+
+```sql
+SELECT
+  r.flow_slug,
+  r.run_id,
+  st.step_slug,
+  st.requeued_count,
+  st.permanently_stalled_at,
+  st.last_requeued_at
+FROM pgflow.step_tasks st
+JOIN pgflow.runs r ON r.run_id = st.run_id
+WHERE st.permanently_stalled_at IS NOT NULL
+ORDER BY st.permanently_stalled_at DESC;
+```
+
+These tasks have a persistent issue causing repeated stalls. Check your step handler logs and input data to determine the root cause.
+
+## Customize Cron Interval
+
+The stalled task recovery cron runs every 15 seconds by default. To change the interval:
+
+```sql
+SELECT pgflow.setup_requeue_stalled_tasks_cron('30 seconds');
+```
+
+<Aside type="tip">
+  The function is idempotent - calling it replaces any existing job with the new
+  interval.
+</Aside>
+
+## Related
+
+<CardGrid>
+  <LinkCard
+    title="Monitor Execution"
+    href="/deploy/monitor-execution/"
+    description="Track flow runs and step status"
+  />
+  <LinkCard
+    title="Worker Management"
+    href="/deploy/worker-management/"
+    description="Understand worker lifecycle and registration"
+  />
+  <LinkCard
+    title="Monitor Workers Health"
+    href="/deploy/monitor-workers-health/"
+    description="Check worker logs and heartbeats"
+  />
+  <LinkCard
+    title="Step Execution Options"
+    href="/reference/configuration/step-execution/"
+    description="Configure timeout and retry behavior"
+  />
+  <LinkCard
+    title="Retrying Steps"
+    href="/build/retrying-steps/"
+    description="Handle transient failures with automatic retries"
+  />
+  <LinkCard
+    title="Worker Lifecycle"
+    href="/concepts/worker-lifecycle/"
+    description="Understand why workers start and stop"
+  />
+</CardGrid>
diff --git a/pkgs/website/src/content/docs/news/pgflow-0-13-2-stalled-task-recovery-and-config-fixes.mdx b/pkgs/website/src/content/docs/news/pgflow-0-13-2-stalled-task-recovery-and-config-fixes.mdx
new file mode 100644
index 000000000..0aa048014
--- /dev/null
+++ b/pkgs/website/src/content/docs/news/pgflow-0-13-2-stalled-task-recovery-and-config-fixes.mdx
@@ -0,0 +1,71 @@
+---
+draft: false
+title: 'pgflow 0.13.2: Stalled Task Recovery and Config Fixes'
+description: 'Automatic recovery for stalled tasks and fix for maxPgConnections config'
+date: 2026-01-12
+authors:
+  - jumski
+tags:
+  - bugfix
+  - patch
+featured: false
+---
+
+This patch release fixes two issues: tasks getting stuck when workers crash, and the `maxPgConnections` config being ignored.
+
+## Stalled Tasks Automatic Recovery
+
+### The Problem
+
+When a worker crashes or is terminated unexpectedly, tasks can get stuck in `started` status indefinitely. These "stalled" tasks never complete and block flow progress. This was reported in [#586](https://github.com/pgflow-dev/pgflow/issues/586).
+
+### The Solution
+
+A new `pgflow.requeue_stalled_tasks()` function automatically detects and recovers stalled tasks:
+
+- Runs via cron job every 15 seconds
+- Identifies tasks stuck in `started` status beyond their timeout + 30s buffer
+- Requeues them back to `queued` status (up to 3 times)
+- After 3 requeue attempts, archives the message and marks task with `permanently_stalled_at` timestamp for manual investigation
+
+The cron job is set up automatically via migration. For more details, see the [Troubleshooting Stalled Tasks](/deploy/troubleshooting-stalled-tasks/) guide.
+
+### Visibility Timeout Increase
+
+Default visibility timeout increased from 2s to 5s to reduce the likelihood of tasks appearing stalled during normal processing delays.
+
+## Did This Affect You?
+
+Run this query to check if you had stalled tasks before upgrading:
+
+```sql
+SELECT count(*)
+FROM pgflow.step_tasks
+WHERE status = 'started'
+  AND started_at < now() - interval '5 minutes';
+```
+
+If you see results, those tasks will now be automatically recovered.
+
+To find tasks that exceeded the max requeue limit (for manual investigation):
+
+```sql
+SELECT r.run_id, r.flow_slug, st.step_slug, st.requeued_count, st.permanently_stalled_at
+FROM pgflow.step_tasks st
+JOIN pgflow.runs r ON r.run_id = st.run_id
+WHERE st.permanently_stalled_at IS NOT NULL;
+```
+
+## Connection Config Fix
+
+### The Bug
+
+The `maxPgConnections` configuration option was being ignored when passed to `createFlowWorker()`.
+
+### The Fix
+
+The config is now properly passed through the connection chain with a default of 4 connections.
+
+## Credits
+
+Thanks to [matz](https://github.com/matz) for reporting issue [#586](https://github.com/pgflow-dev/pgflow/issues/586)!
diff --git a/pkgs/website/src/content/docs/reference/configuration/step-execution.mdx b/pkgs/website/src/content/docs/reference/configuration/step-execution.mdx
index bbda0fa1b..d2f886257 100644
--- a/pkgs/website/src/content/docs/reference/configuration/step-execution.mdx
+++ b/pkgs/website/src/content/docs/reference/configuration/step-execution.mdx
@@ -5,12 +5,13 @@ sidebar:
   order: 20
 ---
 
-import { Aside } from "@astrojs/starlight/components";
+import { Aside } from '@astrojs/starlight/components';
 
 These settings are defined in your TypeScript flow code and compiled into SQL migrations. They control how individual steps are executed, delayed, and retried. Set defaults at the flow level, override for specific steps. Step-level options are `null` by default, inheriting from flow-level settings.
 
 <Aside type="tip">
-After deployment, you can update these settings without recompiling your flow. See [Tune Deployed Flows](/deploy/tune-flow-config/) for details.
+  After deployment, you can update these settings without recompiling your flow.
+  See [Tune Deployed Flows](/deploy/tune-flow-config/) for details.
 </Aside>
 
 ## Default Configuration
@@ -18,14 +19,15 @@ After deployment, you can update these settings without recompiling your flow. S
 ```typescript
 new Flow({
   slug: 'myFlow',
-  maxAttempts: 3,    // max retry attempts before marking as failed
-  baseDelay: 1,      // initial retry delay in seconds
-  timeout: 60        // visibility timeout in seconds
+  maxAttempts: 3, // max retry attempts before marking as failed
+  baseDelay: 1, // initial retry delay in seconds
+  timeout: 60, // visibility timeout in seconds
   // Note: startDelay is step-level only, not available as a default at flow level
-})
+});
 ```
 
 ## `maxAttempts`
+
 **Type:** `number`
 **Default:** `3`
 
@@ -35,11 +37,12 @@ The maximum number of times a task will be attempted before being marked as perm
 // Flow level
 new Flow({ slug: 'myFlow', maxAttempts: 5 })
 
-// Step level (overrides flow default)
-.step({ slug: 'myStep', maxAttempts: 7 }, handler)
+  // Step level (overrides flow default)
+  .step({ slug: 'myStep', maxAttempts: 7 }, handler);
 ```
 
 ## `baseDelay`
+
 **Type:** `number`
 **Default:** `1`
 
@@ -49,11 +52,12 @@ The initial delay (in seconds) before the first retry. pgflow uses exponential b
 // Flow level
 new Flow({ slug: 'myFlow', baseDelay: 2 })
 
-// Step level (overrides flow default)
-.step({ slug: 'myStep', baseDelay: 10 }, handler)
+  // Step level (overrides flow default)
+  .step({ slug: 'myStep', baseDelay: 10 }, handler);
 ```
 
 ## `timeout`
+
 **Type:** `number`
 **Default:** `60`
 
@@ -61,29 +65,34 @@ The visibility timeout (in seconds) - how long a task remains invisible to other
 
 <Aside type="caution" title="Timeout and Task Processing">
 Set `timeout` higher than your task's maximum processing time.
-  <details>
-  <summary>
-  Here's why:
-  </summary>
-  - When a worker picks up a task, it becomes invisible for `timeout` seconds
-  - If processing takes longer than `timeout`, the task becomes visible again
-  - Other workers can then pick up and process the same task
-  - This leads to duplicate processing
-  - For example: with `timeout: 30` and a task that takes 45 seconds, the task could be processed twice
-  </details>
+
+<details>
+<summary>
+Here's why:
+</summary>
+- When a worker picks up a task, it becomes invisible for `timeout` seconds
+- If processing takes longer than `timeout`, the task becomes visible again
+- Other workers can then pick up and process the same task
+- This leads to duplicate processing
+- For example: with `timeout: 30` and a task that takes 45 seconds, the task could be processed twice
+</details>
 
 Currently, pgflow uses timeout only for visibility. In the future, the Edge Worker will also use it to terminate tasks that exceed their timeout.
+
+If a worker crashes during processing, pgflow automatically recovers stalled tasks. See [Troubleshooting Stalled Tasks](/deploy/troubleshooting-stalled-tasks/) for details.
+
 </Aside>
 
 ```ts
 // Flow level
 new Flow({ slug: 'myFlow', timeout: 120 })
 
-// Step level (overrides flow default)
-.step({ slug: 'myStep', timeout: 300 }, handler)
+  // Step level (overrides flow default)
+  .step({ slug: 'myStep', timeout: 300 }, handler);
 ```
 
 ## `startDelay`
+
 **Type:** `number`
 **Default:** `0`
 
@@ -112,6 +121,7 @@ Time 40:  Step C starts (waits 10s after B completes)
 This results in 40+ seconds of delays, not the expected 10s.
 
 **Better alternatives:**
+
 - **Need uniform delays?** Use a constant as shown below
 - **Rate limiting?** Use worker's `maxConcurrent` setting
 - **Debug delays?** Add only to specific steps you're debugging
@@ -123,9 +133,10 @@ To apply the same delay to multiple steps, use a constant:
 ```typescript
 const RATE_LIMIT_DELAY = 2;
 flow
-  .step({ slug: "apiCall1", startDelay: RATE_LIMIT_DELAY }, handler1)
-  .step({ slug: "apiCall2", startDelay: RATE_LIMIT_DELAY }, handler2)
+  .step({ slug: 'apiCall1', startDelay: RATE_LIMIT_DELAY }, handler1)
+  .step({ slug: 'apiCall2', startDelay: RATE_LIMIT_DELAY }, handler2);
 ```
+
 </Aside>
 
 ## Configuration Examples
@@ -137,12 +148,12 @@ When all steps can use the same configuration:
 ```typescript
 new Flow({
   slug: 'myFlow',
-  maxAttempts: 3,    // Default for all steps
-  baseDelay: 1,      // Default for all steps
-  timeout: 60        // Default for all steps
+  maxAttempts: 3, // Default for all steps
+  baseDelay: 1, // Default for all steps
+  timeout: 60, // Default for all steps
 })
-  .step({ slug: 'step1' }, handler1)  // Uses flow defaults
-  .step({ slug: 'step2' }, handler2)  // Uses flow defaults
+  .step({ slug: 'step1' }, handler1) // Uses flow defaults
+  .step({ slug: 'step2' }, handler2); // Uses flow defaults
 ```
 
 ### Mixed Configuration
@@ -152,25 +163,34 @@ Override flow defaults for specific steps that need different behavior:
 ```typescript
 new Flow({
   slug: 'analyzeData',
-  maxAttempts: 3,    // Flow defaults
+  maxAttempts: 3, // Flow defaults
   baseDelay: 1,
-  timeout: 60
+  timeout: 60,
 })
-  .step({
-    slug: 'fetchData',
-    // Uses all flow defaults
-  }, fetchHandler)
-  .step({
-    slug: 'processData',
-    maxAttempts: 5,    // Override: more retries
-    timeout: 300       // Override: needs more time
-    // baseDelay uses flow default (1)
-  }, processHandler)
-  .step({
-    slug: 'callApi',
-    baseDelay: 10,     // Override: longer initial delay
-    // maxAttempts and timeout use flow defaults
-  }, apiHandler)
+  .step(
+    {
+      slug: 'fetchData',
+      // Uses all flow defaults
+    },
+    fetchHandler
+  )
+  .step(
+    {
+      slug: 'processData',
+      maxAttempts: 5, // Override: more retries
+      timeout: 300, // Override: needs more time
+      // baseDelay uses flow default (1)
+    },
+    processHandler
+  )
+  .step(
+    {
+      slug: 'callApi',
+      baseDelay: 10, // Override: longer initial delay
+      // maxAttempts and timeout use flow defaults
+    },
+    apiHandler
+  );
 ```
 
 ## Retry Behavior
@@ -182,7 +202,11 @@ delay = baseDelay * 2^attemptCount
 ```
 
 <Aside type="note">
-Unlike [Background Jobs Mode](/get-started/faq/#what-are-the-two-edge-worker-modes) which supports a `maxDelay` cap, Flow Mode retry delays are not capped yet. Delays will continue to double with each attempt, at most `maxAttempts`-times, after which the step and flow are failed permanently
+  Unlike [Background Jobs
+  Mode](/get-started/faq/#what-are-the-two-edge-worker-modes) which supports a
+  `maxDelay` cap, Flow Mode retry delays are not capped yet. Delays will
+  continue to double with each attempt, at most `maxAttempts`-times, after which
+  the step and flow are failed permanently
 </Aside>
 
 ### Retry Delay Examples
@@ -190,18 +214,19 @@ Unlike [Background Jobs Mode](/get-started/faq/#what-are-the-two-edge-worker-mod
 Here's how retry delays grow with different base delays:
 
 | Attempt | Delay (baseDelay: 2s) | Delay (baseDelay: 5s) | Delay (baseDelay: 10s) |
-|---------|----------------------|----------------------|------------------------|
-| 1       | 2s                   | 5s                   | 10s                    |
-| 2       | 4s                   | 10s                  | 20s                    |
-| 3       | 8s                   | 20s                  | 40s                    |
-| 4       | 16s                  | 40s                  | 80s                    |
-| 5       | 32s                  | 80s                  | 160s                   |
-| 6       | 64s                  | 160s                 | 320s                   |
-| 7       | 128s                 | 320s                 | 640s                   |
+| ------- | --------------------- | --------------------- | ---------------------- |
+| 1       | 2s                    | 5s                    | 10s                    |
+| 2       | 4s                    | 10s                   | 20s                    |
+| 3       | 8s                    | 20s                   | 40s                    |
+| 4       | 16s                   | 40s                   | 80s                    |
+| 5       | 32s                   | 80s                   | 160s                   |
+| 6       | 64s                   | 160s                  | 320s                   |
+| 7       | 128s                  | 320s                  | 640s                   |
 
 ### When Tasks Fail Permanently
 
 A task is marked as permanently failed when:
+
 - It has been attempted `maxAttempts` times
 - Each attempt resulted in an error
 - The task status changes from `queued` to `failed`