Neurostep · Neurostep · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ It provides declarative cluster management through custom resources, enabling us
 - **Multiple Worker Pools**: Different sizing, node selectors, and tolerations per workload profile
 - **Scale-to-Zero**: Snapshot workers can scale to zero when no initial loads are running
 - **Automatic Lifecycle Management**: OwnerReferences enable automatic garbage collection on CR deletion
+- **Maintenance Mode Integration**: Gracefully pauses mirrors before upgrades and resumes them after via PeerDB's maintenance workflows
 
 ## Getting Started
 

diff --git a/api/v1alpha1/peerdbcluster_types.go b/api/v1alpha1/peerdbcluster_types.go
@@ -41,6 +41,8 @@ const (
 	ConditionUpgradeInProgress = "UpgradeInProgress"
 	// ConditionBackupSafe indicates whether it is safe to take a backup (no rolling restarts or upgrades in progress).
 	ConditionBackupSafe = "BackupSafe"
+	// ConditionMaintenanceMode indicates PeerDB maintenance mode is active.
+	ConditionMaintenanceMode = "MaintenanceMode"
 )
 
 // Reason constants for status conditions.
@@ -105,6 +107,16 @@ const (
 	ReasonBackupSafe = "BackupSafe"
 	// ReasonBackupUnsafe indicates the cluster is not safe for backup (upgrade or rollout in progress).
 	ReasonBackupUnsafe = "BackupUnsafe"
+	// ReasonMaintenanceStarting indicates maintenance mode is being activated.
+	ReasonMaintenanceStarting = "MaintenanceStarting"
+	// ReasonMaintenanceActive indicates maintenance mode is active.
+	ReasonMaintenanceActive = "MaintenanceActive"
+	// ReasonMaintenanceEnding indicates maintenance mode is being deactivated.
+	ReasonMaintenanceEnding = "MaintenanceEnding"
+	// ReasonMaintenanceComplete indicates maintenance mode has been deactivated.
+	ReasonMaintenanceComplete = "MaintenanceComplete"
+	// ReasonMaintenanceFailed indicates a maintenance mode job failed.
+	ReasonMaintenanceFailed = "MaintenanceFailed"
 )
 
 // Annotation constants for PeerDBCluster.
@@ -344,6 +356,23 @@ type InitSpec struct {
 	TemporalSearchAttributes *InitJobSpec `json:"temporalSearchAttributes,omitempty"`
 }
 
+// MaintenanceSpec configures PeerDB maintenance mode for upgrades.
+// When enabled, the operator triggers PeerDB's maintenance workflows
+// to gracefully pause mirrors before upgrading and resume them after.
+type MaintenanceSpec struct {
+	// image overrides the default flow-maintenance container image.
+	// +optional
+	Image *string `json:"image,omitempty"`
+	// backoffLimit is the number of retries before marking the maintenance job as failed.
+	// +kubebuilder:validation:Minimum=0
+	// +kubebuilder:default=4
+	// +optional
+	BackoffLimit *int32 `json:"backoffLimit,omitempty"`
+	// resources defines compute resource requirements for the maintenance job container.
+	// +optional
+	Resources *corev1.ResourceRequirements `json:"resources,omitempty"`
+}
+
 // UpgradePolicy controls whether the operator automatically performs version upgrades.
 // +kubebuilder:validation:Enum=Automatic;Manual
 type UpgradePolicy string
@@ -373,14 +402,16 @@ type MaintenanceWindow struct {
 type UpgradePhase string
 
 const (
-	UpgradePhaseComplete UpgradePhase = "Complete"
-	UpgradePhaseWaiting  UpgradePhase = "Waiting"
-	UpgradePhaseBlocked  UpgradePhase = "Blocked"
-	UpgradePhaseConfig   UpgradePhase = "Config"
-	UpgradePhaseInitJobs UpgradePhase = "InitJobs"
-	UpgradePhaseFlowAPI  UpgradePhase = "FlowAPI"
-	UpgradePhaseServer   UpgradePhase = "PeerDBServer"
-	UpgradePhaseUI       UpgradePhase = "UI"
+	UpgradePhaseComplete         UpgradePhase = "Complete"
+	UpgradePhaseWaiting          UpgradePhase = "Waiting"
+	UpgradePhaseBlocked          UpgradePhase = "Blocked"
+	UpgradePhaseStartMaintenance UpgradePhase = "StartMaintenance"
+	UpgradePhaseConfig           UpgradePhase = "Config"
+	UpgradePhaseInitJobs         UpgradePhase = "InitJobs"
+	UpgradePhaseFlowAPI          UpgradePhase = "FlowAPI"
+	UpgradePhaseServer           UpgradePhase = "PeerDBServer"
+	UpgradePhaseUI               UpgradePhase = "UI"
+	UpgradePhaseEndMaintenance   UpgradePhase = "EndMaintenance"
 )
 
 // UpgradeStatus tracks the progress of a version upgrade.
@@ -446,6 +477,11 @@ type PeerDBClusterSpec struct {
 	// Only used when upgradePolicy is Automatic.
 	// +optional
 	MaintenanceWindow *MaintenanceWindow `json:"maintenanceWindow,omitempty"`
+	// maintenance configures PeerDB maintenance mode for graceful upgrades.
+	// When configured, the operator runs maintenance workflows to pause mirrors
+	// before upgrading and resume them after.
+	// +optional
+	Maintenance *MaintenanceSpec `json:"maintenance,omitempty"`
 }
 
 // PeerDBClusterStatus defines the observed state of PeerDBCluster.

diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/peerdb.peerdb.io_peerdbclusters.yaml b/config/crd/bases/peerdb.peerdb.io_peerdbclusters.yaml
@@ -812,6 +812,84 @@ spec:
                         type: object
                     type: object
                 type: object
+              maintenance:
+                description: |-
+                  maintenance configures PeerDB maintenance mode for graceful upgrades.
+                  When configured, the operator runs maintenance workflows to pause mirrors
+                  before upgrading and resume them after.
+                properties:
+                  backoffLimit:
+                    default: 4
+                    description: backoffLimit is the number of retries before marking
+                      the maintenance job as failed.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  image:
+                    description: image overrides the default flow-maintenance container
+                      image.
+                    type: string
+                  resources:
+                    description: resources defines compute resource requirements for
+                      the maintenance job container.
+                    properties:
+                      claims:
+                        description: |-
+                          Claims lists the names of resources, defined in spec.resourceClaims,
+                          that are used by this container.
+
+                          This field depends on the
+                          DynamicResourceAllocation feature gate.
+
+                          This field is immutable. It can only be set for containers.
+                        items:
+                          description: ResourceClaim references one entry in PodSpec.ResourceClaims.
+                          properties:
+                            name:
+                              description: |-
+                                Name must match the name of one entry in pod.spec.resourceClaims of
+                                the Pod where this field is used. It makes that resource available
+                                inside a container.
+                              type: string
+                            request:
+                              description: |-
+                                Request is the name chosen for a request in the referenced claim.
+                                If empty, everything from the claim is made available, otherwise
+                                only the result of this request.
+                              type: string
+                          required:
+                          - name
+                          type: object
+                        type: array
+                        x-kubernetes-list-map-keys:
+                        - name
+                        x-kubernetes-list-type: map
+                      limits:
+                        additionalProperties:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                          x-kubernetes-int-or-string: true
+                        description: |-
+                          Limits describes the maximum amount of compute resources allowed.
+                          More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                        type: object
+                      requests:
+                        additionalProperties:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                          x-kubernetes-int-or-string: true
+                        description: |-
+                          Requests describes the minimum amount of compute resources required.
+                          If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                          otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                          More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                        type: object
+                    type: object
+                type: object
               maintenanceWindow:
                 description: |-
                   maintenanceWindow defines a daily time window during which upgrades may start.

diff --git a/docs/api-reference/v1alpha1.md b/docs/api-reference/v1alpha1.md
@@ -32,6 +32,7 @@ This document describes all Custom Resource Definitions (CRDs) managed by the Pe
 | `paused` | `bool` | No | `false` | When true, the operator stops reconciling this cluster. |
 | `upgradePolicy` | [`UpgradePolicy`](#upgradepolicy) | No | `Automatic` | Controls how version upgrades are applied. Enum: `Automatic`, `Manual`. |
 | `maintenanceWindow` | [`MaintenanceWindow`](#maintenancewindow) | No | — | Time window for automatic upgrades. Only used when `upgradePolicy` is `Automatic`. |
+| `maintenance` | [`MaintenanceSpec`](#maintenancespec) | No | — | Configures PeerDB maintenance mode for graceful upgrades. When set, the operator pauses mirrors before upgrading and resumes them after. |
 
 ### PeerDBClusterStatus
 
@@ -205,6 +206,16 @@ Defines a time window during which automatic upgrades may be applied.
 | `end` | `string` | **Yes** | — | End time in 24-hour `HH:MM` format. |
 | `timeZone` | `*string` | No | `UTC` | IANA timezone name (e.g., `America/New_York`). |
 
+### MaintenanceSpec
+
+Configuration for PeerDB maintenance mode during upgrades. When configured, the operator runs maintenance Jobs (`ghcr.io/peerdb-io/flow-maintenance`) to gracefully pause all mirrors before upgrading and resume them after.
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `image` | `*string` | No | `ghcr.io/peerdb-io/flow-maintenance:stable-{version}` | Container image override for the maintenance Job. |
+| `backoffLimit` | `*int32` | No | `4` | Number of retries before marking the maintenance Job as failed (min: 0). |
+| `resources` | `*ResourceRequirements` | No | — | CPU/memory resource requests and limits for the maintenance Job container. |
+
 ### UpgradePolicy
 
 `string` enum controlling how version upgrades are applied.
@@ -232,7 +243,7 @@ Tracks the state of a rolling version upgrade.
 |-------|------|-------------|
 | `fromVersion` | `string` | The version being upgraded from. |
 | `toVersion` | `string` | The version being upgraded to. |
-| `phase` | `UpgradePhase` | Current upgrade phase. Values: `Complete`, `Waiting`, `Blocked`, `Config`, `InitJobs`, `FlowAPI`, `PeerDBServer`, `UI`. |
+| `phase` | `UpgradePhase` | Current upgrade phase. Values: `Complete`, `Waiting`, `Blocked`, `StartMaintenance`, `Config`, `InitJobs`, `FlowAPI`, `PeerDBServer`, `UI`, `EndMaintenance`. |
 | `startedAt` | `*metav1.Time` | Timestamp when the upgrade started. |
 | `message` | `string` | Human-readable message about the upgrade state. |
 
@@ -361,6 +372,7 @@ The following condition types are used in `PeerDBCluster` status:
 | `Degraded` | Set to `True` when one or more components are unhealthy but the cluster is partially operational. |
 | `UpgradeInProgress` | Set to `True` when a version upgrade is in progress. |
 | `BackupSafe` | Whether it is safe to take a backup. `True` when no upgrade or rolling restart is in progress. `False` with reason `BackupInProgress` when the `peerdb.io/backup-in-progress` annotation is set, or `BackupUnsafe` when an upgrade/rollout is active. |
+| `MaintenanceMode` | Set to `True` when PeerDB maintenance mode is active (mirrors are paused for an upgrade). Set to `False` with reason `MaintenanceComplete` after mirrors are resumed. |
 
 ### Annotations
 
@@ -383,9 +395,11 @@ The `UpgradeStatus.phase` field tracks progress through a rolling upgrade:
 |-------|-------------|
 | `Waiting` | Upgrade is pending (e.g., waiting for a maintenance window). |
 | `Blocked` | Upgrade is blocked (e.g., manual policy requires acknowledgement). |
+| `StartMaintenance` | Running the StartMaintenance Job to pause mirrors before upgrade. |
 | `Config` | Updating shared ConfigMap and configuration. |
 | `InitJobs` | Re-running init jobs if needed. |
 | `FlowAPI` | Rolling out the Flow API Deployment. |
 | `PeerDBServer` | Rolling out the PeerDB Server Deployment. |
 | `UI` | Rolling out the PeerDB UI Deployment. |
+| `EndMaintenance` | Running the EndMaintenance Job to resume mirrors after upgrade. |
 | `Complete` | Upgrade finished successfully. |
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -37,6 +37,7 @@ flowchart TB
         UISvc["PeerDB UI\nService :3000"]
         NSJob["Temporal NS\nRegister Job"]
         SAJob["Search Attr\nJob"]
+        MaintJob["Maintenance\nJobs"]
     end
 
     subgraph ManagedByWorker["Owned by PeerDBWorkerPool"]
@@ -58,6 +59,7 @@ flowchart TB
     CC --> UISvc
     CC --> NSJob
     CC --> SAJob
+    CC --> MaintJob
 
     WC -->|"reads cluster config"| PeerDBCluster
     WC --> WorkerDep
@@ -121,9 +123,11 @@ A single CRD would force all scaling decisions through one reconciler and one sp
 
 1. **Dependency validation** — Check catalog password Secret exists before proceeding
 2. **Shared infrastructure** — ServiceAccount → ConfigMap (connection config)
-3. **Init jobs** — Idempotent Temporal setup jobs; cluster waits for completion
-4. **Components** — Flow API → PeerDB Server → UI (Deployments + Services)
-5. **Status rollup** — Individual conditions aggregate into overall `Ready` condition
+3. **Maintenance mode** — If `spec.maintenance` is set, run StartMaintenance Job to pause mirrors (upgrade only)
+4. **Init jobs** — Idempotent Temporal setup jobs; cluster waits for completion
+5. **Components** — Flow API → PeerDB Server → UI (Deployments + Services)
+6. **End maintenance** — If `spec.maintenance` is set, run EndMaintenance Job to resume mirrors (upgrade only)
+7. **Status rollup** — Individual conditions aggregate into overall `Ready` condition
 
 All managed resources have **OwnerReferences** set to the parent CR, enabling automatic garbage collection on deletion without custom finalizers.
 
@@ -154,7 +158,8 @@ internal/
     ├── ui.go                    # PeerDB UI Deployment + Service
     ├── flow_worker.go           # Flow Worker Deployment
     ├── snapshot_worker.go       # Snapshot Worker StatefulSet + headless Service
-    └── init_jobs.go             # Temporal init Jobs
+    ├── init_jobs.go             # Temporal init Jobs
+    └── maintenance_jobs.go      # Maintenance mode Jobs
 
 config/
 ├── crd/bases/                   # Generated CRD manifests

diff --git a/docs/runbooks/safe-upgrade.md b/docs/runbooks/safe-upgrade.md
@@ -70,10 +70,11 @@ For more control, use the manual upgrade policy:
 The controller enforces a specific rollout order to minimize disruption:
 
 ```
-ConfigMap/Secrets → Init Jobs → Flow API → PeerDB Server → UI
+[StartMaintenance →] ConfigMap/Secrets → Init Jobs → Flow API → PeerDB Server → UI [→ EndMaintenance]
 ```
 
 Each step must complete successfully before the next begins. This ensures:
+- Mirrors are gracefully paused before any component restarts (when `spec.maintenance` is configured).
 - Configuration is propagated before any component restarts.
 - The Flow API (gRPC backend) is ready before the Server and UI that depend on it.
 - The UI is upgraded last since it's the least critical component.
@@ -102,6 +103,48 @@ spec:
 - Remove or omit `maintenanceWindow` to allow upgrades at any time.
 - If `timeZone` is not specified, it defaults to UTC.
 
+## Maintenance Mode
+
+PeerDB has a built-in maintenance mode that gracefully pauses all running mirrors before an upgrade and resumes them after. The operator integrates this via Kubernetes Jobs:
+
+```yaml
+apiVersion: peerdb.peerdb.io/v1alpha1
+kind: PeerDBCluster
+metadata:
+  name: peerdb
+spec:
+  version: "v0.37.0"
+  maintenance: {}
+  # ... rest of spec
+```
+
+When `spec.maintenance` is set, the upgrade flow becomes:
+
+1. **StartMaintenance** — A Job runs using the `flow-maintenance` image with `start` command. This triggers PeerDB's `StartMaintenance` Temporal workflow, which waits for running snapshots, enables maintenance mode (`PEERDB_MAINTENANCE_MODE_ENABLED`), and pauses all running mirrors.
+2. **Normal upgrade** — Config, init jobs, Flow API, Server, and UI are rolled out in order.
+3. **EndMaintenance** — A Job runs with the `end` command, resuming all previously paused mirrors and disabling maintenance mode.
+
+While maintenance mode is active, mirrors cannot be created or mutated through PeerDB.
+
+### Customizing the Maintenance Job
+
+```yaml
+spec:
+  maintenance:
+    image: "custom-registry/flow-maintenance:v1.0.0"  # Override image
+    backoffLimit: 6                                     # Retry count
+    resources:
+      requests:
+        cpu: "100m"
+        memory: "128Mi"
+```
+
+If a maintenance Job fails, the operator deletes it and retries automatically. A `Degraded` condition is set so you can monitor failures via:
+
+```bash
+kubectl get peerdbcluster <name> -o jsonpath='{.status.conditions}' | jq '.[] | select(.type=="MaintenanceMode")'
+```
+
 ## Monitoring Upgrade Progress
 
 ### Quick Status
@@ -140,8 +183,10 @@ Example output:
 | `FlowAPI` | Rolling out Flow API Deployment |
 | `PeerDBServer` | Rolling out PeerDB Server Deployment |
 | `UI` | Rolling out UI Deployment |
+| `EndMaintenance` | Running EndMaintenance Job (resuming mirrors) |
 | `Complete` | Upgrade finished successfully |
 | `Blocked` | Upgrade blocked — dependencies are unhealthy |
+| `StartMaintenance` | Running StartMaintenance Job (pausing mirrors) |
 
 ### Watch Upgrade Events