From 6ee40bddc1041ae5355bb9e7a296758e6fe15394 Mon Sep 17 00:00:00 2001 From: Yishuai Li Date: Tue, 10 Mar 2026 12:36:53 +0800 Subject: [PATCH 1/5] metrics: add PD_leader_service_stuck alert for etcd-leader without PD service When service_member_role drops to 0 and never recovers while etcd_server_is_leader stays stable, no existing alert fires: - PD_leader_lease_drop_without_failover requires service_member_role==1 at eval time and changes>=2 (persistent drop gives changes==1, value=0) - PD_leader_change detects TSO-save handoff; with no active PD leader, no saves are emitted and the count stays below threshold - All cluster-health alerts rely on pd_cluster_status/pd_regions_status, which are only emitted by the active PD leader Add PD_leader_service_stuck (critical, for:1m) that fires when the etcd leader node's PD service layer is not serving as PD leader. Normal failovers are naturally excluded: when etcd leadership transfers, the departing node's etcd_server_is_leader drops to 0, making the join condition false without any extra suppression logic. Add two promtool unit tests: - pd-leader-service-stuck: positive case fires after 1m of stuck state - pd-leader-service-stuck-suppressed-by-failover: normal failover with matching etcd+PD leadership transfer stays silent Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Yishuai Li --- metrics/alertmanager/pd.rules.yml | 14 +++++++++ tests/alertmanager/pd.rules.test.yml | 47 ++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/metrics/alertmanager/pd.rules.yml b/metrics/alertmanager/pd.rules.yml index b746f94db95..680df4966d8 100644 --- a/metrics/alertmanager/pd.rules.yml +++ b/metrics/alertmanager/pd.rules.yml @@ -153,6 +153,20 @@ groups: value: '{{ $value }}' summary: PD leader lease dropped without failover + - alert: PD_leader_service_stuck + expr: | + (service_member_role{job="pd",service="PD"} == 0) + and on(instance,job) (etcd_server_is_leader{job="pd"} == 1) + for: 1m + labels: + env: ENV_LABELS_ENV + level: critical + expr: (service_member_role{job="pd",service="PD"} == 0) and on(instance,job) (etcd_server_is_leader{job="pd"} == 1) + annotations: + description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, PD service is not the PD leader while being the embedded etcd leader; values:{{ $value }}' + value: '{{ $value }}' + summary: PD leader service is stuck in non-leader state + - alert: PD_cluster_store_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 for: 1m diff --git a/tests/alertmanager/pd.rules.test.yml b/tests/alertmanager/pd.rules.test.yml index cfd87ff4ac7..dca92ea81cd 100644 --- a/tests/alertmanager/pd.rules.test.yml +++ b/tests/alertmanager/pd.rules.test.yml @@ -114,3 +114,50 @@ tests: - eval_time: 12m alertname: PD_leader_lease_drop_without_failover exp_alerts: [] + + - interval: 15s + name: pd-leader-service-stuck + input_series: + # PD service layer drops at minute 3 and never recovers. + - series: 'service_member_role{job="pd",service="PD",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + + # Embedded etcd leader stays stable on pd-1 throughout. + - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + + alert_rule_test: + - eval_time: 6m + alertname: PD_leader_service_stuck + exp_alerts: + - exp_labels: + env: ENV_LABELS_ENV + level: critical + job: pd + service: PD + instance: pd-1 + expr: (service_member_role{job="pd",service="PD"} == 0) and on(instance,job) (etcd_server_is_leader{job="pd"} == 1) + exp_annotations: + summary: 'PD leader service is stuck in non-leader state' + description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD service is not the PD leader while being the embedded etcd leader; values:0' + value: '0' + + - interval: 15s + name: pd-leader-service-stuck-suppressed-by-failover + input_series: + # Normal failover: pd-1 loses both PD and etcd leadership at minute 5. + - series: 'service_member_role{job="pd",service="PD",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + + # pd-2 takes over both roles. + - series: 'service_member_role{job="pd",service="PD",instance="pd-2"}' + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'etcd_server_is_leader{job="pd",instance="pd-2"}' + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + + alert_rule_test: + - eval_time: 12m + alertname: PD_leader_service_stuck + exp_alerts: [] From b595d4114efb2375d71f75e4d1c281ece3a73b69 Mon Sep 17 00:00:00 2001 From: Yishuai Li Date: Tue, 10 Mar 2026 12:42:16 +0800 Subject: [PATCH 2/5] Apply suggestion from @Copilot Signed-off-by: Yishuai Li Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/alertmanager/pd.rules.test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/alertmanager/pd.rules.test.yml b/tests/alertmanager/pd.rules.test.yml index dca92ea81cd..39bb158d962 100644 --- a/tests/alertmanager/pd.rules.test.yml +++ b/tests/alertmanager/pd.rules.test.yml @@ -136,7 +136,7 @@ tests: job: pd service: PD instance: pd-1 - expr: (service_member_role{job="pd",service="PD"} == 0) and on(instance,job) (etcd_server_is_leader{job="pd"} == 1) + expr: '(service_member_role{job="pd",service="PD"} == 0) and on(instance,job) (etcd_server_is_leader{job="pd"} == 1)' exp_annotations: summary: 'PD leader service is stuck in non-leader state' description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD service is not the PD leader while being the embedded etcd leader; values:0' From 80a030419a9289991968257f351655636429af35 Mon Sep 17 00:00:00 2001 From: Yishuai Li Date: Tue, 10 Mar 2026 13:16:24 +0800 Subject: [PATCH 3/5] tests: add eval-time checks and staggered-failover case for PD_leader_service_stuck MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add alert_rule_test entries at 5m, 5m30s, and 6m to the instant-failover suppression test to assert the rule stays silent while and immediately after the handoff is occurring. Add pd-leader-service-stuck-staggered-failover: pd-1 service_member_role flips to 0 thirty seconds before etcd_server_is_leader transfers. The transient window (2 evaluations × 15s = 30s) is shorter than for:1m's required 4 consecutive evaluations, so the alert must not fire. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Yishuai Li --- tests/alertmanager/pd.rules.test.yml | 43 ++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/alertmanager/pd.rules.test.yml b/tests/alertmanager/pd.rules.test.yml index 39bb158d962..6ab2ba0eea4 100644 --- a/tests/alertmanager/pd.rules.test.yml +++ b/tests/alertmanager/pd.rules.test.yml @@ -158,6 +158,49 @@ tests: values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' alert_rule_test: + # Both roles transfer simultaneously; condition is immediately false after minute 5. + - eval_time: 5m + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 5m30s + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 6m + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 12m + alertname: PD_leader_service_stuck + exp_alerts: [] + + - interval: 15s + name: pd-leader-service-stuck-staggered-failover + input_series: + # pd-1 PD service drops at minute 5 (30s before etcd leadership transfers). + - series: 'service_member_role{job="pd",service="PD",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + # pd-1 etcd leadership transfers 30s later at minute 5:30. + - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + + # pd-2 takes over both roles at minute 5:30. + - series: 'service_member_role{job="pd",service="PD",instance="pd-2"}' + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'etcd_server_is_leader{job="pd",instance="pd-2"}' + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + + alert_rule_test: + # During 5:00–5:15 pd-1 has service=0 and etcd=1 (transient 30s window). + # for:1m requires 4 consecutive evals (4×15s); only 2 pass before etcd flips. + - eval_time: 5m + alertname: PD_leader_service_stuck + exp_alerts: [] + # At 5:30 pd-1 etcd drops; transient was too short to satisfy for:1m. + - eval_time: 5m30s + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 6m + alertname: PD_leader_service_stuck + exp_alerts: [] - eval_time: 12m alertname: PD_leader_service_stuck exp_alerts: [] From e03fe8e5b879f3d4dbb26dd99986c2adadfdf9d5 Mon Sep 17 00:00:00 2001 From: Yishuai Li Date: Tue, 10 Mar 2026 13:25:05 +0800 Subject: [PATCH 4/5] Apply suggestion from @coderabbitai[bot] Signed-off-by: Yishuai Li Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- tests/alertmanager/pd.rules.test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/alertmanager/pd.rules.test.yml b/tests/alertmanager/pd.rules.test.yml index 6ab2ba0eea4..d72cd29e849 100644 --- a/tests/alertmanager/pd.rules.test.yml +++ b/tests/alertmanager/pd.rules.test.yml @@ -194,6 +194,9 @@ tests: - eval_time: 5m alertname: PD_leader_service_stuck exp_alerts: [] + - eval_time: 5m15s + alertname: PD_leader_service_stuck + exp_alerts: [] # At 5:30 pd-1 etcd drops; transient was too short to satisfy for:1m. - eval_time: 5m30s alertname: PD_leader_service_stuck From f3183596e6707e69bd7d95fd2734e19ba1852d37 Mon Sep 17 00:00:00 2001 From: Yishuai Li Date: Tue, 10 Mar 2026 13:33:17 +0800 Subject: [PATCH 5/5] tests: tighten PD_leader_service_stuck boundary and fix pd-2 fixture accuracy Add eval_time: 3m45s (PENDING, no alert) and eval_time: 4m (exactly at for:1m boundary, FIRING) to the pd-leader-service-stuck test so the suite enforces the hold time precisely rather than only checking a late eval. Remove the service_member_role{instance="pd-2"} series with leading zeros from pd-leader-service-stuck-suppressed-by-failover and pd-leader-service-stuck-staggered-failover. In practice the metric is only initialized when a node first wins PD leadership (server.go:1797 Set(1)); followers that have never been leader do not export it at all. Keeping the series absent before takeover makes the fixture match the real metric cardinality. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Yishuai Li --- tests/alertmanager/pd.rules.test.yml | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/tests/alertmanager/pd.rules.test.yml b/tests/alertmanager/pd.rules.test.yml index d72cd29e849..820e83bb7d0 100644 --- a/tests/alertmanager/pd.rules.test.yml +++ b/tests/alertmanager/pd.rules.test.yml @@ -127,6 +127,24 @@ tests: values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' alert_rule_test: + # Condition first true at 3m; for:1m means PENDING until 4m. + - eval_time: 3m45s + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 4m + alertname: PD_leader_service_stuck + exp_alerts: + - exp_labels: + env: ENV_LABELS_ENV + level: critical + job: pd + service: PD + instance: pd-1 + expr: '(service_member_role{job="pd",service="PD"} == 0) and on(instance,job) (etcd_server_is_leader{job="pd"} == 1)' + exp_annotations: + summary: 'PD leader service is stuck in non-leader state' + description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD service is not the PD leader while being the embedded etcd leader; values:0' + value: '0' - eval_time: 6m alertname: PD_leader_service_stuck exp_alerts: @@ -151,9 +169,8 @@ tests: - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' - # pd-2 takes over both roles. - - series: 'service_member_role{job="pd",service="PD",instance="pd-2"}' - values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + # pd-2 takes over etcd leadership; service_member_role is absent on pd-2 + # before it wins PD leadership (the metric is only initialized on first win). - series: 'etcd_server_is_leader{job="pd",instance="pd-2"}' values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' @@ -182,9 +199,8 @@ tests: - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' - # pd-2 takes over both roles at minute 5:30. - - series: 'service_member_role{job="pd",service="PD",instance="pd-2"}' - values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + # pd-2 takes over both roles at minute 5:30; service_member_role is absent + # on pd-2 before it wins PD leadership. - series: 'etcd_server_is_leader{job="pd",instance="pd-2"}' values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'