diff --git a/metrics/alertmanager/pd.rules.yml b/metrics/alertmanager/pd.rules.yml index b746f94db95..680df4966d8 100644 --- a/metrics/alertmanager/pd.rules.yml +++ b/metrics/alertmanager/pd.rules.yml @@ -153,6 +153,20 @@ groups: value: '{{ $value }}' summary: PD leader lease dropped without failover + - alert: PD_leader_service_stuck + expr: | + (service_member_role{job="pd",service="PD"} == 0) + and on(instance,job) (etcd_server_is_leader{job="pd"} == 1) + for: 1m + labels: + env: ENV_LABELS_ENV + level: critical + expr: (service_member_role{job="pd",service="PD"} == 0) and on(instance,job) (etcd_server_is_leader{job="pd"} == 1) + annotations: + description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, PD service is not the PD leader while being the embedded etcd leader; values:{{ $value }}' + value: '{{ $value }}' + summary: PD leader service is stuck in non-leader state + - alert: PD_cluster_store_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 for: 1m diff --git a/tests/alertmanager/pd.rules.test.yml b/tests/alertmanager/pd.rules.test.yml index cfd87ff4ac7..820e83bb7d0 100644 --- a/tests/alertmanager/pd.rules.test.yml +++ b/tests/alertmanager/pd.rules.test.yml @@ -114,3 +114,112 @@ tests: - eval_time: 12m alertname: PD_leader_lease_drop_without_failover exp_alerts: [] + + - interval: 15s + name: pd-leader-service-stuck + input_series: + # PD service layer drops at minute 3 and never recovers. + - series: 'service_member_role{job="pd",service="PD",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + + # Embedded etcd leader stays stable on pd-1 throughout. + - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + + alert_rule_test: + # Condition first true at 3m; for:1m means PENDING until 4m. + - eval_time: 3m45s + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 4m + alertname: PD_leader_service_stuck + exp_alerts: + - exp_labels: + env: ENV_LABELS_ENV + level: critical + job: pd + service: PD + instance: pd-1 + expr: '(service_member_role{job="pd",service="PD"} == 0) and on(instance,job) (etcd_server_is_leader{job="pd"} == 1)' + exp_annotations: + summary: 'PD leader service is stuck in non-leader state' + description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD service is not the PD leader while being the embedded etcd leader; values:0' + value: '0' + - eval_time: 6m + alertname: PD_leader_service_stuck + exp_alerts: + - exp_labels: + env: ENV_LABELS_ENV + level: critical + job: pd + service: PD + instance: pd-1 + expr: '(service_member_role{job="pd",service="PD"} == 0) and on(instance,job) (etcd_server_is_leader{job="pd"} == 1)' + exp_annotations: + summary: 'PD leader service is stuck in non-leader state' + description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD service is not the PD leader while being the embedded etcd leader; values:0' + value: '0' + + - interval: 15s + name: pd-leader-service-stuck-suppressed-by-failover + input_series: + # Normal failover: pd-1 loses both PD and etcd leadership at minute 5. + - series: 'service_member_role{job="pd",service="PD",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + + # pd-2 takes over etcd leadership; service_member_role is absent on pd-2 + # before it wins PD leadership (the metric is only initialized on first win). + - series: 'etcd_server_is_leader{job="pd",instance="pd-2"}' + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + + alert_rule_test: + # Both roles transfer simultaneously; condition is immediately false after minute 5. + - eval_time: 5m + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 5m30s + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 6m + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 12m + alertname: PD_leader_service_stuck + exp_alerts: [] + + - interval: 15s + name: pd-leader-service-stuck-staggered-failover + input_series: + # pd-1 PD service drops at minute 5 (30s before etcd leadership transfers). + - series: 'service_member_role{job="pd",service="PD",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + # pd-1 etcd leadership transfers 30s later at minute 5:30. + - series: 'etcd_server_is_leader{job="pd",instance="pd-1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + + # pd-2 takes over both roles at minute 5:30; service_member_role is absent + # on pd-2 before it wins PD leadership. + - series: 'etcd_server_is_leader{job="pd",instance="pd-2"}' + values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + + alert_rule_test: + # During 5:00–5:15 pd-1 has service=0 and etcd=1 (transient 30s window). + # for:1m requires 4 consecutive evals (4×15s); only 2 pass before etcd flips. + - eval_time: 5m + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 5m15s + alertname: PD_leader_service_stuck + exp_alerts: [] + # At 5:30 pd-1 etcd drops; transient was too short to satisfy for:1m. + - eval_time: 5m30s + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 6m + alertname: PD_leader_service_stuck + exp_alerts: [] + - eval_time: 12m + alertname: PD_leader_service_stuck + exp_alerts: []