Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions metrics/alertmanager/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,20 @@ groups:
value: '{{ $value }}'
summary: PD leader lease dropped without failover

- alert: PD_leader_service_stuck
expr: |
(service_member_role{job="pd",service="PD"} == 0)
and on(instance,job) (etcd_server_is_leader{job="pd"} == 1)
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: (service_member_role{job="pd",service="PD"} == 0) and on(instance,job) (etcd_server_is_leader{job="pd"} == 1)
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, PD service is not the PD leader while being the embedded etcd leader; values:{{ $value }}'
value: '{{ $value }}'
summary: PD leader service is stuck in non-leader state
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about adding a summary to show the etcd leader is normal?


- alert: PD_cluster_store_space_used_more_than_80%
expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80
for: 1m
Expand Down
109 changes: 109 additions & 0 deletions tests/alertmanager/pd.rules.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,112 @@ tests:
- eval_time: 12m
alertname: PD_leader_lease_drop_without_failover
exp_alerts: []

- interval: 15s
name: pd-leader-service-stuck
input_series:
# PD service layer drops at minute 3 and never recovers.
- series: 'service_member_role{job="pd",service="PD",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'

# Embedded etcd leader stays stable on pd-1 throughout.
- series: 'etcd_server_is_leader{job="pd",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

alert_rule_test:
# Condition first true at 3m; for:1m means PENDING until 4m.
- eval_time: 3m45s
alertname: PD_leader_service_stuck
exp_alerts: []
- eval_time: 4m
alertname: PD_leader_service_stuck
exp_alerts:
- exp_labels:
env: ENV_LABELS_ENV
level: critical
job: pd
service: PD
instance: pd-1
expr: '(service_member_role{job="pd",service="PD"} == 0) and on(instance,job) (etcd_server_is_leader{job="pd"} == 1)'
exp_annotations:
summary: 'PD leader service is stuck in non-leader state'
description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD service is not the PD leader while being the embedded etcd leader; values:0'
value: '0'
- eval_time: 6m
alertname: PD_leader_service_stuck
exp_alerts:
- exp_labels:
env: ENV_LABELS_ENV
level: critical
job: pd
service: PD
instance: pd-1
expr: '(service_member_role{job="pd",service="PD"} == 0) and on(instance,job) (etcd_server_is_leader{job="pd"} == 1)'
exp_annotations:
summary: 'PD leader service is stuck in non-leader state'
description: 'cluster: ENV_LABELS_ENV, instance: pd-1, PD service is not the PD leader while being the embedded etcd leader; values:0'
value: '0'

- interval: 15s
name: pd-leader-service-stuck-suppressed-by-failover
input_series:
# Normal failover: pd-1 loses both PD and etcd leadership at minute 5.
- series: 'service_member_role{job="pd",service="PD",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
- series: 'etcd_server_is_leader{job="pd",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'

# pd-2 takes over etcd leadership; service_member_role is absent on pd-2
# before it wins PD leadership (the metric is only initialized on first win).
- series: 'etcd_server_is_leader{job="pd",instance="pd-2"}'
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

alert_rule_test:
# Both roles transfer simultaneously; condition is immediately false after minute 5.
- eval_time: 5m
alertname: PD_leader_service_stuck
exp_alerts: []
- eval_time: 5m30s
alertname: PD_leader_service_stuck
exp_alerts: []
- eval_time: 6m
alertname: PD_leader_service_stuck
exp_alerts: []
- eval_time: 12m
alertname: PD_leader_service_stuck
exp_alerts: []

- interval: 15s
name: pd-leader-service-stuck-staggered-failover
input_series:
# pd-1 PD service drops at minute 5 (30s before etcd leadership transfers).
- series: 'service_member_role{job="pd",service="PD",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
# pd-1 etcd leadership transfers 30s later at minute 5:30.
- series: 'etcd_server_is_leader{job="pd",instance="pd-1"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'

# pd-2 takes over both roles at minute 5:30; service_member_role is absent
# on pd-2 before it wins PD leadership.
- series: 'etcd_server_is_leader{job="pd",instance="pd-2"}'
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

alert_rule_test:
# During 5:00–5:15 pd-1 has service=0 and etcd=1 (transient 30s window).
# for:1m requires 4 consecutive evals (4×15s); only 2 pass before etcd flips.
- eval_time: 5m
alertname: PD_leader_service_stuck
exp_alerts: []
- eval_time: 5m15s
alertname: PD_leader_service_stuck
exp_alerts: []
# At 5:30 pd-1 etcd drops; transient was too short to satisfy for:1m.
- eval_time: 5m30s
alertname: PD_leader_service_stuck
exp_alerts: []
- eval_time: 6m
alertname: PD_leader_service_stuck
exp_alerts: []
- eval_time: 12m
alertname: PD_leader_service_stuck
exp_alerts: []