-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.example.yaml
More file actions
160 lines (143 loc) · 4.92 KB
/
config.example.yaml
File metadata and controls
160 lines (143 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Example Configuration File for Python-Script-Runner
# Copy to config.yaml and customize for your use case
# ============================================================================
# ALERT DEFINITIONS
# ============================================================================
alerts:
# Alert on high CPU usage
- name: cpu_high
condition: cpu_max > 80
channels: [slack, stdout]
severity: WARNING
throttle_seconds: 300
# Alert on memory spike (rapid increase)
- name: memory_spike
condition: memory_max_mb > 2048
channels: [email, slack]
severity: CRITICAL
throttle_seconds: 600
# Alert on script timeout risk
- name: timeout_risk
condition: execution_time_seconds > 240
channels: [slack]
severity: INFO
throttle_seconds: 300
# Alert on script failure
- name: script_failed
condition: exit_code != 0
channels: [email, slack]
severity: CRITICAL
throttle_seconds: 300
# Alert on high page faults (potential memory issues)
- name: memory_pressure
condition: page_faults_major > 100
channels: [slack]
severity: WARNING
throttle_seconds: 300
# ============================================================================
# PERFORMANCE GATES FOR CI/CD
# ============================================================================
performance_gates:
# CPU max should not exceed 90%
- metric_name: cpu_max
max_value: 90
comparator: max
# Memory max should not exceed 1GB
- metric_name: memory_max_mb
max_value: 1024
comparator: max
# Execution time should not exceed 5 minutes
- metric_name: execution_time_seconds
max_value: 300
comparator: max
# Average CPU should stay reasonable
- metric_name: cpu_avg
max_value: 60
comparator: avg
# ============================================================================
# NOTIFICATION CHANNEL CONFIGURATIONS
# ============================================================================
notifications:
# Slack webhook configuration
slack:
webhook_url: "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
# You can use environment variables: ${SLACK_WEBHOOK_URL}
# Email configuration (SMTP)
email:
smtp_server: "smtp.gmail.com"
smtp_port: 587
from: "alerts@example.com"
to:
- "team@example.com"
- "ops@example.com"
username: "your-email@gmail.com"
# Use environment variable for password: ${GMAIL_APP_PASSWORD}
password: "your-app-password-here"
use_tls: true
# Custom webhook endpoint
webhook:
url: "https://your-monitoring-system.com/api/alerts"
headers:
Authorization: "Bearer YOUR_TOKEN_HERE"
Content-Type: "application/json"
# ============================================================================
# ALERT EXAMPLES FOR DIFFERENT SCENARIOS
# ============================================================================
# Example: Long-running data processing job
# alerts:
# - name: processing_slow
# condition: execution_time_seconds > 1800 # 30 minutes
# channels: [slack]
# severity: INFO
# throttle_seconds: 600
#
# - name: processing_memory_issue
# condition: memory_delta > 500 # 500MB increase
# channels: [email, slack]
# severity: WARNING
# throttle_seconds: 900
# Example: Machine Learning training job
# alerts:
# - name: gpu_underutilized
# condition: cpu_max < 10
# channels: [slack]
# severity: INFO
# throttle_seconds: 300
#
# - name: training_diverging
# condition: execution_time_seconds > 3600 and memory_max_mb > 8192
# channels: [email, slack]
# severity: CRITICAL
# throttle_seconds: 300
# Example: Test suite execution
# performance_gates:
# - metric_name: execution_time_seconds
# max_value: 600 # Tests should complete in 10 minutes
# - metric_name: memory_max_mb
# max_value: 512 # Tests should not use more than 512MB
# - metric_name: exit_code
# min_value: 0 # Must succeed
# ============================================================================
# COMMON METRIC CONDITIONS
# ============================================================================
# Available metrics for conditions:
# - execution_time_seconds: Total execution time
# - exit_code: Process exit code (0 = success)
# - success: Boolean success indicator
# - cpu_max, cpu_avg, cpu_min: CPU usage percentages
# - memory_max_mb, memory_avg_mb, memory_min_mb: Memory in MB
# - user_time_seconds: User time
# - system_time_seconds: System time
# - page_faults_major: Major page faults (disk swaps)
# - page_faults_minor: Minor page faults
# - voluntary_context_switches: Context switches requested
# - involuntary_context_switches: Context switches forced by kernel
# - stdout_lines: Number of output lines
# - stderr_lines: Number of error lines
#
# Examples of conditions:
# - cpu_max > 80
# - memory_max_mb > 2048
# - execution_time_seconds > 300 and success == false
# - page_faults_major > 50
# - exit_code != 0