Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions helm/templates/grafana-dashboard-traces.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ metadata:
name: {{ .Values.app }}-traces-dashboard
namespace: {{ .Release.Namespace }}
spec:
folder: {{ .Values.app }}-folder
folder: {{ .Release.Namespace | title }} Dashboards
instanceSelector:
matchLabels:
dashboards: "grafana"
Expand Down Expand Up @@ -42,22 +42,22 @@ spec:
{
"current": {
"selected": true,
"text": "{{ .Release.Namespace }}",
"value": "{{ .Release.Namespace }}"
"text": "{{ .Values.app }}",
"value": "{{ .Values.app }}"
},
"hide": 2,
"hide": 0,
"includeAll": false,
"label": "Namespace Pattern",
"label": "Service Name",
"multi": false,
"name": "namespace_pattern",
"name": "service_name",
"options": [
{
"selected": true,
"text": "{{ .Release.Namespace }}",
"value": "{{ .Release.Namespace }}"
"text": "{{ .Values.app }}",
"value": "{{ .Values.app }}"
}
],
"query": "{{ .Release.Namespace }}",
"query": "{{ .Values.app }}",
"refresh": 0,
"type": "custom"
}
Expand All @@ -81,7 +81,7 @@ spec:
"targets": [
{
"limit": 200,
"query": "{resource.k8s.namespace.name=~\"$namespace_pattern\"}",
"query": "{resource.service.name=~\"$service_name\"}",
"queryType": "traceql",
"refId": "A",
"tableType": "traces",
Expand Down
2 changes: 1 addition & 1 deletion helm/templates/grafana-datasource-rhoai.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
tlsSkipVerify: true
name: RHOAI Prometheus
type: prometheus
url: 'http://data-science-monitoringstack-prometheus.redhat-ods-monitoring.svc.cluster.local:9090'
url: {{ .Values.monitoring.prometheusUrl | quote }}
instanceSelector:
matchLabels:
dashboards: "grafana"
2 changes: 1 addition & 1 deletion helm/templates/grafana-datasource-tempo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ spec:
secureJsonData:
httpHeaderValue1: "redhat-ods-monitoring"
type: tempo
url: 'http://tempo-data-science-tempomonolithic.redhat-ods-monitoring.svc.cluster.local:3200'
url: {{ .Values.monitoring.tempoUrl | quote }}
instanceSelector:
matchLabels:
dashboards: "grafana"
3 changes: 1 addition & 2 deletions helm/templates/grafana-proxy-rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,9 @@ rules:
- subjectaccessreviews
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
kind: ClusterRoleBinding
metadata:
name: {{ .Values.app }}-grafana-proxy
namespace: {{ .Release.Namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
Expand Down
10 changes: 0 additions & 10 deletions helm/templates/grafana.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,4 @@
---
kind: Secret
apiVersion: v1
metadata:
name: {{ .Values.app }}-grafana-sa-token
namespace: {{ .Release.Namespace }}
annotations:
kubernetes.io/service-account.name: {{ .Values.app }}-grafana-sa
argocd.argoproj.io/sync-wave: "5"
type: kubernetes.io/service-account-token
---
apiVersion: v1
kind: ConfigMap
metadata:
Expand Down
91 changes: 91 additions & 0 deletions helm/templates/monitoring-dsci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{{- if .Values.monitoring.enable }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ .Values.app }}-monitoring-patcher
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "5"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded,hook-failed
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ .Values.app }}-monitoring-patcher
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "5"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded,hook-failed
rules:
- apiGroups:
- dscinitialization.opendatahub.io
resources:
- dscinitializations
verbs:
- get
- patch
- update
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ .Values.app }}-monitoring-patcher
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "5"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded,hook-failed
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ .Values.app }}-monitoring-patcher
subjects:
- kind: ServiceAccount
name: {{ .Values.app }}-monitoring-patcher
namespace: {{ .Release.Namespace }}
---
apiVersion: batch/v1
kind: Job
metadata:
name: {{ .Values.app }}-monitoring-patcher
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "10"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded,hook-failed
spec:
template:
spec:
serviceAccountName: {{ .Values.app }}-monitoring-patcher
restartPolicy: OnFailure
containers:
- name: patch-dsci
image: {{ .Values.monitoring.patcherImage | quote }}
command:
- sh
- -c
args:
- |
cat <<'EOF' | kubectl apply -f -
apiVersion: dscinitialization.opendatahub.io/v2
kind: DSCInitialization
metadata:
name: {{ .Values.monitoring.dsciName }}
spec:
monitoring:
managementState: Managed
namespace: {{ .Values.monitoring.namespace }}
metrics:
replicas: {{ .Values.monitoring.metrics.replicas }}
storage:
retention: {{ .Values.monitoring.metrics.retention | quote }}
size: {{ .Values.monitoring.metrics.size | quote }}
traces:
sampleRatio: {{ .Values.monitoring.traces.sampleRatio | quote }}
storage:
backend: {{ .Values.monitoring.traces.storage.backend }}
retention: {{ .Values.monitoring.traces.storage.retention | quote }}
size: {{ .Values.monitoring.traces.storage.size | quote }}
EOF
{{- end }}
27 changes: 27 additions & 0 deletions helm/templates/otel-collector.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{{- if and .Values.monitoring.enable .Values.monitoring.deployCollector }}
---
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: {{ .Values.monitoring.collectorName }}
namespace: {{ .Values.monitoring.namespace }}
annotations:
argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
spec:
mode: deployment
replicas: {{ .Values.monitoring.collectorReplicas }}
config:
receivers:
otlp:
protocols:
grpc: {}
http: {}
exporters:
otlphttp:
endpoint: {{ .Values.monitoring.tempoOtlpHttpEndpoint | quote }}
service:
pipelines:
traces:
receivers: [otlp]
exporters: [otlphttp]
{{- end }}
25 changes: 25 additions & 0 deletions helm/templates/tempo-monolithic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{{- if and .Values.monitoring.enable .Values.monitoring.deployTempo }}
---
apiVersion: tempo.grafana.com/v1alpha1
kind: TempoMonolithic
metadata:
name: {{ .Values.monitoring.tempoName }}
namespace: {{ .Values.monitoring.namespace }}
annotations:
argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
spec:
management: Managed
ingestion:
otlp:
grpc:
enabled: true
http:
enabled: true
storage:
traces:
backend: {{ .Values.monitoring.tempoStorageBackend }}
size: {{ .Values.monitoring.tempoStorageSize | quote }}
{{- if .Values.monitoring.tempoStorageClassName }}
storageClassName: {{ .Values.monitoring.tempoStorageClassName | quote }}
{{- end }}
{{- end }}
29 changes: 28 additions & 1 deletion helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ milvus:
docsFolder: /data/docs

otelCollector:
enabled: false
enabled: true
endpoint: http://data-science-collector-collector-headless.redhat-ods-monitoring.svc.cluster.local:4318
protocol: http/protobuf
tracesEndpoint: http://data-science-collector-collector-headless.redhat-ods-monitoring.svc.cluster.local:4318/v1/traces
Expand All @@ -186,6 +186,33 @@ otelCollector:
metricExportInterval: 60000
telemetrySinks: otel_trace, otel_metric

monitoring:
enable: true
dsciName: default-dsci
namespace: redhat-ods-monitoring
patcherImage: registry.redhat.io/openshift4/ose-cli:v4.14
deployTempo: true
tempoName: data-science-tempomonolithic
tempoStorageBackend: pv
tempoStorageSize: 10Gi
tempoStorageClassName: ""
tempoOtlpHttpEndpoint: http://tempo-data-science-tempomonolithic.redhat-ods-monitoring.svc.cluster.local:4318
metrics:
replicas: 1
retention: 90d
size: 50Gi
traces:
sampleRatio: "1.0"
storage:
backend: pv
retention: 2160h0m0s
size: 100Gi
deployCollector: true
collectorName: data-science-collector
collectorReplicas: 1
tempoUrl: http://tempo-data-science-tempomonolithic.redhat-ods-monitoring.svc.cluster.local:3200
prometheusUrl: http://data-science-monitoringstack-prometheus.redhat-ods-monitoring.svc.cluster.local:9090

# Local models - deployed in the cluster (safe to commit to git)
# Remote models with API tokens should go in values-secrets.yaml as remoteModels
localModels:
Expand Down