diff --git a/helm/templates/grafana-dashboard-traces.yaml b/helm/templates/grafana-dashboard-traces.yaml index 8ce6505..97e9985 100644 --- a/helm/templates/grafana-dashboard-traces.yaml +++ b/helm/templates/grafana-dashboard-traces.yaml @@ -5,7 +5,7 @@ metadata: name: {{ .Values.app }}-traces-dashboard namespace: {{ .Release.Namespace }} spec: - folder: {{ .Values.app }}-folder + folder: {{ .Release.Namespace | title }} Dashboards instanceSelector: matchLabels: dashboards: "grafana" @@ -42,22 +42,22 @@ spec: { "current": { "selected": true, - "text": "{{ .Release.Namespace }}", - "value": "{{ .Release.Namespace }}" + "text": "{{ .Values.app }}", + "value": "{{ .Values.app }}" }, - "hide": 2, + "hide": 0, "includeAll": false, - "label": "Namespace Pattern", + "label": "Service Name", "multi": false, - "name": "namespace_pattern", + "name": "service_name", "options": [ { "selected": true, - "text": "{{ .Release.Namespace }}", - "value": "{{ .Release.Namespace }}" + "text": "{{ .Values.app }}", + "value": "{{ .Values.app }}" } ], - "query": "{{ .Release.Namespace }}", + "query": "{{ .Values.app }}", "refresh": 0, "type": "custom" } @@ -81,7 +81,7 @@ spec: "targets": [ { "limit": 200, - "query": "{resource.k8s.namespace.name=~\"$namespace_pattern\"}", + "query": "{resource.service.name=~\"$service_name\"}", "queryType": "traceql", "refId": "A", "tableType": "traces", diff --git a/helm/templates/grafana-datasource-rhoai.yaml b/helm/templates/grafana-datasource-rhoai.yaml index 52b33b2..1fafdea 100644 --- a/helm/templates/grafana-datasource-rhoai.yaml +++ b/helm/templates/grafana-datasource-rhoai.yaml @@ -13,7 +13,7 @@ spec: tlsSkipVerify: true name: RHOAI Prometheus type: prometheus - url: 'http://data-science-monitoringstack-prometheus.redhat-ods-monitoring.svc.cluster.local:9090' + url: {{ .Values.monitoring.prometheusUrl | quote }} instanceSelector: matchLabels: dashboards: "grafana" diff --git a/helm/templates/grafana-datasource-tempo.yaml b/helm/templates/grafana-datasource-tempo.yaml index c7a27d2..f2130a9 100644 --- a/helm/templates/grafana-datasource-tempo.yaml +++ b/helm/templates/grafana-datasource-tempo.yaml @@ -36,7 +36,7 @@ spec: secureJsonData: httpHeaderValue1: "redhat-ods-monitoring" type: tempo - url: 'http://tempo-data-science-tempomonolithic.redhat-ods-monitoring.svc.cluster.local:3200' + url: {{ .Values.monitoring.tempoUrl | quote }} instanceSelector: matchLabels: dashboards: "grafana" diff --git a/helm/templates/grafana-proxy-rbac.yaml b/helm/templates/grafana-proxy-rbac.yaml index 0261033..c7ccf78 100644 --- a/helm/templates/grafana-proxy-rbac.yaml +++ b/helm/templates/grafana-proxy-rbac.yaml @@ -18,10 +18,9 @@ rules: - subjectaccessreviews --- apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding +kind: ClusterRoleBinding metadata: name: {{ .Values.app }}-grafana-proxy - namespace: {{ .Release.Namespace }} roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole diff --git a/helm/templates/grafana.yaml b/helm/templates/grafana.yaml index 8a544a3..70737e5 100644 --- a/helm/templates/grafana.yaml +++ b/helm/templates/grafana.yaml @@ -1,14 +1,4 @@ --- -kind: Secret -apiVersion: v1 -metadata: - name: {{ .Values.app }}-grafana-sa-token - namespace: {{ .Release.Namespace }} - annotations: - kubernetes.io/service-account.name: {{ .Values.app }}-grafana-sa - argocd.argoproj.io/sync-wave: "5" -type: kubernetes.io/service-account-token ---- apiVersion: v1 kind: ConfigMap metadata: diff --git a/helm/templates/monitoring-dsci.yaml b/helm/templates/monitoring-dsci.yaml new file mode 100644 index 0000000..a8a5329 --- /dev/null +++ b/helm/templates/monitoring-dsci.yaml @@ -0,0 +1,91 @@ +{{- if .Values.monitoring.enable }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Values.app }}-monitoring-patcher + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded,hook-failed +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ .Values.app }}-monitoring-patcher + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded,hook-failed +rules: + - apiGroups: + - dscinitialization.opendatahub.io + resources: + - dscinitializations + verbs: + - get + - patch + - update +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ .Values.app }}-monitoring-patcher + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded,hook-failed +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Values.app }}-monitoring-patcher +subjects: + - kind: ServiceAccount + name: {{ .Values.app }}-monitoring-patcher + namespace: {{ .Release.Namespace }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ .Values.app }}-monitoring-patcher + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "10" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded,hook-failed +spec: + template: + spec: + serviceAccountName: {{ .Values.app }}-monitoring-patcher + restartPolicy: OnFailure + containers: + - name: patch-dsci + image: {{ .Values.monitoring.patcherImage | quote }} + command: + - sh + - -c + args: + - | + cat <<'EOF' | kubectl apply -f - + apiVersion: dscinitialization.opendatahub.io/v2 + kind: DSCInitialization + metadata: + name: {{ .Values.monitoring.dsciName }} + spec: + monitoring: + managementState: Managed + namespace: {{ .Values.monitoring.namespace }} + metrics: + replicas: {{ .Values.monitoring.metrics.replicas }} + storage: + retention: {{ .Values.monitoring.metrics.retention | quote }} + size: {{ .Values.monitoring.metrics.size | quote }} + traces: + sampleRatio: {{ .Values.monitoring.traces.sampleRatio | quote }} + storage: + backend: {{ .Values.monitoring.traces.storage.backend }} + retention: {{ .Values.monitoring.traces.storage.retention | quote }} + size: {{ .Values.monitoring.traces.storage.size | quote }} + EOF +{{- end }} diff --git a/helm/templates/otel-collector.yaml b/helm/templates/otel-collector.yaml new file mode 100644 index 0000000..86dcab6 --- /dev/null +++ b/helm/templates/otel-collector.yaml @@ -0,0 +1,27 @@ +{{- if and .Values.monitoring.enable .Values.monitoring.deployCollector }} +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: {{ .Values.monitoring.collectorName }} + namespace: {{ .Values.monitoring.namespace }} + annotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true +spec: + mode: deployment + replicas: {{ .Values.monitoring.collectorReplicas }} + config: + receivers: + otlp: + protocols: + grpc: {} + http: {} + exporters: + otlphttp: + endpoint: {{ .Values.monitoring.tempoOtlpHttpEndpoint | quote }} + service: + pipelines: + traces: + receivers: [otlp] + exporters: [otlphttp] +{{- end }} diff --git a/helm/templates/tempo-monolithic.yaml b/helm/templates/tempo-monolithic.yaml new file mode 100644 index 0000000..04714e6 --- /dev/null +++ b/helm/templates/tempo-monolithic.yaml @@ -0,0 +1,25 @@ +{{- if and .Values.monitoring.enable .Values.monitoring.deployTempo }} +--- +apiVersion: tempo.grafana.com/v1alpha1 +kind: TempoMonolithic +metadata: + name: {{ .Values.monitoring.tempoName }} + namespace: {{ .Values.monitoring.namespace }} + annotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true +spec: + management: Managed + ingestion: + otlp: + grpc: + enabled: true + http: + enabled: true + storage: + traces: + backend: {{ .Values.monitoring.tempoStorageBackend }} + size: {{ .Values.monitoring.tempoStorageSize | quote }} + {{- if .Values.monitoring.tempoStorageClassName }} + storageClassName: {{ .Values.monitoring.tempoStorageClassName | quote }} + {{- end }} +{{- end }} diff --git a/helm/values.yaml b/helm/values.yaml index 779ae62..8cb1c17 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -168,7 +168,7 @@ milvus: docsFolder: /data/docs otelCollector: - enabled: false + enabled: true endpoint: http://data-science-collector-collector-headless.redhat-ods-monitoring.svc.cluster.local:4318 protocol: http/protobuf tracesEndpoint: http://data-science-collector-collector-headless.redhat-ods-monitoring.svc.cluster.local:4318/v1/traces @@ -186,6 +186,33 @@ otelCollector: metricExportInterval: 60000 telemetrySinks: otel_trace, otel_metric +monitoring: + enable: true + dsciName: default-dsci + namespace: redhat-ods-monitoring + patcherImage: registry.redhat.io/openshift4/ose-cli:v4.14 + deployTempo: true + tempoName: data-science-tempomonolithic + tempoStorageBackend: pv + tempoStorageSize: 10Gi + tempoStorageClassName: "" + tempoOtlpHttpEndpoint: http://tempo-data-science-tempomonolithic.redhat-ods-monitoring.svc.cluster.local:4318 + metrics: + replicas: 1 + retention: 90d + size: 50Gi + traces: + sampleRatio: "1.0" + storage: + backend: pv + retention: 2160h0m0s + size: 100Gi + deployCollector: true + collectorName: data-science-collector + collectorReplicas: 1 + tempoUrl: http://tempo-data-science-tempomonolithic.redhat-ods-monitoring.svc.cluster.local:3200 + prometheusUrl: http://data-science-monitoringstack-prometheus.redhat-ods-monitoring.svc.cluster.local:9090 + # Local models - deployed in the cluster (safe to commit to git) # Remote models with API tokens should go in values-secrets.yaml as remoteModels localModels: