diff --git a/backend/graph-proxy/src/graphql/mod.rs b/backend/graph-proxy/src/graphql/mod.rs index 7d9c2d955..d303c3fc6 100644 --- a/backend/graph-proxy/src/graphql/mod.rs +++ b/backend/graph-proxy/src/graphql/mod.rs @@ -34,6 +34,7 @@ use axum_extra::{ use lazy_static::lazy_static; use opentelemetry::KeyValue; use std::fmt::Display; +use tracing::instrument; use workflow_templates::WorkflowTemplatesMutation; /// The root schema of the service @@ -60,6 +61,7 @@ pub struct NodeQuery; #[Object] impl NodeQuery { + #[instrument(name = "graph_proxy_node_query", skip(self, ctx))] async fn node(&self, ctx: &Context<'_>, id: ID) -> Option { let id_str = id.to_string(); let parts: Vec<&str> = id_str.split(':').collect(); diff --git a/charts/monitoring/staging-values.yaml b/charts/monitoring/staging-values.yaml index 0f2dc364f..9d21bcade 100644 --- a/charts/monitoring/staging-values.yaml +++ b/charts/monitoring/staging-values.yaml @@ -1,6 +1,14 @@ cluster: pollux grafana: + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + url: "http://{{ .Release.Name }}-prometheus-server:80" + isDefault: true ingress: enabled: true path: "/" diff --git a/charts/otel-collector/templates/role.yaml b/charts/otel-collector/templates/role.yaml index a63489c4a..99801d05e 100644 --- a/charts/otel-collector/templates/role.yaml +++ b/charts/otel-collector/templates/role.yaml @@ -5,5 +5,5 @@ metadata: name: opentelemetry-collector rules: - apiGroups: [""] - resources: ["pods", "services", "endpoints"] + resources: ["pods", "services", "endpoints", "namespaces", "nodes", "resourcequotas"] verbs: ["get", "list", "watch"] diff --git a/charts/otel-collector/values.yaml b/charts/otel-collector/values.yaml index 51bc953d8..fd7d25408 100644 --- a/charts/otel-collector/values.yaml +++ b/charts/otel-collector/values.yaml @@ -6,6 +6,20 @@ opentelemetry-collector: presets: kubernetesAttributes: enabled: true + clusterRole: + rules: + - apiGroups: [""] + resources: ["replicationcontrollers", "resourcequotas", "services", "endpoints"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "replicasets", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["get", "list", "watch"] ports: prometheus: enabled: true @@ -44,6 +58,9 @@ opentelemetry-collector: memory: 24Gi config: processors: + groupbyattrs: + keys: + - k8s.namespace.name batch: send_batch_size: 512 k8sattributes: @@ -119,6 +136,33 @@ opentelemetry-collector: - source_labels: [__meta_kubernetes_pod_phase] regex: Pending|Succeeded|Failed|Completed action: drop + - job_name: 'kubelet-resource-metrics' + scheme: https + tls_config: + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/$1/proxy/metrics/resource + - source_labels: [__meta_kubernetes_node_name] + target_label: node + k8s_cluster: + collection_interval: 30s + auth_type: serviceAccount + resource_attributes: + k8s.namespace.name: + enabled: true + allocatable_types_to_report: + - cpu + - memory otlp: protocols: grpc: @@ -147,8 +191,10 @@ opentelemetry-collector: receivers: - prometheus - otlp + - k8s_cluster processors: - k8sattributes + - groupbyattrs - memory_limiter - batch exporters: diff --git a/charts/workflows-cluster/staging-values.yaml b/charts/workflows-cluster/staging-values.yaml index c48fa0463..c204e0464 100644 --- a/charts/workflows-cluster/staging-values.yaml +++ b/charts/workflows-cluster/staging-values.yaml @@ -14,6 +14,9 @@ vcluster: deployment: replicas: 2 + serviceMonitor: + enabled: true + experimental: deploy: vcluster: