From 859353b3cb87d1c731e0da5aa33fb200ac03c991 Mon Sep 17 00:00:00 2001 From: Sze Ching Date: Tue, 17 Feb 2026 14:33:31 +0000 Subject: [PATCH 1/7] feat(graph-proxy): test --- backend/graph-proxy/src/graphql/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/graph-proxy/src/graphql/mod.rs b/backend/graph-proxy/src/graphql/mod.rs index 7d9c2d955..c47d45312 100644 --- a/backend/graph-proxy/src/graphql/mod.rs +++ b/backend/graph-proxy/src/graphql/mod.rs @@ -60,6 +60,7 @@ pub struct NodeQuery; #[Object] impl NodeQuery { + #[instrument(skip(self, ctx), fields(id = %id, graphql_operation = "node_query"))] async fn node(&self, ctx: &Context<'_>, id: ID) -> Option { let id_str = id.to_string(); let parts: Vec<&str> = id_str.split(':').collect(); From 0cb6eafdc916dec8f1800111f76767effaa33c44 Mon Sep 17 00:00:00 2001 From: Sze Ching Date: Tue, 17 Feb 2026 15:48:43 +0000 Subject: [PATCH 2/7] feat(graph-proxy): test --- backend/graph-proxy/src/graphql/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/graph-proxy/src/graphql/mod.rs b/backend/graph-proxy/src/graphql/mod.rs index c47d45312..2ec9cf73a 100644 --- a/backend/graph-proxy/src/graphql/mod.rs +++ b/backend/graph-proxy/src/graphql/mod.rs @@ -34,6 +34,7 @@ use axum_extra::{ use lazy_static::lazy_static; use opentelemetry::KeyValue; use std::fmt::Display; +use tracing::instrument; use workflow_templates::WorkflowTemplatesMutation; /// The root schema of the service From 791147e1bb5b538a901491eeff0a0e15352810de Mon Sep 17 00:00:00 2001 From: Sze Ching Date: Tue, 17 Feb 2026 16:24:58 +0000 Subject: [PATCH 3/7] feat(graph-proxy): test --- backend/graph-proxy/src/graphql/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/graph-proxy/src/graphql/mod.rs b/backend/graph-proxy/src/graphql/mod.rs index 2ec9cf73a..d303c3fc6 100644 --- a/backend/graph-proxy/src/graphql/mod.rs +++ b/backend/graph-proxy/src/graphql/mod.rs @@ -61,7 +61,7 @@ pub struct NodeQuery; #[Object] impl NodeQuery { - #[instrument(skip(self, ctx), fields(id = %id, graphql_operation = "node_query"))] + #[instrument(name = "graph_proxy_node_query", skip(self, ctx))] async fn node(&self, ctx: &Context<'_>, id: ID) -> Option { let id_str = id.to_string(); let parts: Vec<&str> = id_str.split(':').collect(); From 61015ca1d003a507f40ec74b8bc1126cac229cea Mon Sep 17 00:00:00 2001 From: Sze Ching Date: Thu, 19 Feb 2026 14:06:42 +0000 Subject: [PATCH 4/7] feat(charts): test otel --- charts/monitoring/staging-values.yaml | 8 +++++ charts/otel-collector/templates/role.yaml | 2 +- charts/otel-collector/values.yaml | 32 ++++++++++++++++++++ charts/workflows-cluster/staging-values.yaml | 3 ++ 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/charts/monitoring/staging-values.yaml b/charts/monitoring/staging-values.yaml index 0f2dc364f..9d21bcade 100644 --- a/charts/monitoring/staging-values.yaml +++ b/charts/monitoring/staging-values.yaml @@ -1,6 +1,14 @@ cluster: pollux grafana: + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + url: "http://{{ .Release.Name }}-prometheus-server:80" + isDefault: true ingress: enabled: true path: "/" diff --git a/charts/otel-collector/templates/role.yaml b/charts/otel-collector/templates/role.yaml index a63489c4a..99801d05e 100644 --- a/charts/otel-collector/templates/role.yaml +++ b/charts/otel-collector/templates/role.yaml @@ -5,5 +5,5 @@ metadata: name: opentelemetry-collector rules: - apiGroups: [""] - resources: ["pods", "services", "endpoints"] + resources: ["pods", "services", "endpoints", "namespaces", "nodes", "resourcequotas"] verbs: ["get", "list", "watch"] diff --git a/charts/otel-collector/values.yaml b/charts/otel-collector/values.yaml index 51bc953d8..ec2b4a07d 100644 --- a/charts/otel-collector/values.yaml +++ b/charts/otel-collector/values.yaml @@ -44,6 +44,9 @@ opentelemetry-collector: memory: 24Gi config: processors: + groupbyattrs: + keys: + - k8s.namespace.name batch: send_batch_size: 512 k8sattributes: @@ -119,6 +122,33 @@ opentelemetry-collector: - source_labels: [__meta_kubernetes_pod_phase] regex: Pending|Succeeded|Failed|Completed action: drop + - job_name: 'kubelet-resource-metrics' + scheme: https + tls_config: + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/$1/proxy/metrics/resource + - source_labels: [__meta_kubernetes_node_name] + target_label: node + k8s_cluster: + collection_interval: 30s + auth_type:serviceAccount + resource_attributes: + k8s.namespacee.name: + enabled: true + allocatable_types_to_report: + - cpu + - memory otlp: protocols: grpc: @@ -147,8 +177,10 @@ opentelemetry-collector: receivers: - prometheus - otlp + - k8s_cluster processors: - k8sattributes + - groupbyattrs - memory_limiter - batch exporters: diff --git a/charts/workflows-cluster/staging-values.yaml b/charts/workflows-cluster/staging-values.yaml index c48fa0463..c204e0464 100644 --- a/charts/workflows-cluster/staging-values.yaml +++ b/charts/workflows-cluster/staging-values.yaml @@ -14,6 +14,9 @@ vcluster: deployment: replicas: 2 + serviceMonitor: + enabled: true + experimental: deploy: vcluster: From 792aa375f90a583463a50d501abb6f05da551b64 Mon Sep 17 00:00:00 2001 From: Sze Ching Date: Thu, 19 Feb 2026 14:09:30 +0000 Subject: [PATCH 5/7] feat(charts): test otel --- charts/otel-collector/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/otel-collector/values.yaml b/charts/otel-collector/values.yaml index ec2b4a07d..a733b5793 100644 --- a/charts/otel-collector/values.yaml +++ b/charts/otel-collector/values.yaml @@ -142,7 +142,7 @@ opentelemetry-collector: target_label: node k8s_cluster: collection_interval: 30s - auth_type:serviceAccount + auth_type: serviceAccount resource_attributes: k8s.namespacee.name: enabled: true From edfec3e824bd020148bb8dafc477d25d56869cc6 Mon Sep 17 00:00:00 2001 From: Sze Ching Date: Thu, 19 Feb 2026 14:32:14 +0000 Subject: [PATCH 6/7] feat(charts): test otel --- charts/otel-collector/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/otel-collector/values.yaml b/charts/otel-collector/values.yaml index a733b5793..d8596e6d5 100644 --- a/charts/otel-collector/values.yaml +++ b/charts/otel-collector/values.yaml @@ -144,7 +144,7 @@ opentelemetry-collector: collection_interval: 30s auth_type: serviceAccount resource_attributes: - k8s.namespacee.name: + k8s.namespace.name: enabled: true allocatable_types_to_report: - cpu From 72418d2df7cfffda7e15451f81d2f4d28fe5a05a Mon Sep 17 00:00:00 2001 From: Sze Ching Date: Thu, 19 Feb 2026 14:55:19 +0000 Subject: [PATCH 7/7] feat(charts): test otel --- charts/otel-collector/values.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/charts/otel-collector/values.yaml b/charts/otel-collector/values.yaml index d8596e6d5..fd7d25408 100644 --- a/charts/otel-collector/values.yaml +++ b/charts/otel-collector/values.yaml @@ -6,6 +6,20 @@ opentelemetry-collector: presets: kubernetesAttributes: enabled: true + clusterRole: + rules: + - apiGroups: [""] + resources: ["replicationcontrollers", "resourcequotas", "services", "endpoints"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "replicasets", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["get", "list", "watch"] ports: prometheus: enabled: true