From 9920c1af8d5e8bb18291ecb816b92c984536cec0 Mon Sep 17 00:00:00 2001 From: Alexander Cristurean Date: Thu, 11 Sep 2025 18:04:23 +0200 Subject: [PATCH 1/6] initial pod chaos. Signed-off-by: Alexander Cristurean --- testsuite/chaos_mesh/__init__.py | 1 + testsuite/chaos_mesh/pod_chaos.py | 186 ++++++++++++++++++ .../custom_metrics_apiserver/__init__.py | 0 .../tests/singlecluster/chaos/__init__.py | 0 .../tests/singlecluster/chaos/conftest.py | 44 +++++ .../singlecluster/chaos/pod_kill/manifets.yml | 12 ++ .../chaos/pod_kill/pod_kill_test.py | 14 ++ 7 files changed, 257 insertions(+) create mode 100644 testsuite/chaos_mesh/__init__.py create mode 100644 testsuite/chaos_mesh/pod_chaos.py create mode 100644 testsuite/custom_metrics_apiserver/__init__.py create mode 100644 testsuite/tests/singlecluster/chaos/__init__.py create mode 100644 testsuite/tests/singlecluster/chaos/conftest.py create mode 100644 testsuite/tests/singlecluster/chaos/pod_kill/manifets.yml create mode 100644 testsuite/tests/singlecluster/chaos/pod_kill/pod_kill_test.py diff --git a/testsuite/chaos_mesh/__init__.py b/testsuite/chaos_mesh/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/testsuite/chaos_mesh/__init__.py @@ -0,0 +1 @@ + diff --git a/testsuite/chaos_mesh/pod_chaos.py b/testsuite/chaos_mesh/pod_chaos.py new file mode 100644 index 00000000..b78419e0 --- /dev/null +++ b/testsuite/chaos_mesh/pod_chaos.py @@ -0,0 +1,186 @@ +"""PodChaos object for simulating Pod faults in Kubernetes.""" + +from typing import Dict, List, Optional, Literal + +from testsuite.kubernetes import KubernetesObject, modify +from testsuite.kubernetes.client import KubernetesClient + + +class PodChaos(KubernetesObject): + """Represents PodChaos CR from Chaos Mesh. + + Supports the following fault types: + - Pod Failure: makes the Pod unavailable for a period of time + - Pod Kill: kills the specified Pod (requires ReplicaSet for recovery) + - Container Kill: kills specified containers in the target Pod + """ + + ACTIONS = Literal["pod-failure", "pod-kill", "container-kill"] + MODES = Literal["one", "all", "fixed", "fixed-percent", "random-max-percent"] + + @classmethod + def create_instance( + cls, + cluster: KubernetesClient, + name: str, + namespace: str = "kuadrant-system", + labels: Optional[Dict[str, str]] = None, + ): + """Creates base instance. + + Args: + cluster: Kubernetes cluster instance + name: Name of the PodChaos resource + namespace: Namespace where to create the PodChaos + labels: Optional labels for the resource + """ + model = { + "apiVersion": "chaos-mesh.org/v1alpha1", + "kind": "PodChaos", + "metadata": { + "name": name, + "namespace": namespace, + "labels": labels or {} + }, + "spec": { + "selector": { + "labelSelectors": {} + } + } + } + return cls(model, context=cluster.context) + + @modify + def set_action(self, action: ACTIONS): + """Set the chaos action. + + Args: + action: Type of chaos action (pod-failure, pod-kill, container-kill) + """ + self.model.spec.action = action + + @modify + def set_mode(self, mode: MODES, value: Optional[str] = None): + """Set the experiment mode. + + Args: + mode: Mode of the experiment: + - one: selecting a random Pod + - all: selecting all eligible Pods + - fixed: selecting specified number of eligible Pods + - fixed-percent: selecting specified percentage of Pods + - random-max-percent: selecting maximum percentage of Pods + value: Parameter for mode configuration (required for fixed/percentage modes) + """ + self.model.spec.mode = mode + if value is not None: + self.model.spec.value = value + + @modify + def set_selector(self, labels: Dict[str, str], namespaces: Optional[List[str]] = None): + """Set pod selector. + + Args: + labels: Label selectors to target pods + namespaces: Optional list of namespaces to target + """ + self.model.spec.selector.labelSelectors = labels + if namespaces: + self.model.spec.selector.namespaces = namespaces + + @modify + def set_container_names(self, containers: List[str]): + """Set target container names (required for container-kill action). + + Args: + containers: List of container names to target + """ + self.model.spec.containerNames = containers + + @modify + def set_grace_period(self, period: int): + """Set grace period for pod-kill action. + + Args: + period: Duration in seconds before deleting Pod + """ + self.model.spec.gracePeriod = period + + @modify + def set_duration(self, duration: str): + """Set experiment duration. + + Args: + duration: Duration string (e.g., "30s", "5m") + """ + self.model.spec.duration = duration + + def pod_failure( + self, + labels: Dict[str, str], + duration: str, + mode: MODES = "one", + value: Optional[str] = None, + namespaces: Optional[List[str]] = None + ): + """Configure for pod-failure chaos experiment. + + Args: + labels: Label selectors to target pods + duration: Duration string (e.g., "30s", "5m") + mode: Mode of the experiment + value: Optional value for fixed/percentage modes + namespaces: Optional list of namespaces to target + """ + self.set_action("pod-failure") + self.set_mode(mode, value) + self.set_selector(labels, namespaces) + self.set_duration(duration) + self.commit() + + def pod_kill( + self, + labels: Dict[str, str], + mode: MODES = "one", + value: Optional[str] = None, + namespaces: Optional[List[str]] = None, + grace_period: int = 0 + ): + """Configure for pod-kill chaos experiment. + + Args: + labels: Label selectors to target pods + mode: Mode of the experiment + value: Optional value for fixed/percentage modes + namespaces: Optional list of namespaces to target + grace_period: Duration in seconds before deleting Pod + """ + self.set_action("pod-kill") + self.set_mode(mode, value) + self.set_selector(labels, namespaces) + if grace_period > 0: + self.set_grace_period(grace_period) + self.commit() + + def container_kill( + self, + labels: Dict[str, str], + containers: List[str], + mode: MODES = "one", + value: Optional[str] = None, + namespaces: Optional[List[str]] = None + ): + """Configure for container-kill chaos experiment. + + Args: + labels: Label selectors to target pods + containers: List of container names to kill + mode: Mode of the experiment + value: Optional value for fixed/percentage modes + namespaces: Optional list of namespaces to target + """ + self.set_action("container-kill") + self.set_mode(mode, value) + self.set_selector(labels, namespaces) + self.set_container_names(containers) + self.commit() \ No newline at end of file diff --git a/testsuite/custom_metrics_apiserver/__init__.py b/testsuite/custom_metrics_apiserver/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/testsuite/tests/singlecluster/chaos/__init__.py b/testsuite/tests/singlecluster/chaos/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/testsuite/tests/singlecluster/chaos/conftest.py b/testsuite/tests/singlecluster/chaos/conftest.py new file mode 100644 index 00000000..35d6c864 --- /dev/null +++ b/testsuite/tests/singlecluster/chaos/conftest.py @@ -0,0 +1,44 @@ +"""Conftest for chaos testing.""" + +import pytest + +from testsuite.chaos_mesh import PodChaos + + +@pytest.fixture(scope="module") +def create_pod_chaos(request, cluster, blame): + """Creates and returns a PodChaos experiment. + + Args: + request: pytest request object + cluster: Kubernetes cluster instance + blame: Fixture to generate unique names + + Returns: + Callable: Function to create PodChaos instances + """ + def _create_pod_chaos(name, namespace="kuadrant-system"): + chaos = PodChaos.create_instance(cluster, blame(name), namespace=namespace) + request.addfinalizer(chaos.delete) + return chaos + + return _create_pod_chaos + + +@pytest.fixture(scope="module") +def operator_pod_chaos(create_pod_chaos): + """Creates a PodChaos experiment targeting the Kuadrant operator. + + Args: + create_pod_chaos: Factory fixture for PodChaos + + Returns: + PodChaos: Configured PodChaos instance + """ + chaos = create_pod_chaos("operator-kill") + chaos.container_kill( + labels={"app": "kuadrant"}, + containers=["manager"], + duration="10s" + ) + return chaos \ No newline at end of file diff --git a/testsuite/tests/singlecluster/chaos/pod_kill/manifets.yml b/testsuite/tests/singlecluster/chaos/pod_kill/manifets.yml new file mode 100644 index 00000000..ed5216e0 --- /dev/null +++ b/testsuite/tests/singlecluster/chaos/pod_kill/manifets.yml @@ -0,0 +1,12 @@ +apiVersion: chaos-mesh.org/v1alpha1 +kind: PodChaos +metadata: + name: container-kill-example + namespace: kuadrant-system +spec: + action: container-kill + mode: one + containerNames: ['manager'] + selector: + labelSelectors: + 'app': 'kuadrant' \ No newline at end of file diff --git a/testsuite/tests/singlecluster/chaos/pod_kill/pod_kill_test.py b/testsuite/tests/singlecluster/chaos/pod_kill/pod_kill_test.py new file mode 100644 index 00000000..6084e207 --- /dev/null +++ b/testsuite/tests/singlecluster/chaos/pod_kill/pod_kill_test.py @@ -0,0 +1,14 @@ +"""Test Kuadrant operator resilience when its container is killed.""" + +import pytest + +pytestmark = [pytest.mark.chaos, pytest.mark.disruptive, pytest.mark.kuadrant_only] + + +def test_operator_pod_kill(operator_pod_chaos, authorization): + """Test operator resilience when its container is killed.""" + # Wait for operator to recover and reconcile + assert authorization.wait_until_ready() + + # Verify operator is functioning by checking policy status + assert authorization.wait_until_enforced() \ No newline at end of file From 465b4ce6acf055c5e4eb795c831221a5d8fda512 Mon Sep 17 00:00:00 2001 From: Alexander Cristurean Date: Fri, 12 Sep 2025 18:12:15 +0200 Subject: [PATCH 2/6] wrapped up chaos test for killing container. Signed-off-by: Alexander Cristurean --- testsuite/chaos_mesh/__init__.py | 106 ++++++++++ testsuite/chaos_mesh/pod_chaos.py | 186 ------------------ testsuite/custom_metrics_apiserver/client.py | 26 ++- .../tests/singlecluster/chaos/conftest.py | 40 ++-- .../chaos/container_kill/container_kill.py | 33 ++++ .../singlecluster/chaos/pod_kill/manifets.yml | 12 -- .../chaos/pod_kill/pod_kill_test.py | 14 -- 7 files changed, 170 insertions(+), 247 deletions(-) delete mode 100644 testsuite/chaos_mesh/pod_chaos.py create mode 100644 testsuite/tests/singlecluster/chaos/container_kill/container_kill.py delete mode 100644 testsuite/tests/singlecluster/chaos/pod_kill/manifets.yml delete mode 100644 testsuite/tests/singlecluster/chaos/pod_kill/pod_kill_test.py diff --git a/testsuite/chaos_mesh/__init__.py b/testsuite/chaos_mesh/__init__.py index 8b137891..0ec85805 100644 --- a/testsuite/chaos_mesh/__init__.py +++ b/testsuite/chaos_mesh/__init__.py @@ -1 +1,107 @@ +"""PodChaos object for simulating Pod faults in Kubernetes.""" +from typing import Dict, List, Optional, Literal + +from testsuite.kubernetes import KubernetesObject, modify +from testsuite.kubernetes.client import KubernetesClient + + +class PodChaos(KubernetesObject): + """Represents PodChaos CR from Chaos Mesh.""" + + ACTIONS = Literal["pod-failure", "pod-kill", "container-kill"] + MODES = Literal["one", "all", "fixed", "fixed-percent", "random-max-percent"] + + @classmethod + def create_instance( + cls, + cluster: KubernetesClient, + name: str, + namespace: str = "kuadrant-system", + labels: Optional[Dict[str, str]] = None, + ): + """Creates base instance.""" + model = { + "apiVersion": "chaos-mesh.org/v1alpha1", + "kind": "PodChaos", + "metadata": {"name": name, "namespace": namespace, "labels": labels or {}}, + "spec": {"selector": {"labelSelectors": {}}}, + } + return cls(model, context=cluster.context) + + @modify + def set_action(self, action: ACTIONS): + """Set the chaos action.""" + self.model.spec.action = action + + @modify + def set_mode(self, mode: MODES, value: Optional[str] = None): + """Set the experiment mode.""" + self.model.spec.mode = mode + if value is not None: + self.model.spec.value = value + + @modify + def set_selector(self, labels: Dict[str, str], namespaces: Optional[List[str]] = None): + """Set pod selector.""" + self.model.spec.selector.labelSelectors = labels + if namespaces: + self.model.spec.selector.namespaces = namespaces + + @modify + def set_container_names(self, containers: List[str]): + """Set target container names.""" + self.model.spec.containerNames = containers + + @modify + def set_grace_period(self, period: int): + """Set grace period for pod-kill action.""" + self.model.spec.gracePeriod = period + + @modify + def set_duration(self, duration: str): + """Set experiment duration.""" + self.model.spec.duration = duration + + def pod_failure( + self, + labels: Dict[str, str], + duration: str, + mode: MODES = "one", + value: Optional[str] = None, + namespaces: Optional[List[str]] = None, + ): + """Configure for pod-failure chaos experiment.""" + self.set_action("pod-failure") + self.set_mode(mode, value) + self.set_selector(labels, namespaces) + self.set_duration(duration) + + def pod_kill( + self, + labels: Dict[str, str], + mode: MODES = "one", + value: Optional[str] = None, + namespaces: Optional[List[str]] = None, + grace_period: int = 0, + ): + """Configure for pod-kill chaos experiment.""" + self.set_action("pod-kill") + self.set_mode(mode, value) + self.set_selector(labels, namespaces) + if grace_period > 0: + self.set_grace_period(grace_period) + + def container_kill( + self, + labels: Dict[str, str], + containers: List[str], + mode: MODES = "one", + value: Optional[str] = None, + namespaces: Optional[List[str]] = None, + ): + """Configure for container-kill chaos experiment.""" + self.set_action("container-kill") + self.set_mode(mode, value) + self.set_selector(labels, namespaces) + self.set_container_names(containers) diff --git a/testsuite/chaos_mesh/pod_chaos.py b/testsuite/chaos_mesh/pod_chaos.py deleted file mode 100644 index b78419e0..00000000 --- a/testsuite/chaos_mesh/pod_chaos.py +++ /dev/null @@ -1,186 +0,0 @@ -"""PodChaos object for simulating Pod faults in Kubernetes.""" - -from typing import Dict, List, Optional, Literal - -from testsuite.kubernetes import KubernetesObject, modify -from testsuite.kubernetes.client import KubernetesClient - - -class PodChaos(KubernetesObject): - """Represents PodChaos CR from Chaos Mesh. - - Supports the following fault types: - - Pod Failure: makes the Pod unavailable for a period of time - - Pod Kill: kills the specified Pod (requires ReplicaSet for recovery) - - Container Kill: kills specified containers in the target Pod - """ - - ACTIONS = Literal["pod-failure", "pod-kill", "container-kill"] - MODES = Literal["one", "all", "fixed", "fixed-percent", "random-max-percent"] - - @classmethod - def create_instance( - cls, - cluster: KubernetesClient, - name: str, - namespace: str = "kuadrant-system", - labels: Optional[Dict[str, str]] = None, - ): - """Creates base instance. - - Args: - cluster: Kubernetes cluster instance - name: Name of the PodChaos resource - namespace: Namespace where to create the PodChaos - labels: Optional labels for the resource - """ - model = { - "apiVersion": "chaos-mesh.org/v1alpha1", - "kind": "PodChaos", - "metadata": { - "name": name, - "namespace": namespace, - "labels": labels or {} - }, - "spec": { - "selector": { - "labelSelectors": {} - } - } - } - return cls(model, context=cluster.context) - - @modify - def set_action(self, action: ACTIONS): - """Set the chaos action. - - Args: - action: Type of chaos action (pod-failure, pod-kill, container-kill) - """ - self.model.spec.action = action - - @modify - def set_mode(self, mode: MODES, value: Optional[str] = None): - """Set the experiment mode. - - Args: - mode: Mode of the experiment: - - one: selecting a random Pod - - all: selecting all eligible Pods - - fixed: selecting specified number of eligible Pods - - fixed-percent: selecting specified percentage of Pods - - random-max-percent: selecting maximum percentage of Pods - value: Parameter for mode configuration (required for fixed/percentage modes) - """ - self.model.spec.mode = mode - if value is not None: - self.model.spec.value = value - - @modify - def set_selector(self, labels: Dict[str, str], namespaces: Optional[List[str]] = None): - """Set pod selector. - - Args: - labels: Label selectors to target pods - namespaces: Optional list of namespaces to target - """ - self.model.spec.selector.labelSelectors = labels - if namespaces: - self.model.spec.selector.namespaces = namespaces - - @modify - def set_container_names(self, containers: List[str]): - """Set target container names (required for container-kill action). - - Args: - containers: List of container names to target - """ - self.model.spec.containerNames = containers - - @modify - def set_grace_period(self, period: int): - """Set grace period for pod-kill action. - - Args: - period: Duration in seconds before deleting Pod - """ - self.model.spec.gracePeriod = period - - @modify - def set_duration(self, duration: str): - """Set experiment duration. - - Args: - duration: Duration string (e.g., "30s", "5m") - """ - self.model.spec.duration = duration - - def pod_failure( - self, - labels: Dict[str, str], - duration: str, - mode: MODES = "one", - value: Optional[str] = None, - namespaces: Optional[List[str]] = None - ): - """Configure for pod-failure chaos experiment. - - Args: - labels: Label selectors to target pods - duration: Duration string (e.g., "30s", "5m") - mode: Mode of the experiment - value: Optional value for fixed/percentage modes - namespaces: Optional list of namespaces to target - """ - self.set_action("pod-failure") - self.set_mode(mode, value) - self.set_selector(labels, namespaces) - self.set_duration(duration) - self.commit() - - def pod_kill( - self, - labels: Dict[str, str], - mode: MODES = "one", - value: Optional[str] = None, - namespaces: Optional[List[str]] = None, - grace_period: int = 0 - ): - """Configure for pod-kill chaos experiment. - - Args: - labels: Label selectors to target pods - mode: Mode of the experiment - value: Optional value for fixed/percentage modes - namespaces: Optional list of namespaces to target - grace_period: Duration in seconds before deleting Pod - """ - self.set_action("pod-kill") - self.set_mode(mode, value) - self.set_selector(labels, namespaces) - if grace_period > 0: - self.set_grace_period(grace_period) - self.commit() - - def container_kill( - self, - labels: Dict[str, str], - containers: List[str], - mode: MODES = "one", - value: Optional[str] = None, - namespaces: Optional[List[str]] = None - ): - """Configure for container-kill chaos experiment. - - Args: - labels: Label selectors to target pods - containers: List of container names to kill - mode: Mode of the experiment - value: Optional value for fixed/percentage modes - namespaces: Optional list of namespaces to target - """ - self.set_action("container-kill") - self.set_mode(mode, value) - self.set_selector(labels, namespaces) - self.set_container_names(containers) - self.commit() \ No newline at end of file diff --git a/testsuite/custom_metrics_apiserver/client.py b/testsuite/custom_metrics_apiserver/client.py index 1334cb47..7655d5f8 100644 --- a/testsuite/custom_metrics_apiserver/client.py +++ b/testsuite/custom_metrics_apiserver/client.py @@ -1,23 +1,21 @@ -from urllib.parse import urljoin +"""Client for interacting with the Custom Metrics API Server. + +This module provides a client for writing metrics to the Custom Metrics API Server, +which can be used to set custom metrics for Kubernetes resources. +""" + import httpx class CustomMetricsApiServerClient(httpx.Client): - """Client for the Custom Metrics API Server""" - - def __init__(self, url: str): - return super().__init__(base_url=url, verify=False, headers={"Content-Type": "application/json"}) + """Client for the Custom Metrics API Server.""" - def write_metric(self, namespace: str, resource_type: str, name: str, metric: str, value: int): - """Write a metric value to the Custom Metrics API Server. + def __init__(self, url: str) -> None: + """Initialize the client with base URL and default headers""" + super().__init__(base_url=url, verify=False, headers={"Content-Type": "application/json"}) - Args: - namespace: The namespace of the resource - resource_type: The type of resource (e.g. 'pods', 'services') - name: The name of the resource - metric: The name of the metric - value: The value to set - """ + def write_metric(self, namespace: str, resource_type: str, name: str, metric: str, value: int) -> int: + """Write a metric value to the Custom Metrics API Server""" endpoint = f"/write-metrics/namespaces/{namespace}/{resource_type}/{name}/{metric}" response = self.post(endpoint, content=f"{value}") diff --git a/testsuite/tests/singlecluster/chaos/conftest.py b/testsuite/tests/singlecluster/chaos/conftest.py index 35d6c864..bc688860 100644 --- a/testsuite/tests/singlecluster/chaos/conftest.py +++ b/testsuite/tests/singlecluster/chaos/conftest.py @@ -1,22 +1,15 @@ """Conftest for chaos testing.""" import pytest +import openshift_client as oc from testsuite.chaos_mesh import PodChaos @pytest.fixture(scope="module") def create_pod_chaos(request, cluster, blame): - """Creates and returns a PodChaos experiment. - - Args: - request: pytest request object - cluster: Kubernetes cluster instance - blame: Fixture to generate unique names - - Returns: - Callable: Function to create PodChaos instances - """ + """Creates and returns a PodChaos experiment.""" + def _create_pod_chaos(name, namespace="kuadrant-system"): chaos = PodChaos.create_instance(cluster, blame(name), namespace=namespace) request.addfinalizer(chaos.delete) @@ -26,19 +19,24 @@ def _create_pod_chaos(name, namespace="kuadrant-system"): @pytest.fixture(scope="module") -def operator_pod_chaos(create_pod_chaos): - """Creates a PodChaos experiment targeting the Kuadrant operator. - - Args: - create_pod_chaos: Factory fixture for PodChaos - - Returns: - PodChaos: Configured PodChaos instance - """ +def kuadrant_operator_pod_chaos(create_pod_chaos): + """Creates a PodChaos experiment targeting the Kuadrant operator.""" chaos = create_pod_chaos("operator-kill") chaos.container_kill( labels={"app": "kuadrant"}, containers=["manager"], - duration="10s" ) - return chaos \ No newline at end of file + return chaos + + +@pytest.fixture(autouse=True) +def restart_operator(cluster): + """Restart the Kuadrant operator deployment after each test.""" + yield # Run the test first + + # After test, delete the pod to force a restart + kuadrant_system = cluster.change_project("kuadrant-system") + with kuadrant_system.context: + # Find and delete the operator pod + pod = oc.selector("pod", labels={"app": "kuadrant"}).object() + pod.delete() diff --git a/testsuite/tests/singlecluster/chaos/container_kill/container_kill.py b/testsuite/tests/singlecluster/chaos/container_kill/container_kill.py new file mode 100644 index 00000000..45ebdf09 --- /dev/null +++ b/testsuite/tests/singlecluster/chaos/container_kill/container_kill.py @@ -0,0 +1,33 @@ +"""Test Kuadrant operator resilience when its container is killed.""" + +import pytest +import openshift_client as oc + +pytestmark = [pytest.mark.disruptive, pytest.mark.kuadrant_only] + + +def test_kuadrant_operator_container_kill(cluster, operator_pod_chaos): + """Test operator resilience when its container is killed.""" + # Check actual operator labels first + kuadrant_system = cluster.change_project("kuadrant-system") + with kuadrant_system.context: + pod = oc.selector("pod", labels={"app": "kuadrant"}).object() + assert pod.status() == "Running" + + # Apply chaos + operator_pod_chaos.commit() + + # Get logs after recovery + with kuadrant_system.context: + pod = oc.selector("pod", labels={"app": "kuadrant"}).object() + logs = pod.logs() + + # Get the log content (first and only value in the dict) + log_content = next(iter(logs.values())) + + expected_error = "unable to start extension manager" + socket_error = "address already in use" + + assert ( + expected_error in log_content and socket_error in log_content + ), "Expected extension manager error about socket already in use not found in logs" diff --git a/testsuite/tests/singlecluster/chaos/pod_kill/manifets.yml b/testsuite/tests/singlecluster/chaos/pod_kill/manifets.yml deleted file mode 100644 index ed5216e0..00000000 --- a/testsuite/tests/singlecluster/chaos/pod_kill/manifets.yml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: chaos-mesh.org/v1alpha1 -kind: PodChaos -metadata: - name: container-kill-example - namespace: kuadrant-system -spec: - action: container-kill - mode: one - containerNames: ['manager'] - selector: - labelSelectors: - 'app': 'kuadrant' \ No newline at end of file diff --git a/testsuite/tests/singlecluster/chaos/pod_kill/pod_kill_test.py b/testsuite/tests/singlecluster/chaos/pod_kill/pod_kill_test.py deleted file mode 100644 index 6084e207..00000000 --- a/testsuite/tests/singlecluster/chaos/pod_kill/pod_kill_test.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Test Kuadrant operator resilience when its container is killed.""" - -import pytest - -pytestmark = [pytest.mark.chaos, pytest.mark.disruptive, pytest.mark.kuadrant_only] - - -def test_operator_pod_kill(operator_pod_chaos, authorization): - """Test operator resilience when its container is killed.""" - # Wait for operator to recover and reconcile - assert authorization.wait_until_ready() - - # Verify operator is functioning by checking policy status - assert authorization.wait_until_enforced() \ No newline at end of file From 8a34fdcbef7219594268ae227076070018a5a01d Mon Sep 17 00:00:00 2001 From: Alexander Cristurean Date: Wed, 17 Sep 2025 16:47:52 +0200 Subject: [PATCH 3/6] change test to assert on no errors. Signed-off-by: Alexander Cristurean --- .../tests/singlecluster/chaos/conftest.py | 6 +++ .../chaos/container_kill/container_kill.py | 39 ++++++++++++------- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/testsuite/tests/singlecluster/chaos/conftest.py b/testsuite/tests/singlecluster/chaos/conftest.py index bc688860..6008a281 100644 --- a/testsuite/tests/singlecluster/chaos/conftest.py +++ b/testsuite/tests/singlecluster/chaos/conftest.py @@ -40,3 +40,9 @@ def restart_operator(cluster): # Find and delete the operator pod pod = oc.selector("pod", labels={"app": "kuadrant"}).object() pod.delete() + + +@pytest.fixture(autouse=True) +def commit(): + """Override commit fixture to do nothing.""" + pass diff --git a/testsuite/tests/singlecluster/chaos/container_kill/container_kill.py b/testsuite/tests/singlecluster/chaos/container_kill/container_kill.py index 45ebdf09..1ca5883d 100644 --- a/testsuite/tests/singlecluster/chaos/container_kill/container_kill.py +++ b/testsuite/tests/singlecluster/chaos/container_kill/container_kill.py @@ -1,33 +1,44 @@ """Test Kuadrant operator resilience when its container is killed.""" +import json import pytest import openshift_client as oc pytestmark = [pytest.mark.disruptive, pytest.mark.kuadrant_only] -def test_kuadrant_operator_container_kill(cluster, operator_pod_chaos): +def test_operator_container_kill(cluster, kuadrant_operator_pod_chaos): """Test operator resilience when its container is killed.""" # Check actual operator labels first kuadrant_system = cluster.change_project("kuadrant-system") with kuadrant_system.context: pod = oc.selector("pod", labels={"app": "kuadrant"}).object() - assert pod.status() == "Running" + assert pod.model.status.phase == "Running" # Apply chaos - operator_pod_chaos.commit() + kuadrant_operator_pod_chaos.commit() # Get logs after recovery with kuadrant_system.context: pod = oc.selector("pod", labels={"app": "kuadrant"}).object() - logs = pod.logs() - - # Get the log content (first and only value in the dict) - log_content = next(iter(logs.values())) - - expected_error = "unable to start extension manager" - socket_error = "address already in use" - - assert ( - expected_error in log_content and socket_error in log_content - ), "Expected extension manager error about socket already in use not found in logs" + log_content = next(iter(pod.logs().values())) + + # Check each log line for errors + errors = [] + for line in log_content.splitlines(): + try: + log_entry = json.loads(line) + if log_entry.get("level") == "error": + error_details = { + "msg": log_entry.get("msg", "Unknown error"), + "error": log_entry.get("error"), + "stacktrace": log_entry.get("stacktrace"), + "timestamp": log_entry.get("ts") + } + # Remove None values for cleaner output + error_details = {k: v for k, v in error_details.items() if v is not None} + errors.append(error_details) + except json.JSONDecodeError: + continue # Skip non-JSON lines + + assert not errors, f"Found errors in operator logs: {errors}" \ No newline at end of file From 23b713d06f9de4ec51cd1971a6c7771daf3c9a92 Mon Sep 17 00:00:00 2001 From: Alexander Cristurean Date: Wed, 17 Sep 2025 16:51:44 +0200 Subject: [PATCH 4/6] reformat code. Signed-off-by: Alexander Cristurean --- .../singlecluster/chaos/container_kill/container_kill.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testsuite/tests/singlecluster/chaos/container_kill/container_kill.py b/testsuite/tests/singlecluster/chaos/container_kill/container_kill.py index 1ca5883d..8d8d09b5 100644 --- a/testsuite/tests/singlecluster/chaos/container_kill/container_kill.py +++ b/testsuite/tests/singlecluster/chaos/container_kill/container_kill.py @@ -33,7 +33,7 @@ def test_operator_container_kill(cluster, kuadrant_operator_pod_chaos): "msg": log_entry.get("msg", "Unknown error"), "error": log_entry.get("error"), "stacktrace": log_entry.get("stacktrace"), - "timestamp": log_entry.get("ts") + "timestamp": log_entry.get("ts"), } # Remove None values for cleaner output error_details = {k: v for k, v in error_details.items() if v is not None} @@ -41,4 +41,4 @@ def test_operator_container_kill(cluster, kuadrant_operator_pod_chaos): except json.JSONDecodeError: continue # Skip non-JSON lines - assert not errors, f"Found errors in operator logs: {errors}" \ No newline at end of file + assert not errors, f"Found errors in operator logs: {errors}" From 5ed59343d5a75ce49ff44e10a19140eefec15784 Mon Sep 17 00:00:00 2001 From: Alexander Cristurean Date: Wed, 17 Sep 2025 16:58:57 +0200 Subject: [PATCH 5/6] added description for commit in pod chaos. Signed-off-by: Alexander Cristurean --- testsuite/tests/singlecluster/chaos/conftest.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/testsuite/tests/singlecluster/chaos/conftest.py b/testsuite/tests/singlecluster/chaos/conftest.py index 6008a281..4ddb77f0 100644 --- a/testsuite/tests/singlecluster/chaos/conftest.py +++ b/testsuite/tests/singlecluster/chaos/conftest.py @@ -44,5 +44,11 @@ def restart_operator(cluster): @pytest.fixture(autouse=True) def commit(): - """Override commit fixture to do nothing.""" - pass + """ + Override commit fixture to do nothing. + + In chaos testing, we don't want the standard commit behavior that + ensures all components are ready and stable before tests run. + Chaos tests need to control component lifecycle themselves. + """ + pass # pylint: disable=unnecessary-pass From 8d98e5784c964b5542ff566a9efdf2f4c0368e4b Mon Sep 17 00:00:00 2001 From: Alexander Cristurean Date: Wed, 24 Sep 2025 09:31:30 +0200 Subject: [PATCH 6/6] reorgganized the chaos tests. Signed-off-by: Alexander Cristurean --- .../chaos/control_plane/conftest.py | 69 ++++++++ .../pod_chaos/test_operator_container_kill.py | 44 +++++ .../pod_chaos/test_operator_pod_kill.py | 78 +++++++++ .../test_oidc_provider_network.py | 113 +++++++++++++ .../chaos/data_plane/conftest.py | 107 +++++++++++++ .../test_redis_connection_chaos.py | 151 ++++++++++++++++++ 6 files changed, 562 insertions(+) create mode 100644 testsuite/tests/singlecluster/chaos/control_plane/conftest.py create mode 100644 testsuite/tests/singlecluster/chaos/control_plane/pod_chaos/test_operator_container_kill.py create mode 100644 testsuite/tests/singlecluster/chaos/control_plane/pod_chaos/test_operator_pod_kill.py create mode 100644 testsuite/tests/singlecluster/chaos/data_plane/authorino/network_chaos/test_oidc_provider_network.py create mode 100644 testsuite/tests/singlecluster/chaos/data_plane/conftest.py create mode 100644 testsuite/tests/singlecluster/chaos/data_plane/limitador/network_chaos/test_redis_connection_chaos.py diff --git a/testsuite/tests/singlecluster/chaos/control_plane/conftest.py b/testsuite/tests/singlecluster/chaos/control_plane/conftest.py new file mode 100644 index 00000000..317d96cb --- /dev/null +++ b/testsuite/tests/singlecluster/chaos/control_plane/conftest.py @@ -0,0 +1,69 @@ +"""Conftest for control plane chaos testing.""" + +import pytest + +@pytest.fixture(scope="module") +def kuadrant_operator_selector(): + """Selector for Kuadrant operator pods.""" + return {"app": "kuadrant"} + + +@pytest.fixture(scope="module") +def control_plane_namespace(): + """Namespace where control plane components run.""" + return "kuadrant-system" + + +@pytest.fixture(scope="module") +def operator_chaos_factory(create_pod_chaos, kuadrant_operator_selector): + """Factory fixture for creating operator chaos experiments.""" + def _create_operator_chaos(name, action, **kwargs): + chaos = create_pod_chaos(f"operator-{name}") + + if action == "container-kill": + chaos.container_kill( + labels=kuadrant_operator_selector, + containers=kwargs.get("containers", ["manager"]), + ) + elif action == "pod-kill": + chaos.pod_kill( + labels=kuadrant_operator_selector, + grace_period=kwargs.get("grace_period", 0), + ) + elif action == "pod-failure": + chaos.pod_failure( + labels=kuadrant_operator_selector, + ) + else: + raise ValueError(f"Unsupported action: {action}") + + return chaos + return _create_operator_chaos + + +@pytest.fixture(scope="module") +def operator_network_chaos(create_network_chaos, kuadrant_operator_selector): + """Creates NetworkChaos targeting the Kuadrant operator.""" + def _create_network_chaos(name, action="delay", **kwargs): + chaos = create_network_chaos(f"operator-network-{name}") + chaos.configure_network_chaos( + labels=kuadrant_operator_selector, + action=action, + **kwargs + ) + return chaos + return _create_network_chaos + + +@pytest.fixture(scope="module") +def operator_stress_chaos(create_stress_chaos, kuadrant_operator_selector): + """Creates StressChaos targeting the Kuadrant operator.""" + def _create_stress_chaos(name, stress_type="memory", **kwargs): + chaos = create_stress_chaos(f"operator-stress-{name}") + chaos.configure_stress( + labels=kuadrant_operator_selector, + stress_type=stress_type, + **kwargs + ) + return chaos + return _create_stress_chaos diff --git a/testsuite/tests/singlecluster/chaos/control_plane/pod_chaos/test_operator_container_kill.py b/testsuite/tests/singlecluster/chaos/control_plane/pod_chaos/test_operator_container_kill.py new file mode 100644 index 00000000..18adeb4c --- /dev/null +++ b/testsuite/tests/singlecluster/chaos/control_plane/pod_chaos/test_operator_container_kill.py @@ -0,0 +1,44 @@ +"""Test Kuadrant operator resilience when its container is killed.""" + +import json +import pytest +import openshift_client as oc + +pytestmark = [pytest.mark.disruptive, pytest.mark.kuadrant_only] + + +def test_operator_container_kill_basic(cluster, operator_chaos_factory): + """Test basic operator container kill and recovery.""" + kuadrant_system = cluster.change_project("kuadrant-system") + + # Verify operator is running + with kuadrant_system.context: + pod = oc.selector("pod", labels={"app": "kuadrant"}).object() + assert pod.model.status.phase == "Running" + + # Apply chaos - kill container + chaos = operator_chaos_factory("container-kill-basic", "container-kill") + chaos.commit() + + # Verify recovery and check logs + with kuadrant_system.context: + pod = oc.selector("pod", labels={"app": "kuadrant"}).object() + log_content = next(iter(pod.logs().values())) + + # Check for error-level logs + errors = [] + for line in log_content.splitlines(): + try: + log_entry = json.loads(line) + if log_entry.get("level") == "error": + error_details = { + "msg": log_entry.get("msg", "Unknown error"), + "error": log_entry.get("error"), + "timestamp": log_entry.get("ts"), + } + error_details = {k: v for k, v in error_details.items() if v is not None} + errors.append(error_details) + except json.JSONDecodeError: + continue + + assert not errors, f"Found errors in operator logs: {errors}" diff --git a/testsuite/tests/singlecluster/chaos/control_plane/pod_chaos/test_operator_pod_kill.py b/testsuite/tests/singlecluster/chaos/control_plane/pod_chaos/test_operator_pod_kill.py new file mode 100644 index 00000000..fe3bde58 --- /dev/null +++ b/testsuite/tests/singlecluster/chaos/control_plane/pod_chaos/test_operator_pod_kill.py @@ -0,0 +1,78 @@ +"""Test Kuadrant operator resilience with pod-kill chaos.""" + +import pytest +import openshift_client as oc + +pytestmark = [pytest.mark.disruptive, pytest.mark.kuadrant_only] + + +def test_operator_pod_kill_basic(cluster, operator_chaos_factory): + """Test basic operator pod kill and recovery.""" + kuadrant_system = cluster.change_project("kuadrant-system") + + # Verify operator is running + with kuadrant_system.context: + pod = oc.selector("pod", labels={"app": "kuadrant"}).object() + assert pod.model.status.phase == "Running" + original_pod_name = pod.model.metadata.name + + # Apply chaos - kill pod + chaos = operator_chaos_factory("pod-kill-basic", "pod-kill") + chaos.commit() + + # Verify new pod is created and running + with kuadrant_system.context: + pod = oc.selector("pod", labels={"app": "kuadrant"}).object() + assert pod.model.status.phase == "Running" + # Should be a different pod + assert pod.model.metadata.name != original_pod_name + + +def test_operator_pod_kill_with_grace_period(cluster, operator_chaos_factory): + """Test operator pod kill with custom grace period.""" + # Create chaos with 30s grace period + chaos = operator_chaos_factory("graceful-kill", "pod-kill", grace_period=30) + chaos.commit() + + kuadrant_system = cluster.change_project("kuadrant-system") + with kuadrant_system.context: + pod = oc.selector("pod", labels={"app": "kuadrant"}).object() + assert pod.model.status.phase == "Running" + + +def test_operator_pod_kill_immediate(cluster, operator_chaos_factory): + """Test operator pod kill with immediate termination.""" + # Create chaos with 0s grace period (immediate kill) + chaos = operator_chaos_factory("immediate-kill", "pod-kill", grace_period=0) + chaos.commit() + + kuadrant_system = cluster.change_project("kuadrant-system") + with kuadrant_system.context: + pod = oc.selector("pod", labels={"app": "kuadrant"}).object() + assert pod.model.status.phase == "Running" + + +def test_operator_pod_failure_recovery(cluster, operator_chaos_factory): + """Test operator recovery from pod failure.""" + kuadrant_system = cluster.change_project("kuadrant-system") + + # Apply chaos - make pod fail + chaos = operator_chaos_factory("pod-failure-recovery", "pod-failure") + chaos.commit() + + # Verify operator eventually recovers + with kuadrant_system.context: + pod = oc.selector("pod", labels={"app": "kuadrant"}).object() + assert pod.model.status.phase == "Running" + + +def test_operator_pod_failure_custom_duration(cluster, operator_chaos_factory): + """Test operator pod failure with custom duration.""" + # Create chaos with longer failure duration + chaos = operator_chaos_factory("long-failure", "pod-failure", duration="60s") + chaos.commit() + + kuadrant_system = cluster.change_project("kuadrant-system") + with kuadrant_system.context: + pod = oc.selector("pod", labels={"app": "kuadrant"}).object() + assert pod.model.status.phase == "Running" diff --git a/testsuite/tests/singlecluster/chaos/data_plane/authorino/network_chaos/test_oidc_provider_network.py b/testsuite/tests/singlecluster/chaos/data_plane/authorino/network_chaos/test_oidc_provider_network.py new file mode 100644 index 00000000..6ce94a62 --- /dev/null +++ b/testsuite/tests/singlecluster/chaos/data_plane/authorino/network_chaos/test_oidc_provider_network.py @@ -0,0 +1,113 @@ +"""Test Authorino resilience when OIDC provider network is disrupted.""" + +import pytest +import time + +pytestmark = [pytest.mark.disruptive, pytest.mark.authorino] + + +def test_authorino_oidc_network_delay( + cluster, authorino_network_chaos, oidc_provider, auth_policy_with_oidc +): + """Test Authorino behavior with delayed OIDC provider responses.""" + # Apply auth policy that uses OIDC + auth_policy_with_oidc.commit() + + # Create network delay to OIDC provider + chaos = authorino_network_chaos( + "oidc-delay", + action="delay", + external_targets=[oidc_provider.issuer_url], + delay="3s", + duration="60s" + ) + chaos.commit() + + # Test authentication with delay + start_time = time.time() + response = auth_policy_with_oidc.test_authentication() + end_time = time.time() + + # Should still work but be slower + assert response.status_code in [200, 401] # Auth decision made + assert end_time - start_time > 3.0 # Delay applied + + # Verify Authorino handles timeout gracefully + assert "timeout" not in response.headers.get("x-ext-auth-reason", "").lower() + + +def test_authorino_oidc_network_partition( + cluster, authorino_network_chaos, oidc_provider, auth_policy_with_oidc +): + """Test Authorino behavior when OIDC provider is unreachable.""" + auth_policy_with_oidc.commit() + + # Create network partition to OIDC provider + chaos = authorino_network_chaos( + "oidc-partition", + action="partition", + external_targets=[oidc_provider.issuer_url], + duration="30s" + ) + chaos.commit() + + # Test authentication during partition + response = auth_policy_with_oidc.test_authentication() + + # Should fail gracefully (not hang indefinitely) + assert response.status_code == 401 + assert "connection" in response.headers.get("x-ext-auth-reason", "").lower() + + +def test_authorino_oidc_intermittent_failures( + cluster, authorino_network_chaos, oidc_provider, auth_policy_with_oidc +): + """Test Authorino with intermittent OIDC provider failures.""" + auth_policy_with_oidc.commit() + + # Create intermittent network issues (50% packet loss) + chaos = authorino_network_chaos( + "oidc-intermittent", + action="loss", + external_targets=[oidc_provider.issuer_url], + loss_percent=50, + duration="45s" + ) + chaos.commit() + + # Test multiple authentication attempts + success_count = 0 + total_attempts = 10 + + for _ in range(total_attempts): + response = auth_policy_with_oidc.test_authentication() + if response.status_code == 200: + success_count += 1 + time.sleep(1) + + # Some should succeed, some should fail + assert 0 < success_count < total_attempts + print(f"Success rate: {success_count}/{total_attempts}") + + +def test_authorino_oidc_discovery_chaos( + cluster, authorino_network_chaos, oidc_provider, auth_policy_with_oidc +): + """Test Authorino when OIDC discovery endpoint is disrupted.""" + # Target specifically the .well-known/openid-configuration endpoint + discovery_url = f"{oidc_provider.issuer_url}/.well-known/openid-configuration" + + chaos = authorino_network_chaos( + "oidc-discovery-chaos", + action="delay", + external_targets=[discovery_url], + delay="10s", + duration="30s" + ) + chaos.commit() + + # Apply policy (this should trigger discovery) + auth_policy_with_oidc.commit() + + # Verify policy eventually becomes ready despite discovery delays + assert auth_policy_with_oidc.wait_for_ready(timeout=60) diff --git a/testsuite/tests/singlecluster/chaos/data_plane/conftest.py b/testsuite/tests/singlecluster/chaos/data_plane/conftest.py new file mode 100644 index 00000000..d492781d --- /dev/null +++ b/testsuite/tests/singlecluster/chaos/data_plane/conftest.py @@ -0,0 +1,107 @@ +"""Conftest for data plane chaos testing.""" + +import pytest + + +@pytest.fixture(scope="module") +def authorino_selector(): + """Selector for Authorino pods.""" + return {"app": "authorino"} + + +@pytest.fixture(scope="module") +def limitador_selector(): + """Selector for Limitador pods.""" + return {"app": "limitador"} + + +@pytest.fixture(scope="module") +def data_plane_namespace(): + """Namespace where data plane components run.""" + return "kuadrant-system" + + +@pytest.fixture(scope="module") +def authorino_chaos_factory(create_pod_chaos, authorino_selector): + """Factory fixture for creating Authorino chaos experiments.""" + def _create_authorino_chaos(name, action, **kwargs): + chaos = create_pod_chaos(f"authorino-{name}") + + if action == "container-kill": + chaos.container_kill( + labels=authorino_selector, + containers=kwargs.get("containers", ["authorino"]), + duration=kwargs.get("duration", "10s"), + ) + elif action == "pod-kill": + chaos.pod_kill( + labels=authorino_selector, + grace_period=kwargs.get("grace_period", 0), + ) + elif action == "pod-failure": + chaos.pod_failure( + labels=authorino_selector, + duration=kwargs.get("duration", "30s"), + ) + else: + raise ValueError(f"Unsupported action: {action}") + + return chaos + return _create_authorino_chaos + + +@pytest.fixture(scope="module") +def limitador_chaos_factory(create_pod_chaos, limitador_selector): + """Factory fixture for creating Limitador chaos experiments.""" + def _create_limitador_chaos(name, action, **kwargs): + chaos = create_pod_chaos(f"limitador-{name}") + + if action == "container-kill": + chaos.container_kill( + labels=limitador_selector, + containers=kwargs.get("containers", ["limitador"]), + duration=kwargs.get("duration", "10s"), + ) + elif action == "pod-kill": + chaos.pod_kill( + labels=limitador_selector, + grace_period=kwargs.get("grace_period", 0), + ) + elif action == "pod-failure": + chaos.pod_failure( + labels=limitador_selector, + duration=kwargs.get("duration", "30s"), + ) + else: + raise ValueError(f"Unsupported action: {action}") + + return chaos + return _create_limitador_chaos + + +@pytest.fixture(scope="module") +def authorino_network_chaos(create_network_chaos, authorino_selector): + """Creates NetworkChaos targeting Authorino.""" + def _create_network_chaos(name, action="delay", **kwargs): + chaos = create_network_chaos(f"authorino-network-{name}") + chaos.configure_network_chaos( + labels=authorino_selector, + action=action, + **kwargs + ) + return chaos + return _create_network_chaos + + +@pytest.fixture(scope="module") +def limitador_network_chaos(create_network_chaos, limitador_selector): + """Creates NetworkChaos targeting Limitador.""" + def _create_network_chaos(name, action="delay", **kwargs): + chaos = create_network_chaos(f"limitador-network-{name}") + chaos.configure_network_chaos( + labels=limitador_selector, + action=action, + **kwargs + ) + return chaos + return _create_network_chaos diff --git a/testsuite/tests/singlecluster/chaos/data_plane/limitador/network_chaos/test_redis_connection_chaos.py b/testsuite/tests/singlecluster/chaos/data_plane/limitador/network_chaos/test_redis_connection_chaos.py new file mode 100644 index 00000000..3354e360 --- /dev/null +++ b/testsuite/tests/singlecluster/chaos/data_plane/limitador/network_chaos/test_redis_connection_chaos.py @@ -0,0 +1,151 @@ +"""Test Limitador resilience when Redis connection is disrupted.""" + +import pytest +import time + +pytestmark = [pytest.mark.disruptive, pytest.mark.limitador] + + +def test_limitador_redis_network_delay( + cluster, limitador_network_chaos, redis_backend, rate_limit_policy +): + """Test Limitador behavior with delayed Redis responses.""" + rate_limit_policy.commit() + + # Create network delay to Redis + chaos = limitador_network_chaos( + "redis-delay", + action="delay", + external_targets=["redis.kuadrant-system.svc.cluster.local"], + delay="500ms", + duration="60s" + ) + chaos.commit() + + # Test rate limiting with delay + start_time = time.time() + response = rate_limit_policy.test_rate_limit() + end_time = time.time() + + # Should still work but be slower + assert response.status_code in [200, 429] # Rate limit decision made + assert end_time - start_time > 0.5 # Delay applied + + +def test_limitador_redis_network_partition( + cluster, limitador_network_chaos, redis_backend, rate_limit_policy +): + """Test Limitador behavior when Redis is unreachable.""" + rate_limit_policy.commit() + + # Create network partition to Redis + chaos = limitador_network_chaos( + "redis-partition", + action="partition", + external_targets=["redis.kuadrant-system.svc.cluster.local"], + duration="30s" + ) + chaos.commit() + + # Test rate limiting during partition + response = rate_limit_policy.test_rate_limit() + + # Should fail-open or fail-closed based on configuration + # This depends on Limitador's configuration + assert response.status_code in [200, 500, 503] + + +def test_limitador_redis_intermittent_connection( + cluster, limitador_network_chaos, redis_backend, rate_limit_policy +): + """Test Limitador with intermittent Redis connection issues.""" + rate_limit_policy.commit() + + # Create intermittent network issues (30% packet loss) + chaos = limitador_network_chaos( + "redis-intermittent", + action="loss", + external_targets=["redis.kuadrant-system.svc.cluster.local"], + loss_percent=30, + duration="45s" + ) + chaos.commit() + + # Test multiple rate limit attempts + responses = [] + for _ in range(20): + response = rate_limit_policy.test_rate_limit() + responses.append(response.status_code) + time.sleep(0.5) + + # Should have mixed results due to intermittent failures + unique_responses = set(responses) + assert len(unique_responses) > 1 # Should have different response codes + print(f"Response distribution: {dict(zip(*zip(*[(r, responses.count(r)) for r in unique_responses])))}") + + +def test_limitador_redis_high_latency( + cluster, limitador_network_chaos, redis_backend, rate_limit_policy +): + """Test Limitador with high Redis latency.""" + rate_limit_policy.commit() + + # Create high latency to Redis + chaos = limitador_network_chaos( + "redis-high-latency", + action="delay", + external_targets=["redis.kuadrant-system.svc.cluster.local"], + delay="2s", + jitter="500ms", + duration="60s" + ) + chaos.commit() + + # Test rate limiting under high latency + slow_responses = 0 + total_requests = 10 + + for _ in range(total_requests): + start_time = time.time() + response = rate_limit_policy.test_rate_limit() + end_time = time.time() + + if end_time - start_time > 1.5: # Accounting for jitter + slow_responses += 1 + + # Verify response is still valid + assert response.status_code in [200, 429, 500, 503] + + # Most responses should be slow due to Redis latency + assert slow_responses >= total_requests * 0.7 # At least 70% slow + + +def test_limitador_redis_connection_reset( + cluster, limitador_network_chaos, redis_backend, rate_limit_policy +): + """Test Limitador when Redis connections are reset.""" + rate_limit_policy.commit() + + # Create connection resets to Redis + chaos = limitador_network_chaos( + "redis-reset", + action="abort", + external_targets=["redis.kuadrant-system.svc.cluster.local"], + abort_percent=50, + duration="30s" + ) + chaos.commit() + + # Test rate limiting with connection resets + error_responses = 0 + total_requests = 15 + + for _ in range(total_requests): + response = rate_limit_policy.test_rate_limit() + if response.status_code >= 500: + error_responses += 1 + time.sleep(1) + + # Should have some errors due to connection resets + assert error_responses > 0 + print(f"Error rate: {error_responses}/{total_requests}")