From 972b259a66849551f7136f8c3a7efd15044db62f Mon Sep 17 00:00:00 2001 From: Damiano Donati Date: Sat, 6 Dec 2025 10:17:16 +0100 Subject: [PATCH 1/2] fix: e2e: storage init timeouts The issue was that GetAWSMachineTemplateByPrefix and DeleteAWSMachineTemplateByPrefix were calling Eventually() without timeout and retry parameters: Eventually(komega.List(templateList, client.InNamespace(namespace))).Should(Succeed(), ...) This uses Gomega's default timeout (1 second), which is far too short when the API server returns a transient "storage is (re)initializing" error (HTTP 429). The new cluster-api-provider-aws v2.10.0 introduces the v1beta2 API version 1, and when CRD storage is reinitializing during API version transitions, these transient errors are expected. The fix adds time.Minute, RetryShort parameters to both Eventually calls, matching the pattern used by other functions in the same file (like GetAWSMachineTemplateByName at line 31). This gives the API server up to 1 minute to complete storage initialization, with 1-second retry intervals. --- e2e/framework/machinetemplate.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/framework/machinetemplate.go b/e2e/framework/machinetemplate.go index cb470cf86..ca6fae8bb 100644 --- a/e2e/framework/machinetemplate.go +++ b/e2e/framework/machinetemplate.go @@ -56,7 +56,7 @@ func GetAWSMachineTemplateByPrefix(cl client.Client, prefix string, namespace st return nil, nil } templateList := &awsv1.AWSMachineTemplateList{} - Eventually(komega.List(templateList, client.InNamespace(namespace))).Should(Succeed(), "failed to list AWSMachineTemplates in namespace %s.", namespace) + Eventually(komega.List(templateList, client.InNamespace(namespace)), time.Minute, RetryShort).Should(Succeed(), "failed to list AWSMachineTemplates in namespace %s.", namespace) var matches []*awsv1.AWSMachineTemplate for i, t := range templateList.Items { @@ -81,7 +81,7 @@ func DeleteAWSMachineTemplateByPrefix(ctx context.Context, cl client.Client, pre return nil } templateList := &awsv1.AWSMachineTemplateList{} - Eventually(komega.List(templateList, client.InNamespace(namespace))).Should(Succeed(), "failed to list AWSMachineTemplates in namespace %s.", namespace) + Eventually(komega.List(templateList, client.InNamespace(namespace)), time.Minute, RetryShort).Should(Succeed(), "failed to list AWSMachineTemplates in namespace %s.", namespace) for i := range templateList.Items { if strings.HasPrefix(templateList.Items[i].Name, prefix) { From 88ea197ab4f0ef0140a12e12998d38ddf98067c7 Mon Sep 17 00:00:00 2001 From: Damiano Donati Date: Sat, 6 Dec 2025 10:53:32 +0100 Subject: [PATCH 2/2] fix: e2e: increase wait for replicas timeouts Storage reinitialization errors continue for 15+ minutes during the v1beta2 transition The cluster connection stabilized around 01:20:08 (~2 minutes in) The caches did eventually populate intermittently The 30-minute timeout provides more buffer for: Storage reinitialization to complete CAPI MachineSet controller to process the scale-up Machine provisioning to complete Note: This is a short-term workaround. The root cause is the prolonged storage instability during the v1beta2 API transition in the AWS provider PR. --- e2e/machineset_migration_helpers.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/machineset_migration_helpers.go b/e2e/machineset_migration_helpers.go index a503eb983..699dc748c 100644 --- a/e2e/machineset_migration_helpers.go +++ b/e2e/machineset_migration_helpers.go @@ -173,12 +173,12 @@ func verifyMachinesetReplicas(machineSet client.Object, replicas int) { switch ms := machineSet.(type) { case *mapiv1beta1.MachineSet: By(fmt.Sprintf("Verifying MAPI MachineSet status.Replicas is %d", replicas)) - Eventually(komega.Object(ms), capiframework.WaitLong, capiframework.RetryLong).Should( + Eventually(komega.Object(ms), capiframework.WaitOverLong, capiframework.RetryLong).Should( HaveField("Status.Replicas", Equal(int32(replicas))), "Should have MAPI MachineSet %q replicas status eventually be %d", ms.Name, replicas) case *clusterv1beta1.MachineSet: By(fmt.Sprintf("Verifying CAPI MachineSet status.Replicas is %d", replicas)) - Eventually(komega.Object(ms), capiframework.WaitLong, capiframework.RetryLong).Should( + Eventually(komega.Object(ms), capiframework.WaitOverLong, capiframework.RetryLong).Should( HaveField("Status.Replicas", Equal(int32(replicas))), "Should have CAPI MachineSet %q replicas status eventually be %d", ms.Name, replicas) default: