From 032a1cb6606d1eb55c0dbb9fdc3f53c0e2257e8a Mon Sep 17 00:00:00 2001 From: Prashant Tak Date: Thu, 12 Mar 2026 15:04:29 +0530 Subject: [PATCH 1/3] Exclude terminating machines when managing machineset replicas --- pkg/controller/machineset.go | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index acd41e80e..5b04807d3 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -335,19 +335,28 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 return nil } - var machinesWithoutUpdateSuccessfulLabel []*v1alpha1.Machine + var ( + nonTerminatingMachines []*v1alpha1.Machine + machinesWithoutUpdateSuccessfulLabel []*v1alpha1.Machine + ) for _, m := range allMachines { + if m.Status.CurrentStatus.Phase != v1alpha1.MachineTerminating { + nonTerminatingMachines = append(nonTerminatingMachines, m) + } + } + + for _, m := range nonTerminatingMachines { if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] != v1alpha1.LabelValueNodeUpdateSuccessful { machinesWithoutUpdateSuccessfulLabel = append(machinesWithoutUpdateSuccessfulLabel, m) } } - allMachinesDiff := len(allMachines) - int(machineSet.Spec.Replicas) + nonTerminatingMachinesDiff := len(nonTerminatingMachines) - int(machineSet.Spec.Replicas) machinesWithoutUpdateSuccessfulLabelDiff := len(machinesWithoutUpdateSuccessfulLabel) - int(machineSet.Spec.Replicas) // During in-place updates, ScaleUps are disabled in the oldMachineSet and // in newMachineSet, its ReplicaCount would never increase before a machine from oldMachineSet is moved. // So no need for any special case during in-place updates. - if allMachinesDiff < 0 { + if nonTerminatingMachinesDiff < 0 { // If MachineSet is frozen and no deletion timestamp, don't process it if machineSet.Labels["freeze"] == "True" && machineSet.DeletionTimestamp == nil { klog.V(2).Infof("MachineSet %q is frozen, and hence not processing", machineSet.Name) @@ -361,20 +370,20 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 return nil } - allMachinesDiff *= -1 - if allMachinesDiff > BurstReplicas { - allMachinesDiff = BurstReplicas + nonTerminatingMachinesDiff *= -1 + if nonTerminatingMachinesDiff > BurstReplicas { + nonTerminatingMachinesDiff = BurstReplicas } // TODO: Track UIDs of creates just like deletes. The problem currently // is we'd need to wait on the result of a create to record the machine's // UID, which would require locking *across* the create, which will turn // into a performance bottleneck. We should generate a UID for the machine // beforehand and store it via ExpectCreations. - if err := c.expectations.ExpectCreations(machineSetKey, allMachinesDiff); err != nil { + if err := c.expectations.ExpectCreations(machineSetKey, nonTerminatingMachinesDiff); err != nil { // TODO: proper error handling needs to happen here klog.Errorf("failed expect creations for machineset %s: %v", machineSet.Name, err) } - klog.V(2).Infof("Too few replicas for MachineSet %s, need %d, creating %d", machineSet.Name, (machineSet.Spec.Replicas), allMachinesDiff) + klog.V(2).Infof("Too few replicas for MachineSet %s, need %d, creating %d", machineSet.Name, (machineSet.Spec.Replicas), nonTerminatingMachinesDiff) // Batch the machine creates. Batch sizes start at SlowStartInitialBatchSize // and double with each successful iteration in a kind of "slow start". // This handles attempts to start large numbers of machines that would @@ -383,7 +392,7 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 // prevented from spamming the API service with the machine create requests // after one of its machines fails. Conveniently, this also prevents the // event spam that those failures would generate. - successfulCreations, err := slowStartBatch(allMachinesDiff, SlowStartInitialBatchSize, func() error { + successfulCreations, err := slowStartBatch(nonTerminatingMachinesDiff, SlowStartInitialBatchSize, func() error { boolPtr := func(b bool) *bool { return &b } controllerRef := &metav1.OwnerReference{ APIVersion: controllerKindMachineSet.GroupVersion().String(), // #ToCheck @@ -410,7 +419,7 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 // Any skipped machines that we never attempted to start shouldn't be expected. // The skipped machines will be retried later. The next controller resync will // retry the slow start process. - if skippedMachines := allMachinesDiff - successfulCreations; skippedMachines > 0 { + if skippedMachines := nonTerminatingMachinesDiff - successfulCreations; skippedMachines > 0 { klog.V(2).Infof("Slow-start failure. Skipping creation of %d machines, decrementing expectations for %v %v/%v", skippedMachines, machineSet.Kind, machineSet.Namespace, machineSet.Name) for range skippedMachines { // Decrement the expected number of creates because the informer won't observe this machine From 73e4577500a59e793c98ffcd40e834a7684e33e5 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Thu, 12 Mar 2026 17:33:07 +0530 Subject: [PATCH 2/3] Remove unneeded slice for nonTerminatingMachines --- pkg/controller/machineset.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index 5b04807d3..842f1ee42 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -336,21 +336,18 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 } var ( - nonTerminatingMachines []*v1alpha1.Machine + numNonTerminatingMachines int machinesWithoutUpdateSuccessfulLabel []*v1alpha1.Machine ) for _, m := range allMachines { if m.Status.CurrentStatus.Phase != v1alpha1.MachineTerminating { - nonTerminatingMachines = append(nonTerminatingMachines, m) - } - } - - for _, m := range nonTerminatingMachines { - if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] != v1alpha1.LabelValueNodeUpdateSuccessful { - machinesWithoutUpdateSuccessfulLabel = append(machinesWithoutUpdateSuccessfulLabel, m) + numNonTerminatingMachines++ + if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] != v1alpha1.LabelValueNodeUpdateSuccessful { + machinesWithoutUpdateSuccessfulLabel = append(machinesWithoutUpdateSuccessfulLabel, m) + } } } - nonTerminatingMachinesDiff := len(nonTerminatingMachines) - int(machineSet.Spec.Replicas) + nonTerminatingMachinesDiff := numNonTerminatingMachines - int(machineSet.Spec.Replicas) machinesWithoutUpdateSuccessfulLabelDiff := len(machinesWithoutUpdateSuccessfulLabel) - int(machineSet.Spec.Replicas) // During in-place updates, ScaleUps are disabled in the oldMachineSet and From bca257978210994b68401fabaaa2e41332761d4d Mon Sep 17 00:00:00 2001 From: Prashant Tak Date: Fri, 13 Mar 2026 11:49:29 +0530 Subject: [PATCH 3/3] Add comment elucidating the need for exclusion of terminating machines --- pkg/controller/machineset.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index 842f1ee42..3508fb417 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -340,6 +340,13 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 machinesWithoutUpdateSuccessfulLabel []*v1alpha1.Machine ) for _, m := range allMachines { + // Machines that are terminating are skipped when computing the diff for the required + // machineSet replicas since either: + // 1. There was a scale-down event that led to the machine being marked for removal, + // in which case there's no need to process it for further iterations; or + // 2. This was a `failed` machine that was terminated in which case the machineSet + // replicas won't change and hence there's a new machine that should be spawned + // in place of the failed one. if m.Status.CurrentStatus.Phase != v1alpha1.MachineTerminating { numNonTerminatingMachines++ if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] != v1alpha1.LabelValueNodeUpdateSuccessful {