diff --git a/pkg/util/provider/app/app.go b/pkg/util/provider/app/app.go index 054976f9c..a4372d02e 100644 --- a/pkg/util/provider/app/app.go +++ b/pkg/util/provider/app/app.go @@ -38,6 +38,7 @@ import ( coreclientbuilder "github.com/gardener/machine-controller-manager/pkg/util/clientbuilder/core" machineclientbuilder "github.com/gardener/machine-controller-manager/pkg/util/clientbuilder/machine" machinecontroller "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecontroller" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" kubernetesinformers "k8s.io/client-go/informers" kubescheme "k8s.io/client-go/kubernetes/scheme" @@ -216,6 +217,9 @@ func StartControllers(s *options.MCServer, recorder record.EventRecorder, stop <-chan struct{}) error { + resourceExhaustedRetryPeriod := machineutils.RetryPeriod(s.ResourceExhaustedRetry.Duration) + klog.V(4).Infof("Configured ResourceExhaustedRetryPeriod=%s", time.Duration(resourceExhaustedRetryPeriod)) + klog.V(4).Info("Getting available resources") availableResources, err := getAvailableResources(controlCoreClientBuilder) if err != nil { @@ -297,6 +301,7 @@ func StartControllers(s *options.MCServer, s.NodeConditions, s.BootstrapTokenAuthExtraGroups, targetKubernetesVersion, + resourceExhaustedRetryPeriod, ) if err != nil { return err diff --git a/pkg/util/provider/app/options/options.go b/pkg/util/provider/app/options/options.go index 9e66b4b27..93ae6bf75 100644 --- a/pkg/util/provider/app/options/options.go +++ b/pkg/util/provider/app/options/options.go @@ -28,6 +28,7 @@ import ( "time" drain "github.com/gardener/machine-controller-manager/pkg/util/provider/drain" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" machineconfig "github.com/gardener/machine-controller-manager/pkg/util/provider/options" "github.com/spf13/pflag" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -116,6 +117,7 @@ func (s *MCServer) AddFlags(fs *pflag.FlagSet) { fs.StringVar(&s.NodeConditions, "node-conditions", s.NodeConditions, "List of comma-separated/case-sensitive node-conditions which when set to True will change machine to a failed state after MachineHealthTimeout duration. It may further be replaced with a new machine if the machine is backed by a machine-set object.") fs.StringVar(&s.BootstrapTokenAuthExtraGroups, "bootstrap-token-auth-extra-groups", s.BootstrapTokenAuthExtraGroups, "Comma-separated list of groups to set bootstrap token's \"auth-extra-groups\" field to") + fs.DurationVar(&s.ResourceExhaustedRetry.Duration, "resource-exhausted-retry", time.Duration(machineutils.LongRetry), "Retry duration used when machine creation fails with ResourceExhausted. Defaults to LongRetry.") logs.AddFlags(fs) // adds --v flag for log level. leaderelectionconfig.BindFlags(&s.LeaderElection, fs) @@ -190,6 +192,9 @@ func (s *MCServer) Validate() error { if s.ControlKubeconfig == "" && s.TargetKubeconfig == constants.TargetKubeconfigDisabledValue { errs = append(errs, fmt.Errorf("--control-kubeconfig cannot be empty if --target-kubeconfig=%s is specified", constants.TargetKubeconfigDisabledValue)) } + if s.ResourceExhaustedRetry.Duration < 0 { + errs = append(errs, fmt.Errorf("resource exhausted retry duration should be a non negative value: got: %v", s.ResourceExhaustedRetry.Duration)) + } return utilerrors.NewAggregate(errs) } diff --git a/pkg/util/provider/machinecontroller/controller.go b/pkg/util/provider/machinecontroller/controller.go index 76cbed046..896e43446 100644 --- a/pkg/util/provider/machinecontroller/controller.go +++ b/pkg/util/provider/machinecontroller/controller.go @@ -16,6 +16,7 @@ import ( "github.com/gardener/machine-controller-manager/pkg/util/permits" "github.com/gardener/machine-controller-manager/pkg/util/provider/drain" "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" "github.com/gardener/machine-controller-manager/pkg/util/provider/options" "github.com/gardener/machine-controller-manager/pkg/util/worker" @@ -73,6 +74,7 @@ func NewController( nodeConditions string, bootstrapTokenAuthExtraGroups string, targetKubernetesVersion *semver.Version, + resourceExhaustedRetry machineutils.RetryPeriod, ) (Controller, error) { const ( permitGiverStaleEntryTimeout = 1 * time.Hour @@ -121,6 +123,7 @@ func NewController( volumeAttachmentHandler: nil, permitGiver: permits.NewPermitGiver(permitGiverStaleEntryTimeout, janitorFreq), targetKubernetesVersion: targetKubernetesVersion, + resourceExhaustedRetry: resourceExhaustedRetry, } controller.internalExternalScheme = runtime.NewScheme() @@ -298,6 +301,8 @@ type controller struct { machineClassSynced cache.InformerSynced machineSynced cache.InformerSynced podSynced cache.InformerSynced + + resourceExhaustedRetry machineutils.RetryPeriod } func (dc *controller) Run(workers int, stopCh <-chan struct{}) { diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 76214eac7..4c328a38a 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -478,12 +478,13 @@ var _ = Describe("machine", func() { Describe("#triggerCreationFlow", func() { type setup struct { - machineClasses []*v1alpha1.MachineClass - machines []*v1alpha1.Machine - secrets []*corev1.Secret - nodes []*corev1.Node - fakeResourceActions *customfake.ResourceActions - noTargetCluster bool + machineClasses []*v1alpha1.MachineClass + machines []*v1alpha1.Machine + secrets []*corev1.Secret + nodes []*corev1.Node + fakeResourceActions *customfake.ResourceActions + noTargetCluster bool + resourceExhaustedRetry machineutils.RetryPeriod } type action struct { machine string @@ -542,6 +543,10 @@ var _ = Describe("machine", func() { waitForCacheSync(stop, controller) + if data.setup.resourceExhaustedRetry != 0 { + controller.resourceExhaustedRetry = data.setup.resourceExhaustedRetry + } + action := data.action machine, err := controller.controlMachineClient.Machines(objMeta.Namespace).Get(context.TODO(), action.machine, metav1.GetOptions{}) Expect(err).ToNot(HaveOccurred()) @@ -916,6 +921,59 @@ var _ = Describe("machine", func() { retry: machineutils.LongRetry, }, }), + Entry("Machine creation fails with CrashLoopBackOff due to resource exhaustion with configured retry period", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + Data: map[string][]byte{"userData": []byte("test")}, + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + }, + }, nil, nil, nil, nil, true, metav1.Now()), + resourceExhaustedRetry: machineutils.RetryPeriod(30 * time.Minute), + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: false, + Err: status.Error(codes.ResourceExhausted, "Provider does not have capacity to create VM"), + }, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machineClass", + }, + }, + }, &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineCrashLoopBackOff, + }, + LastOperation: v1alpha1.LastOperation{ + ErrorCode: codes.ResourceExhausted.String(), + }, + }, nil, nil, nil, true, metav1.Now()), + err: status.Error(codes.ResourceExhausted, "Provider does not have capacity to create VM"), + retry: machineutils.RetryPeriod(30 * time.Minute), + }, + }), Entry("Machine creation fails with Failure due to timeout", &data{ setup: setup{ secrets: []*corev1.Secret{ diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 36fa6f414..b081979c9 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -804,7 +804,10 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a if ok { switch machineErr.Code() { case codes.ResourceExhausted: - retryRequired = machineutils.LongRetry + if c.resourceExhaustedRetry == 0 { + c.resourceExhaustedRetry = machineutils.LongRetry + } + retryRequired = c.resourceExhaustedRetry lastKnownState = machine.Status.LastKnownState case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable: retryRequired = machineutils.ShortRetry diff --git a/pkg/util/provider/options/types.go b/pkg/util/provider/options/types.go index d1be4c2c9..2c0172f9e 100644 --- a/pkg/util/provider/options/types.go +++ b/pkg/util/provider/options/types.go @@ -73,6 +73,8 @@ type MachineControllerConfiguration struct { //BootstrapTokenAuthExtraGroups is a comma-separated string of groups to set bootstrap token's "auth-extra-groups" field to. BootstrapTokenAuthExtraGroups string + + ResourceExhaustedRetry metav1.Duration } // SafetyOptions are used to configure the upper-limit and lower-limit