From bc632fb5762446652bd7b7e94af3f44ae1edd2a0 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Wed, 19 Nov 2025 15:08:16 -0500 Subject: [PATCH 01/13] chore: remove dangling v1alpha1 helm reference (#337) remove dangling v1alpha1 helm reference Signed-off-by: Wei Weng Co-authored-by: Wei Weng --- charts/member-agent/templates/crds/appliedworks.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/charts/member-agent/templates/crds/appliedworks.yaml b/charts/member-agent/templates/crds/appliedworks.yaml index 17d27ee5c..5d0bbc742 100644 --- a/charts/member-agent/templates/crds/appliedworks.yaml +++ b/charts/member-agent/templates/crds/appliedworks.yaml @@ -1,8 +1,4 @@ {{ $files := .Files }} -{{ if .Values.enableV1Alpha1APIs }} - {{ $files.Get "crdbases/multicluster.x-k8s.io_appliedworks.yaml" }} -{{ end }} ---- {{ if .Values.enableV1Beta1APIs }} {{ $files.Get "crdbases/placement.kubernetes-fleet.io_appliedworks.yaml" }} {{ end }} From 97d6f93223df294d4a825d27a00343dbdca2018c Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Wed, 19 Nov 2025 15:00:55 -0800 Subject: [PATCH 02/13] fix: fix the join script (#336) fix the script Signed-off-by: Ryan Zhang --- hack/membership/joinMC.sh | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/hack/membership/joinMC.sh b/hack/membership/joinMC.sh index 87478824b..7a4ac4224 100755 --- a/hack/membership/joinMC.sh +++ b/hack/membership/joinMC.sh @@ -2,6 +2,7 @@ # AKS Clusters and joins them onto the hub cluster. # Perform validation to ensure the script can run correctly. +export REGISTRY="${REGISTRY:-mcr.microsoft.com/aks/fleet}" if [ "$#" -lt 3 ]; then echo "Usage: $0 [ ...]" @@ -9,11 +10,6 @@ if [ "$#" -lt 3 ]; then fi export IMAGE_TAG="$1" -if [[ $(curl "https://api.github.com/repos/Azure/fleet/tags") != *"$1"* ]] > /dev/null 2>&1; then - echo "fleet image tag $1 does not exist" - exit 1 -fi - export HUB_CLUSTER="$2" if [[ ! $(kubectl config view -o jsonpath="{.contexts[?(@.context.cluster==\"$HUB_CLUSTER\")]}") ]] > /dev/null 2>&1; then echo "The cluster named $HUB_CLUSTER does not exist." @@ -100,13 +96,7 @@ fi # # Install the member agent helm chart on the member cluster. -# The variables below uses the Fleet images kept in the Microsoft Container Registry (MCR), -# and will retrieve the latest version from the Fleet GitHub repository. -# -# You can, however, build the Fleet images of your own; see the repository README for -# more information. echo "Retrieving image..." -export REGISTRY="mcr.microsoft.com/aks/fleet" export MEMBER_AGENT_IMAGE="member-agent" export REFRESH_TOKEN_IMAGE="${REFRESH_TOKEN_NAME:-refresh-token}" export OUTPUT_TYPE="${OUTPUT_TYPE:-type=docker}" @@ -121,11 +111,8 @@ echo "Creating secret..." kubectl delete secret hub-kubeconfig-secret --ignore-not-found=true kubectl create secret generic hub-kubeconfig-secret --from-literal=token=$TOKEN -echo "Uninstalling member-agent..." -helm uninstall member-agent --wait - echo "Installing member-agent..." -helm install member-agent charts/member-agent/ \ +helm upgrade --install member-agent charts/member-agent/ \ --set config.hubURL=$HUB_CLUSTER_ADDRESS \ --set image.repository=$REGISTRY/$MEMBER_AGENT_IMAGE \ --set image.tag=$IMAGE_TAG \ @@ -134,7 +121,7 @@ helm install member-agent charts/member-agent/ \ --set image.pullPolicy=Always \ --set refreshtoken.pullPolicy=Always \ --set config.memberClusterName=$MEMBER_CLUSTER \ - --set logVerbosity=8 \ + --set logVerbosity=5 \ --set namespace=fleet-system \ --set enableV1Beta1APIs=true From 7e69ea58bd4999933fc9ce50a20d2099ebf5d5c7 Mon Sep 17 00:00:00 2001 From: Arvind Thirumurugan Date: Wed, 19 Nov 2025 16:57:28 -0800 Subject: [PATCH 03/13] feat: Process clusters in parallel within stage in an updateRun (#323) --- apis/placement/v1beta1/stageupdate_types.go | 3 +- ...etes-fleet.io_clusterstagedupdateruns.yaml | 5 +- ...leet.io_clusterstagedupdatestrategies.yaml | 5 +- ....kubernetes-fleet.io_stagedupdateruns.yaml | 5 +- ...netes-fleet.io_stagedupdatestrategies.yaml | 5 +- pkg/controllers/updaterun/execution.go | 116 +++- pkg/controllers/updaterun/execution_test.go | 545 ++++++++++++++++++ pkg/controllers/updaterun/validation.go | 18 +- pkg/controllers/updaterun/validation_test.go | 6 +- .../api_validation_integration_test.go | 206 +++++++ test/e2e/cluster_staged_updaterun_test.go | 184 +++++- test/e2e/staged_updaterun_test.go | 179 ++++++ 12 files changed, 1223 insertions(+), 54 deletions(-) diff --git a/apis/placement/v1beta1/stageupdate_types.go b/apis/placement/v1beta1/stageupdate_types.go index fa92786d2..9491f3ec6 100644 --- a/apis/placement/v1beta1/stageupdate_types.go +++ b/apis/placement/v1beta1/stageupdate_types.go @@ -322,7 +322,8 @@ type StageConfig struct { // Defaults to 1. // +kubebuilder:default=1 // +kubebuilder:validation:XIntOrString - // +kubebuilder:validation:Pattern="^((100|[0-9]{1,2})%|[0-9]+)$" + // +kubebuilder:validation:Pattern="^(100|[1-9][0-9]?)%$" + // +kubebuilder:validation:XValidation:rule="self == null || type(self) != int || self >= 1",message="maxConcurrency must be at least 1" // +kubebuilder:validation:Optional MaxConcurrency *intstr.IntOrString `json:"maxConcurrency,omitempty"` diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml index a2ddf03ec..0bf83da28 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml @@ -2044,8 +2044,11 @@ spec: Fractional results are rounded down. A minimum of 1 update is enforced. If not specified, all clusters in the stage are updated sequentially (effectively maxConcurrency = 1). Defaults to 1. - pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + pattern: ^(100|[1-9][0-9]?)%$ x-kubernetes-int-or-string: true + x-kubernetes-validations: + - message: maxConcurrency must be at least 1 + rule: self == null || type(self) != int || self >= 1 name: description: The name of the stage. This MUST be unique within the same StagedUpdateStrategy. diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml index 4d088a0ce..e4c5d099e 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml @@ -315,8 +315,11 @@ spec: Fractional results are rounded down. A minimum of 1 update is enforced. If not specified, all clusters in the stage are updated sequentially (effectively maxConcurrency = 1). Defaults to 1. - pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + pattern: ^(100|[1-9][0-9]?)%$ x-kubernetes-int-or-string: true + x-kubernetes-validations: + - message: maxConcurrency must be at least 1 + rule: self == null || type(self) != int || self >= 1 name: description: The name of the stage. This MUST be unique within the same StagedUpdateStrategy. diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml index 0db56cd16..979c73e99 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml @@ -964,8 +964,11 @@ spec: Fractional results are rounded down. A minimum of 1 update is enforced. If not specified, all clusters in the stage are updated sequentially (effectively maxConcurrency = 1). Defaults to 1. - pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + pattern: ^(100|[1-9][0-9]?)%$ x-kubernetes-int-or-string: true + x-kubernetes-validations: + - message: maxConcurrency must be at least 1 + rule: self == null || type(self) != int || self >= 1 name: description: The name of the stage. This MUST be unique within the same StagedUpdateStrategy. diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml index 898f92a88..6e1119657 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml @@ -177,8 +177,11 @@ spec: Fractional results are rounded down. A minimum of 1 update is enforced. If not specified, all clusters in the stage are updated sequentially (effectively maxConcurrency = 1). Defaults to 1. - pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + pattern: ^(100|[1-9][0-9]?)%$ x-kubernetes-int-or-string: true + x-kubernetes-validations: + - message: maxConcurrency must be at least 1 + rule: self == null || type(self) != int || self >= 1 name: description: The name of the stage. This MUST be unique within the same StagedUpdateStrategy. diff --git a/pkg/controllers/updaterun/execution.go b/pkg/controllers/updaterun/execution.go index 87f5eaf4b..198980749 100644 --- a/pkg/controllers/updaterun/execution.go +++ b/pkg/controllers/updaterun/execution.go @@ -22,12 +22,15 @@ import ( "fmt" "reflect" "strconv" + "strings" "time" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + utilerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" @@ -67,8 +70,12 @@ func (r *Reconciler) execute( updateRunStatus := updateRun.GetUpdateRunStatus() if updatingStageIndex < len(updateRunStatus.StagesStatus) { + maxConcurrency, err := calculateMaxConcurrencyValue(updateRunStatus, updatingStageIndex) + if err != nil { + return false, 0, err + } updatingStage := &updateRunStatus.StagesStatus[updatingStageIndex] - waitTime, execErr := r.executeUpdatingStage(ctx, updateRun, updatingStageIndex, toBeUpdatedBindings) + waitTime, execErr := r.executeUpdatingStage(ctx, updateRun, updatingStageIndex, toBeUpdatedBindings, maxConcurrency) if errors.Is(execErr, errStagedUpdatedAborted) { markStageUpdatingFailed(updatingStage, updateRun.GetGeneration(), execErr.Error()) return true, waitTime, execErr @@ -91,6 +98,7 @@ func (r *Reconciler) executeUpdatingStage( updateRun placementv1beta1.UpdateRunObj, updatingStageIndex int, toBeUpdatedBindings []placementv1beta1.BindingObj, + maxConcurrency int, ) (time.Duration, error) { updateRunStatus := updateRun.GetUpdateRunStatus() updateRunSpec := updateRun.GetUpdateRunSpec() @@ -105,25 +113,30 @@ func (r *Reconciler) executeUpdatingStage( bindingSpec := binding.GetBindingSpec() toBeUpdatedBindingsMap[bindingSpec.TargetCluster] = binding } - finishedClusterCount := 0 - // Go through each cluster in the stage and check if it's updated. - for i := range updatingStageStatus.Clusters { + finishedClusterCount := 0 + clusterUpdatingCount := 0 + var stuckClusterNames []string + var clusterUpdateErrors []error + // Go through each cluster in the stage and check if it's updating/succeeded/failed. + for i := 0; i < len(updatingStageStatus.Clusters) && clusterUpdatingCount < maxConcurrency; i++ { clusterStatus := &updatingStageStatus.Clusters[i] - clusterStartedCond := meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1beta1.ClusterUpdatingConditionStarted)) clusterUpdateSucceededCond := meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1beta1.ClusterUpdatingConditionSucceeded)) - if condition.IsConditionStatusFalse(clusterUpdateSucceededCond, updateRun.GetGeneration()) { - // The cluster is marked as failed to update. - failedErr := fmt.Errorf("the cluster `%s` in the stage %s has failed", clusterStatus.ClusterName, updatingStageStatus.StageName) - klog.ErrorS(failedErr, "The cluster has failed to be updated", "updateRun", updateRunRef) - return 0, fmt.Errorf("%w: %s", errStagedUpdatedAborted, failedErr.Error()) - } if condition.IsConditionStatusTrue(clusterUpdateSucceededCond, updateRun.GetGeneration()) { // The cluster has been updated successfully. finishedClusterCount++ continue } - // The cluster is either updating or not started yet. + clusterUpdatingCount++ + if condition.IsConditionStatusFalse(clusterUpdateSucceededCond, updateRun.GetGeneration()) { + // The cluster is marked as failed to update, this cluster is counted as updating cluster since it's not finished to avoid processing more clusters than maxConcurrency in this round. + failedErr := fmt.Errorf("the cluster `%s` in the stage %s has failed", clusterStatus.ClusterName, updatingStageStatus.StageName) + klog.ErrorS(failedErr, "The cluster has failed to be updated", "updateRun", updateRunRef) + clusterUpdateErrors = append(clusterUpdateErrors, fmt.Errorf("%w: %s", errStagedUpdatedAborted, failedErr.Error())) + continue + } + // The cluster needs to be processed. + clusterStartedCond := meta.FindStatusCondition(clusterStatus.Conditions, string(placementv1beta1.ClusterUpdatingConditionStarted)) binding := toBeUpdatedBindingsMap[clusterStatus.ClusterName] if !condition.IsConditionStatusTrue(clusterStartedCond, updateRun.GetGeneration()) { // The cluster has not started updating yet. @@ -138,11 +151,13 @@ func (r *Reconciler) executeUpdatingStage( bindingSpec.ApplyStrategy = updateRunStatus.ApplyStrategy if err := r.Client.Update(ctx, binding); err != nil { klog.ErrorS(err, "Failed to update binding to be bound with the matching spec of the updateRun", "binding", klog.KObj(binding), "updateRun", updateRunRef) - return 0, controller.NewUpdateIgnoreConflictError(err) + clusterUpdateErrors = append(clusterUpdateErrors, controller.NewUpdateIgnoreConflictError(err)) + continue } klog.V(2).InfoS("Updated the status of a binding to bound", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef) if err := r.updateBindingRolloutStarted(ctx, binding, updateRun); err != nil { - return 0, err + clusterUpdateErrors = append(clusterUpdateErrors, err) + continue } } else { klog.V(2).InfoS("Found the first binding that is updating but the cluster status has not been updated", "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef) @@ -151,20 +166,24 @@ func (r *Reconciler) executeUpdatingStage( bindingSpec.State = placementv1beta1.BindingStateBound if err := r.Client.Update(ctx, binding); err != nil { klog.ErrorS(err, "Failed to update a binding to be bound", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef) - return 0, controller.NewUpdateIgnoreConflictError(err) + clusterUpdateErrors = append(clusterUpdateErrors, controller.NewUpdateIgnoreConflictError(err)) + continue } klog.V(2).InfoS("Updated the status of a binding to bound", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef) if err := r.updateBindingRolloutStarted(ctx, binding, updateRun); err != nil { - return 0, err + clusterUpdateErrors = append(clusterUpdateErrors, err) + continue } } else if !condition.IsConditionStatusTrue(meta.FindStatusCondition(binding.GetBindingStatus().Conditions, string(placementv1beta1.ResourceBindingRolloutStarted)), binding.GetGeneration()) { klog.V(2).InfoS("The binding is bound and up-to-date but the generation is updated by the scheduler, update rolloutStarted status again", "binding", klog.KObj(binding), "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef) if err := r.updateBindingRolloutStarted(ctx, binding, updateRun); err != nil { - return 0, err + clusterUpdateErrors = append(clusterUpdateErrors, err) + continue } } else { if _, updateErr := checkClusterUpdateResult(binding, clusterStatus, updatingStageStatus, updateRun); updateErr != nil { - return clusterUpdatingWaitTime, updateErr + clusterUpdateErrors = append(clusterUpdateErrors, updateErr) + continue } } } @@ -172,8 +191,8 @@ func (r *Reconciler) executeUpdatingStage( if finishedClusterCount == 0 { markStageUpdatingStarted(updatingStageStatus, updateRun.GetGeneration()) } - // No need to continue as we only support one cluster updating at a time for now. - return clusterUpdatingWaitTime, nil + // Need to continue as we need to process at most maxConcurrency number of clusters in parallel. + continue } // Now the cluster has to be updating, the binding should point to the right resource snapshot and the binding should be bound. @@ -190,24 +209,35 @@ func (r *Reconciler) executeUpdatingStage( "bindingSpecInSync", inSync, "bindingState", bindingSpec.State, "bindingRolloutStarted", rolloutStarted, "binding", klog.KObj(binding), "updateRun", updateRunRef) markClusterUpdatingFailed(clusterStatus, updateRun.GetGeneration(), preemptedErr.Error()) - return 0, fmt.Errorf("%w: %s", errStagedUpdatedAborted, preemptedErr.Error()) + clusterUpdateErrors = append(clusterUpdateErrors, fmt.Errorf("%w: %s", errStagedUpdatedAborted, preemptedErr.Error())) + continue } finished, updateErr := checkClusterUpdateResult(binding, clusterStatus, updatingStageStatus, updateRun) + if updateErr != nil { + clusterUpdateErrors = append(clusterUpdateErrors, updateErr) + } if finished { finishedClusterCount++ - markUpdateRunProgressing(updateRun) - continue + // The cluster has finished successfully, we can process another cluster in this round. + clusterUpdatingCount-- } else { // If cluster update has been running for more than "updateRunStuckThreshold", mark the update run as stuck. timeElapsed := time.Since(clusterStartedCond.LastTransitionTime.Time) if timeElapsed > updateRunStuckThreshold { klog.V(2).InfoS("Time waiting for cluster update to finish passes threshold, mark the update run as stuck", "time elapsed", timeElapsed, "threshold", updateRunStuckThreshold, "cluster", clusterStatus.ClusterName, "stage", updatingStageStatus.StageName, "updateRun", updateRunRef) - markUpdateRunStuck(updateRun, updatingStageStatus.StageName, clusterStatus.ClusterName) + stuckClusterNames = append(stuckClusterNames, clusterStatus.ClusterName) } } - // No need to continue as we only support one cluster updating at a time for now. - return clusterUpdatingWaitTime, updateErr + } + + // After processing maxConcurrency number of cluster, check if we need to mark the update run as stuck or progressing. + aggregateUpdateRunStatus(updateRun, updatingStageStatus.StageName, stuckClusterNames) + + // Aggregate and return errors. + if len(clusterUpdateErrors) > 0 { + // Even though we aggregate errors, we can still check if one of the errors is a staged update aborted error by using errors.Is in the caller. + return 0, utilerrors.NewAggregate(clusterUpdateErrors) } if finishedClusterCount == len(updatingStageStatus.Clusters) { @@ -232,6 +262,7 @@ func (r *Reconciler) executeUpdatingStage( } return waitTime, nil } + // Some clusters are still updating. return clusterUpdatingWaitTime, nil } @@ -431,6 +462,35 @@ func (r *Reconciler) updateApprovalRequestAccepted(ctx context.Context, appReq p return nil } +// calculateMaxConcurrencyValue calculates the actual max concurrency value for a stage. +// It converts the IntOrString maxConcurrency (which can be an integer or percentage) to an integer value +// based on the total number of clusters in the stage. The value is rounded down with 1 at minimum. +func calculateMaxConcurrencyValue(status *placementv1beta1.UpdateRunStatus, stageIndex int) (int, error) { + specifiedMaxConcurrency := status.UpdateStrategySnapshot.Stages[stageIndex].MaxConcurrency + clusterCount := len(status.StagesStatus[stageIndex].Clusters) + // Round down the maxConcurrency to the number of clusters in the stage. + maxConcurrencyValue, err := intstr.GetScaledValueFromIntOrPercent(specifiedMaxConcurrency, clusterCount, false) + if err != nil { + return 0, err + } + // Handle the case where maxConcurrency is specified as percentage but results in 0 after scaling down. + if maxConcurrencyValue == 0 { + maxConcurrencyValue = 1 + } + return maxConcurrencyValue, nil +} + +// aggregateUpdateRunStatus aggregates the status of the update run based on the cluster update status. +// It marks the update run as stuck if any clusters are stuck, or as progressing if some clusters have finished updating. +func aggregateUpdateRunStatus(updateRun placementv1beta1.UpdateRunObj, stageName string, stuckClusterNames []string) { + if len(stuckClusterNames) > 0 { + markUpdateRunStuck(updateRun, stageName, strings.Join(stuckClusterNames, ", ")) + } else { + // If there is no stuck cluster but some progress has been made, mark the update run as progressing. + markUpdateRunProgressing(updateRun) + } +} + // isBindingSyncedWithClusterStatus checks if the binding is up-to-date with the cluster status. func isBindingSyncedWithClusterStatus(resourceSnapshotName string, updateRun placementv1beta1.UpdateRunObj, binding placementv1beta1.BindingObj, cluster *placementv1beta1.ClusterUpdatingStatus) bool { bindingSpec := binding.GetBindingSpec() @@ -544,14 +604,14 @@ func markUpdateRunProgressingIfNotWaitingOrStuck(updateRun placementv1beta1.Upda } // markUpdateRunStuck marks the updateRun as stuck in memory. -func markUpdateRunStuck(updateRun placementv1beta1.UpdateRunObj, stageName, clusterName string) { +func markUpdateRunStuck(updateRun placementv1beta1.UpdateRunObj, stageName, clusterNames string) { updateRunStatus := updateRun.GetUpdateRunStatus() meta.SetStatusCondition(&updateRunStatus.Conditions, metav1.Condition{ Type: string(placementv1beta1.StagedUpdateRunConditionProgressing), Status: metav1.ConditionFalse, ObservedGeneration: updateRun.GetGeneration(), Reason: condition.UpdateRunStuckReason, - Message: fmt.Sprintf("The updateRun is stuck waiting for cluster %s in stage %s to finish updating, please check placement status for potential errors", clusterName, stageName), + Message: fmt.Sprintf("The updateRun is stuck waiting for cluster(s) %s in stage %s to finish updating, please check placement status for potential errors", clusterNames, stageName), }) } diff --git a/pkg/controllers/updaterun/execution_test.go b/pkg/controllers/updaterun/execution_test.go index 2dbf4dcff..ce775ea50 100644 --- a/pkg/controllers/updaterun/execution_test.go +++ b/pkg/controllers/updaterun/execution_test.go @@ -17,12 +17,21 @@ limitations under the License. package updaterun import ( + "context" + "errors" + "strings" "testing" + "time" "github.com/google/go-cmp/cmp" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" "github.com/kubefleet-dev/kubefleet/pkg/utils/condition" @@ -398,3 +407,539 @@ func TestBuildApprovalRequestObject(t *testing.T) { }) } } + +// TODO(arvindth): Add more test cases to cover aggregate error scenarios both positive and negative cases. +func TestExecuteUpdatingStage_Error(t *testing.T) { + tests := []struct { + name string + updateRun *placementv1beta1.ClusterStagedUpdateRun + bindings []placementv1beta1.BindingObj + interceptorFunc *interceptor.Funcs + wantErr error + wantAbortErr bool + wantWaitTime time.Duration + }{ + { + name: "cluster update failed", + updateRun: &placementv1beta1.ClusterStagedUpdateRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-update-run", + Generation: 1, + }, + Spec: placementv1beta1.UpdateRunSpec{ + PlacementName: "test-placement", + ResourceSnapshotIndex: "1", + }, + Status: placementv1beta1.UpdateRunStatus{ + StagesStatus: []placementv1beta1.StageUpdatingStatus{ + { + StageName: "test-stage", + Clusters: []placementv1beta1.ClusterUpdatingStatus{ + { + ClusterName: "cluster-1", + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ClusterUpdatingConditionSucceeded), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: condition.ClusterUpdatingFailedReason, + Message: "cluster update failed", + }, + }, + }, + }, + }, + }, + UpdateStrategySnapshot: &placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "test-stage", + MaxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + }, + }, + }, + }, + }, + bindings: nil, + interceptorFunc: nil, + wantErr: errors.New("the cluster `cluster-1` in the stage test-stage has failed"), + wantAbortErr: true, + wantWaitTime: 0, + }, + { + name: "binding update failure", + updateRun: &placementv1beta1.ClusterStagedUpdateRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-update-run", + Generation: 1, + }, + Spec: placementv1beta1.UpdateRunSpec{ + PlacementName: "test-placement", + ResourceSnapshotIndex: "1", + }, + Status: placementv1beta1.UpdateRunStatus{ + StagesStatus: []placementv1beta1.StageUpdatingStatus{ + { + StageName: "test-stage", + Clusters: []placementv1beta1.ClusterUpdatingStatus{ + { + ClusterName: "cluster-1", + }, + }, + }, + }, + UpdateStrategySnapshot: &placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "test-stage", + MaxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + }, + }, + }, + }, + }, + bindings: []placementv1beta1.BindingObj{ + &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "binding-1", + Generation: 1, + }, + Spec: placementv1beta1.ResourceBindingSpec{ + TargetCluster: "cluster-1", + State: placementv1beta1.BindingStateScheduled, + }, + }, + }, + interceptorFunc: &interceptor.Funcs{ + Update: func(ctx context.Context, client client.WithWatch, obj client.Object, opts ...client.UpdateOption) error { + return errors.New("simulated update error") + }, + }, + wantErr: errors.New("simulated update error"), + wantWaitTime: 0, + }, + { + name: "binding preemption", + updateRun: &placementv1beta1.ClusterStagedUpdateRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-update-run", + Generation: 1, + }, + Spec: placementv1beta1.UpdateRunSpec{ + PlacementName: "test-placement", + ResourceSnapshotIndex: "1", + }, + Status: placementv1beta1.UpdateRunStatus{ + StagesStatus: []placementv1beta1.StageUpdatingStatus{ + { + StageName: "test-stage", + Clusters: []placementv1beta1.ClusterUpdatingStatus{ + { + ClusterName: "cluster-1", + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ClusterUpdatingConditionStarted), + Status: metav1.ConditionTrue, + ObservedGeneration: 1, + Reason: condition.ClusterUpdatingStartedReason, + }, + }, + }, + }, + }, + }, + UpdateStrategySnapshot: &placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "test-stage", + MaxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + }, + }, + }, + }, + }, + bindings: []placementv1beta1.BindingObj{ + &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "binding-1", + Generation: 1, + }, + Spec: placementv1beta1.ResourceBindingSpec{ + TargetCluster: "cluster-1", + ResourceSnapshotName: "wrong-snapshot", + State: placementv1beta1.BindingStateBound, + }, + Status: placementv1beta1.ResourceBindingStatus{ + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ResourceBindingRolloutStarted), + Status: metav1.ConditionTrue, + ObservedGeneration: 1, + }, + }, + }, + }, + }, + interceptorFunc: nil, + wantErr: errors.New("the binding of the updating cluster `cluster-1` in the stage `test-stage` is not up-to-date with the desired status"), + wantAbortErr: true, + wantWaitTime: 0, + }, + { + name: "binding synced but state not bound - update binding state fails", + updateRun: &placementv1beta1.ClusterStagedUpdateRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-update-run", + Generation: 1, + }, + Spec: placementv1beta1.UpdateRunSpec{ + PlacementName: "test-placement", + ResourceSnapshotIndex: "1", + }, + Status: placementv1beta1.UpdateRunStatus{ + ResourceSnapshotIndexUsed: "1", + StagesStatus: []placementv1beta1.StageUpdatingStatus{ + { + StageName: "test-stage", + Clusters: []placementv1beta1.ClusterUpdatingStatus{ + { + ClusterName: "cluster-1", + // No conditions - cluster has not started updating yet. + }, + }, + }, + }, + UpdateStrategySnapshot: &placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "test-stage", + MaxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + }, + }, + }, + }, + }, + bindings: []placementv1beta1.BindingObj{ + &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "binding-1", + Generation: 1, + }, + Spec: placementv1beta1.ResourceBindingSpec{ + TargetCluster: "cluster-1", + ResourceSnapshotName: "test-placement-1-snapshot", // Already synced. + State: placementv1beta1.BindingStateScheduled, // But not Bound yet. + }, + }, + }, + interceptorFunc: &interceptor.Funcs{ + Update: func(ctx context.Context, client client.WithWatch, obj client.Object, opts ...client.UpdateOption) error { + return errors.New("failed to update binding state") + }, + }, + wantErr: errors.New("failed to update binding state"), + wantWaitTime: 0, + }, + { + name: "binding synced and bound but generation updated - update rolloutStarted fails", + updateRun: &placementv1beta1.ClusterStagedUpdateRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-update-run", + Generation: 1, + }, + Spec: placementv1beta1.UpdateRunSpec{ + PlacementName: "test-placement", + ResourceSnapshotIndex: "1", + }, + Status: placementv1beta1.UpdateRunStatus{ + ResourceSnapshotIndexUsed: "1", + StagesStatus: []placementv1beta1.StageUpdatingStatus{ + { + StageName: "test-stage", + Clusters: []placementv1beta1.ClusterUpdatingStatus{ + { + ClusterName: "cluster-1", + // No conditions - cluster has not started updating yet. + }, + }, + }, + }, + UpdateStrategySnapshot: &placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "test-stage", + MaxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + }, + }, + }, + }, + }, + bindings: []placementv1beta1.BindingObj{ + &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "binding-1", + Generation: 2, // Generation updated by scheduler. + }, + Spec: placementv1beta1.ResourceBindingSpec{ + TargetCluster: "cluster-1", + ResourceSnapshotName: "test-placement-1-snapshot", // Already synced. + State: placementv1beta1.BindingStateBound, // Already Bound. + }, + Status: placementv1beta1.ResourceBindingStatus{ + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ResourceBindingRolloutStarted), + Status: metav1.ConditionTrue, + ObservedGeneration: 1, // Old generation - needs update. + Reason: condition.RolloutStartedReason, + }, + }, + }, + }, + }, + interceptorFunc: &interceptor.Funcs{ + SubResourceUpdate: func(ctx context.Context, client client.Client, subResourceName string, obj client.Object, opts ...client.SubResourceUpdateOption) error { + // Fail the status update for rolloutStarted. + return errors.New("failed to update binding rolloutStarted status") + }, + }, + wantErr: errors.New("failed to update binding rolloutStarted status"), + wantWaitTime: 0, + }, + { + name: "binding synced, bound, rolloutStarted true, but binding has failed condition", + updateRun: &placementv1beta1.ClusterStagedUpdateRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-update-run", + Generation: 1, + }, + Spec: placementv1beta1.UpdateRunSpec{ + PlacementName: "test-placement", + ResourceSnapshotIndex: "1", + }, + Status: placementv1beta1.UpdateRunStatus{ + ResourceSnapshotIndexUsed: "1", + StagesStatus: []placementv1beta1.StageUpdatingStatus{ + { + StageName: "test-stage", + Clusters: []placementv1beta1.ClusterUpdatingStatus{ + { + ClusterName: "cluster-1", + // No conditions - cluster has not started updating yet. + }, + }, + }, + }, + UpdateStrategySnapshot: &placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "test-stage", + MaxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + }, + }, + }, + }, + }, + bindings: []placementv1beta1.BindingObj{ + &placementv1beta1.ClusterResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "binding-1", + Generation: 1, + }, + Spec: placementv1beta1.ResourceBindingSpec{ + TargetCluster: "cluster-1", + ResourceSnapshotName: "test-placement-1-snapshot", // Already synced. + State: placementv1beta1.BindingStateBound, // Already Bound. + }, + Status: placementv1beta1.ResourceBindingStatus{ + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ResourceBindingRolloutStarted), + Status: metav1.ConditionTrue, + ObservedGeneration: 1, + Reason: condition.RolloutStartedReason, + }, + { + Type: string(placementv1beta1.ResourceBindingApplied), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: condition.ApplyFailedReason, + }, + }, + }, + }, + }, + interceptorFunc: nil, + wantErr: errors.New("cluster updating encountered an error at stage"), + wantWaitTime: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + scheme := runtime.NewScheme() + _ = placementv1beta1.AddToScheme(scheme) + + var fakeClient client.Client + objs := make([]client.Object, len(tt.bindings)) + for i := range tt.bindings { + objs[i] = tt.bindings[i] + } + if tt.interceptorFunc != nil { + fakeClient = interceptor.NewClient( + fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build(), + *tt.interceptorFunc, + ) + } else { + fakeClient = fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build() + } + + r := &Reconciler{ + Client: fakeClient, + } + + // Execute the stage. + waitTime, gotErr := r.executeUpdatingStage(ctx, tt.updateRun, 0, tt.bindings, 1) + + // Verify error expectation. + if (tt.wantErr != nil) != (gotErr != nil) { + t.Fatalf("executeUpdatingStage() want error: %v, got error: %v", tt.wantErr, gotErr) + } + + // Verify error message contains expected substring. + if tt.wantErr != nil && gotErr != nil { + if errors.Is(gotErr, errStagedUpdatedAborted) != tt.wantAbortErr { + t.Fatalf("executeUpdatingStage() want abort error: %v, got error: %v", tt.wantAbortErr, gotErr) + } + if !strings.Contains(gotErr.Error(), tt.wantErr.Error()) { + t.Fatalf("executeUpdatingStage() want error: %v, got error: %v", tt.wantErr, gotErr) + } + } + + // Verify wait time. + if waitTime != tt.wantWaitTime { + t.Fatalf("executeUpdatingStage() want waitTime: %v, got waitTime: %v", tt.wantWaitTime, waitTime) + } + }) + } +} + +func TestCalculateMaxConcurrencyValue(t *testing.T) { + tests := []struct { + name string + maxConcurrency *intstr.IntOrString + clusterCount int + wantValue int + wantErr bool + }{ + { + name: "integer value - less than cluster count", + maxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 3}, + clusterCount: 10, + wantValue: 3, + wantErr: false, + }, + { + name: "integer value - equal to cluster count", + maxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 10}, + clusterCount: 10, + wantValue: 10, + wantErr: false, + }, + { + name: "integer value - greater than cluster count", + maxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 15}, + clusterCount: 10, + wantValue: 15, + wantErr: false, + }, + { + name: "percentage value - 50% with cluster count > 1", + maxConcurrency: &intstr.IntOrString{Type: intstr.String, StrVal: "50%"}, + clusterCount: 10, + wantValue: 5, + wantErr: false, + }, + { + name: "percentage value - non zero percentage with cluster count equal to 1", + maxConcurrency: &intstr.IntOrString{Type: intstr.String, StrVal: "10%"}, + clusterCount: 1, + wantValue: 1, + wantErr: false, + }, + { + name: "percentage value - 33% rounds down", + maxConcurrency: &intstr.IntOrString{Type: intstr.String, StrVal: "33%"}, + clusterCount: 10, + wantValue: 3, + wantErr: false, + }, + { + name: "percentage value - 100%", + maxConcurrency: &intstr.IntOrString{Type: intstr.String, StrVal: "100%"}, + clusterCount: 10, + wantValue: 10, + wantErr: false, + }, + { + name: "percentage value - 25% with 7 clusters", + maxConcurrency: &intstr.IntOrString{Type: intstr.String, StrVal: "25%"}, + clusterCount: 7, + wantValue: 1, + wantErr: false, + }, + { + name: "zero clusters", + maxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 3}, + clusterCount: 0, + wantValue: 3, + wantErr: false, + }, + { + name: "non-zero percentage with zero clusters", + maxConcurrency: &intstr.IntOrString{Type: intstr.String, StrVal: "50%"}, + clusterCount: 0, + wantValue: 1, + wantErr: false, + }, + { + name: "non-zero value as string without percentage with zero clusters", + maxConcurrency: &intstr.IntOrString{Type: intstr.String, StrVal: "50"}, + clusterCount: 0, + wantValue: 0, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + status := &placementv1beta1.UpdateRunStatus{ + StagesStatus: []placementv1beta1.StageUpdatingStatus{ + { + StageName: "test-stage", + Clusters: make([]placementv1beta1.ClusterUpdatingStatus, tt.clusterCount), + }, + }, + UpdateStrategySnapshot: &placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "test-stage", + MaxConcurrency: tt.maxConcurrency, + }, + }, + }, + } + + gotValue, gotErr := calculateMaxConcurrencyValue(status, 0) + + if (gotErr != nil) != tt.wantErr { + t.Fatalf("calculateMaxConcurrencyValue() error = %v, wantErr %v", gotErr, tt.wantErr) + } + + if gotValue != tt.wantValue { + t.Fatalf("calculateMaxConcurrencyValue() = %v, want %v", gotValue, tt.wantValue) + } + }) + } +} diff --git a/pkg/controllers/updaterun/validation.go b/pkg/controllers/updaterun/validation.go index 27d557b77..ffa5ea3c2 100644 --- a/pkg/controllers/updaterun/validation.go +++ b/pkg/controllers/updaterun/validation.go @@ -234,23 +234,7 @@ func validateClusterUpdatingStatus( return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, unexpectedErr.Error()) } updatingStageIndex = curStage - // Collect the updating clusters. - var updatingClusters []string - for j := range stageStatus.Clusters { - clusterStartedCond := meta.FindStatusCondition(stageStatus.Clusters[j].Conditions, string(placementv1beta1.ClusterUpdatingConditionStarted)) - clusterFinishedCond := meta.FindStatusCondition(stageStatus.Clusters[j].Conditions, string(placementv1beta1.ClusterUpdatingConditionSucceeded)) - if condition.IsConditionStatusTrue(clusterStartedCond, updateRun.GetGeneration()) && - !(condition.IsConditionStatusTrue(clusterFinishedCond, updateRun.GetGeneration()) || condition.IsConditionStatusFalse(clusterFinishedCond, updateRun.GetGeneration())) { - updatingClusters = append(updatingClusters, stageStatus.Clusters[j].ClusterName) - } - } - // We don't allow more than one clusters to be updating at the same time. - // TODO(wantjian): support multiple clusters updating at the same time. - if len(updatingClusters) > 1 { - unexpectedErr := controller.NewUnexpectedBehaviorError(fmt.Errorf("more than one cluster is updating in the stage `%s`, clusters: %v", stageStatus.StageName, updatingClusters)) - klog.ErrorS(unexpectedErr, "Detected more than one updating clusters in the stage", "updateRun", klog.KObj(updateRun)) - return -1, -1, fmt.Errorf("%w: %s", errStagedUpdatedAborted, unexpectedErr.Error()) - } + // TODO(arvindth): add validation to ensure updating cluster count should not exceed maxConcurrency. } return updatingStageIndex, lastFinishedStageIndex, nil } diff --git a/pkg/controllers/updaterun/validation_test.go b/pkg/controllers/updaterun/validation_test.go index 0f01168e3..d6ad8215d 100644 --- a/pkg/controllers/updaterun/validation_test.go +++ b/pkg/controllers/updaterun/validation_test.go @@ -145,7 +145,7 @@ func TestValidateClusterUpdatingStatus(t *testing.T) { wantLastFinishedStageIndex: -1, }, { - name: "determineUpdatignStage should return error if there are multiple clusters updating in an updating stage", + name: "determineUpdatignStage should not return error if there are multiple clusters updating in an updating stage", curStage: 0, updatingStageIndex: -1, lastFinishedStageIndex: -1, @@ -163,8 +163,8 @@ func TestValidateClusterUpdatingStatus(t *testing.T) { }, }, }, - wantErr: wrapErr(true, fmt.Errorf("more than one cluster is updating in the stage `test-stage`, clusters: [cluster-1 cluster-2]")), - wantUpdatingStageIndex: -1, + wantErr: nil, + wantUpdatingStageIndex: 0, wantLastFinishedStageIndex: -1, }, { diff --git a/test/apis/placement/v1beta1/api_validation_integration_test.go b/test/apis/placement/v1beta1/api_validation_integration_test.go index d8eeb7d0c..9d14e54f9 100644 --- a/test/apis/placement/v1beta1/api_validation_integration_test.go +++ b/test/apis/placement/v1beta1/api_validation_integration_test.go @@ -1282,6 +1282,86 @@ var _ = Describe("Test placement v1beta1 API validation", func() { Expect(strategy.Spec.Stages[0].BeforeStageTasks[0].WaitTime).Should(BeNil()) Expect(hubClient.Delete(ctx, &strategy)).Should(Succeed()) }) + + It("Should allow creation of ClusterStagedUpdateStrategy with MaxConcurrency as integer between 1-100", func() { + maxConcurrency := intstr.FromInt(70) + strategy := placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + MaxConcurrency: &maxConcurrency, + }, + }, + }, + } + Expect(hubClient.Create(ctx, &strategy)).Should(Succeed()) + Expect(*strategy.Spec.Stages[0].MaxConcurrency).Should(Equal(maxConcurrency)) + Expect(hubClient.Delete(ctx, &strategy)).Should(Succeed()) + }) + + It("Should allow creation of ClusterStagedUpdateStrategy with MaxConcurrency as integer greater than 100", func() { + maxConcurrency := intstr.FromInt(150) + strategy := placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + MaxConcurrency: &maxConcurrency, + }, + }, + }, + } + Expect(hubClient.Create(ctx, &strategy)).Should(Succeed()) + Expect(*strategy.Spec.Stages[0].MaxConcurrency).Should(Equal(maxConcurrency)) + Expect(hubClient.Delete(ctx, &strategy)).Should(Succeed()) + }) + + It("Should allow creation of ClusterStagedUpdateStrategy with MaxConcurrency as 1%", func() { + maxConcurrency := intstr.FromString("1%") + strategy := placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + MaxConcurrency: &maxConcurrency, + }, + }, + }, + } + Expect(hubClient.Create(ctx, &strategy)).Should(Succeed()) + Expect(*strategy.Spec.Stages[0].MaxConcurrency).Should(Equal(maxConcurrency)) + Expect(hubClient.Delete(ctx, &strategy)).Should(Succeed()) + }) + + It("Should allow creation of ClusterStagedUpdateStrategy with MaxConcurrency as 100%", func() { + maxConcurrency := intstr.FromString("100%") + strategy := placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + MaxConcurrency: &maxConcurrency, + }, + }, + }, + } + Expect(hubClient.Create(ctx, &strategy)).Should(Succeed()) + Expect(*strategy.Spec.Stages[0].MaxConcurrency).Should(Equal(maxConcurrency)) + Expect(hubClient.Delete(ctx, &strategy)).Should(Succeed()) + }) }) Context("Test ClusterStagedUpdateStrategy API validation - invalid cases", func() { @@ -1567,6 +1647,132 @@ var _ = Describe("Test placement v1beta1 API validation", func() { Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("Too many: 2: must have at most 1 items")) }) + + It("Should deny creation of ClusterStagedUpdateStrategy with MaxConcurrency set to a negative value", func() { + maxConcurrency := intstr.FromInt(-1) + strategy := placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + MaxConcurrency: &maxConcurrency, + }, + }, + }, + } + err := hubClient.Create(ctx, &strategy) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("maxConcurrency must be at least 1")) + }) + + It("Should deny creation of ClusterStagedUpdateStrategy with MaxConcurrency set to 0", func() { + maxConcurrency := intstr.FromInt(0) + strategy := placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + MaxConcurrency: &maxConcurrency, + }, + }, + }, + } + err := hubClient.Create(ctx, &strategy) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("maxConcurrency must be at least 1")) + }) + + It("Should deny creation of ClusterStagedUpdateStrategy with MaxConcurrency set to '0'", func() { + maxConcurrency := intstr.FromString("0") + strategy := placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + MaxConcurrency: &maxConcurrency, + }, + }, + }, + } + err := hubClient.Create(ctx, &strategy) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].maxConcurrency in body should match")) + }) + + It("Should deny creation of ClusterStagedUpdateStrategy with MaxConcurrency set to '50'", func() { + maxConcurrency := intstr.FromString("50") + strategy := placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + MaxConcurrency: &maxConcurrency, + }, + }, + }, + } + err := hubClient.Create(ctx, &strategy) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].maxConcurrency in body should match")) + }) + + It("Should deny creation of ClusterStagedUpdateStrategy with MaxConcurrency set to 0%", func() { + maxConcurrency := intstr.FromString("0%") + strategy := placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + MaxConcurrency: &maxConcurrency, + }, + }, + }, + } + err := hubClient.Create(ctx, &strategy) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].maxConcurrency in body should match")) + }) + + It("Should deny creation of ClusterStagedUpdateStrategy with MaxConcurrency set to 101%", func() { + maxConcurrency := intstr.FromString("101%") + strategy := placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(updateRunStrategyNameTemplate, GinkgoParallelProcess()), + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: fmt.Sprintf(updateRunStageNameTemplate, GinkgoParallelProcess(), 1), + MaxConcurrency: &maxConcurrency, + }, + }, + }, + } + err := hubClient.Create(ctx, &strategy) + var statusErr *k8sErrors.StatusError + Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create updateRunStrategy call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("spec.stages\\[0\\].maxConcurrency in body should match")) + }) }) Context("Test ClusterApprovalRequest API validation - valid cases", func() { diff --git a/test/e2e/cluster_staged_updaterun_test.go b/test/e2e/cluster_staged_updaterun_test.go index 5f93a4254..8e8b28822 100644 --- a/test/e2e/cluster_staged_updaterun_test.go +++ b/test/e2e/cluster_staged_updaterun_test.go @@ -27,6 +27,7 @@ import ( apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" @@ -37,7 +38,8 @@ import ( const ( // The current stage wait between clusters are 15 seconds - updateRunEventuallyDuration = time.Minute + updateRunEventuallyDuration = time.Minute + updateRunParallelEventuallyDuration = 20 * time.Second resourceSnapshotIndex1st = "0" resourceSnapshotIndex2nd = "1" @@ -1439,6 +1441,186 @@ var _ = Describe("test CRP rollout with staged update run", func() { } }) }) + + Context("Test parallel cluster updates with maxConcurrency set to 3", Ordered, func() { + var strategy *placementv1beta1.ClusterStagedUpdateStrategy + updateRunName := fmt.Sprintf(clusterStagedUpdateRunNameWithSubIndexTemplate, GinkgoParallelProcess(), 0) + + BeforeAll(func() { + // Create a test namespace and a configMap inside it on the hub cluster. + createWorkResources() + + // Create the CRP with external rollout strategy. + crp := &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: crpName, + // Add a custom finalizer; this would allow us to better observe + // the behavior of the controllers. + Finalizers: []string{customDeletionBlockerFinalizer}, + }, + Spec: placementv1beta1.PlacementSpec{ + ResourceSelectors: workResourceSelector(), + Strategy: placementv1beta1.RolloutStrategy{ + Type: placementv1beta1.ExternalRolloutStrategyType, + }, + }, + } + Expect(hubClient.Create(ctx, crp)).To(Succeed(), "Failed to create CRP") + + // Create a strategy with a single stage selecting all 3 clusters with maxConcurrency specified. + strategy = &placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: strategyName, + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "parallel", + // Pick all clusters in a single stage. + LabelSelector: &metav1.LabelSelector{}, + MaxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 3}, + }, + }, + }, + } + Expect(hubClient.Create(ctx, strategy)).To(Succeed(), "Failed to create ClusterStagedUpdateStrategy") + }) + + AfterAll(func() { + // Remove the custom deletion blocker finalizer from the CRP. + ensureCRPAndRelatedResourcesDeleted(crpName, allMemberClusters) + + // Delete the clusterStagedUpdateRun. + ensureClusterStagedUpdateRunDeletion(updateRunName) + + // Delete the clusterStagedUpdateStrategy. + ensureClusterUpdateRunStrategyDeletion(strategyName) + }) + + It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedWorkResourcesFromAllMemberClustersConsistently) + + It("Should have the latest resource snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + }) + + It("Should successfully schedule the crp", func() { + validateLatestClusterSchedulingPolicySnapshot(crpName, policySnapshotIndex1st, 3) + }) + + It("Should update crp status as pending rollout", func() { + crpStatusUpdatedActual := crpStatusWithExternalStrategyActual(nil, "", false, allMemberClusterNames, []string{"", "", ""}, []bool{false, false, false}, nil, nil) + Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP %s status as expected", crpName) + }) + + It("Should create a cluster staged update run successfully", func() { + createClusterStagedUpdateRunSucceed(updateRunName, crpName, resourceSnapshotIndex1st, strategyName) + }) + + It("Should complete the cluster staged update run with all 3 clusters updated in parallel", func() { + // With maxConcurrency=3, all 3 clusters should be updated in parallel. + // Each round waits 15 seconds, so total time should be under 20s. + csurSucceededActual := clusterStagedUpdateRunStatusSucceededActual(updateRunName, resourceSnapshotIndex1st, policySnapshotIndex1st, len(allMemberClusters), defaultApplyStrategy, &strategy.Spec, [][]string{{allMemberClusterNames[0], allMemberClusterNames[1], allMemberClusterNames[2]}}, nil, nil, nil) + Eventually(csurSucceededActual, updateRunParallelEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to validate updateRun %s succeeded", updateRunName) + checkIfPlacedWorkResourcesOnMemberClustersInUpdateRun(allMemberClusters) + }) + + It("Should update crp status as completed", func() { + crpStatusUpdatedActual := crpStatusWithExternalStrategyActual(workResourceIdentifiers(), resourceSnapshotIndex1st, true, allMemberClusterNames, + []string{resourceSnapshotIndex1st, resourceSnapshotIndex1st, resourceSnapshotIndex1st}, []bool{true, true, true}, nil, nil) + Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP %s status as expected", crpName) + }) + }) + + Context("Test parallel cluster updates with maxConcurrency set to 70%", Ordered, func() { + var strategy *placementv1beta1.ClusterStagedUpdateStrategy + updateRunName := fmt.Sprintf(clusterStagedUpdateRunNameWithSubIndexTemplate, GinkgoParallelProcess(), 0) + + BeforeAll(func() { + // Create a test namespace and a configMap inside it on the hub cluster. + createWorkResources() + + // Create the CRP with external rollout strategy. + crp := &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: crpName, + // Add a custom finalizer; this would allow us to better observe + // the behavior of the controllers. + Finalizers: []string{customDeletionBlockerFinalizer}, + }, + Spec: placementv1beta1.PlacementSpec{ + ResourceSelectors: workResourceSelector(), + Strategy: placementv1beta1.RolloutStrategy{ + Type: placementv1beta1.ExternalRolloutStrategyType, + }, + }, + } + Expect(hubClient.Create(ctx, crp)).To(Succeed(), "Failed to create CRP") + + // Create a strategy with maxConcurrency set to 70%. + // With 3 clusters, 70% = 2.1, rounded down to 2 clusters. + strategy = &placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: strategyName, + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "percentage", + // Pick all clusters in a single stage. + LabelSelector: &metav1.LabelSelector{}, + MaxConcurrency: &intstr.IntOrString{Type: intstr.String, StrVal: "70%"}, + }, + }, + }, + } + Expect(hubClient.Create(ctx, strategy)).To(Succeed(), "Failed to create ClusterStagedUpdateStrategy") + }) + + AfterAll(func() { + // Remove the custom deletion blocker finalizer from the CRP. + ensureCRPAndRelatedResourcesDeleted(crpName, allMemberClusters) + + // Delete the clusterStagedUpdateRun. + ensureClusterStagedUpdateRunDeletion(updateRunName) + + // Delete the clusterStagedUpdateStrategy. + ensureClusterUpdateRunStrategyDeletion(strategyName) + }) + + It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedWorkResourcesFromAllMemberClustersConsistently) + + It("Should have the latest resource snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + }) + + It("Should successfully schedule the crp", func() { + validateLatestClusterSchedulingPolicySnapshot(crpName, policySnapshotIndex1st, 3) + }) + + It("Should update crp status as pending rollout", func() { + crpStatusUpdatedActual := crpStatusWithExternalStrategyActual(nil, "", false, allMemberClusterNames, []string{"", "", ""}, []bool{false, false, false}, nil, nil) + Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP %s status as expected", crpName) + }) + + It("Should create a cluster staged update run successfully", func() { + createClusterStagedUpdateRunSucceed(updateRunName, crpName, resourceSnapshotIndex1st, strategyName) + }) + + It("Should complete the cluster staged update run with all 3 clusters", func() { + // Since maxConcurrency=70% each round we process 2 clusters in parallel, + // so all 3 clusters should be updated in 2 rounds. + // Each round waits 15 seconds, so total time should be under 40s. + csurSucceededActual := clusterStagedUpdateRunStatusSucceededActual(updateRunName, resourceSnapshotIndex1st, policySnapshotIndex1st, len(allMemberClusters), defaultApplyStrategy, &strategy.Spec, [][]string{{allMemberClusterNames[0], allMemberClusterNames[1], allMemberClusterNames[2]}}, nil, nil, nil) + Eventually(csurSucceededActual, updateRunParallelEventuallyDuration*2, eventuallyInterval).Should(Succeed(), "Failed to validate updateRun %s succeeded", updateRunName) + checkIfPlacedWorkResourcesOnMemberClustersInUpdateRun(allMemberClusters) + }) + + It("Should update crp status as completed", func() { + crpStatusUpdatedActual := crpStatusWithExternalStrategyActual(workResourceIdentifiers(), resourceSnapshotIndex1st, true, allMemberClusterNames, + []string{resourceSnapshotIndex1st, resourceSnapshotIndex1st, resourceSnapshotIndex1st}, []bool{true, true, true}, nil, nil) + Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP %s status as expected", crpName) + }) + }) }) // Note that this container cannot run in parallel with other containers. diff --git a/test/e2e/staged_updaterun_test.go b/test/e2e/staged_updaterun_test.go index 277d0221a..2f03affa9 100644 --- a/test/e2e/staged_updaterun_test.go +++ b/test/e2e/staged_updaterun_test.go @@ -27,6 +27,7 @@ import ( "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" @@ -1266,6 +1267,184 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem } }) }) + + Context("Test parallel cluster updates with maxConcurrency set to 3", Ordered, func() { + var strategy *placementv1beta1.StagedUpdateStrategy + updateRunName := fmt.Sprintf(stagedUpdateRunNameWithSubIndexTemplate, GinkgoParallelProcess(), 0) + + BeforeAll(func() { + // Create the RP with external rollout strategy. + rp := &placementv1beta1.ResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: rpName, + Namespace: testNamespace, + // Add a custom finalizer; this would allow us to better observe + // the behavior of the controllers. + Finalizers: []string{customDeletionBlockerFinalizer}, + }, + Spec: placementv1beta1.PlacementSpec{ + ResourceSelectors: configMapSelector(), + Strategy: placementv1beta1.RolloutStrategy{ + Type: placementv1beta1.ExternalRolloutStrategyType, + }, + }, + } + Expect(hubClient.Create(ctx, rp)).To(Succeed(), "Failed to create RP") + + // Create a strategy with a single stage selecting all 3 clusters with maxConcurrency specified. + strategy = &placementv1beta1.StagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: strategyName, + Namespace: testNamespace, + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "parallel", + // Pick all clusters in a single stage. + LabelSelector: &metav1.LabelSelector{}, + MaxConcurrency: &intstr.IntOrString{Type: intstr.Int, IntVal: 3}, + }, + }, + }, + } + Expect(hubClient.Create(ctx, strategy)).To(Succeed(), "Failed to create StagedUpdateStrategy") + }) + + AfterAll(func() { + // Remove the custom deletion blocker finalizer from the RP. + ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: testNamespace}, allMemberClusters) + + // Delete the stagedUpdateRun. + ensureStagedUpdateRunDeletion(updateRunName, testNamespace) + + // Delete the stagedUpdateStrategy. + ensureStagedUpdateRunStrategyDeletion(strategyName, testNamespace) + }) + + It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedConfigMapFromAllMemberClustersConsistently) + + It("Should have the latest resource snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) + }) + + It("Should successfully schedule the rp", func() { + validateLatestSchedulingPolicySnapshot(rpName, testNamespace, policySnapshotIndex1st, 3) + }) + + It("Should update rp status as pending rollout", func() { + rpStatusUpdatedActual := rpStatusWithExternalStrategyActual(nil, "", false, allMemberClusterNames, []string{"", "", ""}, []bool{false, false, false}, nil, nil) + Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s/%s status as expected", testNamespace, rpName) + }) + + It("Should create a staged update run successfully", func() { + createStagedUpdateRunSucceed(updateRunName, testNamespace, rpName, resourceSnapshotIndex1st, strategyName) + }) + + It("Should complete the staged update run with all 3 clusters updated in parallel", func() { + // With maxConcurrency=3, all 3 clusters should be updated in parallel. + // Each round waits 15 seconds, so total time should be under 20s. + surSucceededActual := stagedUpdateRunStatusSucceededActual(updateRunName, testNamespace, resourceSnapshotIndex1st, policySnapshotIndex1st, len(allMemberClusters), defaultApplyStrategy, &strategy.Spec, [][]string{{allMemberClusterNames[0], allMemberClusterNames[1], allMemberClusterNames[2]}}, nil, nil, nil) + Eventually(surSucceededActual, updateRunParallelEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to validate updateRun %s/%s succeeded", testNamespace, updateRunName) + checkIfPlacedWorkResourcesOnMemberClustersInUpdateRun(allMemberClusters) + }) + + It("Should update rp status as completed", func() { + rpStatusUpdatedActual := rpStatusWithExternalStrategyActual(appConfigMapIdentifiers(), resourceSnapshotIndex1st, true, allMemberClusterNames, + []string{resourceSnapshotIndex1st, resourceSnapshotIndex1st, resourceSnapshotIndex1st}, []bool{true, true, true}, nil, nil) + Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s/%s status as expected", testNamespace, rpName) + }) + }) + + Context("Test parallel cluster updates with maxConcurrency set to 70%", Ordered, func() { + var strategy *placementv1beta1.StagedUpdateStrategy + updateRunName := fmt.Sprintf(stagedUpdateRunNameWithSubIndexTemplate, GinkgoParallelProcess(), 0) + + BeforeAll(func() { + // Create the RP with external rollout strategy. + rp := &placementv1beta1.ResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: rpName, + Namespace: testNamespace, + // Add a custom finalizer; this would allow us to better observe + // the behavior of the controllers. + Finalizers: []string{customDeletionBlockerFinalizer}, + }, + Spec: placementv1beta1.PlacementSpec{ + ResourceSelectors: configMapSelector(), + Strategy: placementv1beta1.RolloutStrategy{ + Type: placementv1beta1.ExternalRolloutStrategyType, + }, + }, + } + Expect(hubClient.Create(ctx, rp)).To(Succeed(), "Failed to create RP") + + // Create a strategy with maxConcurrency set to 70%. + // With 3 clusters, 70% = 2.1, rounded down to 2 clusters. + strategy = &placementv1beta1.StagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: strategyName, + Namespace: testNamespace, + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "parallel", + // Pick all clusters in a single stage. + LabelSelector: &metav1.LabelSelector{}, + MaxConcurrency: &intstr.IntOrString{Type: intstr.String, StrVal: "70%"}, + }, + }, + }, + } + Expect(hubClient.Create(ctx, strategy)).To(Succeed(), "Failed to create StagedUpdateStrategy") + }) + + AfterAll(func() { + // Remove the custom deletion blocker finalizer from the RP. + ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: testNamespace}, allMemberClusters) + + // Delete the stagedUpdateRun. + ensureStagedUpdateRunDeletion(updateRunName, testNamespace) + + // Delete the stagedUpdateStrategy. + ensureStagedUpdateRunStrategyDeletion(strategyName, testNamespace) + }) + + It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedConfigMapFromAllMemberClustersConsistently) + + It("Should have the latest resource snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) + }) + + It("Should successfully schedule the rp", func() { + validateLatestSchedulingPolicySnapshot(rpName, testNamespace, policySnapshotIndex1st, 3) + }) + + It("Should update rp status as pending rollout", func() { + rpStatusUpdatedActual := rpStatusWithExternalStrategyActual(nil, "", false, allMemberClusterNames, []string{"", "", ""}, []bool{false, false, false}, nil, nil) + Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s/%s status as expected", testNamespace, rpName) + }) + + It("Should create a staged update run successfully", func() { + createStagedUpdateRunSucceed(updateRunName, testNamespace, rpName, resourceSnapshotIndex1st, strategyName) + }) + + It("Should complete the staged update run with all 3 clusters", func() { + // Since maxConcurrency=70% each round we process 2 clusters in parallel, + // so all 3 clusters should be updated in 2 rounds. + // Each round waits 15 seconds, so total time should be under 40s. + surSucceededActual := stagedUpdateRunStatusSucceededActual(updateRunName, testNamespace, resourceSnapshotIndex1st, policySnapshotIndex1st, len(allMemberClusters), defaultApplyStrategy, &strategy.Spec, [][]string{{allMemberClusterNames[0], allMemberClusterNames[1], allMemberClusterNames[2]}}, nil, nil, nil) + Eventually(surSucceededActual, updateRunParallelEventuallyDuration*2, eventuallyInterval).Should(Succeed(), "Failed to validate updateRun %s/%s succeeded", testNamespace, updateRunName) + checkIfPlacedWorkResourcesOnMemberClustersInUpdateRun(allMemberClusters) + }) + + It("Should update rp status as completed", func() { + rpStatusUpdatedActual := rpStatusWithExternalStrategyActual(appConfigMapIdentifiers(), resourceSnapshotIndex1st, true, allMemberClusterNames, + []string{resourceSnapshotIndex1st, resourceSnapshotIndex1st, resourceSnapshotIndex1st}, []bool{true, true, true}, nil, nil) + Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s/%s status as expected", testNamespace, rpName) + }) + }) }) func createStagedUpdateStrategySucceed(strategyName, namespace string) *placementv1beta1.StagedUpdateStrategy { From e4b6260478fa0f7511eb93754f7333e0cca0a138 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 10:26:37 -0500 Subject: [PATCH 04/13] chore: bump golang.org/x/crypto from 0.38.0 to 0.45.0 (#340) Bumps [golang.org/x/crypto](https://github.com/golang/crypto) from 0.38.0 to 0.45.0. - [Commits](https://github.com/golang/crypto/compare/v0.38.0...v0.45.0) --- updated-dependencies: - dependency-name: golang.org/x/crypto dependency-version: 0.45.0 dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 14 +++++++------- go.sum | 28 ++++++++++++++-------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/go.mod b/go.mod index 46736368e..8be9f715d 100644 --- a/go.mod +++ b/go.mod @@ -21,7 +21,7 @@ require ( go.goms.io/fleet-networking v0.3.3 go.uber.org/atomic v1.11.0 go.uber.org/zap v1.27.0 - golang.org/x/sync v0.15.0 + golang.org/x/sync v0.18.0 golang.org/x/time v0.11.0 gomodules.xyz/jsonpatch/v2 v2.4.0 k8s.io/api v0.32.3 @@ -109,14 +109,14 @@ require ( go.uber.org/automaxprocs v1.6.0 // indirect go.uber.org/mock v0.5.1 // indirect go.uber.org/multierr v1.11.0 // indirect - golang.org/x/crypto v0.38.0 // indirect + golang.org/x/crypto v0.45.0 // indirect golang.org/x/exp v0.0.0-20250305212735-054e65f0b394 // indirect - golang.org/x/net v0.40.0 // indirect + golang.org/x/net v0.47.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/term v0.32.0 // indirect - golang.org/x/text v0.25.0 // indirect - golang.org/x/tools v0.31.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/term v0.37.0 // indirect + golang.org/x/text v0.31.0 // indirect + golang.org/x/tools v0.38.0 // indirect google.golang.org/protobuf v1.36.6 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect diff --git a/go.sum b/go.sum index 9ec4bc7bb..99c67babe 100644 --- a/go.sum +++ b/go.sum @@ -324,8 +324,8 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8= -golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw= +golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= +golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= golang.org/x/exp v0.0.0-20250305212735-054e65f0b394 h1:nDVHiLt8aIbd/VzvPWN6kSOPE7+F/fNFDSXLVYkE/Iw= golang.org/x/exp v0.0.0-20250305212735-054e65f0b394/go.mod h1:sIifuuw/Yco/y6yb6+bDNfyeQ/MdPUy/hKEMYQV17cM= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -334,35 +334,35 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY= -golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= -golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4= -golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.31.0 h1:0EedkvKDbh+qistFTd0Bcwe/YLh4vHwWEkiI0toFIBU= -golang.org/x/tools v0.31.0/go.mod h1:naFTU+Cev749tSJRXJlna0T3WxKvb1kWEx15xA4SdmQ= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= From d7e9d019126bae7bcc6295fd673bbcb9299ae1a4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Nov 2025 10:12:08 +0800 Subject: [PATCH 05/13] chore: bump actions/checkout from 5 to 6 (#345) Bumps [actions/checkout](https://github.com/actions/checkout) from 5 to 6. - [Release notes](https://github.com/actions/checkout/releases) - [Commits](https://github.com/actions/checkout/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... --- .github/workflows/chart.yml | 2 +- .github/workflows/ci.yml | 4 ++-- .github/workflows/code-lint.yml | 4 ++-- .github/workflows/codeql-analysis.yml | 2 +- .github/workflows/codespell.yml | 2 +- .github/workflows/markdown-lint.yml | 2 +- .github/workflows/trivy.yml | 2 +- .github/workflows/upgrade.yml | 6 +++--- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/chart.yml b/.github/workflows/chart.yml index 9d16e834c..ee61c51bf 100644 --- a/.github/workflows/chart.yml +++ b/.github/workflows/chart.yml @@ -18,7 +18,7 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: submodules: true fetch-depth: 0 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 17e7e27f3..a72343b46 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,7 +40,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v5 + uses: actions/checkout@v6 - name: Set up Ginkgo CLI run: | @@ -91,7 +91,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v5 + uses: actions/checkout@v6 - name: Install Ginkgo CLI run: | diff --git a/.github/workflows/code-lint.yml b/.github/workflows/code-lint.yml index 2f865a75f..b46d30ee3 100644 --- a/.github/workflows/code-lint.yml +++ b/.github/workflows/code-lint.yml @@ -43,7 +43,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Checkout - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: true @@ -64,7 +64,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v5 + uses: actions/checkout@v6 - name: golangci-lint run: make lint diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 67f30593d..c56c52856 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -38,7 +38,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index cd990a5e0..fc3936cb9 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -16,7 +16,7 @@ jobs: with: egress-policy: audit - - uses: actions/checkout@ff7abcd0c3c05ccf6adc123a8cd1fd4fb30fb493 # v4.1.7 + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v4.1.7 - uses: codespell-project/actions-codespell@8f01853be192eb0f849a5c7d721450e7a467c579 # master with: check_filenames: true diff --git a/.github/workflows/markdown-lint.yml b/.github/workflows/markdown-lint.yml index 337f8be6a..d0c13afe1 100644 --- a/.github/workflows/markdown-lint.yml +++ b/.github/workflows/markdown-lint.yml @@ -10,7 +10,7 @@ jobs: markdown-link-check: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - uses: tcort/github-action-markdown-link-check@v1 with: # this will only show errors in the output diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index e5c9e7f46..2dc6a40f6 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -44,7 +44,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Checkout code - uses: actions/checkout@v5 + uses: actions/checkout@v6 - name: Login to ${{ env.REGISTRY }} uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef diff --git a/.github/workflows/upgrade.yml b/.github/workflows/upgrade.yml index ee99472e2..e7d0e5125 100644 --- a/.github/workflows/upgrade.yml +++ b/.github/workflows/upgrade.yml @@ -44,7 +44,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: # Fetch the history of all branches and tags. # This is needed for the test suite to switch between releases. @@ -127,7 +127,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: # Fetch the history of all branches and tags. # This is needed for the test suite to switch between releases. @@ -210,7 +210,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: # Fetch the history of all branches and tags. # This is needed for the test suite to switch between releases. From db87d46c5db1f8f311f064ad51cf86a45bd8deb4 Mon Sep 17 00:00:00 2001 From: Zhiying Lin <54013513+zhiying-lin@users.noreply.github.com> Date: Tue, 25 Nov 2025 14:22:58 +0800 Subject: [PATCH 06/13] test: add e2e test using label to manage CRP and RP (#326) Signed-off-by: Zhiying Lin --- test/e2e/mixed_placement_test.go | 163 +++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) diff --git a/test/e2e/mixed_placement_test.go b/test/e2e/mixed_placement_test.go index 00096c64f..dc8dae38a 100644 --- a/test/e2e/mixed_placement_test.go +++ b/test/e2e/mixed_placement_test.go @@ -24,7 +24,9 @@ import ( . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" + clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" "github.com/kubefleet-dev/kubefleet/pkg/controllers/workapplier" ) @@ -373,3 +375,164 @@ var _ = Describe("mixed ClusterResourcePlacement and ResourcePlacement negative }) }) }) + +var _ = Describe("mixed ClusterResourcePlacement and ResourcePlacement positive test cases", Label("resourceplacement"), func() { + crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + workNamespaceName := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) + wantSelectedClusters := []string{memberCluster3WestProdName} + wantUnscheduledClusters := []string{memberCluster1EastProdName, memberCluster2EastCanaryName} + + Context("coupling CRP and RP using cluster labeling", Ordered, func() { + BeforeAll(func() { + By("creating work resources") + createWorkResources() + }) + + AfterAll(func() { + ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: workNamespaceName}, allMemberClusters) + ensureCRPAndRelatedResourcesDeleted(crpName, allMemberClusters) + + By("removing labels from member clusters") + Eventually(func() error { + for _, clusterName := range wantSelectedClusters { + mc := &clusterv1beta1.MemberCluster{} + if err := hubClient.Get(ctx, types.NamespacedName{Name: clusterName}, mc); err != nil { + return fmt.Errorf("failed to get member cluster %s: %w", clusterName, err) + } + + if mc.Labels != nil { + delete(mc.Labels, workNamespaceName) + if err := hubClient.Update(ctx, mc); err != nil { + return fmt.Errorf("failed to update member cluster %s: %w", clusterName, err) + } + } + } + return nil + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to remove labels from member clusters") + }) + + It("picking fixed cluster", func() { + crp := &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: crpName, + // Add a custom finalizer; this would allow us to better observe + // the behavior of the controllers. + Finalizers: []string{customDeletionBlockerFinalizer}, + }, + Spec: placementv1beta1.PlacementSpec{ + ResourceSelectors: namespaceOnlySelector(), + Policy: &placementv1beta1.PlacementPolicy{ + PlacementType: placementv1beta1.PickFixedPlacementType, + ClusterNames: wantSelectedClusters, + }, + Strategy: placementv1beta1.RolloutStrategy{ + Type: placementv1beta1.RollingUpdateRolloutStrategyType, + RollingUpdate: &placementv1beta1.RollingUpdateConfig{ + UnavailablePeriodSeconds: ptr.To(2), + }, + }, + }, + } + Expect(hubClient.Create(ctx, crp)).To(Succeed(), "Failed to create CRP %s", crpName) + }) + + It("should update CRP status as expected", func() { + crpStatusUpdatedActual := crpStatusUpdatedActual(workNamespaceIdentifiers(), wantSelectedClusters, nil, "0") + Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") + }) + + It("add labels to member clusters based on CRP placement decisions", func() { + Eventually(func() error { + // Add labels to the selected clusters + for _, clusterName := range wantSelectedClusters { + mc := &clusterv1beta1.MemberCluster{} + if err := hubClient.Get(ctx, types.NamespacedName{Name: clusterName}, mc); err != nil { + return fmt.Errorf("failed to get member cluster %s: %w", clusterName, err) + } + + if mc.Labels == nil { + mc.Labels = make(map[string]string) + } + mc.Labels[workNamespaceName] = "true" + + if err := hubClient.Update(ctx, mc); err != nil { + return fmt.Errorf("failed to update member cluster %s: %w", clusterName, err) + } + } + return nil + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to add labels to member clusters") + }) + + It("should create an RP with pickN policy using cluster labels", func() { + rp := &placementv1beta1.ResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: rpName, + Namespace: workNamespaceName, + Finalizers: []string{customDeletionBlockerFinalizer}, + }, + Spec: placementv1beta1.PlacementSpec{ + ResourceSelectors: configMapSelector(), + Policy: &placementv1beta1.PlacementPolicy{ + PlacementType: placementv1beta1.PickNPlacementType, + NumberOfClusters: ptr.To(int32(3)), + Affinity: &placementv1beta1.Affinity{ + ClusterAffinity: &placementv1beta1.ClusterAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &placementv1beta1.ClusterSelector{ + ClusterSelectorTerms: []placementv1beta1.ClusterSelectorTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + workNamespaceName: "true", + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + Expect(hubClient.Create(ctx, rp)).To(Succeed(), "Failed to create RP %s", rpName) + }) + + It("should update RP status as expected", func() { + rpStatusUpdatedActual := func() error { + rpKey := types.NamespacedName{Name: rpName, Namespace: workNamespaceName} + return customizedPlacementStatusUpdatedActual(rpKey, appConfigMapIdentifiers(), wantSelectedClusters, wantUnscheduledClusters, "0", true)() + } + Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP status as expected") + }) + + It("should place resources on the picked clusters", func() { + resourcePlacedActual := workNamespaceAndConfigMapPlacedOnClusterActual(memberCluster3WestProd) + Eventually(resourcePlacedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to place resources on the picked clusters") + }) + + It("update RP to pick 1 cluster instead of 3", func() { + Eventually(func() error { + rp := &placementv1beta1.ResourcePlacement{} + if err := hubClient.Get(ctx, types.NamespacedName{Name: rpName, Namespace: workNamespaceName}, rp); err != nil { + return fmt.Errorf("failed to get RP %s: %w", rpName, err) + } + + rp.Spec.Policy.NumberOfClusters = ptr.To(int32(1)) + if err := hubClient.Update(ctx, rp); err != nil { + return fmt.Errorf("failed to update RP %s: %w", rpName, err) + } + return nil + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP to pick 1 cluster") + }) + + It("should update RP status as expected", func() { + rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), wantSelectedClusters, nil, "0") + Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP status as expected") + }) + + It("should still place resources only on the selected cluster", func() { + resourcePlacedActual := workNamespaceAndConfigMapPlacedOnClusterActual(memberCluster3WestProd) + Eventually(resourcePlacedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to place resources on the selected cluster after update") + }) + }) +}) From 283050183ef7d1ca5fbfe6abb893173c26809b91 Mon Sep 17 00:00:00 2001 From: michaelawyu Date: Tue, 25 Nov 2025 21:22:36 +0800 Subject: [PATCH 07/13] interface: progress drift/diff detection APIs to v1 (#298) * Progressed drift/diff APIs to v1 Signed-off-by: michaelawyu * Added API progression tests Signed-off-by: michaelawyu --------- Signed-off-by: michaelawyu Co-authored-by: Ryan Zhang --- apis/placement/v1/binding_types.go | 36 ++ .../v1/clusterresourceplacement_types.go | 422 ++++++++++++++- apis/placement/v1/work_types.go | 90 ++++ apis/placement/v1/zz_generated.deepcopy.go | 153 ++++++ ...etes-fleet.io_clusterresourcebindings.yaml | 435 +++++++++++++++- ...es-fleet.io_clusterresourceplacements.yaml | 439 +++++++++++++++- ...t.io_clusterresourceplacementstatuses.yaml | 231 +++++++++ .../placement.kubernetes-fleet.io_works.yaml | 342 ++++++++++++- pkg/utils/common.go | 55 ++ test/e2e/api_progression_test.go | 483 ++++++++++++++++++ test/e2e/setup_test.go | 9 + 11 files changed, 2663 insertions(+), 32 deletions(-) create mode 100644 test/e2e/api_progression_test.go diff --git a/apis/placement/v1/binding_types.go b/apis/placement/v1/binding_types.go index 2cd204f81..e11a48536 100644 --- a/apis/placement/v1/binding_types.go +++ b/apis/placement/v1/binding_types.go @@ -105,6 +105,30 @@ type ResourceBindingStatus struct { // +optional FailedPlacements []FailedResourcePlacement `json:"failedPlacements,omitempty"` + // DriftedPlacements is a list of resources that have drifted from their desired states + // kept in the hub cluster, as found by Fleet using the drift detection mechanism. + // + // To control the object size, only the first 100 drifted resources will be included. + // This field is only meaningful if the `ClusterName` is not empty. + // +kubebuilder:validation:Optional + // +kubebuilder:validation:MaxItems=100 + DriftedPlacements []DriftedResourcePlacement `json:"driftedPlacements,omitempty"` + + // DiffedPlacements is a list of resources that have configuration differences from their + // corresponding hub cluster manifests. Fleet will report such differences when: + // + // * The CRP uses the ReportDiff apply strategy, which instructs Fleet to compare the hub + // cluster manifests against the live resources without actually performing any apply op; or + // * Fleet finds a pre-existing resource on the member cluster side that does not match its + // hub cluster counterpart, and the CRP has been configured to only take over a resource if + // no configuration differences are found. + // + // To control the object size, only the first 100 diffed resources will be included. + // This field is only meaningful if the `ClusterName` is not empty. + // +kubebuilder:validation:Optional + // +kubebuilder:validation:MaxItems=100 + DiffedPlacements []DiffedResourcePlacement `json:"diffedPlacements,omitempty"` + // +patchMergeKey=type // +patchStrategy=merge // +listType=map @@ -156,6 +180,18 @@ const ( // - "False" means not all the resources are available in the target cluster yet. // - "Unknown" means we haven't finished the apply yet so that we cannot check the resource availability. ResourceBindingAvailable ResourceBindingConditionType = "Available" + + // ResourceBindingDiffReported indicates that Fleet has successfully reported configuration + // differences between the hub cluster and a specific member cluster for the given resources. + // + // This condition is added only when the ReportDiff apply strategy is used. + // + // It can have the following condition statuses: + // * True: Fleet has successfully reported configuration differences for all resources. + // * False: Fleet has not yet reported configuration differences for some resources, or an + // error has occurred. + // * Unknown: Fleet has not finished processing the diff reporting yet. + ResourceBindingDiffReported ResourceBindingConditionType = "DiffReported" ) // ClusterResourceBindingList is a collection of ClusterResourceBinding. diff --git a/apis/placement/v1/clusterresourceplacement_types.go b/apis/placement/v1/clusterresourceplacement_types.go index aa48ee012..5e0193464 100644 --- a/apis/placement/v1/clusterresourceplacement_types.go +++ b/apis/placement/v1/clusterresourceplacement_types.go @@ -432,27 +432,244 @@ type RolloutStrategy struct { // Note: If multiple CRPs try to place the same resource with different apply strategy, the later ones will fail with the // reason ApplyConflictBetweenPlacements. type ApplyStrategy struct { - // Type defines the type of strategy to use. Default to ClientSideApply. - // Server-side apply is a safer choice. Read more about the differences between server-side apply and client-side - // apply: https://kubernetes.io/docs/reference/using-api/server-side-apply/#comparison-with-client-side-apply. + // ComparisonOption controls how Fleet compares the desired state of a resource, as kept in + // a hub cluster manifest, with the current state of the resource (if applicable) in the + // member cluster. + // + // Available options are: + // + // * PartialComparison: with this option, Fleet will compare only fields that are managed by + // Fleet, i.e., the fields that are specified explicitly in the hub cluster manifest. + // Unmanaged fields are ignored. This is the default option. + // + // * FullComparison: with this option, Fleet will compare all fields of the resource, + // even if the fields are absent from the hub cluster manifest. + // + // Consider using the PartialComparison option if you would like to: + // + // * use the default values for certain fields; or + // * let another agent, e.g., HPAs, VPAs, etc., on the member cluster side manage some fields; or + // * allow ad-hoc or cluster-specific settings on the member cluster side. + // + // To use the FullComparison option, it is recommended that you: + // + // * specify all fields as appropriate in the hub cluster, even if you are OK with using default + // values; + // * make sure that no fields are managed by agents other than Fleet on the member cluster + // side, such as HPAs, VPAs, or other controllers. + // + // See the Fleet documentation for further explanations and usage examples. + // + // +kubebuilder:default=PartialComparison + // +kubebuilder:validation:Enum=PartialComparison;FullComparison + // +kubebuilder:validation:Optional + ComparisonOption ComparisonOptionType `json:"comparisonOption,omitempty"` + + // WhenToApply controls when Fleet would apply the manifests on the hub cluster to the member + // clusters. + // + // Available options are: + // + // * Always: with this option, Fleet will periodically apply hub cluster manifests + // on the member cluster side; this will effectively overwrite any change in the fields + // managed by Fleet (i.e., specified in the hub cluster manifest). This is the default + // option. + // + // Note that this option would revert any ad-hoc changes made on the member cluster side in the + // managed fields; if you would like to make temporary edits on the member cluster side + // in the managed fields, switch to IfNotDrifted option. Note that changes in unmanaged + // fields will be left alone; if you use the FullDiff compare option, such changes will + // be reported as drifts. + // + // * IfNotDrifted: with this option, Fleet will stop applying hub cluster manifests on + // clusters that have drifted from the desired state; apply ops would still continue on + // the rest of the clusters. Drifts are calculated using the ComparisonOption, + // as explained in the corresponding field. + // + // Use this option if you would like Fleet to detect drifts in your multi-cluster setup. + // A drift occurs when an agent makes an ad-hoc change on the member cluster side that + // makes affected resources deviate from its desired state as kept in the hub cluster; + // and this option grants you an opportunity to view the drift details and take actions + // accordingly. The drift details will be reported in the CRP status. + // + // To fix a drift, you may: + // + // * revert the changes manually on the member cluster side + // * update the hub cluster manifest; this will trigger Fleet to apply the latest revision + // of the manifests, which will overwrite the drifted fields + // (if they are managed by Fleet) + // * switch to the Always option; this will trigger Fleet to apply the current revision + // of the manifests, which will overwrite the drifted fields (if they are managed by Fleet). + // * if applicable and necessary, delete the drifted resources on the member cluster side; Fleet + // will attempt to re-create them using the hub cluster manifests + // + // +kubebuilder:default=Always + // +kubebuilder:validation:Enum=Always;IfNotDrifted + // +kubebuilder:validation:Optional + WhenToApply WhenToApplyType `json:"whenToApply,omitempty"` + + // Type is the apply strategy to use; it determines how Fleet applies manifests from the + // hub cluster to a member cluster. + // + // Available options are: + // + // * ClientSideApply: Fleet uses three-way merge to apply manifests, similar to how kubectl + // performs a client-side apply. This is the default option. + // + // Note that this strategy requires that Fleet keep the last applied configuration in the + // annotation of an applied resource. If the object gets so large that apply ops can no longer + // be executed, Fleet will switch to server-side apply. + // + // Use ComparisonOption and WhenToApply settings to control when an apply op can be executed. + // + // * ServerSideApply: Fleet uses server-side apply to apply manifests; Fleet itself will + // become the field manager for specified fields in the manifests. Specify + // ServerSideApplyConfig as appropriate if you would like Fleet to take over field + // ownership upon conflicts. This is the recommended option for most scenarios; it might + // help reduce object size and safely resolve conflicts between field values. For more + // information, please refer to the Kubernetes documentation + // (https://kubernetes.io/docs/reference/using-api/server-side-apply/#comparison-with-client-side-apply). + // + // Use ComparisonOption and WhenToApply settings to control when an apply op can be executed. + // + // * ReportDiff: Fleet will compare the desired state of a resource as kept in the hub cluster + // with its current state (if applicable) on the member cluster side, and report any + // differences. No actual apply ops would be executed, and resources will be left alone as they + // are on the member clusters. + // + // If configuration differences are found on a resource, Fleet will consider this as an apply + // error, which might block rollout depending on the specified rollout strategy. + // + // Use ComparisonOption setting to control how the difference is calculated. + // + // ClientSideApply and ServerSideApply apply strategies only work when Fleet can assume + // ownership of a resource (e.g., the resource is created by Fleet, or Fleet has taken over + // the resource). See the comments on the WhenToTakeOver field for more information. + // ReportDiff apply strategy, however, will function regardless of Fleet's ownership + // status. One may set up a CRP with the ReportDiff strategy and the Never takeover option, + // and this will turn Fleet into a detection tool that reports only configuration differences + // but do not touch any resources on the member cluster side. + // + // For a comparison between the different strategies and usage examples, refer to the + // Fleet documentation. + // // +kubebuilder:default=ClientSideApply - // +kubebuilder:validation:Enum=ClientSideApply;ServerSideApply - // +optional + // +kubebuilder:validation:Enum=ClientSideApply;ServerSideApply;ReportDiff + // +kubebuilder:validation:Optional Type ApplyStrategyType `json:"type,omitempty"` - // AllowCoOwnership defines whether to apply the resource if it already exists in the target cluster and is not - // solely owned by fleet (i.e., metadata.ownerReferences contains only fleet custom resources). - // If true, apply the resource and add fleet as a co-owner. - // If false, leave the resource unchanged and fail the apply. + // AllowCoOwnership controls whether co-ownership between Fleet and other agents are allowed + // on a Fleet-managed resource. If set to false, Fleet will refuse to apply manifests to + // a resource that has been owned by one or more non-Fleet agents. + // + // Note that Fleet does not support the case where one resource is being placed multiple + // times by different CRPs on the same member cluster. An apply error will be returned if + // Fleet finds that a resource has been owned by another placement attempt by Fleet, even + // with the AllowCoOwnership setting set to true. AllowCoOwnership bool `json:"allowCoOwnership,omitempty"` // ServerSideApplyConfig defines the configuration for server side apply. It is honored only when type is ServerSideApply. // +optional ServerSideApplyConfig *ServerSideApplyConfig `json:"serverSideApplyConfig,omitempty"` + + // WhenToTakeOver determines the action to take when Fleet applies resources to a member + // cluster for the first time and finds out that the resource already exists in the cluster. + // + // This setting is most relevant in cases where you would like Fleet to manage pre-existing + // resources on a member cluster. + // + // Available options include: + // + // * Always: with this action, Fleet will apply the hub cluster manifests to the member + // clusters even if the affected resources already exist. This is the default action. + // + // Note that this might lead to fields being overwritten on the member clusters, if they + // are specified in the hub cluster manifests. + // + // * IfNoDiff: with this action, Fleet will apply the hub cluster manifests to the member + // clusters if (and only if) pre-existing resources look the same as the hub cluster manifests. + // + // This is a safer option as pre-existing resources that are inconsistent with the hub cluster + // manifests will not be overwritten; Fleet will ignore them until the inconsistencies + // are resolved properly: any change you make to the hub cluster manifests would not be + // applied, and if you delete the manifests or even the ClusterResourcePlacement itself + // from the hub cluster, these pre-existing resources would not be taken away. + // + // Fleet will check for inconsistencies in accordance with the ComparisonOption setting. See also + // the comments on the ComparisonOption field for more information. + // + // If a diff has been found in a field that is **managed** by Fleet (i.e., the field + // **is specified ** in the hub cluster manifest), consider one of the following actions: + // * set the field in the member cluster to be of the same value as that in the hub cluster + // manifest. + // * update the hub cluster manifest so that its field value matches with that in the member + // cluster. + // * switch to the Always action, which will allow Fleet to overwrite the field with the + // value in the hub cluster manifest. + // + // If a diff has been found in a field that is **not managed** by Fleet (i.e., the field + // **is not specified** in the hub cluster manifest), consider one of the following actions: + // * remove the field from the member cluster. + // * update the hub cluster manifest so that the field is included in the hub cluster manifest. + // + // If appropriate, you may also delete the object from the member cluster; Fleet will recreate + // it using the hub cluster manifest. + // + // * Never: with this action, Fleet will not apply a hub cluster manifest to the member + // clusters if there is a corresponding pre-existing resource. However, if a manifest + // has never been applied yet; or it has a corresponding resource which Fleet has assumed + // ownership, apply op will still be executed. + // + // This is the safest option; one will have to remove the pre-existing resources (so that + // Fleet can re-create them) or switch to a different + // WhenToTakeOver option before Fleet starts processing the corresponding hub cluster + // manifests. + // + // If you prefer Fleet stop processing all manifests, use this option along with the + // ReportDiff apply strategy type. This setup would instruct Fleet to touch nothing + // on the member cluster side but still report configuration differences between the + // hub cluster and member clusters. Fleet will not give up ownership + // that it has already assumed though. + // + // +kubebuilder:default=Always + // +kubebuilder:validation:Enum=Always;IfNoDiff;Never + // +kubebuilder:validation:Optional + WhenToTakeOver WhenToTakeOverType `json:"whenToTakeOver,omitempty"` } -// ApplyStrategyType describes the type of the strategy used to resolve the conflict if the resource to be placed already -// exists in the target cluster and is owned by other appliers. +// ComparisonOptionType describes the compare option that Fleet uses to detect drifts and/or +// calculate differences. +// +enum +type ComparisonOptionType string + +const ( + // ComparisonOptionTypePartialComparison will compare only fields that are managed by Fleet, i.e., + // fields that are specified explicitly in the hub cluster manifest. Unmanaged fields + // are ignored. + ComparisonOptionTypePartialComparison ComparisonOptionType = "PartialComparison" + + // ComparisonOptionTypeFullDiff will compare all fields of the resource, even if the fields + // are absent from the hub cluster manifest. + ComparisonOptionTypeFullComparison ComparisonOptionType = "FullComparison" +) + +// WhenToApplyType describes when Fleet would apply the manifests on the hub cluster to +// the member clusters. +type WhenToApplyType string + +const ( + // WhenToApplyTypeAlways instructs Fleet to periodically apply hub cluster manifests + // on the member cluster side; this will effectively overwrite any change in the fields + // managed by Fleet (i.e., specified in the hub cluster manifest). + WhenToApplyTypeAlways WhenToApplyType = "Always" + + // WhenToApplyTypeIfNotDrifted instructs Fleet to stop applying hub cluster manifests on + // clusters that have drifted from the desired state; apply ops would still continue on + // the rest of the clusters. + WhenToApplyTypeIfNotDrifted WhenToApplyType = "IfNotDrifted" +) + +// ApplyStrategyType describes the type of the strategy used to apply the resource to the target cluster. // +enum type ApplyStrategyType string @@ -466,6 +683,11 @@ const ( // and the existing resource in the target cluster. // Details: https://kubernetes.io/docs/reference/using-api/server-side-apply ApplyStrategyTypeServerSideApply ApplyStrategyType = "ServerSideApply" + + // ApplyStrategyTypeReportDiff will report differences between the desired state of a + // resource as kept in the hub cluster and its current state (if applicable) on the member + // cluster side. No actual apply ops would be executed. + ApplyStrategyTypeReportDiff ApplyStrategyType = "ReportDiff" ) // ServerSideApplyConfig defines the configuration for server side apply. @@ -482,6 +704,43 @@ type ServerSideApplyConfig struct { ForceConflicts bool `json:"force"` } +// WhenToTakeOverType describes the type of the action to take when we first apply the +// resources to the member cluster. +// +enum +type WhenToTakeOverType string + +const ( + // WhenToTakeOverTypeIfNoDiff instructs Fleet to apply a manifest with a corresponding + // pre-existing resource on a member cluster if and only if the pre-existing resource + // looks the same as the manifest. Should there be any inconsistency, Fleet will skip + // the apply op; no change will be made on the resource and Fleet will not claim + // ownership on it. + // + // Note that this will not stop Fleet from processing other manifests in the same + // placement that do not concern the takeover process (e.g., the manifests that have + // not been created yet, or that are already under the management of Fleet). + WhenToTakeOverTypeIfNoDiff WhenToTakeOverType = "IfNoDiff" + + // WhenToTakeOverTypeAlways instructs Fleet to always apply manifests to a member cluster, + // even if there are some corresponding pre-existing resources. Some fields on these + // resources might be overwritten, and Fleet will claim ownership on them. + WhenToTakeOverTypeAlways WhenToTakeOverType = "Always" + + // WhenToTakeOverTypeNever instructs Fleet to never apply a manifest to a member cluster + // if there is a corresponding pre-existing resource. + // + // Note that this will not stop Fleet from processing other manifests in the same placement + // that do not concern the takeover process (e.g., the manifests that have not been created + // yet, or that are already under the management of Fleet). + // + // If you would like Fleet to stop processing manifests all together and do not assume + // ownership on any pre-existing resources, use this option along with the ReportDiff + // apply strategy type. This setup would instruct Fleet to touch nothing on the member + // cluster side but still report configuration differences between the hub cluster + // and member clusters. Fleet will not give up ownership that it has already assumed, though. + WhenToTakeOverTypeNever WhenToTakeOverType = "Never" +) + // +enum type RolloutStrategyType string @@ -657,6 +916,30 @@ type ResourcePlacementStatus struct { // +optional FailedPlacements []FailedResourcePlacement `json:"failedPlacements,omitempty"` + // DriftedPlacements is a list of resources that have drifted from their desired states + // kept in the hub cluster, as found by Fleet using the drift detection mechanism. + // + // To control the object size, only the first 100 drifted resources will be included. + // This field is only meaningful if the `ClusterName` is not empty. + // +kubebuilder:validation:Optional + // +kubebuilder:validation:MaxItems=100 + DriftedPlacements []DriftedResourcePlacement `json:"driftedPlacements,omitempty"` + + // DiffedPlacements is a list of resources that have configuration differences from their + // corresponding hub cluster manifests. Fleet will report such differences when: + // + // * The CRP uses the ReportDiff apply strategy, which instructs Fleet to compare the hub + // cluster manifests against the live resources without actually performing any apply op; or + // * Fleet finds a pre-existing resource on the member cluster side that does not match its + // hub cluster counterpart, and the CRP has been configured to only take over a resource if + // no configuration differences are found. + // + // To control the object size, only the first 100 diffed resources will be included. + // This field is only meaningful if the `ClusterName` is not empty. + // +kubebuilder:validation:Optional + // +kubebuilder:validation:MaxItems=100 + DiffedPlacements []DiffedResourcePlacement `json:"diffedPlacements,omitempty"` + // Conditions is an array of current observed conditions for ResourcePlacementStatus. // +optional Conditions []metav1.Condition `json:"conditions,omitempty"` @@ -672,6 +955,101 @@ type FailedResourcePlacement struct { Condition metav1.Condition `json:"condition"` } +// PatchDetail describes a patch that explains an observed configuration drift or +// difference. +// +// A patch detail can be transcribed as a JSON patch operation, as specified in RFC 6902. +type PatchDetail struct { + // The JSON path that points to a field that has drifted or has configuration differences. + // +kubebuilder:validation:Required + Path string `json:"path"` + + // The value at the JSON path from the member cluster side. + // + // This field can be empty if the JSON path does not exist on the member cluster side; i.e., + // applying the manifest from the hub cluster side would add a new field. + // +kubebuilder:validation:Optional + ValueInMember string `json:"valueInMember,omitempty"` + + // The value at the JSON path from the hub cluster side. + // + // This field can be empty if the JSON path does not exist on the hub cluster side; i.e., + // applying the manifest from the hub cluster side would remove the field. + // +kubebuilder:validation:Optional + ValueInHub string `json:"valueInHub,omitempty"` +} + +// DriftedResourcePlacement contains the details of a resource with configuration drifts. +type DriftedResourcePlacement struct { + // The resource that has drifted. + ResourceIdentifier `json:",inline"` + + // ObservationTime is the time when we observe the configuration drifts for the resource. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Type=string + // +kubebuilder:validation:Format=date-time + ObservationTime metav1.Time `json:"observationTime"` + + // TargetClusterObservedGeneration is the generation of the resource on the target cluster + // that contains the configuration drifts. + // +kubebuilder:validation:Required + TargetClusterObservedGeneration int64 `json:"targetClusterObservedGeneration"` + + // FirstDriftedObservedTime is the first time the resource on the target cluster is + // observed to have configuration drifts. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Type=string + // +kubebuilder:validation:Format=date-time + FirstDriftedObservedTime metav1.Time `json:"firstDriftedObservedTime"` + + // ObservedDrifts are the details about the found configuration drifts. Note that + // Fleet might truncate the details as appropriate to control the object size. + // + // Each detail entry specifies how the live state (the state on the member + // cluster side) compares against the desired state (the state kept in the hub cluster manifest). + // + // An event about the details will be emitted as well. + // +kubebuilder:validation:Optional + ObservedDrifts []PatchDetail `json:"observedDrifts,omitempty"` +} + +// DiffedResourcePlacement contains the details of a resource with configuration differences. +type DiffedResourcePlacement struct { + // The resource that has drifted. + ResourceIdentifier `json:",inline"` + + // ObservationTime is the time when we observe the configuration differences for the resource. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Type=string + // +kubebuilder:validation:Format=date-time + ObservationTime metav1.Time `json:"observationTime"` + + // TargetClusterObservedGeneration is the generation of the resource on the target cluster + // that contains the configuration differences. + // + // This might be nil if the resource has not been created yet on the target cluster. + // + // +kubebuilder:validation:Optional + TargetClusterObservedGeneration *int64 `json:"targetClusterObservedGeneration"` + + // FirstDiffedObservedTime is the first time the resource on the target cluster is + // observed to have configuration differences. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Type=string + // +kubebuilder:validation:Format=date-time + FirstDiffedObservedTime metav1.Time `json:"firstDiffedObservedTime"` + + // ObservedDiffs are the details about the found configuration differences. Note that + // Fleet might truncate the details as appropriate to control the object size. + // + // Each detail entry specifies how the live state (the state on the member + // cluster side) compares against the desired state (the state kept in the hub cluster manifest). + // + // An event about the details will be emitted as well. + // +kubebuilder:validation:Optional + ObservedDiffs []PatchDetail `json:"observedDiffs,omitempty"` +} + // Toleration allows ClusterResourcePlacement to tolerate any taint that matches // the triple using the matching operator . type Toleration struct { @@ -758,6 +1136,17 @@ const ( // array. // - "Unknown" means we haven't finished the apply yet so that we cannot check the resource availability. ClusterResourcePlacementAvailableConditionType ClusterResourcePlacementConditionType = "ClusterResourcePlacementAvailable" + + // ClusterResourcePlacementDiffReportedConditionType indicates whether Fleet has reported + // configuration differences between the desired states of resources as kept in the hub cluster + // and the current states on the all member clusters. + // + // It can have the following condition statuses: + // * True: Fleet has reported complete sets of configuration differences on all member clusters. + // * False: Fleet has not yet reported complete sets of configuration differences on some member + // clusters, or an error has occurred. + // * Unknown: Fleet has not finished processing the diff reporting yet. + ClusterResourcePlacementDiffReportedConditionType ClusterResourcePlacementConditionType = "ClusterResourcePlacementDiffReported" ) // ResourcePlacementConditionType defines a specific condition of a resource placement. @@ -813,6 +1202,17 @@ const ( // - "False" means some of them are not available yet. // - "Unknown" means we haven't finished the apply yet so that we cannot check the resource availability. ResourcesAvailableConditionType ResourcePlacementConditionType = "Available" + + // ResourcePlacementDiffReportedConditionType indicates whether Fleet has reported + // configuration differences between the desired states of resources as kept in the hub cluster + // and the current states on the all member clusters. + // + // It can have the following condition statuses: + // * True: Fleet has reported complete sets of configuration differences on all member clusters. + // * False: Fleet has not yet reported complete sets of configuration differences on some member + // clusters, or an error has occurred. + // * Unknown: Fleet has not finished processing the diff reporting yet. + ResourcePlacementDiffReportedConditionType ResourcePlacementConditionType = "ResourcePlacementDiffReported" ) // PlacementType identifies the type of placement. diff --git a/apis/placement/v1/work_types.go b/apis/placement/v1/work_types.go index 8910ff737..cef2700ea 100644 --- a/apis/placement/v1/work_types.go +++ b/apis/placement/v1/work_types.go @@ -110,6 +110,74 @@ type WorkResourceIdentifier struct { Name string `json:"name,omitempty"` } +// DriftDetails describes the observed configuration drifts. +type DriftDetails struct { + // ObservationTime is the timestamp when the drift was last detected. + // + // +kubebuilder:validation:Required + // +kubebuilder:validation:Type=string + // +kubebuilder:validation:Format=date-time + ObservationTime metav1.Time `json:"observationTime"` + + // ObservedInMemberClusterGeneration is the generation of the applied manifest on the member + // cluster side. + // +kubebuilder:validation:Required + ObservedInMemberClusterGeneration int64 `json:"observedInMemberClusterGeneration"` + + // FirstDriftedObservedTime is the timestamp when the drift was first detected. + // + // +kubebuilder:validation:Required + // +kubebuilder:validation:Type=string + // +kubebuilder:validation:Format=date-time + FirstDriftedObservedTime metav1.Time `json:"firstDriftedObservedTime"` + + // ObservedDrifts describes each drifted field found from the applied manifest. + // Fleet might truncate the details as appropriate to control object size. + // + // Each entry specifies how the live state (the state on the member cluster side) compares + // against the desired state (the state kept in the hub cluster manifest). + // + // +kubebuilder:validation:Optional + ObservedDrifts []PatchDetail `json:"observedDrifts,omitempty"` +} + +// DiffDetails describes the observed configuration differences. +type DiffDetails struct { + // ObservationTime is the timestamp when the configuration difference was last detected. + // + // +kubebuilder:validation:Required + // +kubebuilder:validation:Type=string + // +kubebuilder:validation:Format=date-time + ObservationTime metav1.Time `json:"observationTime"` + + // ObservedInMemberClusterGeneration is the generation of the applied manifest on the member + // cluster side. + // + // This might be nil if the resource has not been created yet in the member cluster. + // + // +kubebuilder:validation:Optional + ObservedInMemberClusterGeneration *int64 `json:"observedInMemberClusterGeneration"` + + // FirstDiffedObservedTime is the timestamp when the configuration difference + // was first detected. + // + // +kubebuilder:validation:Required + // +kubebuilder:validation:Type=string + // +kubebuilder:validation:Format=date-time + FirstDiffedObservedTime metav1.Time `json:"firstDiffedObservedTime"` + + // ObservedDiffs describes each field with configuration difference as found from the + // member cluster side. + // + // Fleet might truncate the details as appropriate to control object size. + // + // Each entry specifies how the live state (the state on the member cluster side) compares + // against the desired state (the state kept in the hub cluster manifest). + // + // +kubebuilder:validation:Optional + ObservedDiffs []PatchDetail `json:"observedDiffs,omitempty"` +} + // ManifestCondition represents the conditions of the resources deployed on // spoke cluster. type ManifestCondition struct { @@ -120,6 +188,28 @@ type ManifestCondition struct { // Conditions represents the conditions of this resource on spoke cluster // +required Conditions []metav1.Condition `json:"conditions"` + + // DriftDetails explains about the observed configuration drifts. + // Fleet might truncate the details as appropriate to control object size. + // + // Note that configuration drifts can only occur on a resource if it is currently owned by + // Fleet and its corresponding placement is set to use the ClientSideApply or ServerSideApply + // apply strategy. In other words, DriftDetails and DiffDetails will not be populated + // at the same time. + // + // +kubebuilder:validation:Optional + DriftDetails *DriftDetails `json:"driftDetails,omitempty"` + + // DiffDetails explains the details about the observed configuration differences. + // Fleet might truncate the details as appropriate to control object size. + // + // Note that configuration differences can only occur on a resource if it is not currently owned + // by Fleet (i.e., it is a pre-existing resource that needs to be taken over), or if its + // corresponding placement is set to use the ReportDiff apply strategy. In other words, + // DiffDetails and DriftDetails will not be populated at the same time. + // + // +kubebuilder:validation:Optional + DiffDetails *DiffDetails `json:"diffDetails,omitempty"` } // +genclient diff --git a/apis/placement/v1/zz_generated.deepcopy.go b/apis/placement/v1/zz_generated.deepcopy.go index 3f10e1e10..fc248dbd2 100644 --- a/apis/placement/v1/zz_generated.deepcopy.go +++ b/apis/placement/v1/zz_generated.deepcopy.go @@ -794,6 +794,106 @@ func (in *ClusterSelectorTerm) DeepCopy() *ClusterSelectorTerm { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiffDetails) DeepCopyInto(out *DiffDetails) { + *out = *in + in.ObservationTime.DeepCopyInto(&out.ObservationTime) + if in.ObservedInMemberClusterGeneration != nil { + in, out := &in.ObservedInMemberClusterGeneration, &out.ObservedInMemberClusterGeneration + *out = new(int64) + **out = **in + } + in.FirstDiffedObservedTime.DeepCopyInto(&out.FirstDiffedObservedTime) + if in.ObservedDiffs != nil { + in, out := &in.ObservedDiffs, &out.ObservedDiffs + *out = make([]PatchDetail, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiffDetails. +func (in *DiffDetails) DeepCopy() *DiffDetails { + if in == nil { + return nil + } + out := new(DiffDetails) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiffedResourcePlacement) DeepCopyInto(out *DiffedResourcePlacement) { + *out = *in + in.ResourceIdentifier.DeepCopyInto(&out.ResourceIdentifier) + in.ObservationTime.DeepCopyInto(&out.ObservationTime) + if in.TargetClusterObservedGeneration != nil { + in, out := &in.TargetClusterObservedGeneration, &out.TargetClusterObservedGeneration + *out = new(int64) + **out = **in + } + in.FirstDiffedObservedTime.DeepCopyInto(&out.FirstDiffedObservedTime) + if in.ObservedDiffs != nil { + in, out := &in.ObservedDiffs, &out.ObservedDiffs + *out = make([]PatchDetail, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiffedResourcePlacement. +func (in *DiffedResourcePlacement) DeepCopy() *DiffedResourcePlacement { + if in == nil { + return nil + } + out := new(DiffedResourcePlacement) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DriftDetails) DeepCopyInto(out *DriftDetails) { + *out = *in + in.ObservationTime.DeepCopyInto(&out.ObservationTime) + in.FirstDriftedObservedTime.DeepCopyInto(&out.FirstDriftedObservedTime) + if in.ObservedDrifts != nil { + in, out := &in.ObservedDrifts, &out.ObservedDrifts + *out = make([]PatchDetail, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DriftDetails. +func (in *DriftDetails) DeepCopy() *DriftDetails { + if in == nil { + return nil + } + out := new(DriftDetails) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DriftedResourcePlacement) DeepCopyInto(out *DriftedResourcePlacement) { + *out = *in + in.ResourceIdentifier.DeepCopyInto(&out.ResourceIdentifier) + in.ObservationTime.DeepCopyInto(&out.ObservationTime) + in.FirstDriftedObservedTime.DeepCopyInto(&out.FirstDriftedObservedTime) + if in.ObservedDrifts != nil { + in, out := &in.ObservedDrifts, &out.ObservedDrifts + *out = make([]PatchDetail, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DriftedResourcePlacement. +func (in *DriftedResourcePlacement) DeepCopy() *DriftedResourcePlacement { + if in == nil { + return nil + } + out := new(DriftedResourcePlacement) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvelopeIdentifier) DeepCopyInto(out *EnvelopeIdentifier) { *out = *in @@ -869,6 +969,16 @@ func (in *ManifestCondition) DeepCopyInto(out *ManifestCondition) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.DriftDetails != nil { + in, out := &in.DriftDetails, &out.DriftDetails + *out = new(DriftDetails) + (*in).DeepCopyInto(*out) + } + if in.DiffDetails != nil { + in, out := &in.DiffDetails, &out.DiffDetails + *out = new(DiffDetails) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ManifestCondition. @@ -945,6 +1055,21 @@ func (in *OverrideRule) DeepCopy() *OverrideRule { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PatchDetail) DeepCopyInto(out *PatchDetail) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PatchDetail. +func (in *PatchDetail) DeepCopy() *PatchDetail { + if in == nil { + return nil + } + out := new(PatchDetail) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PlacementPolicy) DeepCopyInto(out *PlacementPolicy) { *out = *in @@ -1116,6 +1241,20 @@ func (in *ResourceBindingStatus) DeepCopyInto(out *ResourceBindingStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.DriftedPlacements != nil { + in, out := &in.DriftedPlacements, &out.DriftedPlacements + *out = make([]DriftedResourcePlacement, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.DiffedPlacements != nil { + in, out := &in.DiffedPlacements, &out.DiffedPlacements + *out = make([]DiffedResourcePlacement, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) @@ -1358,6 +1497,20 @@ func (in *ResourcePlacementStatus) DeepCopyInto(out *ResourcePlacementStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.DriftedPlacements != nil { + in, out := &in.DriftedPlacements, &out.DriftedPlacements + *out = make([]DriftedResourcePlacement, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.DiffedPlacements != nil { + in, out := &in.DiffedPlacements, &out.DiffedPlacements + *out = make([]DiffedResourcePlacement, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml index 56a8608fb..0e12d6032 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml @@ -68,11 +68,49 @@ spec: properties: allowCoOwnership: description: |- - AllowCoOwnership defines whether to apply the resource if it already exists in the target cluster and is not - solely owned by fleet (i.e., metadata.ownerReferences contains only fleet custom resources). - If true, apply the resource and add fleet as a co-owner. - If false, leave the resource unchanged and fail the apply. + AllowCoOwnership controls whether co-ownership between Fleet and other agents are allowed + on a Fleet-managed resource. If set to false, Fleet will refuse to apply manifests to + a resource that has been owned by one or more non-Fleet agents. + + Note that Fleet does not support the case where one resource is being placed multiple + times by different CRPs on the same member cluster. An apply error will be returned if + Fleet finds that a resource has been owned by another placement attempt by Fleet, even + with the AllowCoOwnership setting set to true. type: boolean + comparisonOption: + default: PartialComparison + description: |- + ComparisonOption controls how Fleet compares the desired state of a resource, as kept in + a hub cluster manifest, with the current state of the resource (if applicable) in the + member cluster. + + Available options are: + + * PartialComparison: with this option, Fleet will compare only fields that are managed by + Fleet, i.e., the fields that are specified explicitly in the hub cluster manifest. + Unmanaged fields are ignored. This is the default option. + + * FullComparison: with this option, Fleet will compare all fields of the resource, + even if the fields are absent from the hub cluster manifest. + + Consider using the PartialComparison option if you would like to: + + * use the default values for certain fields; or + * let another agent, e.g., HPAs, VPAs, etc., on the member cluster side manage some fields; or + * allow ad-hoc or cluster-specific settings on the member cluster side. + + To use the FullComparison option, it is recommended that you: + + * specify all fields as appropriate in the hub cluster, even if you are OK with using default + values; + * make sure that no fields are managed by agents other than Fleet on the member cluster + side, such as HPAs, VPAs, or other controllers. + + See the Fleet documentation for further explanations and usage examples. + enum: + - PartialComparison + - FullComparison + type: string serverSideApplyConfig: description: ServerSideApplyConfig defines the configuration for server side apply. It is honored only when type is ServerSideApply. @@ -91,12 +129,164 @@ spec: type: default: ClientSideApply description: |- - Type defines the type of strategy to use. Default to ClientSideApply. - Server-side apply is a safer choice. Read more about the differences between server-side apply and client-side - apply: https://kubernetes.io/docs/reference/using-api/server-side-apply/#comparison-with-client-side-apply. + Type is the apply strategy to use; it determines how Fleet applies manifests from the + hub cluster to a member cluster. + + Available options are: + + * ClientSideApply: Fleet uses three-way merge to apply manifests, similar to how kubectl + performs a client-side apply. This is the default option. + + Note that this strategy requires that Fleet keep the last applied configuration in the + annotation of an applied resource. If the object gets so large that apply ops can no longer + be executed, Fleet will switch to server-side apply. + + Use ComparisonOption and WhenToApply settings to control when an apply op can be executed. + + * ServerSideApply: Fleet uses server-side apply to apply manifests; Fleet itself will + become the field manager for specified fields in the manifests. Specify + ServerSideApplyConfig as appropriate if you would like Fleet to take over field + ownership upon conflicts. This is the recommended option for most scenarios; it might + help reduce object size and safely resolve conflicts between field values. For more + information, please refer to the Kubernetes documentation + (https://kubernetes.io/docs/reference/using-api/server-side-apply/#comparison-with-client-side-apply). + + Use ComparisonOption and WhenToApply settings to control when an apply op can be executed. + + * ReportDiff: Fleet will compare the desired state of a resource as kept in the hub cluster + with its current state (if applicable) on the member cluster side, and report any + differences. No actual apply ops would be executed, and resources will be left alone as they + are on the member clusters. + + If configuration differences are found on a resource, Fleet will consider this as an apply + error, which might block rollout depending on the specified rollout strategy. + + Use ComparisonOption setting to control how the difference is calculated. + + ClientSideApply and ServerSideApply apply strategies only work when Fleet can assume + ownership of a resource (e.g., the resource is created by Fleet, or Fleet has taken over + the resource). See the comments on the WhenToTakeOver field for more information. + ReportDiff apply strategy, however, will function regardless of Fleet's ownership + status. One may set up a CRP with the ReportDiff strategy and the Never takeover option, + and this will turn Fleet into a detection tool that reports only configuration differences + but do not touch any resources on the member cluster side. + + For a comparison between the different strategies and usage examples, refer to the + Fleet documentation. enum: - ClientSideApply - ServerSideApply + - ReportDiff + type: string + whenToApply: + default: Always + description: |- + WhenToApply controls when Fleet would apply the manifests on the hub cluster to the member + clusters. + + Available options are: + + * Always: with this option, Fleet will periodically apply hub cluster manifests + on the member cluster side; this will effectively overwrite any change in the fields + managed by Fleet (i.e., specified in the hub cluster manifest). This is the default + option. + + Note that this option would revert any ad-hoc changes made on the member cluster side in the + managed fields; if you would like to make temporary edits on the member cluster side + in the managed fields, switch to IfNotDrifted option. Note that changes in unmanaged + fields will be left alone; if you use the FullDiff compare option, such changes will + be reported as drifts. + + * IfNotDrifted: with this option, Fleet will stop applying hub cluster manifests on + clusters that have drifted from the desired state; apply ops would still continue on + the rest of the clusters. Drifts are calculated using the ComparisonOption, + as explained in the corresponding field. + + Use this option if you would like Fleet to detect drifts in your multi-cluster setup. + A drift occurs when an agent makes an ad-hoc change on the member cluster side that + makes affected resources deviate from its desired state as kept in the hub cluster; + and this option grants you an opportunity to view the drift details and take actions + accordingly. The drift details will be reported in the CRP status. + + To fix a drift, you may: + + * revert the changes manually on the member cluster side + * update the hub cluster manifest; this will trigger Fleet to apply the latest revision + of the manifests, which will overwrite the drifted fields + (if they are managed by Fleet) + * switch to the Always option; this will trigger Fleet to apply the current revision + of the manifests, which will overwrite the drifted fields (if they are managed by Fleet). + * if applicable and necessary, delete the drifted resources on the member cluster side; Fleet + will attempt to re-create them using the hub cluster manifests + enum: + - Always + - IfNotDrifted + type: string + whenToTakeOver: + default: Always + description: |- + WhenToTakeOver determines the action to take when Fleet applies resources to a member + cluster for the first time and finds out that the resource already exists in the cluster. + + This setting is most relevant in cases where you would like Fleet to manage pre-existing + resources on a member cluster. + + Available options include: + + * Always: with this action, Fleet will apply the hub cluster manifests to the member + clusters even if the affected resources already exist. This is the default action. + + Note that this might lead to fields being overwritten on the member clusters, if they + are specified in the hub cluster manifests. + + * IfNoDiff: with this action, Fleet will apply the hub cluster manifests to the member + clusters if (and only if) pre-existing resources look the same as the hub cluster manifests. + + This is a safer option as pre-existing resources that are inconsistent with the hub cluster + manifests will not be overwritten; Fleet will ignore them until the inconsistencies + are resolved properly: any change you make to the hub cluster manifests would not be + applied, and if you delete the manifests or even the ClusterResourcePlacement itself + from the hub cluster, these pre-existing resources would not be taken away. + + Fleet will check for inconsistencies in accordance with the ComparisonOption setting. See also + the comments on the ComparisonOption field for more information. + + If a diff has been found in a field that is **managed** by Fleet (i.e., the field + **is specified ** in the hub cluster manifest), consider one of the following actions: + * set the field in the member cluster to be of the same value as that in the hub cluster + manifest. + * update the hub cluster manifest so that its field value matches with that in the member + cluster. + * switch to the Always action, which will allow Fleet to overwrite the field with the + value in the hub cluster manifest. + + If a diff has been found in a field that is **not managed** by Fleet (i.e., the field + **is not specified** in the hub cluster manifest), consider one of the following actions: + * remove the field from the member cluster. + * update the hub cluster manifest so that the field is included in the hub cluster manifest. + + If appropriate, you may also delete the object from the member cluster; Fleet will recreate + it using the hub cluster manifest. + + * Never: with this action, Fleet will not apply a hub cluster manifest to the member + clusters if there is a corresponding pre-existing resource. However, if a manifest + has never been applied yet; or it has a corresponding resource which Fleet has assumed + ownership, apply op will still be executed. + + This is the safest option; one will have to remove the pre-existing resources (so that + Fleet can re-create them) or switch to a different + WhenToTakeOver option before Fleet starts processing the corresponding hub cluster + manifests. + + If you prefer Fleet stop processing all manifests, use this option along with the + ReportDiff apply strategy type. This setup would instruct Fleet to touch nothing + on the member cluster side but still report configuration differences between the + hub cluster and member clusters. Fleet will not give up ownership + that it has already assumed though. + enum: + - Always + - IfNoDiff + - Never type: string type: object clusterDecision: @@ -257,6 +447,237 @@ spec: x-kubernetes-list-map-keys: - type x-kubernetes-list-type: map + diffedPlacements: + description: |- + DiffedPlacements is a list of resources that have configuration differences from their + corresponding hub cluster manifests. Fleet will report such differences when: + + * The CRP uses the ReportDiff apply strategy, which instructs Fleet to compare the hub + cluster manifests against the live resources without actually performing any apply op; or + * Fleet finds a pre-existing resource on the member cluster side that does not match its + hub cluster counterpart, and the CRP has been configured to only take over a resource if + no configuration differences are found. + + To control the object size, only the first 100 diffed resources will be included. + This field is only meaningful if the `ClusterName` is not empty. + items: + description: DiffedResourcePlacement contains the details of a resource + with configuration differences. + properties: + envelope: + description: Envelope identifies the envelope object that contains + this resource. + properties: + name: + description: Name of the envelope object. + type: string + namespace: + description: Namespace is the namespace of the envelope + object. Empty if the envelope object is cluster scoped. + type: string + type: + default: ConfigMap + description: Type of the envelope object. + enum: + - ConfigMap + type: string + required: + - name + type: object + firstDiffedObservedTime: + description: |- + FirstDiffedObservedTime is the first time the resource on the target cluster is + observed to have configuration differences. + format: date-time + type: string + group: + description: Group is the group name of the selected resource. + type: string + kind: + description: Kind represents the Kind of the selected resources. + type: string + name: + description: Name of the target resource. + type: string + namespace: + description: Namespace is the namespace of the resource. Empty + if the resource is cluster scoped. + type: string + observationTime: + description: ObservationTime is the time when we observe the + configuration differences for the resource. + format: date-time + type: string + observedDiffs: + description: |- + ObservedDiffs are the details about the found configuration differences. Note that + Fleet might truncate the details as appropriate to control the object size. + + Each detail entry specifies how the live state (the state on the member + cluster side) compares against the desired state (the state kept in the hub cluster manifest). + + An event about the details will be emitted as well. + items: + description: |- + PatchDetail describes a patch that explains an observed configuration drift or + difference. + + A patch detail can be transcribed as a JSON patch operation, as specified in RFC 6902. + properties: + path: + description: The JSON path that points to a field that + has drifted or has configuration differences. + type: string + valueInHub: + description: |- + The value at the JSON path from the hub cluster side. + + This field can be empty if the JSON path does not exist on the hub cluster side; i.e., + applying the manifest from the hub cluster side would remove the field. + type: string + valueInMember: + description: |- + The value at the JSON path from the member cluster side. + + This field can be empty if the JSON path does not exist on the member cluster side; i.e., + applying the manifest from the hub cluster side would add a new field. + type: string + required: + - path + type: object + type: array + targetClusterObservedGeneration: + description: |- + TargetClusterObservedGeneration is the generation of the resource on the target cluster + that contains the configuration differences. + + This might be nil if the resource has not been created yet on the target cluster. + format: int64 + type: integer + version: + description: Version is the version of the selected resource. + type: string + required: + - firstDiffedObservedTime + - kind + - name + - observationTime + - version + type: object + maxItems: 100 + type: array + driftedPlacements: + description: |- + DriftedPlacements is a list of resources that have drifted from their desired states + kept in the hub cluster, as found by Fleet using the drift detection mechanism. + + To control the object size, only the first 100 drifted resources will be included. + This field is only meaningful if the `ClusterName` is not empty. + items: + description: DriftedResourcePlacement contains the details of a + resource with configuration drifts. + properties: + envelope: + description: Envelope identifies the envelope object that contains + this resource. + properties: + name: + description: Name of the envelope object. + type: string + namespace: + description: Namespace is the namespace of the envelope + object. Empty if the envelope object is cluster scoped. + type: string + type: + default: ConfigMap + description: Type of the envelope object. + enum: + - ConfigMap + type: string + required: + - name + type: object + firstDriftedObservedTime: + description: |- + FirstDriftedObservedTime is the first time the resource on the target cluster is + observed to have configuration drifts. + format: date-time + type: string + group: + description: Group is the group name of the selected resource. + type: string + kind: + description: Kind represents the Kind of the selected resources. + type: string + name: + description: Name of the target resource. + type: string + namespace: + description: Namespace is the namespace of the resource. Empty + if the resource is cluster scoped. + type: string + observationTime: + description: ObservationTime is the time when we observe the + configuration drifts for the resource. + format: date-time + type: string + observedDrifts: + description: |- + ObservedDrifts are the details about the found configuration drifts. Note that + Fleet might truncate the details as appropriate to control the object size. + + Each detail entry specifies how the live state (the state on the member + cluster side) compares against the desired state (the state kept in the hub cluster manifest). + + An event about the details will be emitted as well. + items: + description: |- + PatchDetail describes a patch that explains an observed configuration drift or + difference. + + A patch detail can be transcribed as a JSON patch operation, as specified in RFC 6902. + properties: + path: + description: The JSON path that points to a field that + has drifted or has configuration differences. + type: string + valueInHub: + description: |- + The value at the JSON path from the hub cluster side. + + This field can be empty if the JSON path does not exist on the hub cluster side; i.e., + applying the manifest from the hub cluster side would remove the field. + type: string + valueInMember: + description: |- + The value at the JSON path from the member cluster side. + + This field can be empty if the JSON path does not exist on the member cluster side; i.e., + applying the manifest from the hub cluster side would add a new field. + type: string + required: + - path + type: object + type: array + targetClusterObservedGeneration: + description: |- + TargetClusterObservedGeneration is the generation of the resource on the target cluster + that contains the configuration drifts. + format: int64 + type: integer + version: + description: Version is the version of the selected resource. + type: string + required: + - firstDriftedObservedTime + - kind + - name + - observationTime + - targetClusterObservedGeneration + - version + type: object + maxItems: 100 + type: array failedPlacements: description: |- FailedPlacements is a list of all the resources failed to be placed to the given cluster or the resource is unavailable. diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml index ddba9f0e4..f49443d3e 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml @@ -630,11 +630,49 @@ spec: properties: allowCoOwnership: description: |- - AllowCoOwnership defines whether to apply the resource if it already exists in the target cluster and is not - solely owned by fleet (i.e., metadata.ownerReferences contains only fleet custom resources). - If true, apply the resource and add fleet as a co-owner. - If false, leave the resource unchanged and fail the apply. + AllowCoOwnership controls whether co-ownership between Fleet and other agents are allowed + on a Fleet-managed resource. If set to false, Fleet will refuse to apply manifests to + a resource that has been owned by one or more non-Fleet agents. + + Note that Fleet does not support the case where one resource is being placed multiple + times by different CRPs on the same member cluster. An apply error will be returned if + Fleet finds that a resource has been owned by another placement attempt by Fleet, even + with the AllowCoOwnership setting set to true. type: boolean + comparisonOption: + default: PartialComparison + description: |- + ComparisonOption controls how Fleet compares the desired state of a resource, as kept in + a hub cluster manifest, with the current state of the resource (if applicable) in the + member cluster. + + Available options are: + + * PartialComparison: with this option, Fleet will compare only fields that are managed by + Fleet, i.e., the fields that are specified explicitly in the hub cluster manifest. + Unmanaged fields are ignored. This is the default option. + + * FullComparison: with this option, Fleet will compare all fields of the resource, + even if the fields are absent from the hub cluster manifest. + + Consider using the PartialComparison option if you would like to: + + * use the default values for certain fields; or + * let another agent, e.g., HPAs, VPAs, etc., on the member cluster side manage some fields; or + * allow ad-hoc or cluster-specific settings on the member cluster side. + + To use the FullComparison option, it is recommended that you: + + * specify all fields as appropriate in the hub cluster, even if you are OK with using default + values; + * make sure that no fields are managed by agents other than Fleet on the member cluster + side, such as HPAs, VPAs, or other controllers. + + See the Fleet documentation for further explanations and usage examples. + enum: + - PartialComparison + - FullComparison + type: string serverSideApplyConfig: description: ServerSideApplyConfig defines the configuration for server side apply. It is honored only when type is ServerSideApply. @@ -653,12 +691,164 @@ spec: type: default: ClientSideApply description: |- - Type defines the type of strategy to use. Default to ClientSideApply. - Server-side apply is a safer choice. Read more about the differences between server-side apply and client-side - apply: https://kubernetes.io/docs/reference/using-api/server-side-apply/#comparison-with-client-side-apply. + Type is the apply strategy to use; it determines how Fleet applies manifests from the + hub cluster to a member cluster. + + Available options are: + + * ClientSideApply: Fleet uses three-way merge to apply manifests, similar to how kubectl + performs a client-side apply. This is the default option. + + Note that this strategy requires that Fleet keep the last applied configuration in the + annotation of an applied resource. If the object gets so large that apply ops can no longer + be executed, Fleet will switch to server-side apply. + + Use ComparisonOption and WhenToApply settings to control when an apply op can be executed. + + * ServerSideApply: Fleet uses server-side apply to apply manifests; Fleet itself will + become the field manager for specified fields in the manifests. Specify + ServerSideApplyConfig as appropriate if you would like Fleet to take over field + ownership upon conflicts. This is the recommended option for most scenarios; it might + help reduce object size and safely resolve conflicts between field values. For more + information, please refer to the Kubernetes documentation + (https://kubernetes.io/docs/reference/using-api/server-side-apply/#comparison-with-client-side-apply). + + Use ComparisonOption and WhenToApply settings to control when an apply op can be executed. + + * ReportDiff: Fleet will compare the desired state of a resource as kept in the hub cluster + with its current state (if applicable) on the member cluster side, and report any + differences. No actual apply ops would be executed, and resources will be left alone as they + are on the member clusters. + + If configuration differences are found on a resource, Fleet will consider this as an apply + error, which might block rollout depending on the specified rollout strategy. + + Use ComparisonOption setting to control how the difference is calculated. + + ClientSideApply and ServerSideApply apply strategies only work when Fleet can assume + ownership of a resource (e.g., the resource is created by Fleet, or Fleet has taken over + the resource). See the comments on the WhenToTakeOver field for more information. + ReportDiff apply strategy, however, will function regardless of Fleet's ownership + status. One may set up a CRP with the ReportDiff strategy and the Never takeover option, + and this will turn Fleet into a detection tool that reports only configuration differences + but do not touch any resources on the member cluster side. + + For a comparison between the different strategies and usage examples, refer to the + Fleet documentation. enum: - ClientSideApply - ServerSideApply + - ReportDiff + type: string + whenToApply: + default: Always + description: |- + WhenToApply controls when Fleet would apply the manifests on the hub cluster to the member + clusters. + + Available options are: + + * Always: with this option, Fleet will periodically apply hub cluster manifests + on the member cluster side; this will effectively overwrite any change in the fields + managed by Fleet (i.e., specified in the hub cluster manifest). This is the default + option. + + Note that this option would revert any ad-hoc changes made on the member cluster side in the + managed fields; if you would like to make temporary edits on the member cluster side + in the managed fields, switch to IfNotDrifted option. Note that changes in unmanaged + fields will be left alone; if you use the FullDiff compare option, such changes will + be reported as drifts. + + * IfNotDrifted: with this option, Fleet will stop applying hub cluster manifests on + clusters that have drifted from the desired state; apply ops would still continue on + the rest of the clusters. Drifts are calculated using the ComparisonOption, + as explained in the corresponding field. + + Use this option if you would like Fleet to detect drifts in your multi-cluster setup. + A drift occurs when an agent makes an ad-hoc change on the member cluster side that + makes affected resources deviate from its desired state as kept in the hub cluster; + and this option grants you an opportunity to view the drift details and take actions + accordingly. The drift details will be reported in the CRP status. + + To fix a drift, you may: + + * revert the changes manually on the member cluster side + * update the hub cluster manifest; this will trigger Fleet to apply the latest revision + of the manifests, which will overwrite the drifted fields + (if they are managed by Fleet) + * switch to the Always option; this will trigger Fleet to apply the current revision + of the manifests, which will overwrite the drifted fields (if they are managed by Fleet). + * if applicable and necessary, delete the drifted resources on the member cluster side; Fleet + will attempt to re-create them using the hub cluster manifests + enum: + - Always + - IfNotDrifted + type: string + whenToTakeOver: + default: Always + description: |- + WhenToTakeOver determines the action to take when Fleet applies resources to a member + cluster for the first time and finds out that the resource already exists in the cluster. + + This setting is most relevant in cases where you would like Fleet to manage pre-existing + resources on a member cluster. + + Available options include: + + * Always: with this action, Fleet will apply the hub cluster manifests to the member + clusters even if the affected resources already exist. This is the default action. + + Note that this might lead to fields being overwritten on the member clusters, if they + are specified in the hub cluster manifests. + + * IfNoDiff: with this action, Fleet will apply the hub cluster manifests to the member + clusters if (and only if) pre-existing resources look the same as the hub cluster manifests. + + This is a safer option as pre-existing resources that are inconsistent with the hub cluster + manifests will not be overwritten; Fleet will ignore them until the inconsistencies + are resolved properly: any change you make to the hub cluster manifests would not be + applied, and if you delete the manifests or even the ClusterResourcePlacement itself + from the hub cluster, these pre-existing resources would not be taken away. + + Fleet will check for inconsistencies in accordance with the ComparisonOption setting. See also + the comments on the ComparisonOption field for more information. + + If a diff has been found in a field that is **managed** by Fleet (i.e., the field + **is specified ** in the hub cluster manifest), consider one of the following actions: + * set the field in the member cluster to be of the same value as that in the hub cluster + manifest. + * update the hub cluster manifest so that its field value matches with that in the member + cluster. + * switch to the Always action, which will allow Fleet to overwrite the field with the + value in the hub cluster manifest. + + If a diff has been found in a field that is **not managed** by Fleet (i.e., the field + **is not specified** in the hub cluster manifest), consider one of the following actions: + * remove the field from the member cluster. + * update the hub cluster manifest so that the field is included in the hub cluster manifest. + + If appropriate, you may also delete the object from the member cluster; Fleet will recreate + it using the hub cluster manifest. + + * Never: with this action, Fleet will not apply a hub cluster manifest to the member + clusters if there is a corresponding pre-existing resource. However, if a manifest + has never been applied yet; or it has a corresponding resource which Fleet has assumed + ownership, apply op will still be executed. + + This is the safest option; one will have to remove the pre-existing resources (so that + Fleet can re-create them) or switch to a different + WhenToTakeOver option before Fleet starts processing the corresponding hub cluster + manifests. + + If you prefer Fleet stop processing all manifests, use this option along with the + ReportDiff apply strategy type. This setup would instruct Fleet to touch nothing + on the member cluster side but still report configuration differences between the + hub cluster and member clusters. Fleet will not give up ownership + that it has already assumed though. + enum: + - Always + - IfNoDiff + - Never type: string type: object rollingUpdate: @@ -908,6 +1098,241 @@ spec: - type type: object type: array + diffedPlacements: + description: |- + DiffedPlacements is a list of resources that have configuration differences from their + corresponding hub cluster manifests. Fleet will report such differences when: + + * The CRP uses the ReportDiff apply strategy, which instructs Fleet to compare the hub + cluster manifests against the live resources without actually performing any apply op; or + * Fleet finds a pre-existing resource on the member cluster side that does not match its + hub cluster counterpart, and the CRP has been configured to only take over a resource if + no configuration differences are found. + + To control the object size, only the first 100 diffed resources will be included. + This field is only meaningful if the `ClusterName` is not empty. + items: + description: DiffedResourcePlacement contains the details + of a resource with configuration differences. + properties: + envelope: + description: Envelope identifies the envelope object that + contains this resource. + properties: + name: + description: Name of the envelope object. + type: string + namespace: + description: Namespace is the namespace of the envelope + object. Empty if the envelope object is cluster + scoped. + type: string + type: + default: ConfigMap + description: Type of the envelope object. + enum: + - ConfigMap + type: string + required: + - name + type: object + firstDiffedObservedTime: + description: |- + FirstDiffedObservedTime is the first time the resource on the target cluster is + observed to have configuration differences. + format: date-time + type: string + group: + description: Group is the group name of the selected resource. + type: string + kind: + description: Kind represents the Kind of the selected + resources. + type: string + name: + description: Name of the target resource. + type: string + namespace: + description: Namespace is the namespace of the resource. + Empty if the resource is cluster scoped. + type: string + observationTime: + description: ObservationTime is the time when we observe + the configuration differences for the resource. + format: date-time + type: string + observedDiffs: + description: |- + ObservedDiffs are the details about the found configuration differences. Note that + Fleet might truncate the details as appropriate to control the object size. + + Each detail entry specifies how the live state (the state on the member + cluster side) compares against the desired state (the state kept in the hub cluster manifest). + + An event about the details will be emitted as well. + items: + description: |- + PatchDetail describes a patch that explains an observed configuration drift or + difference. + + A patch detail can be transcribed as a JSON patch operation, as specified in RFC 6902. + properties: + path: + description: The JSON path that points to a field + that has drifted or has configuration differences. + type: string + valueInHub: + description: |- + The value at the JSON path from the hub cluster side. + + This field can be empty if the JSON path does not exist on the hub cluster side; i.e., + applying the manifest from the hub cluster side would remove the field. + type: string + valueInMember: + description: |- + The value at the JSON path from the member cluster side. + + This field can be empty if the JSON path does not exist on the member cluster side; i.e., + applying the manifest from the hub cluster side would add a new field. + type: string + required: + - path + type: object + type: array + targetClusterObservedGeneration: + description: |- + TargetClusterObservedGeneration is the generation of the resource on the target cluster + that contains the configuration differences. + + This might be nil if the resource has not been created yet on the target cluster. + format: int64 + type: integer + version: + description: Version is the version of the selected resource. + type: string + required: + - firstDiffedObservedTime + - kind + - name + - observationTime + - version + type: object + maxItems: 100 + type: array + driftedPlacements: + description: |- + DriftedPlacements is a list of resources that have drifted from their desired states + kept in the hub cluster, as found by Fleet using the drift detection mechanism. + + To control the object size, only the first 100 drifted resources will be included. + This field is only meaningful if the `ClusterName` is not empty. + items: + description: DriftedResourcePlacement contains the details + of a resource with configuration drifts. + properties: + envelope: + description: Envelope identifies the envelope object that + contains this resource. + properties: + name: + description: Name of the envelope object. + type: string + namespace: + description: Namespace is the namespace of the envelope + object. Empty if the envelope object is cluster + scoped. + type: string + type: + default: ConfigMap + description: Type of the envelope object. + enum: + - ConfigMap + type: string + required: + - name + type: object + firstDriftedObservedTime: + description: |- + FirstDriftedObservedTime is the first time the resource on the target cluster is + observed to have configuration drifts. + format: date-time + type: string + group: + description: Group is the group name of the selected resource. + type: string + kind: + description: Kind represents the Kind of the selected + resources. + type: string + name: + description: Name of the target resource. + type: string + namespace: + description: Namespace is the namespace of the resource. + Empty if the resource is cluster scoped. + type: string + observationTime: + description: ObservationTime is the time when we observe + the configuration drifts for the resource. + format: date-time + type: string + observedDrifts: + description: |- + ObservedDrifts are the details about the found configuration drifts. Note that + Fleet might truncate the details as appropriate to control the object size. + + Each detail entry specifies how the live state (the state on the member + cluster side) compares against the desired state (the state kept in the hub cluster manifest). + + An event about the details will be emitted as well. + items: + description: |- + PatchDetail describes a patch that explains an observed configuration drift or + difference. + + A patch detail can be transcribed as a JSON patch operation, as specified in RFC 6902. + properties: + path: + description: The JSON path that points to a field + that has drifted or has configuration differences. + type: string + valueInHub: + description: |- + The value at the JSON path from the hub cluster side. + + This field can be empty if the JSON path does not exist on the hub cluster side; i.e., + applying the manifest from the hub cluster side would remove the field. + type: string + valueInMember: + description: |- + The value at the JSON path from the member cluster side. + + This field can be empty if the JSON path does not exist on the member cluster side; i.e., + applying the manifest from the hub cluster side would add a new field. + type: string + required: + - path + type: object + type: array + targetClusterObservedGeneration: + description: |- + TargetClusterObservedGeneration is the generation of the resource on the target cluster + that contains the configuration drifts. + format: int64 + type: integer + version: + description: Version is the version of the selected resource. + type: string + required: + - firstDriftedObservedTime + - kind + - name + - observationTime + - targetClusterObservedGeneration + - version + type: object + maxItems: 100 + type: array failedPlacements: description: |- FailedPlacements is a list of all the resources failed to be placed to the given cluster or the resource is unavailable. diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml index a474e0244..2c42689db 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml @@ -204,6 +204,237 @@ spec: - type type: object type: array + diffedPlacements: + description: |- + DiffedPlacements is a list of resources that have configuration differences from their + corresponding hub cluster manifests. Fleet will report such differences when: + + * The CRP uses the ReportDiff apply strategy, which instructs Fleet to compare the hub + cluster manifests against the live resources without actually performing any apply op; or + * Fleet finds a pre-existing resource on the member cluster side that does not match its + hub cluster counterpart, and the CRP has been configured to only take over a resource if + no configuration differences are found. + + To control the object size, only the first 100 diffed resources will be included. + This field is only meaningful if the `ClusterName` is not empty. + items: + description: DiffedResourcePlacement contains the details of a + resource with configuration differences. + properties: + envelope: + description: Envelope identifies the envelope object that + contains this resource. + properties: + name: + description: Name of the envelope object. + type: string + namespace: + description: Namespace is the namespace of the envelope + object. Empty if the envelope object is cluster scoped. + type: string + type: + default: ConfigMap + description: Type of the envelope object. + enum: + - ConfigMap + type: string + required: + - name + type: object + firstDiffedObservedTime: + description: |- + FirstDiffedObservedTime is the first time the resource on the target cluster is + observed to have configuration differences. + format: date-time + type: string + group: + description: Group is the group name of the selected resource. + type: string + kind: + description: Kind represents the Kind of the selected resources. + type: string + name: + description: Name of the target resource. + type: string + namespace: + description: Namespace is the namespace of the resource. Empty + if the resource is cluster scoped. + type: string + observationTime: + description: ObservationTime is the time when we observe the + configuration differences for the resource. + format: date-time + type: string + observedDiffs: + description: |- + ObservedDiffs are the details about the found configuration differences. Note that + Fleet might truncate the details as appropriate to control the object size. + + Each detail entry specifies how the live state (the state on the member + cluster side) compares against the desired state (the state kept in the hub cluster manifest). + + An event about the details will be emitted as well. + items: + description: |- + PatchDetail describes a patch that explains an observed configuration drift or + difference. + + A patch detail can be transcribed as a JSON patch operation, as specified in RFC 6902. + properties: + path: + description: The JSON path that points to a field that + has drifted or has configuration differences. + type: string + valueInHub: + description: |- + The value at the JSON path from the hub cluster side. + + This field can be empty if the JSON path does not exist on the hub cluster side; i.e., + applying the manifest from the hub cluster side would remove the field. + type: string + valueInMember: + description: |- + The value at the JSON path from the member cluster side. + + This field can be empty if the JSON path does not exist on the member cluster side; i.e., + applying the manifest from the hub cluster side would add a new field. + type: string + required: + - path + type: object + type: array + targetClusterObservedGeneration: + description: |- + TargetClusterObservedGeneration is the generation of the resource on the target cluster + that contains the configuration differences. + + This might be nil if the resource has not been created yet on the target cluster. + format: int64 + type: integer + version: + description: Version is the version of the selected resource. + type: string + required: + - firstDiffedObservedTime + - kind + - name + - observationTime + - version + type: object + maxItems: 100 + type: array + driftedPlacements: + description: |- + DriftedPlacements is a list of resources that have drifted from their desired states + kept in the hub cluster, as found by Fleet using the drift detection mechanism. + + To control the object size, only the first 100 drifted resources will be included. + This field is only meaningful if the `ClusterName` is not empty. + items: + description: DriftedResourcePlacement contains the details of + a resource with configuration drifts. + properties: + envelope: + description: Envelope identifies the envelope object that + contains this resource. + properties: + name: + description: Name of the envelope object. + type: string + namespace: + description: Namespace is the namespace of the envelope + object. Empty if the envelope object is cluster scoped. + type: string + type: + default: ConfigMap + description: Type of the envelope object. + enum: + - ConfigMap + type: string + required: + - name + type: object + firstDriftedObservedTime: + description: |- + FirstDriftedObservedTime is the first time the resource on the target cluster is + observed to have configuration drifts. + format: date-time + type: string + group: + description: Group is the group name of the selected resource. + type: string + kind: + description: Kind represents the Kind of the selected resources. + type: string + name: + description: Name of the target resource. + type: string + namespace: + description: Namespace is the namespace of the resource. Empty + if the resource is cluster scoped. + type: string + observationTime: + description: ObservationTime is the time when we observe the + configuration drifts for the resource. + format: date-time + type: string + observedDrifts: + description: |- + ObservedDrifts are the details about the found configuration drifts. Note that + Fleet might truncate the details as appropriate to control the object size. + + Each detail entry specifies how the live state (the state on the member + cluster side) compares against the desired state (the state kept in the hub cluster manifest). + + An event about the details will be emitted as well. + items: + description: |- + PatchDetail describes a patch that explains an observed configuration drift or + difference. + + A patch detail can be transcribed as a JSON patch operation, as specified in RFC 6902. + properties: + path: + description: The JSON path that points to a field that + has drifted or has configuration differences. + type: string + valueInHub: + description: |- + The value at the JSON path from the hub cluster side. + + This field can be empty if the JSON path does not exist on the hub cluster side; i.e., + applying the manifest from the hub cluster side would remove the field. + type: string + valueInMember: + description: |- + The value at the JSON path from the member cluster side. + + This field can be empty if the JSON path does not exist on the member cluster side; i.e., + applying the manifest from the hub cluster side would add a new field. + type: string + required: + - path + type: object + type: array + targetClusterObservedGeneration: + description: |- + TargetClusterObservedGeneration is the generation of the resource on the target cluster + that contains the configuration drifts. + format: int64 + type: integer + version: + description: Version is the version of the selected resource. + type: string + required: + - firstDriftedObservedTime + - kind + - name + - observationTime + - targetClusterObservedGeneration + - version + type: object + maxItems: 100 + type: array failedPlacements: description: |- FailedPlacements is a list of all the resources failed to be placed to the given cluster or the resource is unavailable. diff --git a/config/crd/bases/placement.kubernetes-fleet.io_works.yaml b/config/crd/bases/placement.kubernetes-fleet.io_works.yaml index e2220df73..9d6bcfb3b 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_works.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_works.yaml @@ -50,11 +50,49 @@ spec: properties: allowCoOwnership: description: |- - AllowCoOwnership defines whether to apply the resource if it already exists in the target cluster and is not - solely owned by fleet (i.e., metadata.ownerReferences contains only fleet custom resources). - If true, apply the resource and add fleet as a co-owner. - If false, leave the resource unchanged and fail the apply. + AllowCoOwnership controls whether co-ownership between Fleet and other agents are allowed + on a Fleet-managed resource. If set to false, Fleet will refuse to apply manifests to + a resource that has been owned by one or more non-Fleet agents. + + Note that Fleet does not support the case where one resource is being placed multiple + times by different CRPs on the same member cluster. An apply error will be returned if + Fleet finds that a resource has been owned by another placement attempt by Fleet, even + with the AllowCoOwnership setting set to true. type: boolean + comparisonOption: + default: PartialComparison + description: |- + ComparisonOption controls how Fleet compares the desired state of a resource, as kept in + a hub cluster manifest, with the current state of the resource (if applicable) in the + member cluster. + + Available options are: + + * PartialComparison: with this option, Fleet will compare only fields that are managed by + Fleet, i.e., the fields that are specified explicitly in the hub cluster manifest. + Unmanaged fields are ignored. This is the default option. + + * FullComparison: with this option, Fleet will compare all fields of the resource, + even if the fields are absent from the hub cluster manifest. + + Consider using the PartialComparison option if you would like to: + + * use the default values for certain fields; or + * let another agent, e.g., HPAs, VPAs, etc., on the member cluster side manage some fields; or + * allow ad-hoc or cluster-specific settings on the member cluster side. + + To use the FullComparison option, it is recommended that you: + + * specify all fields as appropriate in the hub cluster, even if you are OK with using default + values; + * make sure that no fields are managed by agents other than Fleet on the member cluster + side, such as HPAs, VPAs, or other controllers. + + See the Fleet documentation for further explanations and usage examples. + enum: + - PartialComparison + - FullComparison + type: string serverSideApplyConfig: description: ServerSideApplyConfig defines the configuration for server side apply. It is honored only when type is ServerSideApply. @@ -73,12 +111,164 @@ spec: type: default: ClientSideApply description: |- - Type defines the type of strategy to use. Default to ClientSideApply. - Server-side apply is a safer choice. Read more about the differences between server-side apply and client-side - apply: https://kubernetes.io/docs/reference/using-api/server-side-apply/#comparison-with-client-side-apply. + Type is the apply strategy to use; it determines how Fleet applies manifests from the + hub cluster to a member cluster. + + Available options are: + + * ClientSideApply: Fleet uses three-way merge to apply manifests, similar to how kubectl + performs a client-side apply. This is the default option. + + Note that this strategy requires that Fleet keep the last applied configuration in the + annotation of an applied resource. If the object gets so large that apply ops can no longer + be executed, Fleet will switch to server-side apply. + + Use ComparisonOption and WhenToApply settings to control when an apply op can be executed. + + * ServerSideApply: Fleet uses server-side apply to apply manifests; Fleet itself will + become the field manager for specified fields in the manifests. Specify + ServerSideApplyConfig as appropriate if you would like Fleet to take over field + ownership upon conflicts. This is the recommended option for most scenarios; it might + help reduce object size and safely resolve conflicts between field values. For more + information, please refer to the Kubernetes documentation + (https://kubernetes.io/docs/reference/using-api/server-side-apply/#comparison-with-client-side-apply). + + Use ComparisonOption and WhenToApply settings to control when an apply op can be executed. + + * ReportDiff: Fleet will compare the desired state of a resource as kept in the hub cluster + with its current state (if applicable) on the member cluster side, and report any + differences. No actual apply ops would be executed, and resources will be left alone as they + are on the member clusters. + + If configuration differences are found on a resource, Fleet will consider this as an apply + error, which might block rollout depending on the specified rollout strategy. + + Use ComparisonOption setting to control how the difference is calculated. + + ClientSideApply and ServerSideApply apply strategies only work when Fleet can assume + ownership of a resource (e.g., the resource is created by Fleet, or Fleet has taken over + the resource). See the comments on the WhenToTakeOver field for more information. + ReportDiff apply strategy, however, will function regardless of Fleet's ownership + status. One may set up a CRP with the ReportDiff strategy and the Never takeover option, + and this will turn Fleet into a detection tool that reports only configuration differences + but do not touch any resources on the member cluster side. + + For a comparison between the different strategies and usage examples, refer to the + Fleet documentation. enum: - ClientSideApply - ServerSideApply + - ReportDiff + type: string + whenToApply: + default: Always + description: |- + WhenToApply controls when Fleet would apply the manifests on the hub cluster to the member + clusters. + + Available options are: + + * Always: with this option, Fleet will periodically apply hub cluster manifests + on the member cluster side; this will effectively overwrite any change in the fields + managed by Fleet (i.e., specified in the hub cluster manifest). This is the default + option. + + Note that this option would revert any ad-hoc changes made on the member cluster side in the + managed fields; if you would like to make temporary edits on the member cluster side + in the managed fields, switch to IfNotDrifted option. Note that changes in unmanaged + fields will be left alone; if you use the FullDiff compare option, such changes will + be reported as drifts. + + * IfNotDrifted: with this option, Fleet will stop applying hub cluster manifests on + clusters that have drifted from the desired state; apply ops would still continue on + the rest of the clusters. Drifts are calculated using the ComparisonOption, + as explained in the corresponding field. + + Use this option if you would like Fleet to detect drifts in your multi-cluster setup. + A drift occurs when an agent makes an ad-hoc change on the member cluster side that + makes affected resources deviate from its desired state as kept in the hub cluster; + and this option grants you an opportunity to view the drift details and take actions + accordingly. The drift details will be reported in the CRP status. + + To fix a drift, you may: + + * revert the changes manually on the member cluster side + * update the hub cluster manifest; this will trigger Fleet to apply the latest revision + of the manifests, which will overwrite the drifted fields + (if they are managed by Fleet) + * switch to the Always option; this will trigger Fleet to apply the current revision + of the manifests, which will overwrite the drifted fields (if they are managed by Fleet). + * if applicable and necessary, delete the drifted resources on the member cluster side; Fleet + will attempt to re-create them using the hub cluster manifests + enum: + - Always + - IfNotDrifted + type: string + whenToTakeOver: + default: Always + description: |- + WhenToTakeOver determines the action to take when Fleet applies resources to a member + cluster for the first time and finds out that the resource already exists in the cluster. + + This setting is most relevant in cases where you would like Fleet to manage pre-existing + resources on a member cluster. + + Available options include: + + * Always: with this action, Fleet will apply the hub cluster manifests to the member + clusters even if the affected resources already exist. This is the default action. + + Note that this might lead to fields being overwritten on the member clusters, if they + are specified in the hub cluster manifests. + + * IfNoDiff: with this action, Fleet will apply the hub cluster manifests to the member + clusters if (and only if) pre-existing resources look the same as the hub cluster manifests. + + This is a safer option as pre-existing resources that are inconsistent with the hub cluster + manifests will not be overwritten; Fleet will ignore them until the inconsistencies + are resolved properly: any change you make to the hub cluster manifests would not be + applied, and if you delete the manifests or even the ClusterResourcePlacement itself + from the hub cluster, these pre-existing resources would not be taken away. + + Fleet will check for inconsistencies in accordance with the ComparisonOption setting. See also + the comments on the ComparisonOption field for more information. + + If a diff has been found in a field that is **managed** by Fleet (i.e., the field + **is specified ** in the hub cluster manifest), consider one of the following actions: + * set the field in the member cluster to be of the same value as that in the hub cluster + manifest. + * update the hub cluster manifest so that its field value matches with that in the member + cluster. + * switch to the Always action, which will allow Fleet to overwrite the field with the + value in the hub cluster manifest. + + If a diff has been found in a field that is **not managed** by Fleet (i.e., the field + **is not specified** in the hub cluster manifest), consider one of the following actions: + * remove the field from the member cluster. + * update the hub cluster manifest so that the field is included in the hub cluster manifest. + + If appropriate, you may also delete the object from the member cluster; Fleet will recreate + it using the hub cluster manifest. + + * Never: with this action, Fleet will not apply a hub cluster manifest to the member + clusters if there is a corresponding pre-existing resource. However, if a manifest + has never been applied yet; or it has a corresponding resource which Fleet has assumed + ownership, apply op will still be executed. + + This is the safest option; one will have to remove the pre-existing resources (so that + Fleet can re-create them) or switch to a different + WhenToTakeOver option before Fleet starts processing the corresponding hub cluster + manifests. + + If you prefer Fleet stop processing all manifests, use this option along with the + ReportDiff apply strategy type. This setup would instruct Fleet to touch nothing + on the member cluster side but still report configuration differences between the + hub cluster and member clusters. Fleet will not give up ownership + that it has already assumed though. + enum: + - Always + - IfNoDiff + - Never type: string type: object workload: @@ -233,6 +423,144 @@ spec: - type type: object type: array + diffDetails: + description: |- + DiffDetails explains the details about the observed configuration differences. + Fleet might truncate the details as appropriate to control object size. + + Note that configuration differences can only occur on a resource if it is not currently owned + by Fleet (i.e., it is a pre-existing resource that needs to be taken over), or if its + corresponding placement is set to use the ReportDiff apply strategy. In other words, + DiffDetails and DriftDetails will not be populated at the same time. + properties: + firstDiffedObservedTime: + description: |- + FirstDiffedObservedTime is the timestamp when the configuration difference + was first detected. + format: date-time + type: string + observationTime: + description: ObservationTime is the timestamp when the configuration + difference was last detected. + format: date-time + type: string + observedDiffs: + description: |- + ObservedDiffs describes each field with configuration difference as found from the + member cluster side. + + Fleet might truncate the details as appropriate to control object size. + + Each entry specifies how the live state (the state on the member cluster side) compares + against the desired state (the state kept in the hub cluster manifest). + items: + description: |- + PatchDetail describes a patch that explains an observed configuration drift or + difference. + + A patch detail can be transcribed as a JSON patch operation, as specified in RFC 6902. + properties: + path: + description: The JSON path that points to a field + that has drifted or has configuration differences. + type: string + valueInHub: + description: |- + The value at the JSON path from the hub cluster side. + + This field can be empty if the JSON path does not exist on the hub cluster side; i.e., + applying the manifest from the hub cluster side would remove the field. + type: string + valueInMember: + description: |- + The value at the JSON path from the member cluster side. + + This field can be empty if the JSON path does not exist on the member cluster side; i.e., + applying the manifest from the hub cluster side would add a new field. + type: string + required: + - path + type: object + type: array + observedInMemberClusterGeneration: + description: |- + ObservedInMemberClusterGeneration is the generation of the applied manifest on the member + cluster side. + + This might be nil if the resource has not been created yet in the member cluster. + format: int64 + type: integer + required: + - firstDiffedObservedTime + - observationTime + type: object + driftDetails: + description: |- + DriftDetails explains about the observed configuration drifts. + Fleet might truncate the details as appropriate to control object size. + + Note that configuration drifts can only occur on a resource if it is currently owned by + Fleet and its corresponding placement is set to use the ClientSideApply or ServerSideApply + apply strategy. In other words, DriftDetails and DiffDetails will not be populated + at the same time. + properties: + firstDriftedObservedTime: + description: FirstDriftedObservedTime is the timestamp when + the drift was first detected. + format: date-time + type: string + observationTime: + description: ObservationTime is the timestamp when the drift + was last detected. + format: date-time + type: string + observedDrifts: + description: |- + ObservedDrifts describes each drifted field found from the applied manifest. + Fleet might truncate the details as appropriate to control object size. + + Each entry specifies how the live state (the state on the member cluster side) compares + against the desired state (the state kept in the hub cluster manifest). + items: + description: |- + PatchDetail describes a patch that explains an observed configuration drift or + difference. + + A patch detail can be transcribed as a JSON patch operation, as specified in RFC 6902. + properties: + path: + description: The JSON path that points to a field + that has drifted or has configuration differences. + type: string + valueInHub: + description: |- + The value at the JSON path from the hub cluster side. + + This field can be empty if the JSON path does not exist on the hub cluster side; i.e., + applying the manifest from the hub cluster side would remove the field. + type: string + valueInMember: + description: |- + The value at the JSON path from the member cluster side. + + This field can be empty if the JSON path does not exist on the member cluster side; i.e., + applying the manifest from the hub cluster side would add a new field. + type: string + required: + - path + type: object + type: array + observedInMemberClusterGeneration: + description: |- + ObservedInMemberClusterGeneration is the generation of the applied manifest on the member + cluster side. + format: int64 + type: integer + required: + - firstDriftedObservedTime + - observationTime + - observedInMemberClusterGeneration + type: object identifier: description: resourceId represents a identity of a resource linking to manifests in spec. diff --git a/pkg/utils/common.go b/pkg/utils/common.go index fd9a8088e..4938cd46d 100644 --- a/pkg/utils/common.go +++ b/pkg/utils/common.go @@ -47,6 +47,7 @@ import ( fleetnetworkingv1alpha1 "go.goms.io/fleet-networking/api/v1alpha1" clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" + placementv1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" "github.com/kubefleet-dev/kubefleet/pkg/utils/condition" "github.com/kubefleet-dev/kubefleet/pkg/utils/controller" @@ -589,6 +590,12 @@ var LessFuncResourceIdentifier = func(a, b placementv1beta1.ResourceIdentifier) return aStr < bStr } +var LessFuncResourceIdentifierV1 = func(a, b placementv1.ResourceIdentifier) bool { + aStr := fmt.Sprintf(ResourceIdentifierStringFormat, a.Group, a.Version, a.Kind, a.Namespace, a.Name) + bStr := fmt.Sprintf(ResourceIdentifierStringFormat, b.Group, b.Version, b.Kind, b.Namespace, b.Name) + return aStr < bStr +} + // LessFuncPatchDetail is a less function for sorting patch details var LessFuncPatchDetail = func(a, b placementv1beta1.PatchDetail) bool { if a.Path != b.Path { @@ -617,6 +624,22 @@ var LessFuncFailedResourcePlacements = func(a, b placementv1beta1.FailedResource return aStr < bStr } +var LessFuncFailedResourcePlacementsV1 = func(a, b placementv1.FailedResourcePlacement) bool { + var aStr, bStr string + if a.Envelope != nil { + aStr = fmt.Sprintf(ResourceIdentifierWithEnvelopeIdentifierStringFormat, a.Group, a.Version, a.Kind, a.Namespace, a.Name, a.Envelope.Type, a.Envelope.Namespace, a.Envelope.Name) + } else { + aStr = fmt.Sprintf(ResourceIdentifierStringFormat, a.Group, a.Version, a.Kind, a.Namespace, a.Name) + } + if b.Envelope != nil { + bStr = fmt.Sprintf(ResourceIdentifierWithEnvelopeIdentifierStringFormat, b.Group, b.Version, b.Kind, b.Namespace, b.Name, b.Envelope.Type, b.Envelope.Namespace, b.Envelope.Name) + } else { + bStr = fmt.Sprintf(ResourceIdentifierStringFormat, b.Group, b.Version, b.Kind, b.Namespace, b.Name) + + } + return aStr < bStr +} + func IsFailedResourcePlacementsEqual(oldFailedResourcePlacements, newFailedResourcePlacements []placementv1beta1.FailedResourcePlacement) bool { if len(oldFailedResourcePlacements) != len(newFailedResourcePlacements) { return false @@ -657,6 +680,22 @@ var LessFuncDriftedResourcePlacements = func(a, b placementv1beta1.DriftedResour return aStr < bStr } +var LessFuncDriftedResourcePlacementsV1 = func(a, b placementv1.DriftedResourcePlacement) bool { + var aStr, bStr string + if a.Envelope != nil { + aStr = fmt.Sprintf(ResourceIdentifierWithEnvelopeIdentifierStringFormat, a.Group, a.Version, a.Kind, a.Namespace, a.Name, a.Envelope.Type, a.Envelope.Namespace, a.Envelope.Name) + } else { + aStr = fmt.Sprintf(ResourceIdentifierStringFormat, a.Group, a.Version, a.Kind, a.Namespace, a.Name) + } + if b.Envelope != nil { + bStr = fmt.Sprintf(ResourceIdentifierWithEnvelopeIdentifierStringFormat, b.Group, b.Version, b.Kind, b.Namespace, b.Name, b.Envelope.Type, b.Envelope.Namespace, b.Envelope.Name) + } else { + bStr = fmt.Sprintf(ResourceIdentifierStringFormat, b.Group, b.Version, b.Kind, b.Namespace, b.Name) + + } + return aStr < bStr +} + // IsDriftedResourcePlacementsEqual returns true if the two set of drifted resource placements are equal. func IsDriftedResourcePlacementsEqual(oldDriftedResourcePlacements, newDriftedResourcePlacements []placementv1beta1.DriftedResourcePlacement) bool { if len(oldDriftedResourcePlacements) != len(newDriftedResourcePlacements) { @@ -698,6 +737,22 @@ var LessFuncDiffedResourcePlacements = func(a, b placementv1beta1.DiffedResource return aStr < bStr } +var LessFuncDiffedResourcePlacementsV1 = func(a, b placementv1.DiffedResourcePlacement) bool { + var aStr, bStr string + if a.Envelope != nil { + aStr = fmt.Sprintf(ResourceIdentifierWithEnvelopeIdentifierStringFormat, a.Group, a.Version, a.Kind, a.Namespace, a.Name, a.Envelope.Type, a.Envelope.Namespace, a.Envelope.Name) + } else { + aStr = fmt.Sprintf(ResourceIdentifierStringFormat, a.Group, a.Version, a.Kind, a.Namespace, a.Name) + } + if b.Envelope != nil { + bStr = fmt.Sprintf(ResourceIdentifierWithEnvelopeIdentifierStringFormat, b.Group, b.Version, b.Kind, b.Namespace, b.Name, b.Envelope.Type, b.Envelope.Namespace, b.Envelope.Name) + } else { + bStr = fmt.Sprintf(ResourceIdentifierStringFormat, b.Group, b.Version, b.Kind, b.Namespace, b.Name) + + } + return aStr < bStr +} + // LessFuncCondition is a less function for sorting conditions based on its types. var LessFuncConditionByType = func(a, b metav1.Condition) bool { return a.Type < b.Type diff --git a/test/e2e/api_progression_test.go b/test/e2e/api_progression_test.go new file mode 100644 index 000000000..815795e01 --- /dev/null +++ b/test/e2e/api_progression_test.go @@ -0,0 +1,483 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "fmt" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" + + placementv1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1" + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" + "github.com/kubefleet-dev/kubefleet/pkg/controllers/workapplier" + "github.com/kubefleet-dev/kubefleet/pkg/utils" +) + +var ( + placementStatusCmpOptionsV1 = cmp.Options{ + cmpopts.SortSlices(lessFuncCondition), + cmpopts.SortSlices(lessFuncPlacementStatusV1), + cmpopts.SortSlices(utils.LessFuncResourceIdentifierV1), + cmpopts.SortSlices(utils.LessFuncFailedResourcePlacementsV1), + cmpopts.SortSlices(utils.LessFuncDiffedResourcePlacementsV1), + cmpopts.SortSlices(utils.LessFuncDriftedResourcePlacementsV1), + utils.IgnoreConditionLTTAndMessageFields, + ignorePlacementStatusDriftedPlacementsTimestampFieldsV1, + ignorePlacementStatusDiffedPlacementsTimestampFieldsV1, + cmpopts.EquateEmpty(), + } +) + +func ensureCRPRemovalV1(crpName string) { + Eventually(func() error { + crp := &placementv1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: crpName, + }, + } + if err := hubClient.Delete(ctx, crp); err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("failed to delete CRP object: %w", err) + } + + if err := hubClient.Get(ctx, types.NamespacedName{Name: crpName}, crp); err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("failed to get CRP object: %w", err) + } + return nil + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to wait for CRP deletion") +} + +// Test specs in this file help verify the progression from one API version to another (e.g., v1beta1 to v1); +// the logic is more focuses on API compatibility and is less focused on behavioral correctness for simplicity reasons. + +// Note (chenyu1): in the test specs there are still sporadic references to the v1beta1 API package; this is needed +// as some of the constants (primarily condition types and reasons) are only available there. + +var _ = Describe("takeover, drift detection, and reportDiff mode (v1beta1 to v1)", func() { + Context("takeover with diff detection (CRP, read and write in v1)", Ordered, func() { + crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) + nsName := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) + + var existingNS *corev1.Namespace + + BeforeAll(func() { + ns := appNamespace() + // Add a label (managed field) to the namespace. + ns.Labels = map[string]string{ + managedDataFieldKey: managedDataFieldVal1, + workNamespaceLabelName: fmt.Sprintf("%d", GinkgoParallelProcess()), + } + existingNS = ns.DeepCopy() + + // Create the resources on the hub cluster. + Expect(hubClient.Create(ctx, &ns)).To(Succeed()) + + // Create the resources on one of the member clusters. + existingNS.Labels[managedDataFieldKey] = managedDataFieldVal2 + Expect(memberCluster1EastProdClient.Create(ctx, existingNS)).To(Succeed()) + + crp := &placementv1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: crpName, + }, + Spec: placementv1.ClusterResourcePlacementSpec{ + ResourceSelectors: []placementv1.ClusterResourceSelector{ + { + Group: "", + Version: "v1", + Kind: "Namespace", + Name: nsName, + }, + }, + Policy: &placementv1.PlacementPolicy{ + PlacementType: placementv1.PickFixedPlacementType, + ClusterNames: []string{ + memberCluster1EastProdName, + }, + }, + Strategy: placementv1.RolloutStrategy{ + Type: placementv1.RollingUpdateRolloutStrategyType, + RollingUpdate: &placementv1.RollingUpdateConfig{ + MaxUnavailable: ptr.To(intstr.FromString("100%")), + UnavailablePeriodSeconds: ptr.To(1), + }, + ApplyStrategy: &placementv1.ApplyStrategy{ + ComparisonOption: placementv1.ComparisonOptionTypePartialComparison, + WhenToTakeOver: placementv1.WhenToTakeOverTypeIfNoDiff, + }, + }, + }, + } + Expect(hubClient.Create(ctx, crp)).To(Succeed()) + }) + + It("should update CRP status as expected", func() { + buildWantCRPStatus := func(crpGeneration int64) *placementv1.ClusterResourcePlacementStatus { + return &placementv1.ClusterResourcePlacementStatus{ + Conditions: crpAppliedFailedConditions(crpGeneration), + SelectedResources: []placementv1.ResourceIdentifier{ + { + Version: "v1", + Kind: "Namespace", + Name: nsName, + }, + }, + PlacementStatuses: []placementv1.ResourcePlacementStatus{ + { + ClusterName: memberCluster1EastProdName, + Conditions: perClusterApplyFailedConditions(crpGeneration), + FailedPlacements: []placementv1.FailedResourcePlacement{ + { + ResourceIdentifier: placementv1.ResourceIdentifier{ + Version: "v1", + Kind: "Namespace", + Name: nsName, + }, + Condition: metav1.Condition{ + Type: string(placementv1beta1.PerClusterAppliedConditionType), + Status: metav1.ConditionFalse, + ObservedGeneration: 0, + Reason: string(workapplier.ApplyOrReportDiffResTypeFailedToTakeOver), + }, + }, + }, + DiffedPlacements: []placementv1.DiffedResourcePlacement{ + { + ResourceIdentifier: placementv1.ResourceIdentifier{ + Version: "v1", + Kind: "Namespace", + Name: nsName, + }, + TargetClusterObservedGeneration: ptr.To(int64(0)), + ObservedDiffs: []placementv1.PatchDetail{ + { + Path: fmt.Sprintf("/metadata/labels/%s", managedDataFieldKey), + ValueInMember: managedDataFieldVal2, + ValueInHub: managedDataFieldVal1, + }, + }, + }, + }, + }, + }, + ObservedResourceIndex: "0", + } + } + + Eventually(func() error { + crp := &placementv1.ClusterResourcePlacement{} + if err := hubClient.Get(ctx, types.NamespacedName{Name: crpName}, crp); err != nil { + return err + } + wantCRPStatus := buildWantCRPStatus(crp.Generation) + + if diff := cmp.Diff(crp.Status, *wantCRPStatus, placementStatusCmpOptionsV1...); diff != "" { + return fmt.Errorf("CRP status diff (-got, +want): %s", diff) + } + return nil + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") + }) + + AfterAll(func() { + // Delete the CRP. + ensureCRPRemovalV1(crpName) + + // Delete the namespace from the hub cluster. + cleanupWorkResources() + + // Verify that all resources placed have been removed from the specified member clusters. + cleanWorkResourcesOnCluster(memberCluster1EastProd) + }) + }) + + Context("apply with drift detection (CRP, read and write in v1)", Ordered, func() { + crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) + nsName := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) + + BeforeAll(func() { + ns := appNamespace() + // Add a label (managed field) to the namespace. + ns.Labels = map[string]string{ + managedDataFieldKey: managedDataFieldVal1, + workNamespaceLabelName: fmt.Sprintf("%d", GinkgoParallelProcess()), + } + + // Create the resources on the hub cluster. + Expect(hubClient.Create(ctx, &ns)).To(Succeed()) + + crp := &placementv1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: crpName, + }, + Spec: placementv1.ClusterResourcePlacementSpec{ + ResourceSelectors: []placementv1.ClusterResourceSelector{ + { + Group: "", + Version: "v1", + Kind: "Namespace", + Name: nsName, + }, + }, + Policy: &placementv1.PlacementPolicy{ + PlacementType: placementv1.PickFixedPlacementType, + ClusterNames: []string{ + memberCluster1EastProdName, + }, + }, + Strategy: placementv1.RolloutStrategy{ + Type: placementv1.RollingUpdateRolloutStrategyType, + RollingUpdate: &placementv1.RollingUpdateConfig{ + MaxUnavailable: ptr.To(intstr.FromString("100%")), + UnavailablePeriodSeconds: ptr.To(1), + }, + ApplyStrategy: &placementv1.ApplyStrategy{ + ComparisonOption: placementv1.ComparisonOptionTypePartialComparison, + WhenToApply: placementv1.WhenToApplyTypeIfNotDrifted, + }, + }, + }, + } + Expect(hubClient.Create(ctx, crp)).To(Succeed()) + }) + + It("can introduce a drift", func() { + Eventually(func() error { + ns := &corev1.Namespace{} + if err := memberCluster1EastProdClient.Get(ctx, types.NamespacedName{Name: nsName}, ns); err != nil { + return fmt.Errorf("failed to retrieve namespace: %w", err) + } + + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + ns.Labels[managedDataFieldKey] = managedDataFieldVal2 + if err := memberCluster1EastProdClient.Update(ctx, ns); err != nil { + return fmt.Errorf("failed to update namespace: %w", err) + } + return nil + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to introduce a drift") + }) + + It("should update CRP status as expected", func() { + buildWantCRPStatus := func(crpGeneration int64) *placementv1.ClusterResourcePlacementStatus { + return &placementv1.ClusterResourcePlacementStatus{ + Conditions: crpAppliedFailedConditions(crpGeneration), + SelectedResources: []placementv1.ResourceIdentifier{ + { + Version: "v1", + Kind: "Namespace", + Name: nsName, + }, + }, + PlacementStatuses: []placementv1.ResourcePlacementStatus{ + { + ClusterName: memberCluster1EastProdName, + Conditions: perClusterApplyFailedConditions(crpGeneration), + FailedPlacements: []placementv1.FailedResourcePlacement{ + { + ResourceIdentifier: placementv1.ResourceIdentifier{ + Version: "v1", + Kind: "Namespace", + Name: nsName, + }, + Condition: metav1.Condition{ + Type: string(placementv1beta1.PerClusterAppliedConditionType), + Status: metav1.ConditionFalse, + ObservedGeneration: 0, + Reason: string(workapplier.ApplyOrReportDiffResTypeFoundDrifts), + }, + }, + }, + DriftedPlacements: []placementv1.DriftedResourcePlacement{ + { + ResourceIdentifier: placementv1.ResourceIdentifier{ + Version: "v1", + Kind: "Namespace", + Name: nsName, + }, + TargetClusterObservedGeneration: 0, + ObservedDrifts: []placementv1.PatchDetail{ + { + Path: fmt.Sprintf("/metadata/labels/%s", managedDataFieldKey), + ValueInMember: managedDataFieldVal2, + ValueInHub: managedDataFieldVal1, + }, + }, + }, + }, + }, + }, + ObservedResourceIndex: "0", + } + } + + Eventually(func() error { + crp := &placementv1.ClusterResourcePlacement{} + if err := hubClient.Get(ctx, types.NamespacedName{Name: crpName}, crp); err != nil { + return err + } + wantCRPStatus := buildWantCRPStatus(crp.Generation) + + if diff := cmp.Diff(crp.Status, *wantCRPStatus, placementStatusCmpOptionsV1...); diff != "" { + return fmt.Errorf("CRP status diff (-got, +want): %s", diff) + } + return nil + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") + }) + + AfterAll(func() { + // Delete the CRP. + ensureCRPRemovalV1(crpName) + + // Delete the namespace from the hub cluster. + cleanupWorkResources() + + // Verify that all resources placed have been removed from the specified member clusters. + cleanWorkResourcesOnCluster(memberCluster1EastProd) + }) + }) + + Context("reportDiff mode (CRP, read and write in v1)", Ordered, func() { + crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) + nsName := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) + + var existingNS *corev1.Namespace + + BeforeAll(func() { + ns := appNamespace() + // Add a label (managed field) to the namespace. + ns.Labels = map[string]string{ + managedDataFieldKey: managedDataFieldVal1, + workNamespaceLabelName: fmt.Sprintf("%d", GinkgoParallelProcess()), + } + existingNS = ns.DeepCopy() + + // Create the resources on the hub cluster. + Expect(hubClient.Create(ctx, &ns)).To(Succeed()) + + // Create the resources on one of the member clusters. + existingNS.Labels[managedDataFieldKey] = managedDataFieldVal2 + Expect(memberCluster1EastProdClient.Create(ctx, existingNS)).To(Succeed()) + + crp := &placementv1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: crpName, + }, + Spec: placementv1.ClusterResourcePlacementSpec{ + ResourceSelectors: []placementv1.ClusterResourceSelector{ + { + Group: "", + Version: "v1", + Kind: "Namespace", + Name: nsName, + }, + }, + Policy: &placementv1.PlacementPolicy{ + PlacementType: placementv1.PickFixedPlacementType, + ClusterNames: []string{ + memberCluster1EastProdName, + }, + }, + Strategy: placementv1.RolloutStrategy{ + Type: placementv1.RollingUpdateRolloutStrategyType, + RollingUpdate: &placementv1.RollingUpdateConfig{ + MaxUnavailable: ptr.To(intstr.FromString("100%")), + UnavailablePeriodSeconds: ptr.To(1), + }, + ApplyStrategy: &placementv1.ApplyStrategy{ + Type: placementv1.ApplyStrategyTypeReportDiff, + ComparisonOption: placementv1.ComparisonOptionTypePartialComparison, + WhenToTakeOver: placementv1.WhenToTakeOverTypeNever, + }, + }, + }, + } + Expect(hubClient.Create(ctx, crp)).To(Succeed()) + }) + + It("should update CRP status as expected", func() { + buildWantCRPStatus := func(crpGeneration int64) *placementv1.ClusterResourcePlacementStatus { + return &placementv1.ClusterResourcePlacementStatus{ + Conditions: crpDiffReportedConditions(crpGeneration, false), + SelectedResources: []placementv1.ResourceIdentifier{ + { + Version: "v1", + Kind: "Namespace", + Name: nsName, + }, + }, + PlacementStatuses: []placementv1.ResourcePlacementStatus{ + { + ClusterName: memberCluster1EastProdName, + Conditions: perClusterDiffReportedConditions(crpGeneration), + DiffedPlacements: []placementv1.DiffedResourcePlacement{ + { + ResourceIdentifier: placementv1.ResourceIdentifier{ + Version: "v1", + Kind: "Namespace", + Name: nsName, + }, + TargetClusterObservedGeneration: ptr.To(int64(0)), + ObservedDiffs: []placementv1.PatchDetail{ + { + Path: fmt.Sprintf("/metadata/labels/%s", managedDataFieldKey), + ValueInMember: managedDataFieldVal2, + ValueInHub: managedDataFieldVal1, + }, + }, + }, + }, + }, + }, + ObservedResourceIndex: "0", + } + } + + Eventually(func() error { + crp := &placementv1.ClusterResourcePlacement{} + if err := hubClient.Get(ctx, types.NamespacedName{Name: crpName}, crp); err != nil { + return err + } + wantCRPStatus := buildWantCRPStatus(crp.Generation) + + if diff := cmp.Diff(crp.Status, *wantCRPStatus, placementStatusCmpOptionsV1...); diff != "" { + return fmt.Errorf("CRP status diff (-got, +want): %s", diff) + } + return nil + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") + }) + + AfterAll(func() { + // Delete the CRP. + ensureCRPRemovalV1(crpName) + + // Delete the namespace from the hub cluster. + cleanupWorkResources() + + // Verify that all resources placed have been removed from the specified member clusters. + cleanWorkResourcesOnCluster(memberCluster1EastProd) + }) + }) +}) diff --git a/test/e2e/setup_test.go b/test/e2e/setup_test.go index 01f0c30ef..051c56758 100644 --- a/test/e2e/setup_test.go +++ b/test/e2e/setup_test.go @@ -45,6 +45,7 @@ import ( fleetnetworkingv1alpha1 "go.goms.io/fleet-networking/api/v1alpha1" clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" + placementv1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1" placementv1alpha1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1alpha1" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" "github.com/kubefleet-dev/kubefleet/pkg/propertyprovider/azure/trackers" @@ -195,6 +196,9 @@ var ( lessFuncPlacementStatus = func(a, b placementv1beta1.PerClusterPlacementStatus) bool { return a.ClusterName < b.ClusterName } + lessFuncPlacementStatusV1 = func(a, b placementv1.ResourcePlacementStatus) bool { + return a.ClusterName < b.ClusterName + } lessFuncPlacementStatusByConditions = func(a, b placementv1beta1.PerClusterPlacementStatus) bool { return len(a.Conditions) < len(b.Conditions) } @@ -221,7 +225,9 @@ var ( }) ignoreTimeTypeFields = cmpopts.IgnoreTypes(time.Time{}, metav1.Time{}) ignorePlacementStatusDriftedPlacementsTimestampFields = cmpopts.IgnoreFields(placementv1beta1.DriftedResourcePlacement{}, "ObservationTime", "FirstDriftedObservedTime") + ignorePlacementStatusDriftedPlacementsTimestampFieldsV1 = cmpopts.IgnoreFields(placementv1.DriftedResourcePlacement{}, "ObservationTime", "FirstDriftedObservedTime") ignorePlacementStatusDiffedPlacementsTimestampFields = cmpopts.IgnoreFields(placementv1beta1.DiffedResourcePlacement{}, "ObservationTime", "FirstDiffedObservedTime") + ignorePlacementStatusDiffedPlacementsTimestampFieldsV1 = cmpopts.IgnoreFields(placementv1.DiffedResourcePlacement{}, "ObservationTime", "FirstDiffedObservedTime") ignorePerClusterPlacementStatusObservedResourceIndexField = cmpopts.IgnoreFields(placementv1beta1.PerClusterPlacementStatus{}, "ObservedResourceIndex") ignorePlacementStatusObservedResourceIndexField = cmpopts.IgnoreFields(placementv1beta1.PlacementStatus{}, "ObservedResourceIndex") @@ -285,6 +291,9 @@ func TestMain(m *testing.M) { if err := placementv1beta1.AddToScheme(scheme); err != nil { log.Fatalf("failed to add custom APIs (placement) to the runtime scheme: %v", err) } + if err := placementv1.AddToScheme(scheme); err != nil { + log.Fatalf("failed to add custom APIs (placement v1) to the runtime scheme: %v", err) + } if err := fleetnetworkingv1alpha1.AddToScheme(scheme); err != nil { log.Fatalf("failed to add custom APIs (networking) to the runtime scheme: %v", err) } From 2ca743d50149d47a37e3ff5c999d9cc103819648 Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Tue, 25 Nov 2025 10:39:18 -0800 Subject: [PATCH 08/13] feat: add clusterProfile status fields (#321) * add k8version and CAData to the clusterProfile Signed-off-by: Ryan Zhang * resolve go.mod Signed-off-by: Ryan Zhang * fix UT Signed-off-by: Ryan Zhang * fix the UT and e2e Signed-off-by: Ryan Zhang * fix the race Signed-off-by: Ryan Zhang --------- Signed-off-by: Ryan Zhang --- ...05-0000-k8s-version-collection-with-ttl.md | 81 +++++++ .gitignore | 1 + CLAUDE.md | 4 + ...multicluster.x-k8s.io_clusterprofiles.yaml | 222 +++++++++++++++--- go.mod | 43 ++-- go.sum | 92 ++++---- .../clusterprofile/controller.go | 42 +++- pkg/controllers/placement/controller.go | 12 +- pkg/controllers/placement/controller_test.go | 18 +- pkg/controllers/workapplier/apply_test.go | 4 +- pkg/controllers/workapplier/utils.go | 9 +- pkg/propertyprovider/azure/provider.go | 98 +++++++- .../azure/provider_integration_test.go | 5 +- pkg/propertyprovider/azure/provider_test.go | 221 ++++++++++++++++- pkg/propertyprovider/commons.go | 9 + pkg/scheduler/scheduler.go | 2 + pkg/utils/controller/controller.go | 2 + test/e2e/join_and_leave_test.go | 70 +++++- test/e2e/utils_test.go | 9 +- 19 files changed, 820 insertions(+), 124 deletions(-) create mode 100644 .github/.copilot/breadcrumbs/2025-11-05-0000-k8s-version-collection-with-ttl.md diff --git a/.github/.copilot/breadcrumbs/2025-11-05-0000-k8s-version-collection-with-ttl.md b/.github/.copilot/breadcrumbs/2025-11-05-0000-k8s-version-collection-with-ttl.md new file mode 100644 index 000000000..d428d4122 --- /dev/null +++ b/.github/.copilot/breadcrumbs/2025-11-05-0000-k8s-version-collection-with-ttl.md @@ -0,0 +1,81 @@ +# Implementation: Kubernetes Version Collection with TTL Caching + +## Overview +Add a `collectK8sVersion` function to the Azure property provider that collects the Kubernetes server version using the discoveryClient with a 15-minute TTL cache to minimize API calls. + +## Plan + +### Phase 1: Add Cache Fields +**Task 1.1: Add cache-related fields to PropertyProvider struct** +- Add `cachedK8sVersion` string field to store the cached version +- Add `k8sVersionCacheTime` time.Time field to track when the cache was last updated +- Add `k8sVersionCacheTTL` time.Duration field set to 15 minutes +- Add a mutex for thread-safe access to cached values + +### Phase 2: Implement collectK8sVersion Function +**Task 2.1: Implement the collectK8sVersion function** +- Check if cached version exists and is still valid (within TTL) +- If cache is valid, return cached version +- If cache is expired or empty, call discoveryClient.ServerVersion() +- Update cache with new version and current timestamp +- Return the version as a property with observation time + +### Phase 3: Integrate into Collect Method +**Task 3.1: Call collectK8sVersion in Collect method** +- Add call to collectK8sVersion in the Collect method +- Store the k8s version in the properties map + +### Phase 4: Write Unit Tests +**Task 4.1: Create unit tests for collectK8sVersion** +- Test cache hit scenario (cached version within TTL) +- Test cache miss scenario (no cached version) +- Test cache expiration scenario (cached version expired) +- Test error handling from discoveryClient +- Test thread safety of cache access + +### Phase 5: Verify Tests Pass +**Task 5.1: Run unit tests** +- Execute `go test` for the provider package +- Verify all tests pass + +## Success Criteria +- [x] Cache fields added to PropertyProvider struct +- [x] collectK8sVersion function implemented with TTL logic +- [x] Function integrated into Collect method +- [x] Unit tests created and passing +- [x] Thread-safe implementation verified + +## Implementation Notes +- Using sync.RWMutex for thread-safe cache access +- TTL set to 15 minutes as specified +- Uses the standard `propertyprovider.K8sVersionProperty` constant instead of creating a new one +- Changed `discoveryClient` field type from `discovery.DiscoveryInterface` to `discovery.ServerVersionInterface` for better testability and to only depend on the interface we actually need +- Fixed test nil pointer issue by conditionally setting the discoveryClient field only when it's non-nil + +## Final Implementation Summary +All tasks completed successfully. The `collectK8sVersion` function now: +1. Caches the Kubernetes version with a 15-minute TTL +2. Uses thread-safe mutex locks for concurrent access +3. Properly handles nil discovery client cases +4. Returns early if cache is still valid to minimize API calls +5. Updates cache atomically when fetching new version +6. Has comprehensive unit tests covering all scenarios including cache hits, misses, expiration, errors, and concurrency + +## Integration Test Updates +Updated integration tests to ignore the new k8s version property in comparisons: +- Added `ignoreK8sVersionProperty` using `cmpopts.IgnoreMapEntries` to filter out the k8s version from test expectations +- Integration tests now pass successfully (33 specs all passed) +- The k8s version is being collected correctly from the test Kubernetes environment, validating the implementation works end-to-end + +## Test Results +- Unit tests: ✅ 8/8 passed (7 in TestCollectK8sVersion + 1 in TestCollectK8sVersionConcurrency) +- Integration tests: ✅ 33/33 specs passed +- All scenarios validated including cache behavior, TTL expiration, error handling, and thread safety + +## Refactoring +Simplified the implementation by removing the `k8sVersionCacheTTL` instance field from PropertyProvider: +- Removed the `k8sVersionCacheTTL time.Duration` field from the struct +- Updated `collectK8sVersion` to use the `K8sVersionCacheTTL` constant directly +- Removed field initialization from `New()` and `NewWithPricingProvider()` constructors +- Updated unit tests to remove the field from test PropertyProvider instances +- All tests still pass after refactoring ✅ diff --git a/.gitignore b/.gitignore index b8e3541cc..828916b76 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ ut-coverage.xml *~ .vscode/ +.qoder/ diff --git a/CLAUDE.md b/CLAUDE.md index 05ff228b0..d128e5019 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -34,8 +34,12 @@ make e2e-tests # Run custom E2E tests with labels make e2e-tests-custom + +# Clean up E2E environment +make clean-e2e-tests ``` + ### Code Quality ```bash # Run linting (required before commits) diff --git a/config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml b/config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml index f8883b17d..078dac97b 100644 --- a/config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml +++ b/config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.14.0 + controller-gen.kubebuilder.io/version: v0.17.3 name: clusterprofiles.multicluster.x-k8s.io spec: group: multicluster.x-k8s.io @@ -62,20 +62,98 @@ spec: status: description: ClusterProfileStatus defines the observed state of ClusterProfile. properties: + accessProviders: + description: |- + AccessProviders is a list of cluster access providers that can provide access + information for clusters. + items: + description: |- + AccessProvider defines how to access the cluster. + It contains the name of the provider name and the cluster connection details. + The name is used to identify different access info types, such as "kubeconfig" or "oidc". + The Cluster field contains the actual cluster connection details, such as server address, + certificate authority data, and authentication information. + properties: + cluster: + description: Cluster contains information about how to communicate + with a kubernetes cluster + properties: + certificate-authority: + description: CertificateAuthority is the path to a cert + file for the certificate authority. + type: string + certificate-authority-data: + description: CertificateAuthorityData contains PEM-encoded + certificate authority certificates. Overrides CertificateAuthority + format: byte + type: string + disable-compression: + description: |- + DisableCompression allows client to opt-out of response compression for all requests to the server. This is useful + to speed up requests (specifically lists) when client-server network bandwidth is ample, by saving time on + compression (server-side) and decompression (client-side): https://github.com/kubernetes/kubernetes/issues/112296. + type: boolean + extensions: + description: Extensions holds additional information. This + is useful for extenders so that reads and writes don't + clobber unknown fields + items: + description: NamedExtension relates nicknames to extension + information + properties: + extension: + description: Extension holds the extension information + type: object + x-kubernetes-preserve-unknown-fields: true + name: + description: Name is the nickname for this Extension + type: string + required: + - extension + - name + type: object + type: array + insecure-skip-tls-verify: + description: InsecureSkipTLSVerify skips the validity check + for the server's certificate. This will make your HTTPS + connections insecure. + type: boolean + proxy-url: + description: |- + ProxyURL is the URL to the proxy to be used for all requests made by this + client. URLs with "http", "https", and "socks5" schemes are supported. If + this configuration is not provided or the empty string, the client + attempts to construct a proxy configuration from http_proxy and + https_proxy environment variables. If these environment variables are not + set, the client does not attempt to proxy requests. + + socks5 proxying does not currently support spdy streaming endpoints (exec, + attach, port forward). + type: string + server: + description: Server is the address of the kubernetes cluster + (https://hostname:port). + type: string + tls-server-name: + description: TLSServerName is used to check server certificate. + If TLSServerName is empty, the hostname used to contact + the server is used. + type: string + required: + - server + type: object + name: + type: string + required: + - name + type: object + type: array conditions: description: Conditions contains the different condition statuses for this cluster. items: - description: "Condition contains details for one aspect of the current - state of this API Resource.\n---\nThis struct is intended for - direct use as an array at the field path .status.conditions. For - example,\n\n\n\ttype FooStatus struct{\n\t // Represents the - observations of a foo's current state.\n\t // Known .status.conditions.type - are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // - +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t - \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" - patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t - \ // other fields\n\t}" + description: Condition contains details for one aspect of the current + state of this API Resource. properties: lastTransitionTime: description: |- @@ -116,12 +194,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -133,24 +206,115 @@ spec: - type type: object type: array + credentialProviders: + description: |- + CredentialProviders is a list of cluster access providers that can provide access + information for clusters. + Deprecated: Use AccessProviders instead. If both AccessProviders and CredentialProviders are provided, both are used. In case they specify a provider with the same name, the one in AccessProviders is preferred. + items: + description: |- + AccessProvider defines how to access the cluster. + It contains the name of the provider name and the cluster connection details. + The name is used to identify different access info types, such as "kubeconfig" or "oidc". + The Cluster field contains the actual cluster connection details, such as server address, + certificate authority data, and authentication information. + properties: + cluster: + description: Cluster contains information about how to communicate + with a kubernetes cluster + properties: + certificate-authority: + description: CertificateAuthority is the path to a cert + file for the certificate authority. + type: string + certificate-authority-data: + description: CertificateAuthorityData contains PEM-encoded + certificate authority certificates. Overrides CertificateAuthority + format: byte + type: string + disable-compression: + description: |- + DisableCompression allows client to opt-out of response compression for all requests to the server. This is useful + to speed up requests (specifically lists) when client-server network bandwidth is ample, by saving time on + compression (server-side) and decompression (client-side): https://github.com/kubernetes/kubernetes/issues/112296. + type: boolean + extensions: + description: Extensions holds additional information. This + is useful for extenders so that reads and writes don't + clobber unknown fields + items: + description: NamedExtension relates nicknames to extension + information + properties: + extension: + description: Extension holds the extension information + type: object + x-kubernetes-preserve-unknown-fields: true + name: + description: Name is the nickname for this Extension + type: string + required: + - extension + - name + type: object + type: array + insecure-skip-tls-verify: + description: InsecureSkipTLSVerify skips the validity check + for the server's certificate. This will make your HTTPS + connections insecure. + type: boolean + proxy-url: + description: |- + ProxyURL is the URL to the proxy to be used for all requests made by this + client. URLs with "http", "https", and "socks5" schemes are supported. If + this configuration is not provided or the empty string, the client + attempts to construct a proxy configuration from http_proxy and + https_proxy environment variables. If these environment variables are not + set, the client does not attempt to proxy requests. + + socks5 proxying does not currently support spdy streaming endpoints (exec, + attach, port forward). + type: string + server: + description: Server is the address of the kubernetes cluster + (https://hostname:port). + type: string + tls-server-name: + description: TLSServerName is used to check server certificate. + If TLSServerName is empty, the hostname used to contact + the server is used. + type: string + required: + - server + type: object + name: + type: string + required: + - name + type: object + type: array properties: description: |- - Properties defines name/value pairs to represent properties of a cluster. - It could be a collection of ClusterProperty (KEP-2149) resources, - but could also be info based on other implementations. - The names of the properties can be predefined names from ClusterProperty resources - and is allowed to be customized by different cluster managers. + Properties defines cluster characteristics through a list of Property objects. + Each Property can be one of: + 1. A ClusterProperty resource (as defined in KEP-2149) + 2. Custom information from cluster manager implementations + Property names support both: + - Standard names from ClusterProperty resources + - Custom names defined by cluster managers items: description: |- - Property defines a name/value pair to represent a property of a cluster. - It could be a ClusterProperty (KEP-2149) resource, - but could also be info based on other implementations. - The name of the property can be predefined name from a ClusterProperty resource - and is allowed to be customized by different cluster managers. + Property defines the data structure to represent a property of a cluster. + It contains a name/value pair and the last observed time of the property on the cluster. This property can store various configurable details and metrics of a cluster, - which may include information such as the number of nodes, total and free CPU, - and total and free memory, among other potential attributes. + which may include information such as the entry point of the cluster, types of nodes, location, etc. according to KEP 4322. properties: + lastObservedTime: + description: |- + LastObservedTime is the last time the property was observed on the corresponding cluster. + The value is the timestamp when the property was observed not the time when the property was updated in the cluster-profile. + format: date-time + type: string name: description: |- Name is the name of a property resource on cluster. It's a well-known diff --git a/go.mod b/go.mod index 8be9f715d..d5eb73fbe 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.10.1 github.com/Azure/karpenter-provider-azure v1.5.1 - github.com/crossplane/crossplane-runtime v1.17.0 + github.com/crossplane/crossplane-runtime v1.20.0 github.com/evanphx/json-patch/v5 v5.9.11 github.com/google/go-cmp v0.7.0 github.com/onsi/ginkgo/v2 v2.23.4 @@ -24,20 +24,20 @@ require ( golang.org/x/sync v0.18.0 golang.org/x/time v0.11.0 gomodules.xyz/jsonpatch/v2 v2.4.0 - k8s.io/api v0.32.3 - k8s.io/apiextensions-apiserver v0.32.3 - k8s.io/apimachinery v0.32.3 - k8s.io/client-go v0.32.3 - k8s.io/component-base v0.32.3 + k8s.io/api v0.34.1 + k8s.io/apiextensions-apiserver v0.34.1 + k8s.io/apimachinery v0.34.1 + k8s.io/client-go v0.34.1 + k8s.io/component-base v0.34.1 k8s.io/component-helpers v0.32.3 k8s.io/klog/v2 v2.130.1 k8s.io/kubectl v0.32.3 k8s.io/metrics v0.32.3 - k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e + k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 sigs.k8s.io/cloud-provider-azure v1.32.4 sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.5.20 - sigs.k8s.io/cluster-inventory-api v0.0.0-20240730014211-ef0154379848 - sigs.k8s.io/controller-runtime v0.20.4 + sigs.k8s.io/cluster-inventory-api v0.0.0-20251028164203-2e3fabb46733 + sigs.k8s.io/controller-runtime v0.21.0 ) require ( @@ -62,22 +62,20 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emicklei/go-restful/v3 v3.12.1 // indirect + github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect - github.com/fxamacker/cbor/v2 v2.7.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-errors/errors v1.4.2 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/zapr v1.3.0 // indirect - github.com/go-openapi/jsonpointer v0.21.0 // indirect + github.com/go-openapi/jsonpointer v0.21.1 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.1 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang-jwt/jwt/v5 v5.2.2 // indirect - github.com/golang/protobuf v1.5.4 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/gnostic-models v0.6.8 // indirect - github.com/google/gofuzz v1.2.0 // indirect + github.com/google/gnostic-models v0.7.0 // indirect github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect github.com/google/uuid v1.6.0 // indirect @@ -88,7 +86,7 @@ require ( github.com/mailru/easyjson v0.9.0 // indirect github.com/mitchellh/hashstructure/v2 v2.0.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/patrickmn/go-cache v2.1.0+incompatible // indirect @@ -109,10 +107,12 @@ require ( go.uber.org/automaxprocs v1.6.0 // indirect go.uber.org/mock v0.5.1 // indirect go.uber.org/multierr v1.11.0 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.45.0 // indirect golang.org/x/exp v0.0.0-20250305212735-054e65f0b394 // indirect golang.org/x/net v0.47.0 // indirect - golang.org/x/oauth2 v0.27.0 // indirect + golang.org/x/oauth2 v0.29.0 // indirect golang.org/x/sys v0.38.0 // indirect golang.org/x/term v0.37.0 // indirect golang.org/x/text v0.31.0 // indirect @@ -122,13 +122,14 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/cli-runtime v0.32.3 // indirect - k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect - sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect + k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect + sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect sigs.k8s.io/karpenter v1.5.0 // indirect sigs.k8s.io/kustomize/api v0.18.0 // indirect sigs.k8s.io/kustomize/kyaml v0.18.1 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect - sigs.k8s.io/yaml v1.4.0 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) replace ( diff --git a/go.sum b/go.sum index 99c67babe..5bb3b922e 100644 --- a/go.sum +++ b/go.sum @@ -97,16 +97,16 @@ github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2y github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= -github.com/crossplane/crossplane-runtime v1.17.0 h1:y+GvxPT1M9s8BKt2AeZJdd2d6pg2xZeCO6LiR+VxEF8= -github.com/crossplane/crossplane-runtime v1.17.0/go.mod h1:vtglCrnnbq2HurAk9yLHa4qS0bbnCxaKL7C21cQcB/0= +github.com/crossplane/crossplane-runtime v1.20.0 h1:I54uipRIecqZyms+vz1J/l62yjVQ7HV5w+Nh3RMrUtc= +github.com/crossplane/crossplane-runtime v1.20.0/go.mod h1:lfV1VJenDc9PNVLxDC80YjPoTm+JdSZ13xlS2h37Dvg= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= -github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU= -github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= +github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v5.9.11+incompatible h1:ixHHqfcGvxhWkniF1tWxBHA0yb4Z+d1UQi45df52xW8= github.com/evanphx/json-patch v5.9.11+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= @@ -115,8 +115,8 @@ github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= -github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= -github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/gabriel-vasile/mimetype v1.4.8 h1:FfZ3gj38NjllZIeJAmMhr+qKL8Wu+nOoI3GqacKw1NM= github.com/gabriel-vasile/mimetype v1.4.8/go.mod h1:ByKUIKGjh1ODkGM1asKUbQZOLGrPjydw3hYPU2YU9t8= github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= @@ -133,8 +133,8 @@ github.com/go-openapi/analysis v0.23.0 h1:aGday7OWupfMs+LbmLZG4k0MYXIANxcuBTYUC0 github.com/go-openapi/analysis v0.23.0/go.mod h1:9mz9ZWaSlV8TvjQHLl2mUW2PbZtemkE8yA5v22ohupo= github.com/go-openapi/errors v0.22.1 h1:kslMRRnK7NCb/CvR1q1VWuEQCEIsBGn5GgKD9e+HYhU= github.com/go-openapi/errors v0.22.1/go.mod h1:+n/5UdIqdVnLIJ6Q9Se8HNGUXYaY6CN8ImWzfi/Gzp0= -github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= -github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= +github.com/go-openapi/jsonpointer v0.21.1 h1:whnzv/pNXtK2FbX/W9yJfRmE2gsmkfahjMKB0fZvcic= +github.com/go-openapi/jsonpointer v0.21.1/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= github.com/go-openapi/loads v0.22.0 h1:ECPGd4jX1U6NApCGG1We+uEozOAvXvJSF4nnwHZ8Aco= @@ -163,13 +163,10 @@ github.com/golang-jwt/jwt/v4 v4.5.2 h1:YtQM7lnr8iZ+j5q71MGKkNw9Mn7AjHM68uc9g5fXe github.com/golang-jwt/jwt/v4 v4.5.2/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeDy8= github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk= -github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= -github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= -github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -212,8 +209,9 @@ github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RR github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/2gBQ3RWajuToeY6ZtZTIKv2v7ThUy5KKusIT0yc0= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= @@ -295,8 +293,8 @@ go.mongodb.org/mongo-driver v1.14.0 h1:P98w8egYRjYe3XDjxhYJagTokP/H6HzlsnojRgZRd go.mongodb.org/mongo-driver v1.14.0/go.mod h1:Vzb0Mk/pa7e6cWw85R4F/endUC3u0U9jGcNU603k65c= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= go.opentelemetry.io/otel/exporters/prometheus v0.57.0 h1:AHh/lAP1BHrY5gBwk8ncc25FXWm/gmmY3BX258z5nuk= @@ -321,6 +319,10 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -336,8 +338,8 @@ golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= -golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= -golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= +golang.org/x/oauth2 v0.29.0 h1:WdYw2tdTK1S8olAzWHdgeqfy+Mtm9XNhv/xJsY65d98= +golang.org/x/oauth2 v0.29.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -369,8 +371,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/grpc v1.70.0 h1:pWFv03aZoHzlRKHWicjsZytKAiYCtNS0dHbXnIdq7jQ= -google.golang.org/grpc v1.70.0/go.mod h1:ofIJqVKDXx/JiXrwr2IG4/zwdH9txy3IlF40RmcJSQw= +google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA= +google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -387,53 +389,55 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.32.3 h1:Hw7KqxRusq+6QSplE3NYG4MBxZw1BZnq4aP4cJVINls= -k8s.io/api v0.32.3/go.mod h1:2wEDTXADtm/HA7CCMD8D8bK4yuBUptzaRhYcYEEYA3k= -k8s.io/apiextensions-apiserver v0.32.3 h1:4D8vy+9GWerlErCwVIbcQjsWunF9SUGNu7O7hiQTyPY= -k8s.io/apiextensions-apiserver v0.32.3/go.mod h1:8YwcvVRMVzw0r1Stc7XfGAzB/SIVLunqApySV5V7Dss= -k8s.io/apimachinery v0.32.3 h1:JmDuDarhDmA/Li7j3aPrwhpNBA94Nvk5zLeOge9HH1U= -k8s.io/apimachinery v0.32.3/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= +k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM= +k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk= +k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI= +k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc= +k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4= +k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= k8s.io/cli-runtime v0.32.3 h1:khLF2ivU2T6Q77H97atx3REY9tXiA3OLOjWJxUrdvss= k8s.io/cli-runtime v0.32.3/go.mod h1:vZT6dZq7mZAca53rwUfdFSZjdtLyfF61mkf/8q+Xjak= -k8s.io/client-go v0.32.3 h1:RKPVltzopkSgHS7aS98QdscAgtgah/+zmpAogooIqVU= -k8s.io/client-go v0.32.3/go.mod h1:3v0+3k4IcT9bXTc4V2rt+d2ZPPG700Xy6Oi0Gdl2PaY= +k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY= +k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8= k8s.io/cloud-provider v0.32.3 h1:WC7KhWrqXsU4b0E4tjS+nBectGiJbr1wuc1TpWXvtZM= k8s.io/cloud-provider v0.32.3/go.mod h1:/fwBfgRPuh16n8vLHT+PPT+Bc4LAEaJYj38opO2wsYY= -k8s.io/component-base v0.32.3 h1:98WJvvMs3QZ2LYHBzvltFSeJjEx7t5+8s71P7M74u8k= -k8s.io/component-base v0.32.3/go.mod h1:LWi9cR+yPAv7cu2X9rZanTiFKB2kHA+JjmhkKjCZRpI= +k8s.io/component-base v0.34.1 h1:v7xFgG+ONhytZNFpIz5/kecwD+sUhVE6HU7qQUiRM4A= +k8s.io/component-base v0.34.1/go.mod h1:mknCpLlTSKHzAQJJnnHVKqjxR7gBeHRv0rPXA7gdtQ0= k8s.io/component-helpers v0.32.3 h1:9veHpOGTPLluqU4hAu5IPOwkOIZiGAJUhHndfVc5FT4= k8s.io/component-helpers v0.32.3/go.mod h1:utTBXk8lhkJewBKNuNf32Xl3KT/0VV19DmiXU/SV4Ao= k8s.io/csi-translation-lib v0.32.3 h1:fKdc9LMVEMk18xsgoPm1Ga8GjfhI7AM3UX8gnIeXZKs= k8s.io/csi-translation-lib v0.32.3/go.mod h1:VX6+hCKgQyFnUX3VrnXZAgYYBXkrqx4BZk9vxr9qRcE= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y= -k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4= +k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= +k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= k8s.io/kubectl v0.32.3 h1:VMi584rbboso+yjfv0d8uBHwwxbC438LKq+dXd5tOAI= k8s.io/kubectl v0.32.3/go.mod h1:6Euv2aso5GKzo/UVMacV6C7miuyevpfI91SvBvV9Zdg= k8s.io/metrics v0.32.3 h1:2vsBvw0v8rIIlczZ/lZ8Kcqk9tR6Fks9h+dtFNbc2a4= k8s.io/metrics v0.32.3/go.mod h1:9R1Wk5cb+qJpCQon9h52mgkVCcFeYxcY+YkumfwHVCU= -k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e h1:KqK5c/ghOm8xkHYhlodbp6i6+r+ChV2vuAuVRdFbLro= -k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= +k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/cloud-provider-azure v1.32.4 h1:v50uJzcE04w25Ra9EfWX/GHTTJKUC0+0Xpt+TOJ+D14= sigs.k8s.io/cloud-provider-azure v1.32.4/go.mod h1:FbBaQt7N6/UVtK/VmIuJMLGe0gKUJ6NwoGrvH+zEa9w= sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.5.20 h1:aVSc4LFdBVlrhlldIzPo4NrcTQRdnAlqTB31sOcPIrM= sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.5.20/go.mod h1:OkkCYstvomfIwV4rvVIegymcgMnt7ZQ3+1Wi9WZmP1s= sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.5.2 h1:jjFJF0PmS9IHLokD41mM6RVoqQF3BQtVDmQd6ZMnN6E= sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.5.2/go.mod h1:7DdZ9ipIsmPLpBlfT4gueejcUlJBZQKWhdljQE5SKvc= -sigs.k8s.io/cluster-inventory-api v0.0.0-20240730014211-ef0154379848 h1:WYPi2PdQyZwZkHG648v2jQl6deyCgyjJ0fkLYgUJ618= -sigs.k8s.io/cluster-inventory-api v0.0.0-20240730014211-ef0154379848/go.mod h1:/aN4e7RWOMHgT4xAjCNkV4YFcpKfpZCeumMIL7S+KNM= -sigs.k8s.io/controller-runtime v0.20.4 h1:X3c+Odnxz+iPTRobG4tp092+CvBU9UK0t/bRf+n0DGU= -sigs.k8s.io/controller-runtime v0.20.4/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY= -sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= -sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= +sigs.k8s.io/cluster-inventory-api v0.0.0-20251028164203-2e3fabb46733 h1:l90ANqblqFrE4L2QLLk+9iPjfmaLRvOFL51l/fgwUgg= +sigs.k8s.io/cluster-inventory-api v0.0.0-20251028164203-2e3fabb46733/go.mod h1:guwenlZ9iIfYlNxn7ExCfugOLTh6wjjRX3adC36YCmQ= +sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= +sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= +sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= +sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/karpenter v1.5.0 h1:3HaFtFvkteUJ+SjIViR1ImR0qR+GTqDulahauIuE4Qg= sigs.k8s.io/karpenter v1.5.0/go.mod h1:YuqGoQsLti+V7ugHQVGXuT4v1QwCMiKloHLcPDfwMbY= sigs.k8s.io/kustomize/api v0.18.0 h1:hTzp67k+3NEVInwz5BHyzc9rGxIauoXferXyjv5lWPo= sigs.k8s.io/kustomize/api v0.18.0/go.mod h1:f8isXnX+8b+SGLHQ6yO4JG1rdkZlvhaCf/uZbLVMb0U= sigs.k8s.io/kustomize/kyaml v0.18.1 h1:WvBo56Wzw3fjS+7vBjN6TeivvpbW9GmRaWZ9CIVmt4E= sigs.k8s.io/kustomize/kyaml v0.18.1/go.mod h1:C3L2BFVU1jgcddNBE1TxuVLgS46TjObMwW5FT9FcjYo= -sigs.k8s.io/structured-merge-diff/v4 v4.4.2 h1:MdmvkGuXi/8io6ixD5wud3vOLwc1rj0aNqRlpuvjmwA= -sigs.k8s.io/structured-merge-diff/v4 v4.4.2/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= -sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/pkg/controllers/clusterinventory/clusterprofile/controller.go b/pkg/controllers/clusterinventory/clusterprofile/controller.go index e3cb7a320..b1de43f80 100644 --- a/pkg/controllers/clusterinventory/clusterprofile/controller.go +++ b/pkg/controllers/clusterinventory/clusterprofile/controller.go @@ -37,6 +37,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" + "github.com/kubefleet-dev/kubefleet/pkg/propertyprovider" "github.com/kubefleet-dev/kubefleet/pkg/utils/controller" ) @@ -158,7 +159,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu return ctrl.Result{}, err } klog.V(2).InfoS("Cluster profile object is created or updated", "memberCluster", mcRef, "clusterProfile", klog.KObj(cp), "operation", createOrUpdateRes) - // sync the cluster profile condition from the member cluster condition + + // sync the cluster profile status/condition from the member cluster condition + r.fillInClusterStatus(mc, cp) r.syncClusterProfileCondition(mc, cp) if err = r.Status().Update(ctx, cp); err != nil { klog.ErrorS(err, "Failed to update cluster profile status", "memberCluster", mcRef, "clusterProfile", klog.KObj(cp)) @@ -167,10 +170,43 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu return ctrl.Result{}, nil } +// fillInClusterStatus fills in the ClusterProfile status fields from the MemberCluster status. +// Currently, it only fills in the Kubernetes version field. +func (r *Reconciler) fillInClusterStatus(mc *clusterv1beta1.MemberCluster, cp *clusterinventory.ClusterProfile) { + k8sversion, exists := mc.Status.Properties[propertyprovider.K8sVersionProperty] + if exists { + klog.V(3).InfoS("Get Kubernetes version from member cluster status", "kubernetesVersion", k8sversion.Value, "clusterProfile", klog.KObj(cp)) + cp.Status.Version = clusterinventory.ClusterVersion{ + Kubernetes: k8sversion.Value, + } + } + // Add the class access provider, we only have one so far + cp.Status.AccessProviders = []clusterinventory.AccessProvider{ + { + Name: controller.ClusterManagerName, + }, + } + // TODO throw and unexpected error if clusterEntryPoint is not found + // We don't have a way to get it yet + clusterEntry, exists := mc.Status.Properties[propertyprovider.ClusterEntryPointProperty] + if exists { + klog.V(3).InfoS("Get Kubernetes cluster entry point from member cluster status", "clusterEntryPoint", clusterEntry.Value, "clusterProfile", klog.KObj(cp)) + cp.Status.AccessProviders[0].Cluster.Server = clusterEntry.Value + } + // Get the CA Data + certificateAuthorityData, exists := mc.Status.Properties[propertyprovider.ClusterCertificateAuthorityProperty] + if exists { + klog.V(3).InfoS("Get Kubernetes cluster certificate authority data from member cluster status", "clusterProfile", klog.KObj(cp)) + cp.Status.AccessProviders[0].Cluster.CertificateAuthorityData = []byte(certificateAuthorityData.Value) + } else { + // throw an alert + _ = controller.NewUnexpectedBehaviorError(fmt.Errorf("cluster certificate authority data not found in member cluster %s status", mc.Name)) + cp.Status.AccessProviders[0].Cluster.InsecureSkipTLSVerify = true + } +} + // syncClusterProfileCondition syncs the ClusterProfile object's condition based on the MemberCluster object's condition. func (r *Reconciler) syncClusterProfileCondition(mc *clusterv1beta1.MemberCluster, cp *clusterinventory.ClusterProfile) { - // Update the cluster profile status. - // // For simplicity reasons, for now only the health check condition is populated, using // Fleet member agent's API server health check result. var mcHealthCond *metav1.Condition diff --git a/pkg/controllers/placement/controller.go b/pkg/controllers/placement/controller.go index fcdead2dc..0e7351994 100644 --- a/pkg/controllers/placement/controller.go +++ b/pkg/controllers/placement/controller.go @@ -241,7 +241,7 @@ func (r *Reconciler) handleUpdate(ctx context.Context, placementObj fleetv1beta1 } // We don't requeue the request here immediately so that placement can keep tracking the rollout status. - if createResourceSnapshotRes.Requeue { + if createResourceSnapshotRes.RequeueAfter > 0 { latestResourceSnapshotKObj := klog.KObj(latestResourceSnapshot) // We cannot create the resource snapshot immediately because of the resource snapshot creation interval. // Rebuild the seletedResourceIDs using the latestResourceSnapshot. @@ -297,7 +297,7 @@ func (r *Reconciler) handleUpdate(ctx context.Context, placementObj fleetv1beta1 klog.V(2).InfoS("Placement has finished the rollout process and reached the desired status", "placement", placementKObj, "generation", placementObj.GetGeneration()) r.Recorder.Event(placementObj, corev1.EventTypeNormal, "PlacementRolloutCompleted", "Placement has finished the rollout process and reached the desired status") } - if createResourceSnapshotRes.Requeue { + if createResourceSnapshotRes.RequeueAfter > 0 { klog.V(2).InfoS("Requeue the request to handle the new resource snapshot", "placement", placementKObj, "generation", placementObj.GetGeneration()) // We requeue the request to handle the resource snapshot. return createResourceSnapshotRes, nil @@ -319,7 +319,7 @@ func (r *Reconciler) handleUpdate(ctx context.Context, placementObj fleetv1beta1 // Here we requeue the request to prevent a bug in the watcher. klog.V(2).InfoS("Scheduler has not scheduled any cluster yet and requeue the request as a backup", "placement", placementKObj, "scheduledCondition", placementObj.GetCondition(string(fleetv1beta1.ClusterResourcePlacementScheduledConditionType)), "generation", placementObj.GetGeneration()) - if createResourceSnapshotRes.Requeue { + if createResourceSnapshotRes.RequeueAfter > 0 { klog.V(2).InfoS("Requeue the request to handle the new resource snapshot", "placement", placementKObj, "generation", placementObj.GetGeneration()) // We requeue the request to handle the resource snapshot. return createResourceSnapshotRes, nil @@ -327,7 +327,7 @@ func (r *Reconciler) handleUpdate(ctx context.Context, placementObj fleetv1beta1 return ctrl.Result{RequeueAfter: controllerResyncPeriod}, nil } klog.V(2).InfoS("Placement rollout has not finished yet and requeue the request", "placement", placementKObj, "status", placementObj.GetPlacementStatus(), "generation", placementObj.GetGeneration()) - if createResourceSnapshotRes.Requeue { + if createResourceSnapshotRes.RequeueAfter > 0 { klog.V(2).InfoS("Requeue the request to handle the new resource snapshot", "placement", placementKObj, "generation", placementObj.GetGeneration()) // We requeue the request to handle the resource snapshot. return createResourceSnapshotRes, nil @@ -553,7 +553,7 @@ func (r *Reconciler) getOrCreateResourceSnapshot(ctx context.Context, placement if error != nil { return ctrl.Result{}, nil, error } - if res.Requeue { + if res.RequeueAfter > 0 { // If the latest resource snapshot is not ready to be updated, we requeue the request. return res, latestResourceSnapshot, nil } @@ -636,7 +636,7 @@ func (r *Reconciler) shouldCreateNewResourceSnapshotNow(ctx context.Context, lat "resourceSnapshot", snapshotKObj, "nextCreationTime", nextCreationTime, "latestResourceSnapshotCreationTime", latestResourceSnapshot.GetCreationTimestamp(), "resourceSnapshotCreationMinimumInterval", r.ResourceSnapshotCreationMinimumInterval, "resourceChangesCollectionDuration", r.ResourceChangesCollectionDuration, "afterDuration", nextCreationTime.Sub(now)) - return ctrl.Result{Requeue: true, RequeueAfter: nextCreationTime.Sub(now)}, nil + return ctrl.Result{RequeueAfter: nextCreationTime.Sub(now)}, nil } return ctrl.Result{}, nil } diff --git a/pkg/controllers/placement/controller_test.go b/pkg/controllers/placement/controller_test.go index 3bbfef28d..3aa9e0c02 100644 --- a/pkg/controllers/placement/controller_test.go +++ b/pkg/controllers/placement/controller_test.go @@ -2714,8 +2714,8 @@ func TestGetOrCreateClusterResourceSnapshot(t *testing.T) { if err != nil { t.Fatalf("failed to handle getOrCreateResourceSnapshot: %v", err) } - if res.Requeue != tc.wantRequeue { - t.Fatalf("getOrCreateResourceSnapshot() got Requeue %v, want %v", res.Requeue, tc.wantRequeue) + if (res.RequeueAfter > 0) != tc.wantRequeue { + t.Fatalf("getOrCreateResourceSnapshot() got Requeue %v, want %v", (res.RequeueAfter > 0), tc.wantRequeue) } options := []cmp.Option{ @@ -3106,7 +3106,7 @@ func TestGetOrCreateClusterResourceSnapshot_failure(t *testing.T) { if err == nil { // if error is nil t.Fatal("getOrCreateClusterResourceSnapshot() = nil, want err") } - if res.Requeue { + if res.RequeueAfter > 0 { t.Fatal("getOrCreateClusterResourceSnapshot() requeue = true, want false") } if !errors.Is(err, controller.ErrUnexpectedBehavior) { @@ -4476,7 +4476,7 @@ func TestShouldCreateNewResourceSnapshotNow(t *testing.T) { collectionDuration: 30 * time.Second, annotationValue: now.Add(-10 * time.Second).Format(time.RFC3339), wantAnnoation: true, - wantRequeue: ctrl.Result{Requeue: true, RequeueAfter: 20 * time.Second}, + wantRequeue: ctrl.Result{RequeueAfter: 20 * time.Second}, }, { name: "ResourceChangesCollectionDuration is 0", @@ -4486,7 +4486,7 @@ func TestShouldCreateNewResourceSnapshotNow(t *testing.T) { // no annotation → sets it and requeues annotationValue: "", wantAnnoation: true, - wantRequeue: ctrl.Result{Requeue: true, RequeueAfter: 295 * time.Second}, + wantRequeue: ctrl.Result{RequeueAfter: 295 * time.Second}, }, { name: "next detection time (now) + collection duration < latest resource snapshot creation time + creation interval", @@ -4496,7 +4496,7 @@ func TestShouldCreateNewResourceSnapshotNow(t *testing.T) { // no annotation → sets it and requeues annotationValue: "", wantAnnoation: true, - wantRequeue: ctrl.Result{Requeue: true, RequeueAfter: 295 * time.Second}, + wantRequeue: ctrl.Result{RequeueAfter: 295 * time.Second}, }, { name: "next detection time (annotation) + collection duration < latest resource snapshot creation time + creation interval", @@ -4505,7 +4505,7 @@ func TestShouldCreateNewResourceSnapshotNow(t *testing.T) { creationTime: now.Add(-10 * time.Second), annotationValue: now.Add(-5 * time.Second).Format(time.RFC3339), wantAnnoation: true, - wantRequeue: ctrl.Result{Requeue: true, RequeueAfter: 290 * time.Second}, + wantRequeue: ctrl.Result{RequeueAfter: 290 * time.Second}, }, { name: "last resource snapshot created long time before", @@ -4513,7 +4513,7 @@ func TestShouldCreateNewResourceSnapshotNow(t *testing.T) { collectionDuration: 30 * time.Second, creationTime: now.Add(-1 * time.Hour), wantAnnoation: true, - wantRequeue: ctrl.Result{Requeue: true, RequeueAfter: 30 * time.Second}, + wantRequeue: ctrl.Result{RequeueAfter: 30 * time.Second}, }, { name: "next detection time (now) + collection duration >= latest resource snapshot creation time + creation interval", @@ -4521,7 +4521,7 @@ func TestShouldCreateNewResourceSnapshotNow(t *testing.T) { collectionDuration: 60 * time.Second, creationTime: now.Add(-40 * time.Second), wantAnnoation: true, - wantRequeue: ctrl.Result{Requeue: true, RequeueAfter: 60 * time.Second}, + wantRequeue: ctrl.Result{RequeueAfter: 60 * time.Second}, }, } diff --git a/pkg/controllers/workapplier/apply_test.go b/pkg/controllers/workapplier/apply_test.go index 602d68526..39da8c5d6 100644 --- a/pkg/controllers/workapplier/apply_test.go +++ b/pkg/controllers/workapplier/apply_test.go @@ -384,7 +384,7 @@ func TestSetFleetLastAppliedAnnotation(t *testing.T) { nsManifestObj1 := ns.DeepCopy() wantNSManifestObj1 := ns.DeepCopy() wantNSManifestObj1.SetAnnotations(map[string]string{ - fleetv1beta1.LastAppliedConfigAnnotation: string("{\"apiVersion\":\"v1\",\"kind\":\"Namespace\",\"metadata\":{\"annotations\":{},\"creationTimestamp\":null,\"name\":\"ns-1\"},\"spec\":{},\"status\":{}}\n"), + fleetv1beta1.LastAppliedConfigAnnotation: string("{\"apiVersion\":\"v1\",\"kind\":\"Namespace\",\"metadata\":{\"annotations\":{},\"name\":\"ns-1\"},\"spec\":{},\"status\":{}}\n"), }) nsManifestObj2 := ns.DeepCopy() @@ -393,7 +393,7 @@ func TestSetFleetLastAppliedAnnotation(t *testing.T) { }) wantNSManifestObj2 := ns.DeepCopy() wantNSManifestObj2.SetAnnotations(map[string]string{ - fleetv1beta1.LastAppliedConfigAnnotation: string("{\"apiVersion\":\"v1\",\"kind\":\"Namespace\",\"metadata\":{\"annotations\":{},\"creationTimestamp\":null,\"name\":\"ns-1\"},\"spec\":{},\"status\":{}}\n"), + fleetv1beta1.LastAppliedConfigAnnotation: string("{\"apiVersion\":\"v1\",\"kind\":\"Namespace\",\"metadata\":{\"annotations\":{},\"name\":\"ns-1\"},\"spec\":{},\"status\":{}}\n"), }) // Annotation size limit is 262144 bytes. diff --git a/pkg/controllers/workapplier/utils.go b/pkg/controllers/workapplier/utils.go index 86f3b2b5b..1daec02d4 100644 --- a/pkg/controllers/workapplier/utils.go +++ b/pkg/controllers/workapplier/utils.go @@ -140,7 +140,6 @@ func discardFieldsIrrelevantInComparisonFrom(obj *unstructured.Unstructured) *un // Fields below are read-only fields in object meta. Fleet will ignore them in the comparison // process. - objCopy.SetCreationTimestamp(metav1.Time{}) // Deleted objects are handled separately in the apply process; for comparison purposes, // Fleet will ignore the deletion timestamp and grace period seconds. objCopy.SetDeletionTimestamp(nil) @@ -153,5 +152,13 @@ func discardFieldsIrrelevantInComparisonFrom(obj *unstructured.Unstructured) *un // Remove the status field. unstructured.RemoveNestedField(objCopy.Object, "status") + // Remove all creationTimestamp fields. + unstructured.RemoveNestedField(objCopy.Object, "metadata", "creationTimestamp") + // Resources that have .spec.template.metadata include: Deployment, Job, StatefulSet, + // DaemonSet, ReplicaSet, and CronJob. + unstructured.RemoveNestedField(objCopy.Object, "spec", "template", "metadata", "creationTimestamp") + // Also handle CronJob's .spec.jobTemplate.spec.template.metadata + unstructured.RemoveNestedField(objCopy.Object, "spec", "jobTemplate", "spec", "template", "metadata", "creationTimestamp") + return objCopy } diff --git a/pkg/propertyprovider/azure/provider.go b/pkg/propertyprovider/azure/provider.go index 0351fa242..a02fcb469 100644 --- a/pkg/propertyprovider/azure/provider.go +++ b/pkg/propertyprovider/azure/provider.go @@ -20,6 +20,9 @@ package azure import ( "context" "fmt" + "os" + "sync" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -27,6 +30,7 @@ import ( "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" + "k8s.io/client-go/discovery" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" "k8s.io/klog/v2" @@ -58,6 +62,11 @@ const ( CostPrecisionTemplate = "%.3f" ) +var ( + // k8sVersionCacheTTL is the TTL for the cached Kubernetes version. + k8sVersionCacheTTL = 15 * time.Minute +) + const ( // The condition related values in use by the Azure property provider. CostPropertiesCollectionSucceededCondType = "AKSClusterCostPropertiesCollectionSucceeded" @@ -75,6 +84,9 @@ type PropertyProvider struct { podTracker *trackers.PodTracker nodeTracker *trackers.NodeTracker + // The discovery client to get k8s cluster version. + discoveryClient discovery.ServerVersionInterface + // The region where the Azure property provider resides. // // This is necessary as the pricing client requires that a region to be specified; it can @@ -92,6 +104,15 @@ type PropertyProvider struct { // to avoid name conflicts, though at this moment are mostly reserved for testing purposes. nodeControllerName string podControllerName string + + // Cache for Kubernetes version information with TTL. + k8sVersionMutex sync.Mutex + cachedK8sVersion string + cachedK8sVersionObservedTime time.Time + + // Cached cluster certificate authority data (base64 encoded). + clusterCertificateAuthority []byte + clusterCertificateAuthorityObservedTime time.Time } // Verify that the Azure property provider implements the MetricProvider interface at compile time. @@ -180,12 +201,11 @@ func (p *PropertyProvider) Start(ctx context.Context, config *rest.Config) error // in a passive manner with no need for any centralized state. LeaderElection: false, }) - p.mgr = mgr - if err != nil { klog.ErrorS(err, "Failed to start Azure property provider") return err } + p.mgr = mgr switch { case p.nodeTracker != nil: @@ -220,6 +240,33 @@ func (p *PropertyProvider) Start(ctx context.Context, config *rest.Config) error p.nodeTracker = trackers.NewNodeTracker(nil) } + p.discoveryClient = discovery.NewDiscoveryClientForConfigOrDie(config) + // Fetch the k8s version from the discovery client. + klog.V(2).Info("Fetching Kubernetes version from discovery client") + serverVersion, err := p.discoveryClient.ServerVersion() + if err != nil { + klog.ErrorS(err, "Failed to get Kubernetes server version from discovery client") + return err + } + // Update the cache with the new version. + p.cachedK8sVersion = serverVersion.GitVersion + p.cachedK8sVersionObservedTime = time.Now() + + // Cache the cluster certificate authority data (base64 encoded). + if len(config.CAFile) > 0 { + cadata, err := os.ReadFile(config.CAFile) + if err != nil { + klog.ErrorS(err, "Failed to read cluster certificate authority data from file") + return err + } + p.clusterCertificateAuthority = cadata + p.clusterCertificateAuthorityObservedTime = time.Now() + klog.V(2).Info("Cached cluster certificate authority data") + } else { + err := fmt.Errorf("rest.Config CAFile empty: %s", config.CAFile) + klog.ErrorS(err, "No certificate authority data available in rest.Config") + } + // Set up the node reconciler. klog.V(2).Info("Setting up the node reconciler") nodeReconciler := &controllers.NodeReconciler{ @@ -291,6 +338,9 @@ func (p *PropertyProvider) Collect(ctx context.Context) propertyprovider.Propert // Collect node-count related properties. p.collectNodeCountRelatedProperties(ctx, properties) + // Collect the Kubernetes version. + p.collectK8sVersion(ctx, properties) + // Collect the cost properties (if enabled). if p.isCostCollectionEnabled { costConds := p.collectCosts(ctx, properties) @@ -309,6 +359,14 @@ func (p *PropertyProvider) Collect(ctx context.Context) propertyprovider.Propert p.collectAvailableResource(ctx, &resources) } + // insert the cluster certificate authority property (if available) + if len(p.clusterCertificateAuthority) > 0 { + properties[propertyprovider.ClusterCertificateAuthorityProperty] = clusterv1beta1.PropertyValue{ + Value: string(p.clusterCertificateAuthority), + ObservationTime: metav1.NewTime(p.clusterCertificateAuthorityObservedTime), + } + } + // Return the collection response. return propertyprovider.PropertyCollectionResponse{ Properties: properties, @@ -423,6 +481,42 @@ func (p *PropertyProvider) collectAvailableResource(_ context.Context, usage *cl usage.Available = available } +// collectK8sVersion collects the Kubernetes server version information. +// It uses a cache with a 15-minute TTL to minimize API calls to the discovery client. +func (p *PropertyProvider) collectK8sVersion(_ context.Context, properties map[clusterv1beta1.PropertyName]clusterv1beta1.PropertyValue) { + now := time.Now() + + // Check if we have a cached version that is still valid. + p.k8sVersionMutex.Lock() + defer p.k8sVersionMutex.Unlock() + if p.cachedK8sVersion != "" && now.Sub(p.cachedK8sVersionObservedTime) < k8sVersionCacheTTL { + // Cache is still valid, use the cached version. + properties[propertyprovider.K8sVersionProperty] = clusterv1beta1.PropertyValue{ + Value: p.cachedK8sVersion, + ObservationTime: metav1.NewTime(p.cachedK8sVersionObservedTime), + } + klog.V(2).InfoS("Using cached Kubernetes version", "version", p.cachedK8sVersion, "cacheAge", now.Sub(p.cachedK8sVersionObservedTime)) + return + } + + // Cache is expired or empty, fetch the version from the discovery client. + klog.V(2).Info("Fetching Kubernetes version from discovery client") + serverVersion, err := p.discoveryClient.ServerVersion() + if err != nil { + klog.ErrorS(err, "Failed to get Kubernetes server version from discovery client") + return + } + + // Update the cache with the new version. + p.cachedK8sVersion = serverVersion.GitVersion + p.cachedK8sVersionObservedTime = now + properties[propertyprovider.K8sVersionProperty] = clusterv1beta1.PropertyValue{ + Value: p.cachedK8sVersion, + ObservationTime: metav1.NewTime(now), + } + klog.V(2).InfoS("Collected Kubernetes version", "version", p.cachedK8sVersion) +} + // autoDiscoverRegionAndSetupTrackers auto-discovers the region of the AKS cluster. func (p *PropertyProvider) autoDiscoverRegionAndSetupTrackers(ctx context.Context, c client.Reader) (*string, error) { klog.V(2).Info("Auto-discover region for the Azure property provider") diff --git a/pkg/propertyprovider/azure/provider_integration_test.go b/pkg/propertyprovider/azure/provider_integration_test.go index 0d41a3e05..5601b6eab 100644 --- a/pkg/propertyprovider/azure/provider_integration_test.go +++ b/pkg/propertyprovider/azure/provider_integration_test.go @@ -39,6 +39,9 @@ import ( var ( ignoreObservationTimeFieldInPropertyValue = cmpopts.IgnoreFields(clusterv1beta1.PropertyValue{}, "ObservationTime") + ignoreNonDeterministicProperty = cmpopts.IgnoreMapEntries(func(k clusterv1beta1.PropertyName, v clusterv1beta1.PropertyValue) bool { + return k == propertyprovider.K8sVersionProperty || k == propertyprovider.ClusterCertificateAuthorityProperty + }) ) var ( @@ -311,7 +314,7 @@ var ( } res := p.Collect(ctx) - if diff := cmp.Diff(res, expectedRes, ignoreObservationTimeFieldInPropertyValue, cmpopts.EquateEmpty()); diff != "" { + if diff := cmp.Diff(res, expectedRes, ignoreObservationTimeFieldInPropertyValue, ignoreNonDeterministicProperty, cmpopts.EquateEmpty()); diff != "" { return fmt.Errorf("property collection response (-got, +want):\n%s", diff) } return nil diff --git a/pkg/propertyprovider/azure/provider_test.go b/pkg/propertyprovider/azure/provider_test.go index 30438b615..ade75592b 100644 --- a/pkg/propertyprovider/azure/provider_test.go +++ b/pkg/propertyprovider/azure/provider_test.go @@ -27,6 +27,10 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/version" + "k8s.io/client-go/discovery/fake" + k8stesting "k8s.io/client-go/testing" clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" "github.com/kubefleet-dev/kubefleet/pkg/propertyprovider" @@ -742,12 +746,17 @@ func TestCollect(t *testing.T) { for idx := range tc.pods { podTracker.AddOrUpdate(&tc.pods[idx]) } - + k8sversion := "1.35.5" p := &PropertyProvider{ nodeTracker: nodeTracker, podTracker: podTracker, isCostCollectionEnabled: true, isAvailableResourcesCollectionEnabled: true, + cachedK8sVersion: k8sversion, + cachedK8sVersionObservedTime: time.Now(), + } + tc.wantMetricCollectionResponse.Properties[propertyprovider.K8sVersionProperty] = clusterv1beta1.PropertyValue{ + Value: k8sversion, } res := p.Collect(ctx) if diff := cmp.Diff(res, tc.wantMetricCollectionResponse, ignoreObservationTimeFieldInPropertyValue); diff != "" { @@ -935,12 +944,21 @@ func TestCollectWithDisabledFeatures(t *testing.T) { podTracker.AddOrUpdate(&pods[idx]) } } - + k8sversion := "1.34.6" p := &PropertyProvider{ nodeTracker: nodeTracker, podTracker: podTracker, isCostCollectionEnabled: tc.isCostCollectionEnabled, isAvailableResourcesCollectionEnabled: tc.isAvailableResourcesCollectionEnabled, + cachedK8sVersion: k8sversion, + cachedK8sVersionObservedTime: time.Now(), + clusterCertificateAuthority: []byte("CADATA"), + } + tc.wantPropertyCollectionResponse.Properties[propertyprovider.K8sVersionProperty] = clusterv1beta1.PropertyValue{ + Value: k8sversion, + } + tc.wantPropertyCollectionResponse.Properties[propertyprovider.ClusterCertificateAuthorityProperty] = clusterv1beta1.PropertyValue{ + Value: "CADATA", } res := p.Collect(ctx) if diff := cmp.Diff(res, tc.wantPropertyCollectionResponse, ignoreObservationTimeFieldInPropertyValue); diff != "" { @@ -949,3 +967,202 @@ func TestCollectWithDisabledFeatures(t *testing.T) { }) } } + +func TestCollectK8sVersion(t *testing.T) { + ctx := context.Background() + + testCases := []struct { + name string + discoveryClient *fake.FakeDiscovery + cachedVersion string + cacheStartTime time.Time + cacheTTL time.Duration + wantVersion string + wantDiscoveryCallsMade bool + }{ + { + name: "cache miss - no cached version", + discoveryClient: &fake.FakeDiscovery{ + Fake: &k8stesting.Fake{}, + FakedServerVersion: &version.Info{ + GitVersion: "v1.28.0", + }, + }, + cachedVersion: "", + cacheStartTime: time.Now().Add(-1 * time.Hour), // 1 hour ago + cacheTTL: 15 * time.Minute, + wantVersion: "v1.28.0", + wantDiscoveryCallsMade: true, + }, + { + name: "cache hit - cached version still valid", + discoveryClient: &fake.FakeDiscovery{ + Fake: &k8stesting.Fake{}, + FakedServerVersion: &version.Info{ + GitVersion: "v1.28.0", + }, + }, + cachedVersion: "v1.27.0", + cacheStartTime: time.Now().Add(-5 * time.Minute), // 5 minutes ago + cacheTTL: 15 * time.Minute, + wantVersion: "v1.27.0", + wantDiscoveryCallsMade: false, + }, + { + name: "cache expired - cached version too old", + discoveryClient: &fake.FakeDiscovery{ + Fake: &k8stesting.Fake{}, + FakedServerVersion: &version.Info{ + GitVersion: "v1.28.0", + }, + }, + cachedVersion: "v1.27.0", + cacheStartTime: time.Now().Add(-20 * time.Minute), // 20 minutes ago + cacheTTL: 1 * time.Minute, + wantVersion: "v1.28.0", + wantDiscoveryCallsMade: true, + }, + { + name: "cache at TTL boundary - should be expired", + discoveryClient: &fake.FakeDiscovery{ + Fake: &k8stesting.Fake{}, + FakedServerVersion: &version.Info{ + GitVersion: "v1.28.0", + }, + }, + cachedVersion: "v1.27.0", + cacheStartTime: time.Now().Add(-5*time.Minute - time.Second), // Just over 15 minutes + cacheTTL: 5 * time.Minute, + wantVersion: "v1.28.0", + wantDiscoveryCallsMade: true, + }, + { + name: "discovery client error - no property set", + discoveryClient: func() *fake.FakeDiscovery { + f := &fake.FakeDiscovery{ + Fake: &k8stesting.Fake{}, + } + f.AddReactor("get", "version", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + return true, nil, fmt.Errorf("connection refused") + }) + return f + }(), + cachedVersion: "", + cacheStartTime: time.Time{}, + cacheTTL: 15 * time.Minute, + wantVersion: "", // No property should be set + wantDiscoveryCallsMade: true, + }, + { + name: "cache hit - cached version still valid even if the client errors", + discoveryClient: func() *fake.FakeDiscovery { + f := &fake.FakeDiscovery{ + Fake: &k8stesting.Fake{}, + } + f.AddReactor("get", "version", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + return true, nil, fmt.Errorf("connection refused") + }) + return f + }(), + cachedVersion: "v1.27.0", + cacheStartTime: time.Now().Add(-5 * time.Minute), // 5 minutes ago + cacheTTL: 15 * time.Minute, + wantVersion: "v1.27.0", + wantDiscoveryCallsMade: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + k8sVersionCacheTTL = tc.cacheTTL + p := &PropertyProvider{ + cachedK8sVersion: tc.cachedVersion, + cachedK8sVersionObservedTime: tc.cacheStartTime, + } + // Only set discoveryClient if it's not nil to avoid interface nil pointer issues + if tc.discoveryClient != nil { + p.discoveryClient = tc.discoveryClient + } + + properties := make(map[clusterv1beta1.PropertyName]clusterv1beta1.PropertyValue) + p.collectK8sVersion(ctx, properties) + + if tc.wantVersion == "" { + // No property should be set + if _, ok := properties[propertyprovider.K8sVersionProperty]; ok { + t.Errorf("Expected no Kubernetes version property to be set, but got one") + } + } else { + // Check that the property is set correctly + gotProperty, ok := properties[propertyprovider.K8sVersionProperty] + if !ok { + t.Fatalf("Expected Kubernetes version property to be set, but it was not") + } + if gotProperty.Value != tc.wantVersion { + t.Errorf("collectK8sVersion() version = %v, want %v", gotProperty.Value, tc.wantVersion) + } + } // Check if discovery client was called the expected number of times + if tc.discoveryClient != nil { + if tc.wantDiscoveryCallsMade && len(tc.discoveryClient.Actions()) == 0 { + t.Errorf("Expected discovery client to be called, but it was not") + } + if !tc.wantDiscoveryCallsMade && len(tc.discoveryClient.Actions()) > 0 { + t.Errorf("Expected discovery client not to be called, but it was called %d times", len(tc.discoveryClient.Actions())) + } + } + + // Verify cache was updated when discovery client was called successfully + if tc.wantDiscoveryCallsMade && tc.discoveryClient != nil && tc.discoveryClient.FakedServerVersion != nil { + if p.cachedK8sVersion != tc.wantVersion { + t.Errorf("Expected cached version to be %v, but got %v", tc.wantVersion, p.cachedK8sVersion) + } + if p.cachedK8sVersionObservedTime.IsZero() { + t.Errorf("Expected cache time to be updated, but it was not") + } + } + }) + } +} + +func TestCollectK8sVersionConcurrency(t *testing.T) { + ctx := context.Background() + + discoveryClient := &fake.FakeDiscovery{ + Fake: &k8stesting.Fake{}, + FakedServerVersion: &version.Info{ + GitVersion: "v1.28.0", + }, + } + + p := &PropertyProvider{ + discoveryClient: discoveryClient, + } + + // Run multiple concurrent calls to collectK8sVersion + const numGoroutines = 100 + k8sVersionCacheTTL = 1 * time.Second // Reduce cache TTL to test cache expire + done := make(chan bool, numGoroutines) + + for i := 0; i < numGoroutines; i++ { + go func() { + properties := make(map[clusterv1beta1.PropertyName]clusterv1beta1.PropertyValue) + p.collectK8sVersion(ctx, properties) + done <- true + }() + } + + // Wait for all goroutines to complete + for i := 0; i < numGoroutines; i++ { + <-done + } + + // Verify that the discovery client was called at least once + if len(discoveryClient.Actions()) == 0 { + t.Errorf("Expected discovery client to be called at least once, but it was not") + } + + // Verify that the cache was populated + if p.cachedK8sVersion != "v1.28.0" { + t.Errorf("Expected cached version to be v1.28.0, but got %v", p.cachedK8sVersion) + } +} diff --git a/pkg/propertyprovider/commons.go b/pkg/propertyprovider/commons.go index 2fabae11b..3d69486e4 100644 --- a/pkg/propertyprovider/commons.go +++ b/pkg/propertyprovider/commons.go @@ -24,6 +24,15 @@ const ( // NodeCountProperty is a property that describes the number of nodes in the cluster. NodeCountProperty = "kubernetes-fleet.io/node-count" + // K8sVersionProperty is a property that describes the Kubernetes version of the cluster. + K8sVersionProperty = "k8s.io/k8s-version" + + // ClusterEntryPointProperty is a property that describes the cluster entry point (API server endpoint). + ClusterEntryPointProperty = "k8s.io/cluster-entrypoint" + + // ClusterCertificateAuthorityProperty is a property that describes the cluster's certificate authority data (base64 encoded). + ClusterCertificateAuthorityProperty = "k8s.io/cluster-certificate-authority-data" + // The resource properties. // Total and allocatable CPU resource properties. TotalCPUCapacityProperty = "resources.kubernetes-fleet.io/total-cpu" diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 7f04bbe52..19e6e46cb 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -228,6 +228,8 @@ func (s *Scheduler) scheduleOnce(ctx context.Context, worker int) { } // Requeue if the scheduling cycle suggests so. + //nolint:staticcheck + //lint:ignore SA1019 we need more time to fully migrate to RequeueAfter as we used these two fields separately. if res.Requeue { if res.RequeueAfter > 0 { s.queue.AddAfter(placementKey, res.RequeueAfter) diff --git a/pkg/utils/controller/controller.go b/pkg/utils/controller/controller.go index 566cb5fd2..4dbeb1cd0 100644 --- a/pkg/utils/controller/controller.go +++ b/pkg/utils/controller/controller.go @@ -284,6 +284,8 @@ func (w *controller) reconcileHandler(ctx context.Context, key interface{}) { w.queue.Forget(key) w.queue.AddAfter(key, result.RequeueAfter) metrics.FleetReconcileTotal.WithLabelValues(w.name, labelRequeueAfter).Inc() + //nolint:staticcheck + //lint:ignore SA1019 we need more time to fully migrate to RequeueAfter as we used these two fields separately. case result.Requeue: w.queue.AddRateLimited(key) metrics.FleetReconcileTotal.WithLabelValues(w.name, labelRequeue).Inc() diff --git a/test/e2e/join_and_leave_test.go b/test/e2e/join_and_leave_test.go index c7f00259b..5b6e79f0c 100644 --- a/test/e2e/join_and_leave_test.go +++ b/test/e2e/join_and_leave_test.go @@ -29,6 +29,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" "sigs.k8s.io/controller-runtime/pkg/client" fleetnetworkingv1alpha1 "go.goms.io/fleet-networking/api/v1alpha1" @@ -36,6 +37,7 @@ import ( clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" "github.com/kubefleet-dev/kubefleet/pkg/utils" + "github.com/kubefleet-dev/kubefleet/pkg/utils/controller" ) const ( @@ -65,7 +67,6 @@ var _ = Describe("Test member cluster join and leave flow", Label("joinleave"), // Note that this container cannot run in parallel with other containers. Describe("Test member cluster join and leave flow for cluster resource placement", Ordered, Serial, func() { - BeforeAll(func() { // Create the test resources. wantCRPSelectedResources = []placementv1beta1.ResourceIdentifier{ @@ -442,7 +443,7 @@ var _ = Describe("Test member cluster join and leave flow", Label("joinleave"), }) }) -var _ = Describe("Test member cluster force delete flow", Label("joinleave"), Ordered, Serial, func() { +var _ = Describe("Test member cluster join and leave without placement", Label("joinleave"), Ordered, Serial, func() { Context("Test cluster join and leave flow with member agent down and force delete member cluster", Label("joinleave"), Ordered, Serial, func() { It("Simulate the member agent going down in member cluster", func() { updateMemberAgentDeploymentReplicas(memberCluster3WestProdClient, 0) @@ -476,6 +477,71 @@ var _ = Describe("Test member cluster force delete flow", Label("joinleave"), Or }) }) +var _ = Describe("Test member cluster join and leave with clusterProfile", Label("joinleave"), Ordered, Serial, func() { + clusterProfileList := &clusterinventory.ClusterProfileList{} + Context("Test cluster profile and ", Label("joinleave"), Ordered, Serial, func() { + It("Make sure we have all the cluster profiles", func() { + Eventually(func() error { + if err := hubClient.List(ctx, clusterProfileList, &client.ListOptions{Namespace: utils.FleetSystemNamespace}); err != nil { + return fmt.Errorf("failed to get cluster profiles: %w", err) + } + + // create a map for easy lookup + cpMap := make(map[string]clusterinventory.ClusterProfile) + for idx := range clusterProfileList.Items { + cp := clusterProfileList.Items[idx] + cpMap[cp.Name] = cp + } + // make sure all the member clusters have a cluster profile + for idx := range allMemberClusterNames { + cp, ok := cpMap[allMemberClusterNames[idx]] + if !ok { + return fmt.Errorf("cluster profile for member cluster %s not found", allMemberClusterNames[idx]) + } + if cp.Status.Version.Kubernetes == "" { + return fmt.Errorf("cluster profile %s Kubernetes version should not be empty", cp.Name) + } + if len(cp.Status.AccessProviders) != 1 { + return fmt.Errorf("cluster profile %s has no access providers %+v", cp.Name, cp.Status.AccessProviders) + } + if cp.Status.AccessProviders[0].Name != controller.ClusterManagerName { + return fmt.Errorf("cluster profile %s access provider name %s doesn't match expected %s", cp.Name, cp.Status.AccessProviders[0].Name, controller.ClusterManagerName) + } + if len(cp.Status.AccessProviders[0].Cluster.CertificateAuthorityData) == 0 { + return fmt.Errorf("cluster profile %s access provider certificate authority data should not be empty", allMemberClusterNames[idx]) + } + } + return nil + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to verify cluster profiles") + }) + + It("Delete member cluster CR associated to the member cluster to simulate member left", func() { + markMemberClusterAsLeft(memberCluster3WestProdName) + }) + + It("Make sure we delete the corresponding cluster profiles", func() { + Eventually(func() error { + if err := hubClient.List(ctx, clusterProfileList, &client.ListOptions{Namespace: utils.FleetSystemNamespace}); err != nil { + return fmt.Errorf("failed to get cluster profiles: %w", err) + } + for idx := range clusterProfileList.Items { + cp := clusterProfileList.Items[idx] + if cp.Name == memberCluster3WestProdName { + return fmt.Errorf("cluster profile for member cluster %s should be deleted", memberCluster3WestProdName) + } + } + return nil + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to verify cluster profiles") + }) + }) + + AfterAll(func() { + By("Add the member cluster back") + createMemberCluster(memberCluster3WestProd.ClusterName, memberCluster3WestProd.PresentingServiceAccountInHubClusterName, labelsByClusterName[memberCluster3WestProd.ClusterName], annotationsByClusterName[memberCluster3WestProd.ClusterName]) + checkIfMemberClusterHasJoined(memberCluster3WestProd) + }) +}) + func updateMemberAgentDeploymentReplicas(clusterClient client.Client, replicas int32) { Eventually(func() error { var d appsv1.Deployment diff --git a/test/e2e/utils_test.go b/test/e2e/utils_test.go index a80b20081..36df25cf1 100644 --- a/test/e2e/utils_test.go +++ b/test/e2e/utils_test.go @@ -85,7 +85,7 @@ func createMemberCluster(name, svcAccountName string, labels, annotations map[st HeartbeatPeriodSeconds: memberClusterHeartbeatPeriodSeconds, }, } - Expect(hubClient.Create(ctx, mcObj)).To(Succeed(), "Failed to create member cluster object %s", name) + Expect(hubClient.Create(ctx, mcObj)).To(SatisfyAny(&utils.AlreadyExistMatcher{}, Succeed()), "Failed to create member cluster object %s", name) } func updateMemberClusterDeleteOptions(name string, deleteOptions *clusterv1beta1.DeleteOptions) { @@ -312,10 +312,15 @@ func checkIfAzurePropertyProviderIsWorking() { ignoreCostProperties := cmpopts.IgnoreMapEntries(func(k clusterv1beta1.PropertyName, v clusterv1beta1.PropertyValue) bool { return k == azure.PerCPUCoreCostProperty || k == azure.PerGBMemoryCostProperty }) + // we don't know the exact value of k8s version and cluster entry point + ignoreClusterProperties := cmpopts.IgnoreMapEntries(func(k clusterv1beta1.PropertyName, v clusterv1beta1.PropertyValue) bool { + return k == propertyprovider.K8sVersionProperty || k == propertyprovider.ClusterCertificateAuthorityProperty + }) if diff := cmp.Diff( mcObj.Status.Properties, wantStatus.Properties, ignoreTimeTypeFields, ignoreCostProperties, + ignoreClusterProperties, ); diff != "" { return fmt.Errorf("member cluster status properties diff (-got, +want):\n%s", diff) } @@ -576,7 +581,7 @@ func cleanupInvalidClusters() { } Eventually(func() error { err := hubClient.Get(ctx, types.NamespacedName{Name: name}, mcObj) - if err != nil { + if err != nil && !k8serrors.IsNotFound(err) { return err } mcObj.Finalizers = []string{} From fbc2483499710b3ba614930ba5016c177bfdef99 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 25 Nov 2025 10:41:28 -0800 Subject: [PATCH 09/13] feat: allow pod and replica sets to be created in hub cluster (#334) * remove pod handler Signed-off-by: Wei Weng * remove replicaset webhook as well Signed-off-by: Wei Weng * update guard rail Signed-off-by: Wei Weng * add e2e tests Signed-off-by: Wei Weng * rename Signed-off-by: Wei Weng * do not propagate rs and pod Signed-off-by: Wei Weng * fix unit test Signed-off-by: Wei Weng * add guard rail test for pod and rs Signed-off-by: Wei Weng * only block propagation for replica set and pod Signed-off-by: Wei Weng * Revert "remove pod handler" This reverts commit 562b151fbea257ae31dd866305a9ef97d53b934e. Signed-off-by: Wei Weng * Revert "remove replicaset webhook as well" This reverts commit ee9b4252d493cb1a46c50b258718a87aabe4e6c6. Signed-off-by: Wei Weng * revert Signed-off-by: Wei Weng * add options to disable rs and pod validating webhooks Signed-off-by: Wei Weng * guard rail pod and replica set if validating webhooks disabled Signed-off-by: Wei Weng * remove some test because validating webhooks are on by default Signed-off-by: Wei Weng * remove unintentional change Signed-off-by: Wei Weng * remove empty line Signed-off-by: Wei Weng * remove one unintentional change and remove owner ref check for other resources Signed-off-by: Wei Weng * fix readme Signed-off-by: Wei Weng * combine into single flag Signed-off-by: Wei Weng * add guard rail test for pod and rs Signed-off-by: Wei Weng * verify workload running in hub Signed-off-by: Wei Weng * rename flag Signed-off-by: Wei Weng * update readme Signed-off-by: Wei Weng * fix tests Signed-off-by: Wei Weng * do not propagate controller revision Signed-off-by: Wei Weng * fix test Signed-off-by: Wei Weng * ignore some pvc annotations Signed-off-by: Wei Weng * enum for stateful set test yaml Signed-off-by: Wei Weng * strip volume name from PVC Signed-off-by: Wei Weng * fix test Signed-off-by: Wei Weng * Revert "ignore some pvc annotations" This reverts commit 9ba69dd493850823638c8cbd309a8a6122892d9d. Signed-off-by: Wei Weng * test job Signed-off-by: Wei Weng * address comment Signed-off-by: Wei Weng * fix unit test Signed-off-by: Wei Weng --------- Signed-off-by: Wei Weng Co-authored-by: Wei Weng --- charts/hub-agent/README.md | 3 +- charts/hub-agent/templates/deployment.yaml | 1 + charts/hub-agent/values.yaml | 1 + cmd/hubagent/main.go | 6 +- cmd/hubagent/options/options.go | 4 + pkg/utils/common.go | 14 + pkg/utils/common_test.go | 148 ++++++++++ pkg/webhook/webhook.go | 206 +++++++------- pkg/webhook/webhook_test.go | 15 +- test/e2e/fleet_guard_rail_test.go | 242 ++++++++++++++++ .../e2e/resource_placement_deployment_test.go | 162 ----------- .../resource_placement_hub_workload_test.go | 269 ++++++++++++++++++ test/e2e/setup.sh | 1 + 13 files changed, 800 insertions(+), 272 deletions(-) delete mode 100644 test/e2e/resource_placement_deployment_test.go create mode 100644 test/e2e/resource_placement_hub_workload_test.go diff --git a/charts/hub-agent/README.md b/charts/hub-agent/README.md index b72c810e2..3f44d29a4 100644 --- a/charts/hub-agent/README.md +++ b/charts/hub-agent/README.md @@ -40,4 +40,5 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen | `logFileMaxSize` | Max log file size before rotation | `1000000` | | `MaxFleetSizeSupported` | Max number of member clusters supported | `100` | | `resourceSnapshotCreationMinimumInterval` | The minimum interval at which resource snapshots could be created. | `30s` | -| `resourceChangesCollectionDuration` | The duration for collecting resource changes into one snapshot. | `15s` | \ No newline at end of file +| `resourceChangesCollectionDuration` | The duration for collecting resource changes into one snapshot. | `15s` | +| `enableWorkload` | Enable kubernetes builtin workload to run in hub cluster. | `false` | \ No newline at end of file diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index c4e24fd50..7ed6ab7ba 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -24,6 +24,7 @@ spec: - --enable-webhook={{ .Values.enableWebhook }} - --webhook-service-name={{ .Values.webhookServiceName }} - --enable-guard-rail={{ .Values.enableGuardRail }} + - --enable-workload={{ .Values.enableWorkload }} - --whitelisted-users=system:serviceaccount:fleet-system:hub-agent-sa - --webhook-client-connection-type={{.Values.webhookClientConnectionType}} - --v={{ .Values.logVerbosity }} diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml index 7672e8048..a423c6270 100644 --- a/charts/hub-agent/values.yaml +++ b/charts/hub-agent/values.yaml @@ -16,6 +16,7 @@ enableWebhook: true webhookServiceName: fleetwebhook enableGuardRail: true webhookClientConnectionType: service +enableWorkload: false forceDeleteWaitTime: 15m0s clusterUnhealthyThreshold: 3m0s resourceSnapshotCreationMinimumInterval: 30s diff --git a/cmd/hubagent/main.go b/cmd/hubagent/main.go index 02dfcae09..f896ab71a 100644 --- a/cmd/hubagent/main.go +++ b/cmd/hubagent/main.go @@ -156,7 +156,7 @@ func main() { if opts.EnableWebhook { whiteListedUsers := strings.Split(opts.WhiteListedUsers, ",") - if err := SetupWebhook(mgr, options.WebhookClientConnectionType(opts.WebhookClientConnectionType), opts.WebhookServiceName, whiteListedUsers, opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels); err != nil { + if err := SetupWebhook(mgr, options.WebhookClientConnectionType(opts.WebhookClientConnectionType), opts.WebhookServiceName, whiteListedUsers, opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels, opts.EnableWorkload); err != nil { klog.ErrorS(err, "unable to set up webhook") exitWithErrorFunc() } @@ -188,9 +188,9 @@ func main() { } // SetupWebhook generates the webhook cert and then set up the webhook configurator. -func SetupWebhook(mgr manager.Manager, webhookClientConnectionType options.WebhookClientConnectionType, webhookServiceName string, whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool) error { +func SetupWebhook(mgr manager.Manager, webhookClientConnectionType options.WebhookClientConnectionType, webhookServiceName string, whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool, enableWorkload bool) error { // Generate self-signed key and crt files in FleetWebhookCertDir for the webhook server to start. - w, err := webhook.NewWebhookConfig(mgr, webhookServiceName, FleetWebhookPort, &webhookClientConnectionType, FleetWebhookCertDir, enableGuardRail, denyModifyMemberClusterLabels) + w, err := webhook.NewWebhookConfig(mgr, webhookServiceName, FleetWebhookPort, &webhookClientConnectionType, FleetWebhookCertDir, enableGuardRail, denyModifyMemberClusterLabels, enableWorkload) if err != nil { klog.ErrorS(err, "fail to generate WebhookConfig") return err diff --git a/cmd/hubagent/options/options.go b/cmd/hubagent/options/options.go index 26487f503..6573789c1 100644 --- a/cmd/hubagent/options/options.go +++ b/cmd/hubagent/options/options.go @@ -107,6 +107,9 @@ type Options struct { PprofPort int // DenyModifyMemberClusterLabels indicates if the member cluster labels cannot be modified by groups (excluding system:masters) DenyModifyMemberClusterLabels bool + // EnableWorkload enables workload resources (pods and replicasets) to be created in the hub cluster. + // When set to true, the pod and replicaset validating webhooks are disabled. + EnableWorkload bool // ResourceSnapshotCreationMinimumInterval is the minimum interval at which resource snapshots could be created. // Whether the resource snapshot is created or not depends on the both ResourceSnapshotCreationMinimumInterval and ResourceChangesCollectionDuration. ResourceSnapshotCreationMinimumInterval time.Duration @@ -181,6 +184,7 @@ func (o *Options) AddFlags(flags *flag.FlagSet) { flags.BoolVar(&o.EnablePprof, "enable-pprof", false, "If set, the pprof profiling is enabled.") flags.IntVar(&o.PprofPort, "pprof-port", 6065, "The port for pprof profiling.") flags.BoolVar(&o.DenyModifyMemberClusterLabels, "deny-modify-member-cluster-labels", false, "If set, users not in the system:masters cannot modify member cluster labels.") + flags.BoolVar(&o.EnableWorkload, "enable-workload", false, "If set, workloads (pods and replicasets) can be created in the hub cluster. This disables the pod and replicaset validating webhooks.") flags.DurationVar(&o.ResourceSnapshotCreationMinimumInterval, "resource-snapshot-creation-minimum-interval", 30*time.Second, "The minimum interval at which resource snapshots could be created.") flags.DurationVar(&o.ResourceChangesCollectionDuration, "resource-changes-collection-duration", 15*time.Second, "The duration for collecting resource changes into one snapshot. The default is 15 seconds, which means that the controller will collect resource changes for 15 seconds before creating a resource snapshot.") diff --git a/pkg/utils/common.go b/pkg/utils/common.go index 4938cd46d..cf1899182 100644 --- a/pkg/utils/common.go +++ b/pkg/utils/common.go @@ -79,6 +79,8 @@ const ( ServiceKind = "Service" NamespaceKind = "Namespace" JobKind = "Job" + ReplicaSetKind = "ReplicaSet" + PodKind = "Pod" ) const ( @@ -507,6 +509,18 @@ func CheckCRDInstalled(discoveryClient discovery.DiscoveryInterface, gvk schema. func ShouldPropagateObj(informerManager informer.Manager, uObj *unstructured.Unstructured) (bool, error) { // TODO: add more special handling for different resource kind switch uObj.GroupVersionKind() { + case appv1.SchemeGroupVersion.WithKind(ReplicaSetKind): + // Skip ReplicaSets if they are managed by Deployments (have owner references) + // Standalone ReplicaSets (without owners) can be propagated + if len(uObj.GetOwnerReferences()) > 0 { + return false, nil + } + case appv1.SchemeGroupVersion.WithKind("ControllerRevision"): + // Skip ControllerRevisions if they are managed by DaemonSets/StatefulSets (have owner references) + // These are automatically created by controllers and will be recreated on member clusters + if len(uObj.GetOwnerReferences()) > 0 { + return false, nil + } case corev1.SchemeGroupVersion.WithKind(ConfigMapKind): // Skip the built-in custom CA certificate created in the namespace if uObj.GetName() == "kube-root-ca.crt" { diff --git a/pkg/utils/common_test.go b/pkg/utils/common_test.go index 483a06a42..35716885f 100644 --- a/pkg/utils/common_test.go +++ b/pkg/utils/common_test.go @@ -6,6 +6,7 @@ import ( "github.com/google/go-cmp/cmp" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/utils/ptr" fleetv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" @@ -1189,3 +1190,150 @@ func TestIsDiffedResourcePlacementEqual(t *testing.T) { }) } } + +func TestShouldPropagateObj_PodAndReplicaSet(t *testing.T) { + tests := []struct { + name string + obj map[string]interface{} + ownerReferences []metav1.OwnerReference + want bool + }{ + { + name: "standalone replicaset without ownerReferences should propagate", + obj: map[string]interface{}{ + "apiVersion": "apps/v1", + "kind": "ReplicaSet", + "metadata": map[string]interface{}{ + "name": "standalone-rs", + "namespace": "default", + }, + }, + ownerReferences: nil, + want: true, + }, + { + name: "standalone pod without ownerReferences should propagate", + obj: map[string]interface{}{ + "apiVersion": "v1", + "kind": "Pod", + "metadata": map[string]interface{}{ + "name": "standalone-pod", + "namespace": "default", + }, + }, + ownerReferences: nil, + want: true, + }, + { + name: "replicaset with deployment owner should NOT propagate", + obj: map[string]interface{}{ + "apiVersion": "apps/v1", + "kind": "ReplicaSet", + "metadata": map[string]interface{}{ + "name": "test-deploy-abc123", + "namespace": "default", + }, + }, + ownerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "test-deploy", + UID: "12345", + }, + }, + want: false, + }, + { + name: "pod owned by replicaset - passes ShouldPropagateObj but filtered by resource config", + obj: map[string]interface{}{ + "apiVersion": "v1", + "kind": "Pod", + "metadata": map[string]interface{}{ + "name": "test-deploy-abc123-xyz", + "namespace": "default", + }, + }, + ownerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: "test-deploy-abc123", + UID: "67890", + }, + }, + want: true, // ShouldPropagateObj doesn't filter Pods - they're filtered by NewResourceConfig + }, + { + name: "controllerrevision owned by daemonset should NOT propagate", + obj: map[string]interface{}{ + "apiVersion": "apps/v1", + "kind": "ControllerRevision", + "metadata": map[string]interface{}{ + "name": "test-ds-7b9848797f", + "namespace": "default", + }, + }, + ownerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "DaemonSet", + Name: "test-ds", + UID: "abcdef", + }, + }, + want: false, + }, + { + name: "controllerrevision owned by statefulset should NOT propagate", + obj: map[string]interface{}{ + "apiVersion": "apps/v1", + "kind": "ControllerRevision", + "metadata": map[string]interface{}{ + "name": "test-ss-7878b4b446", + "namespace": "default", + }, + }, + ownerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "StatefulSet", + Name: "test-ss", + UID: "fedcba", + }, + }, + want: false, + }, + { + name: "standalone controllerrevision without owner should propagate", + obj: map[string]interface{}{ + "apiVersion": "apps/v1", + "kind": "ControllerRevision", + "metadata": map[string]interface{}{ + "name": "custom-revision", + "namespace": "default", + }, + }, + ownerReferences: nil, + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + uObj := &unstructured.Unstructured{Object: tt.obj} + if tt.ownerReferences != nil { + uObj.SetOwnerReferences(tt.ownerReferences) + } + + got, err := ShouldPropagateObj(nil, uObj) + if err != nil { + t.Errorf("ShouldPropagateObj() error = %v", err) + return + } + if got != tt.want { + t.Errorf("ShouldPropagateObj() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 0b2ebe2de..781833e3e 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -159,9 +159,10 @@ type Config struct { enableGuardRail bool denyModifyMemberClusterLabels bool + enableWorkload bool } -func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32, clientConnectionType *options.WebhookClientConnectionType, certDir string, enableGuardRail bool, denyModifyMemberClusterLabels bool) (*Config, error) { +func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32, clientConnectionType *options.WebhookClientConnectionType, certDir string, enableGuardRail bool, denyModifyMemberClusterLabels bool, enableWorkload bool) (*Config, error) { // We assume the Pod namespace should be passed to env through downward API in the Pod spec. namespace := os.Getenv("POD_NAMESPACE") if namespace == "" { @@ -176,6 +177,7 @@ func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32 clientConnectionType: clientConnectionType, enableGuardRail: enableGuardRail, denyModifyMemberClusterLabels: denyModifyMemberClusterLabels, + enableWorkload: enableWorkload, } caPEM, err := w.genCertificate(certDir) if err != nil { @@ -302,8 +304,11 @@ func (w *Config) createValidatingWebhookConfiguration(ctx context.Context, webho // buildValidatingWebHooks returns a slice of fleet validating webhook objects. func (w *Config) buildFleetValidatingWebhooks() []admv1.ValidatingWebhook { - webHooks := []admv1.ValidatingWebhook{ - { + var webHooks []admv1.ValidatingWebhook + + // When enableWorkload is true, skip pod and replicaset validating webhooks to allow workloads + if !w.enableWorkload { + webHooks = append(webHooks, admv1.ValidatingWebhook{ Name: "fleet.pod.validating", ClientConfig: w.createClientConfig(pod.ValidationPath), FailurePolicy: &failFailurePolicy, @@ -311,32 +316,14 @@ func (w *Config) buildFleetValidatingWebhooks() []admv1.ValidatingWebhook { AdmissionReviewVersions: admissionReviewVersions, Rules: []admv1.RuleWithOperations{ { - Operations: []admv1.OperationType{ - admv1.Create, - }, - Rule: createRule([]string{corev1.SchemeGroupVersion.Group}, []string{corev1.SchemeGroupVersion.Version}, []string{podResourceName}, &namespacedScope), - }, - }, - TimeoutSeconds: longWebhookTimeout, - }, - { - Name: "fleet.clusterresourceplacementv1beta1.validating", - ClientConfig: w.createClientConfig(clusterresourceplacement.ValidationPath), - FailurePolicy: &failFailurePolicy, - SideEffects: &sideEffortsNone, - AdmissionReviewVersions: admissionReviewVersions, - Rules: []admv1.RuleWithOperations{ - { - Operations: []admv1.OperationType{ - admv1.Create, - admv1.Update, - }, - Rule: createRule([]string{placementv1beta1.GroupVersion.Group}, []string{placementv1beta1.GroupVersion.Version}, []string{placementv1beta1.ClusterResourcePlacementResource}, &clusterScope), + Operations: []admv1.OperationType{admv1.Create}, + Rule: createRule([]string{corev1.SchemeGroupVersion.Group}, []string{corev1.SchemeGroupVersion.Version}, []string{podResourceName}, &namespacedScope), }, }, TimeoutSeconds: longWebhookTimeout, - }, - { + }) + + webHooks = append(webHooks, admv1.ValidatingWebhook{ Name: "fleet.replicaset.validating", ClientConfig: w.createClientConfig(replicaset.ValidationPath), FailurePolicy: &failFailurePolicy, @@ -344,99 +331,91 @@ func (w *Config) buildFleetValidatingWebhooks() []admv1.ValidatingWebhook { AdmissionReviewVersions: admissionReviewVersions, Rules: []admv1.RuleWithOperations{ { - Operations: []admv1.OperationType{ - admv1.Create, - }, - Rule: createRule([]string{appsv1.SchemeGroupVersion.Group}, []string{appsv1.SchemeGroupVersion.Version}, []string{replicaSetResourceName}, &namespacedScope), + Operations: []admv1.OperationType{admv1.Create}, + Rule: createRule([]string{appsv1.SchemeGroupVersion.Group}, []string{appsv1.SchemeGroupVersion.Version}, []string{replicaSetResourceName}, &namespacedScope), }, }, TimeoutSeconds: longWebhookTimeout, + }) + } + + webHooks = append(webHooks, admv1.ValidatingWebhook{ + Name: "fleet.clusterresourceplacementv1beta1.validating", + ClientConfig: w.createClientConfig(clusterresourceplacement.ValidationPath), + FailurePolicy: &failFailurePolicy, + SideEffects: &sideEffortsNone, + AdmissionReviewVersions: admissionReviewVersions, + Rules: []admv1.RuleWithOperations{ + { + Operations: []admv1.OperationType{admv1.Create, admv1.Update}, + Rule: createRule([]string{placementv1beta1.GroupVersion.Group}, []string{placementv1beta1.GroupVersion.Version}, []string{placementv1beta1.ClusterResourcePlacementResource}, &clusterScope), + }, }, - { + TimeoutSeconds: longWebhookTimeout, + }) + + webHooks = append(webHooks, + admv1.ValidatingWebhook{ Name: "fleet.membercluster.validating", ClientConfig: w.createClientConfig(membercluster.ValidationPath), FailurePolicy: &failFailurePolicy, SideEffects: &sideEffortsNone, AdmissionReviewVersions: admissionReviewVersions, - Rules: []admv1.RuleWithOperations{ - { - Operations: []admv1.OperationType{ - admv1.Create, - admv1.Update, - admv1.Delete, - }, - Rule: createRule([]string{clusterv1beta1.GroupVersion.Group}, []string{clusterv1beta1.GroupVersion.Version}, []string{memberClusterResourceName}, &clusterScope), - }, - }, + Rules: []admv1.RuleWithOperations{{ + Operations: []admv1.OperationType{admv1.Create, admv1.Update, admv1.Delete}, + Rule: createRule([]string{clusterv1beta1.GroupVersion.Group}, []string{clusterv1beta1.GroupVersion.Version}, []string{memberClusterResourceName}, &clusterScope), + }}, TimeoutSeconds: longWebhookTimeout, }, - { + admv1.ValidatingWebhook{ Name: "fleet.clusterresourceoverride.validating", ClientConfig: w.createClientConfig(clusterresourceoverride.ValidationPath), FailurePolicy: &failFailurePolicy, SideEffects: &sideEffortsNone, AdmissionReviewVersions: admissionReviewVersions, - Rules: []admv1.RuleWithOperations{ - { - Operations: []admv1.OperationType{ - admv1.Create, - admv1.Update, - }, - Rule: createRule([]string{placementv1beta1.GroupVersion.Group}, []string{placementv1beta1.GroupVersion.Version}, []string{clusterResourceOverrideName}, &clusterScope), - }, - }, + Rules: []admv1.RuleWithOperations{{ + Operations: []admv1.OperationType{admv1.Create, admv1.Update}, + Rule: createRule([]string{placementv1beta1.GroupVersion.Group}, []string{placementv1beta1.GroupVersion.Version}, []string{clusterResourceOverrideName}, &clusterScope), + }}, TimeoutSeconds: longWebhookTimeout, }, - { + admv1.ValidatingWebhook{ Name: "fleet.resourceoverride.validating", ClientConfig: w.createClientConfig(resourceoverride.ValidationPath), FailurePolicy: &failFailurePolicy, SideEffects: &sideEffortsNone, AdmissionReviewVersions: admissionReviewVersions, - Rules: []admv1.RuleWithOperations{ - { - Operations: []admv1.OperationType{ - admv1.Create, - admv1.Update, - }, - Rule: createRule([]string{placementv1beta1.GroupVersion.Group}, []string{placementv1beta1.GroupVersion.Version}, []string{resourceOverrideName}, &namespacedScope), - }, - }, + Rules: []admv1.RuleWithOperations{{ + Operations: []admv1.OperationType{admv1.Create, admv1.Update}, + Rule: createRule([]string{placementv1beta1.GroupVersion.Group}, []string{placementv1beta1.GroupVersion.Version}, []string{resourceOverrideName}, &namespacedScope), + }}, TimeoutSeconds: longWebhookTimeout, }, - { + admv1.ValidatingWebhook{ Name: "fleet.clusterresourceplacementeviction.validating", ClientConfig: w.createClientConfig(clusterresourceplacementeviction.ValidationPath), FailurePolicy: &failFailurePolicy, SideEffects: &sideEffortsNone, AdmissionReviewVersions: admissionReviewVersions, - Rules: []admv1.RuleWithOperations{ - { - Operations: []admv1.OperationType{ - admv1.Create, - }, - Rule: createRule([]string{placementv1beta1.GroupVersion.Group}, []string{placementv1beta1.GroupVersion.Version}, []string{evictionName}, &clusterScope), - }, - }, + Rules: []admv1.RuleWithOperations{{ + Operations: []admv1.OperationType{admv1.Create}, + Rule: createRule([]string{placementv1beta1.GroupVersion.Group}, []string{placementv1beta1.GroupVersion.Version}, []string{evictionName}, &clusterScope), + }}, TimeoutSeconds: longWebhookTimeout, }, - { + admv1.ValidatingWebhook{ Name: "fleet.clusterresourceplacementdisruptionbudget.validating", ClientConfig: w.createClientConfig(clusterresourceplacementdisruptionbudget.ValidationPath), FailurePolicy: &failFailurePolicy, SideEffects: &sideEffortsNone, AdmissionReviewVersions: admissionReviewVersions, - Rules: []admv1.RuleWithOperations{ - { - Operations: []admv1.OperationType{ - admv1.Create, admv1.Update, - }, - Rule: createRule([]string{placementv1beta1.GroupVersion.Group}, []string{placementv1beta1.GroupVersion.Version}, []string{disruptionBudgetName}, &clusterScope), - }, - }, + Rules: []admv1.RuleWithOperations{{ + Operations: []admv1.OperationType{admv1.Create, admv1.Update}, + Rule: createRule([]string{placementv1beta1.GroupVersion.Group}, []string{placementv1beta1.GroupVersion.Version}, []string{disruptionBudgetName}, &clusterScope), + }}, TimeoutSeconds: longWebhookTimeout, }, - } + ) return webHooks } @@ -487,65 +466,82 @@ func (w *Config) buildFleetGuardRailValidatingWebhooks() []admv1.ValidatingWebho Operations: []admv1.OperationType{admv1.Delete}, Rule: createRule([]string{"*"}, []string{"*"}, []string{"*/*"}, &namespacedScope), }, - // TODO(ArvindThiru): not handling pods, replicasets as part of the fleet guard rail since they have validating webhooks, need to remove validating webhooks before adding these resources to fleet guard rail. - { - Operations: cuOperations, - Rule: createRule([]string{corev1.SchemeGroupVersion.Group}, []string{corev1.SchemeGroupVersion.Version}, []string{bindingResourceName, configMapResourceName, endPointResourceName, - limitRangeResourceName, persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", podTemplateResourceName, - replicationControllerResourceName, replicationControllerResourceName + "/status", resourceQuotaResourceName, resourceQuotaResourceName + "/status", secretResourceName, - serviceAccountResourceName, servicesResourceName, servicesResourceName + "/status"}, &namespacedScope), - }, - { - Operations: cuOperations, - Rule: createRule([]string{appsv1.SchemeGroupVersion.Group}, []string{appsv1.SchemeGroupVersion.Version}, []string{controllerRevisionResourceName, daemonSetResourceName, daemonSetResourceName + "/status", - deploymentResourceName, deploymentResourceName + "/status", statefulSetResourceName, statefulSetResourceName + "/status"}, &namespacedScope), - }, - { + } + + // Build core v1 resources list, conditionally including pods if workload is enabled + coreV1Resources := []string{bindingResourceName, configMapResourceName, endPointResourceName, + limitRangeResourceName, persistentVolumeClaimsName, persistentVolumeClaimsName + "/status", podTemplateResourceName, + replicationControllerResourceName, replicationControllerResourceName + "/status", resourceQuotaResourceName, resourceQuotaResourceName + "/status", secretResourceName, + serviceAccountResourceName, servicesResourceName, servicesResourceName + "/status"} + if w.enableWorkload { + coreV1Resources = append(coreV1Resources, podResourceName, podResourceName+"/status") + } + + namespacedResourcesRules = append(namespacedResourcesRules, admv1.RuleWithOperations{ + Operations: cuOperations, + Rule: createRule([]string{corev1.SchemeGroupVersion.Group}, []string{corev1.SchemeGroupVersion.Version}, coreV1Resources, &namespacedScope), + }) + + // Build apps/v1 resources list, conditionally including replicasets if workload is enabled + appsV1Resources := []string{controllerRevisionResourceName, daemonSetResourceName, daemonSetResourceName + "/status", + deploymentResourceName, deploymentResourceName + "/status", statefulSetResourceName, statefulSetResourceName + "/status"} + if w.enableWorkload { + appsV1Resources = append(appsV1Resources, replicaSetResourceName, replicaSetResourceName+"/status") + } + + namespacedResourcesRules = append(namespacedResourcesRules, admv1.RuleWithOperations{ + Operations: cuOperations, + Rule: createRule([]string{appsv1.SchemeGroupVersion.Group}, []string{appsv1.SchemeGroupVersion.Version}, appsV1Resources, &namespacedScope), + }) + + namespacedResourcesRules = append(namespacedResourcesRules, + admv1.RuleWithOperations{ Operations: cuOperations, Rule: createRule([]string{authorizationv1.SchemeGroupVersion.Group}, []string{authorizationv1.SchemeGroupVersion.Version}, []string{localSubjectAccessReviewResourceName, localSubjectAccessReviewResourceName + "/status"}, &namespacedScope), }, - { + admv1.RuleWithOperations{ Operations: cuOperations, Rule: createRule([]string{autoscalingv1.SchemeGroupVersion.Group}, []string{autoscalingv1.SchemeGroupVersion.Version}, []string{horizontalPodAutoScalerResourceName, horizontalPodAutoScalerResourceName + "/status"}, &namespacedScope), }, - { + admv1.RuleWithOperations{ Operations: cuOperations, Rule: createRule([]string{batchv1.SchemeGroupVersion.Group}, []string{batchv1.SchemeGroupVersion.Version}, []string{cronJobResourceName, cronJobResourceName + "/status", jobResourceName, jobResourceName + "/status"}, &namespacedScope), }, - { + admv1.RuleWithOperations{ Operations: cuOperations, Rule: createRule([]string{discoveryv1.SchemeGroupVersion.Group}, []string{discoveryv1.SchemeGroupVersion.Version}, []string{endPointSlicesResourceName}, &namespacedScope), }, - { + admv1.RuleWithOperations{ Operations: cuOperations, Rule: createRule([]string{networkingv1.SchemeGroupVersion.Group}, []string{networkingv1.SchemeGroupVersion.Version}, []string{ingressResourceName, ingressResourceName + "/status", networkPolicyResourceName, networkPolicyResourceName + "/status"}, &namespacedScope), }, - { + admv1.RuleWithOperations{ Operations: cuOperations, Rule: createRule([]string{policyv1.SchemeGroupVersion.Group}, []string{policyv1.SchemeGroupVersion.Version}, []string{podDisruptionBudgetsResourceName, podDisruptionBudgetsResourceName + "/status"}, &namespacedScope), }, - { + admv1.RuleWithOperations{ Operations: cuOperations, Rule: createRule([]string{rbacv1.SchemeGroupVersion.Group}, []string{rbacv1.SchemeGroupVersion.Version}, []string{roleResourceName, roleBindingResourceName}, &namespacedScope), }, // rules for fleet namespaced resources. - { + admv1.RuleWithOperations{ Operations: cuOperations, Rule: createRule([]string{storagev1.SchemeGroupVersion.Group}, []string{storagev1.SchemeGroupVersion.Version}, []string{csiStorageCapacityResourceName}, &namespacedScope), }, - { + admv1.RuleWithOperations{ Operations: cuOperations, Rule: createRule([]string{clusterv1beta1.GroupVersion.Group}, []string{clusterv1beta1.GroupVersion.Version}, []string{internalMemberClusterResourceName, internalMemberClusterResourceName + "/status"}, &namespacedScope), }, - { + admv1.RuleWithOperations{ Operations: cuOperations, Rule: createRule([]string{placementv1beta1.GroupVersion.Group}, []string{placementv1beta1.GroupVersion.Version}, []string{workResourceName, workResourceName + "/status"}, &namespacedScope), }, - { + admv1.RuleWithOperations{ Operations: cuOperations, Rule: createRule([]string{fleetnetworkingv1alpha1.GroupVersion.Group}, []string{fleetnetworkingv1alpha1.GroupVersion.Version}, []string{endpointSliceExportResourceName, endpointSliceImportResourceName, internalServiceExportResourceName, internalServiceExportResourceName + "/status", internalServiceImportResourceName, internalServiceImportResourceName + "/status"}, &namespacedScope), }, - } + ) + guardRailWebhookConfigurations := []admv1.ValidatingWebhook{ { Name: "fleet.customresourcedefinition.guardrail.validating", diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index acc976825..f1975b30b 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -54,6 +54,16 @@ func TestBuildFleetValidatingWebhooks(t *testing.T) { }, wantLength: 8, }, + "enable workload": { + config: Config{ + serviceNamespace: "test-namespace", + servicePort: 8080, + serviceURL: "test-url", + clientConnectionType: &url, + enableWorkload: true, + }, + wantLength: 6, + }, } for testName, testCase := range testCases { @@ -99,6 +109,7 @@ func TestNewWebhookConfig(t *testing.T) { certDir string enableGuardRail bool denyModifyMemberClusterLabels bool + enableWorkload bool want *Config wantErr bool }{ @@ -111,6 +122,7 @@ func TestNewWebhookConfig(t *testing.T) { certDir: "/tmp/cert", enableGuardRail: true, denyModifyMemberClusterLabels: true, + enableWorkload: false, want: &Config{ serviceNamespace: "test-namespace", serviceName: "test-webhook", @@ -118,6 +130,7 @@ func TestNewWebhookConfig(t *testing.T) { clientConnectionType: nil, enableGuardRail: true, denyModifyMemberClusterLabels: true, + enableWorkload: false, }, wantErr: false, }, @@ -126,7 +139,7 @@ func TestNewWebhookConfig(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Setenv("POD_NAMESPACE", "test-namespace") defer t.Setenv("POD_NAMESPACE", "") - got, err := NewWebhookConfig(tt.mgr, tt.webhookServiceName, tt.port, tt.clientConnectionType, tt.certDir, tt.enableGuardRail, tt.denyModifyMemberClusterLabels) + got, err := NewWebhookConfig(tt.mgr, tt.webhookServiceName, tt.port, tt.clientConnectionType, tt.certDir, tt.enableGuardRail, tt.denyModifyMemberClusterLabels, tt.enableWorkload) if (err != nil) != tt.wantErr { t.Errorf("NewWebhookConfig() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/test/e2e/fleet_guard_rail_test.go b/test/e2e/fleet_guard_rail_test.go index 3c633aa03..1b37871c0 100644 --- a/test/e2e/fleet_guard_rail_test.go +++ b/test/e2e/fleet_guard_rail_test.go @@ -32,6 +32,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" utilrand "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/utils/ptr" clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" @@ -915,6 +916,247 @@ var _ = Describe("fleet guard rail networking E2Es", Serial, Ordered, func() { }) }) +var _ = Describe("fleet guard rail for pods and replicasets in fleet/kube namespaces", Serial, Ordered, func() { + var ( + podGVK = metav1.GroupVersionKind{Group: corev1.SchemeGroupVersion.Group, Version: corev1.SchemeGroupVersion.Version, Kind: "Pod"} + replicaSetGVK = metav1.GroupVersionKind{Group: appsv1.SchemeGroupVersion.Group, Version: appsv1.SchemeGroupVersion.Version, Kind: "ReplicaSet"} + ) + + Context("deny pod operations in fleet-system namespace", func() { + It("should deny CREATE operation on pod in fleet-system namespace for user not in system:masters", func() { + pod := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "fleet-system", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + Image: "nginx:latest", + }, + }, + }, + } + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, &pod), fmt.Sprintf(validation.ResourceDeniedFormat, testUser, utils.GenerateGroupString(testGroups), admissionv1.Create, &podGVK, "", types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}))).Should(Succeed()) + }) + + It("should deny UPDATE operation on pod in fleet-system namespace for user not in system:masters", func() { + // First create a pod as admin + pod := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod-update", + Namespace: "fleet-system", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + Image: "nginx:latest", + }, + }, + }, + } + Expect(hubClient.Create(ctx, &pod)).Should(Succeed()) + + // Try to update as non-admin + Eventually(func(g Gomega) error { + var p corev1.Pod + err := hubClient.Get(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}, &p) + if err != nil { + return err + } + p.Labels = map[string]string{testKey: testValue} + err = impersonateHubClient.Update(ctx, &p) + if k8sErrors.IsConflict(err) { + return err + } + return checkIfStatusErrorWithMessage(err, fmt.Sprintf(validation.ResourceDeniedFormat, testUser, utils.GenerateGroupString(testGroups), admissionv1.Update, &podGVK, "", types.NamespacedName{Name: p.Name, Namespace: p.Namespace})) + }, eventuallyDuration, eventuallyInterval).Should(Succeed()) + + // Cleanup + Expect(hubClient.Delete(ctx, &pod)).Should(Succeed()) + }) + }) + + Context("deny replicaset operations in fleet-member namespace", func() { + var ( + mcName string + imcNamespace string + ) + + BeforeAll(func() { + mcName = fmt.Sprintf(mcNameTemplate, GinkgoParallelProcess()) + imcNamespace = fmt.Sprintf(utils.NamespaceNameFormat, mcName) + createMemberCluster(mcName, testIdentity, nil, map[string]string{fleetClusterResourceIDAnnotationKey: clusterID1}) + checkInternalMemberClusterExists(mcName, imcNamespace) + }) + + AfterAll(func() { + ensureMemberClusterAndRelatedResourcesDeletion(mcName) + }) + + It("should deny CREATE operation on replicaset in fleet-member namespace for user not in MC identity", func() { + rs := appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-replicaset", + Namespace: imcNamespace, + }, + Spec: appsv1.ReplicaSetSpec{ + Replicas: ptr.To(int32(1)), + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "test"}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "test"}, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + Image: "nginx:latest", + }, + }, + }, + }, + }, + } + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, &rs), fmt.Sprintf(validation.ResourceDeniedFormat, testUser, utils.GenerateGroupString(testGroups), admissionv1.Create, &replicaSetGVK, "", types.NamespacedName{Name: rs.Name, Namespace: rs.Namespace}))).Should(Succeed()) + }) + + It("should deny UPDATE operation on replicaset in fleet-member namespace for user not in MC identity", func() { + // First create a replicaset as admin + rs := appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-replicaset-update", + Namespace: imcNamespace, + }, + Spec: appsv1.ReplicaSetSpec{ + Replicas: ptr.To(int32(1)), + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "test"}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "test"}, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + Image: "nginx:latest", + }, + }, + }, + }, + }, + } + Expect(hubClient.Create(ctx, &rs)).Should(Succeed()) + + // Try to update as non-admin + Eventually(func(g Gomega) error { + var r appsv1.ReplicaSet + err := hubClient.Get(ctx, types.NamespacedName{Name: rs.Name, Namespace: rs.Namespace}, &r) + if err != nil { + return err + } + r.Labels = map[string]string{testKey: testValue} + err = impersonateHubClient.Update(ctx, &r) + if k8sErrors.IsConflict(err) { + return err + } + return checkIfStatusErrorWithMessage(err, fmt.Sprintf(validation.ResourceDeniedFormat, testUser, utils.GenerateGroupString(testGroups), admissionv1.Update, &replicaSetGVK, "", types.NamespacedName{Name: r.Name, Namespace: r.Namespace})) + }, eventuallyDuration, eventuallyInterval).Should(Succeed()) + + // Cleanup + Expect(hubClient.Delete(ctx, &rs)).Should(Succeed()) + }) + + It("should deny DELETE operation on pod in fleet-member namespace for user not in MC identity", func() { + // First create a pod as admin + pod := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod-delete", + Namespace: imcNamespace, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + Image: "nginx:latest", + }, + }, + }, + } + Expect(hubClient.Create(ctx, &pod)).Should(Succeed()) + + // Try to delete as non-admin + Eventually(func() error { + var p corev1.Pod + err := hubClient.Get(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}, &p) + if err != nil { + return err + } + err = impersonateHubClient.Delete(ctx, &p) + return checkIfStatusErrorWithMessage(err, fmt.Sprintf(validation.ResourceDeniedFormat, testUser, utils.GenerateGroupString(testGroups), admissionv1.Delete, &podGVK, "", types.NamespacedName{Name: p.Name, Namespace: p.Namespace})) + }, eventuallyDuration, eventuallyInterval).Should(Succeed()) + + // Cleanup by admin + Expect(hubClient.Delete(ctx, &pod)).Should(Succeed()) + }) + }) + + Context("deny pod/replicaset operations in kube-system namespace", func() { + It("should deny CREATE operation on pod in kube-system namespace for user not in system:masters", func() { + pod := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod-kube", + Namespace: "kube-system", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + Image: "nginx:latest", + }, + }, + }, + } + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, &pod), fmt.Sprintf(validation.ResourceDeniedFormat, testUser, utils.GenerateGroupString(testGroups), admissionv1.Create, &podGVK, "", types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}))).Should(Succeed()) + }) + + It("should deny CREATE operation on replicaset in kube-system namespace for user not in system:masters", func() { + rs := appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-replicaset-kube", + Namespace: "kube-system", + }, + Spec: appsv1.ReplicaSetSpec{ + Replicas: ptr.To(int32(1)), + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "test"}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "test"}, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + Image: "nginx:latest", + }, + }, + }, + }, + }, + } + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, &rs), fmt.Sprintf(validation.ResourceDeniedFormat, testUser, utils.GenerateGroupString(testGroups), admissionv1.Create, &replicaSetGVK, "", types.NamespacedName{Name: rs.Name, Namespace: rs.Namespace}))).Should(Succeed()) + }) + }) +}) + var _ = Describe("fleet guard rail restrict internal fleet resources from being created in fleet/kube pre-fixed namespaces", Serial, Ordered, func() { Context("deny request to CREATE IMC in fleet-system namespace", func() { It("should deny CREATE operation on internal member cluster resource in fleet-system namespace for invalid user", func() { diff --git a/test/e2e/resource_placement_deployment_test.go b/test/e2e/resource_placement_deployment_test.go deleted file mode 100644 index 45c36908c..000000000 --- a/test/e2e/resource_placement_deployment_test.go +++ /dev/null @@ -1,162 +0,0 @@ -/* -Copyright 2025 The KubeFleet Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package e2e - -import ( - "fmt" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - appsv1 "k8s.io/api/apps/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/utils/ptr" - - placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" - "github.com/kubefleet-dev/kubefleet/pkg/utils" -) - -var _ = Describe("placing a Deployment using a RP with PickAll policy", Label("resourceplacement"), Ordered, func() { - crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) - rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) - var testDeployment appsv1.Deployment - - BeforeAll(func() { - // Read the test deployment manifest - readDeploymentTestManifest(&testDeployment) - workNamespace := appNamespace() - - // Create namespace and deployment - By("creating namespace and deployment") - Expect(hubClient.Create(ctx, &workNamespace)).To(Succeed(), "Failed to create namespace %s", workNamespace.Name) - testDeployment.Namespace = workNamespace.Name - Expect(hubClient.Create(ctx, &testDeployment)).To(Succeed(), "Failed to create test deployment %s", testDeployment.Name) - - // Create the CRP with namespace-only selector - By("creating CRP with namespace selector") - crp := &placementv1beta1.ClusterResourcePlacement{ - ObjectMeta: metav1.ObjectMeta{ - Name: crpName, - Finalizers: []string{customDeletionBlockerFinalizer}, - }, - Spec: placementv1beta1.PlacementSpec{ - ResourceSelectors: namespaceOnlySelector(), - Policy: &placementv1beta1.PlacementPolicy{ - PlacementType: placementv1beta1.PickAllPlacementType, - }, - Strategy: placementv1beta1.RolloutStrategy{ - Type: placementv1beta1.RollingUpdateRolloutStrategyType, - RollingUpdate: &placementv1beta1.RollingUpdateConfig{ - UnavailablePeriodSeconds: ptr.To(2), - }, - }, - }, - } - Expect(hubClient.Create(ctx, crp)).To(Succeed(), "Failed to create CRP") - - By("waiting for CRP status to update") - crpStatusUpdatedActual := crpStatusUpdatedActual(workNamespaceIdentifiers(), allMemberClusterNames, nil, "0") - Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") - }) - - AfterAll(func() { - By("cleaning up resources") - ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: testDeployment.Namespace}, allMemberClusters, &testDeployment) - ensureCRPAndRelatedResourcesDeleted(crpName, allMemberClusters) - }) - - Context("with PickAll placement type", Ordered, func() { - It("creating the RP should succeed", func() { - By("creating RP that selects the deployment") - rp := &placementv1beta1.ResourcePlacement{ - ObjectMeta: metav1.ObjectMeta{ - Name: rpName, - Namespace: testDeployment.Namespace, - Finalizers: []string{customDeletionBlockerFinalizer}, - }, - Spec: placementv1beta1.PlacementSpec{ - ResourceSelectors: []placementv1beta1.ResourceSelectorTerm{ - { - Group: appsv1.SchemeGroupVersion.Group, - Version: appsv1.SchemeGroupVersion.Version, - Kind: utils.DeploymentKind, - Name: testDeployment.Name, - }, - }, - Policy: &placementv1beta1.PlacementPolicy{ - PlacementType: placementv1beta1.PickAllPlacementType, - }, - Strategy: placementv1beta1.RolloutStrategy{ - Type: placementv1beta1.RollingUpdateRolloutStrategyType, - RollingUpdate: &placementv1beta1.RollingUpdateConfig{ - UnavailablePeriodSeconds: ptr.To(2), - }, - }, - }, - } - Expect(hubClient.Create(ctx, rp)).To(Succeed(), "Failed to create RP") - }) - - It("should update RP status as expected", func() { - By("verifying RP status update") - wantSelectedResources := []placementv1beta1.ResourceIdentifier{ - { - Group: appsv1.SchemeGroupVersion.Group, - Version: appsv1.SchemeGroupVersion.Version, - Kind: utils.DeploymentKind, - Name: testDeployment.Name, - Namespace: testDeployment.Namespace, - }, - } - rpStatusUpdatedActual := rpStatusUpdatedActual(wantSelectedResources, allMemberClusterNames, nil, "0") - Eventually(rpStatusUpdatedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP status as expected") - }) - - It("should place the deployment on all member clusters", func() { - By("verifying deployment is placed and ready on all member clusters") - for idx := range allMemberClusters { - memberCluster := allMemberClusters[idx] - deploymentPlacedActual := waitForDeploymentPlacementToReady(memberCluster, &testDeployment) - Eventually(deploymentPlacedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to place deployment on member cluster %s", memberCluster.ClusterName) - } - }) - - It("should verify deployment replicas are ready on all clusters", func() { - By("checking deployment status on each cluster") - for _, cluster := range allMemberClusters { - Eventually(func() error { - var deployed appsv1.Deployment - if err := cluster.KubeClient.Get(ctx, types.NamespacedName{ - Name: testDeployment.Name, - Namespace: testDeployment.Namespace, - }, &deployed); err != nil { - return err - } - // Verify deployment is ready - if deployed.Status.ReadyReplicas != *deployed.Spec.Replicas { - return fmt.Errorf("deployment not ready: %d/%d replicas ready", deployed.Status.ReadyReplicas, *deployed.Spec.Replicas) - } - if deployed.Status.UpdatedReplicas != *deployed.Spec.Replicas { - return fmt.Errorf("deployment not updated: %d/%d replicas updated", deployed.Status.UpdatedReplicas, *deployed.Spec.Replicas) - } - return nil - }, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), - "Deployment should be ready on cluster %s", cluster.ClusterName) - } - }) - }) -}) diff --git a/test/e2e/resource_placement_hub_workload_test.go b/test/e2e/resource_placement_hub_workload_test.go new file mode 100644 index 000000000..c248319d0 --- /dev/null +++ b/test/e2e/resource_placement_hub_workload_test.go @@ -0,0 +1,269 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "fmt" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" + + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" +) + +var _ = Describe("placing workloads using a CRP with PickAll policy", Label("resourceplacement"), Ordered, func() { + crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) + var testDeployment appsv1.Deployment + var testDaemonSet appsv1.DaemonSet + var testJob batchv1.Job + + BeforeAll(func() { + // Read the test manifests + readDeploymentTestManifest(&testDeployment) + readDaemonSetTestManifest(&testDaemonSet) + readJobTestManifest(&testJob) + workNamespace := appNamespace() + + // Create namespace and workloads + By("creating namespace and workloads") + Expect(hubClient.Create(ctx, &workNamespace)).To(Succeed(), "Failed to create namespace %s", workNamespace.Name) + testDeployment.Namespace = workNamespace.Name + testDaemonSet.Namespace = workNamespace.Name + testJob.Namespace = workNamespace.Name + Expect(hubClient.Create(ctx, &testDeployment)).To(Succeed(), "Failed to create test deployment %s", testDeployment.Name) + Expect(hubClient.Create(ctx, &testDaemonSet)).To(Succeed(), "Failed to create test daemonset %s", testDaemonSet.Name) + Expect(hubClient.Create(ctx, &testJob)).To(Succeed(), "Failed to create test job %s", testJob.Name) + + // Create the CRP that selects the namespace + By("creating CRP that selects the namespace") + crp := &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: crpName, + Finalizers: []string{customDeletionBlockerFinalizer}, + }, + Spec: placementv1beta1.PlacementSpec{ + ResourceSelectors: workResourceSelector(), + Policy: &placementv1beta1.PlacementPolicy{ + PlacementType: placementv1beta1.PickAllPlacementType, + }, + Strategy: placementv1beta1.RolloutStrategy{ + Type: placementv1beta1.RollingUpdateRolloutStrategyType, + RollingUpdate: &placementv1beta1.RollingUpdateConfig{ + UnavailablePeriodSeconds: ptr.To(2), + }, + }, + }, + } + Expect(hubClient.Create(ctx, crp)).To(Succeed(), "Failed to create CRP") + + By("waiting for CRP status to update") + wantSelectedResources := []placementv1beta1.ResourceIdentifier{ + { + Kind: "Namespace", + Name: workNamespace.Name, + Version: "v1", + }, + { + Group: "apps", + Version: "v1", + Kind: "Deployment", + Name: testDeployment.Name, + Namespace: workNamespace.Name, + }, + { + Group: "apps", + Version: "v1", + Kind: "DaemonSet", + Name: testDaemonSet.Name, + Namespace: workNamespace.Name, + }, + { + Group: "batch", + Version: "v1", + Kind: "Job", + Name: testJob.Name, + Namespace: workNamespace.Name, + }, + } + // Use customizedPlacementStatusUpdatedActual with resourceIsTrackable=false + // because Jobs don't have availability tracking like Deployments/DaemonSets do + crpKey := types.NamespacedName{Name: crpName} + crpStatusUpdatedActual := customizedPlacementStatusUpdatedActual(crpKey, wantSelectedResources, allMemberClusterNames, nil, "0", false) + Eventually(crpStatusUpdatedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") + }) + + AfterAll(func() { + By("cleaning up resources") + ensureCRPAndRelatedResourcesDeleted(crpName, allMemberClusters) + }) + + Context("with PickAll placement type", Ordered, func() { + It("should verify hub deployment is ready", func() { + By("checking hub deployment status") + Eventually(func() error { + var hubDeployment appsv1.Deployment + if err := hubClient.Get(ctx, types.NamespacedName{ + Name: testDeployment.Name, + Namespace: testDeployment.Namespace, + }, &hubDeployment); err != nil { + return err + } + // Verify deployment is ready in hub cluster + if hubDeployment.Status.ReadyReplicas != *hubDeployment.Spec.Replicas { + return fmt.Errorf("hub deployment not ready: %d/%d replicas ready", hubDeployment.Status.ReadyReplicas, *hubDeployment.Spec.Replicas) + } + if hubDeployment.Status.UpdatedReplicas != *hubDeployment.Spec.Replicas { + return fmt.Errorf("hub deployment not updated: %d/%d replicas updated", hubDeployment.Status.UpdatedReplicas, *hubDeployment.Spec.Replicas) + } + return nil + }, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), + "Hub deployment should be ready before placement") + }) + + It("should verify hub daemonset is ready", func() { + By("checking hub daemonset status") + Eventually(func() error { + var hubDaemonSet appsv1.DaemonSet + if err := hubClient.Get(ctx, types.NamespacedName{ + Name: testDaemonSet.Name, + Namespace: testDaemonSet.Namespace, + }, &hubDaemonSet); err != nil { + return err + } + // Verify daemonset is ready in hub cluster + if hubDaemonSet.Status.NumberReady == 0 { + return fmt.Errorf("hub daemonset has no ready pods") + } + if hubDaemonSet.Status.NumberReady != hubDaemonSet.Status.DesiredNumberScheduled { + return fmt.Errorf("hub daemonset not ready: %d/%d pods ready", hubDaemonSet.Status.NumberReady, hubDaemonSet.Status.DesiredNumberScheduled) + } + return nil + }, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), + "Hub daemonset should be ready before placement") + }) + + It("should verify hub job completes successfully", func() { + By("checking hub job completion status") + jobCompletedActual := waitForJobToComplete(hubClient, &testJob) + Eventually(jobCompletedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), + "Hub job should complete successfully") + }) + + It("should place the deployment on all member clusters", func() { + By("verifying deployment is placed and ready on all member clusters") + for idx := range allMemberClusters { + memberCluster := allMemberClusters[idx] + deploymentPlacedActual := waitForDeploymentPlacementToReady(memberCluster, &testDeployment) + Eventually(deploymentPlacedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to place deployment on member cluster %s", memberCluster.ClusterName) + } + }) + + It("should place the daemonset on all member clusters", func() { + By("verifying daemonset is placed and ready on all member clusters") + for idx := range allMemberClusters { + memberCluster := allMemberClusters[idx] + daemonsetPlacedActual := waitForDaemonSetPlacementToReady(memberCluster, &testDaemonSet) + Eventually(daemonsetPlacedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to place daemonset on member cluster %s", memberCluster.ClusterName) + } + }) + + It("should place the job on all member clusters", func() { + By("verifying job is placed on all member clusters") + for idx := range allMemberClusters { + memberCluster := allMemberClusters[idx] + jobPlacedActual := waitForJobToBePlaced(memberCluster, &testJob) + Eventually(jobPlacedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to place job on member cluster %s", memberCluster.ClusterName) + } + }) + + It("should verify job completes successfully on all clusters", func() { + By("checking job completion status on each cluster") + for _, cluster := range allMemberClusters { + jobCompletedActual := waitForJobToComplete(cluster.KubeClient, &testJob) + Eventually(jobCompletedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), + "Job should complete successfully on cluster %s", cluster.ClusterName) + } + }) + + It("should verify deployment replicas are ready on all clusters", func() { + By("checking deployment status on each cluster") + for _, cluster := range allMemberClusters { + Eventually(func() error { + var deployed appsv1.Deployment + if err := cluster.KubeClient.Get(ctx, types.NamespacedName{ + Name: testDeployment.Name, + Namespace: testDeployment.Namespace, + }, &deployed); err != nil { + return err + } + // Verify deployment is ready + if deployed.Status.ReadyReplicas != *deployed.Spec.Replicas { + return fmt.Errorf("deployment not ready: %d/%d replicas ready", deployed.Status.ReadyReplicas, *deployed.Spec.Replicas) + } + if deployed.Status.UpdatedReplicas != *deployed.Spec.Replicas { + return fmt.Errorf("deployment not updated: %d/%d replicas updated", deployed.Status.UpdatedReplicas, *deployed.Spec.Replicas) + } + return nil + }, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), + "Deployment should be ready on cluster %s", cluster.ClusterName) + } + }) + }) +}) + +func waitForJobToComplete(kubeClient client.Client, testJob *batchv1.Job) func() error { + return func() error { + var job batchv1.Job + if err := kubeClient.Get(ctx, types.NamespacedName{ + Name: testJob.Name, + Namespace: testJob.Namespace, + }, &job); err != nil { + return err + } + + // Check if job has completed successfully + if job.Status.Succeeded == 0 { + return fmt.Errorf("job not completed: %d succeeded", job.Status.Succeeded) + } + + // Verify all job pods completed successfully + podList := &corev1.PodList{} + if err := kubeClient.List(ctx, podList, client.InNamespace(testJob.Namespace), + client.MatchingLabels{"job-name": testJob.Name}); err != nil { + return fmt.Errorf("failed to list job pods: %w", err) + } + + if len(podList.Items) == 0 { + return fmt.Errorf("no pods found for job %s", testJob.Name) + } + + for _, pod := range podList.Items { + if pod.Status.Phase != corev1.PodSucceeded { + return fmt.Errorf("pod %s not succeeded: phase=%s", pod.Name, pod.Status.Phase) + } + } + + return nil + } +} diff --git a/test/e2e/setup.sh b/test/e2e/setup.sh index cdb47442b..d4b6b8a2a 100755 --- a/test/e2e/setup.sh +++ b/test/e2e/setup.sh @@ -123,6 +123,7 @@ helm install hub-agent ../../charts/hub-agent/ \ --set namespace=fleet-system \ --set logVerbosity=5 \ --set enableWebhook=true \ + --set enableWorkload=true \ --set webhookClientConnectionType=service \ --set forceDeleteWaitTime="1m0s" \ --set clusterUnhealthyThreshold="3m0s" \ From 6bf6d62927eb5c100b54b7b21a1b64b155176c93 Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Wed, 26 Nov 2025 10:37:28 -0800 Subject: [PATCH 10/13] feat: rename update run state (#348) * rename updaterun state Signed-off-by: Ryan Zhang * fix api test Signed-off-by: Ryan Zhang --------- Signed-off-by: Ryan Zhang Co-authored-by: Ryan Zhang --- apis/placement/v1beta1/stageupdate_types.go | 32 +++++------ ...etes-fleet.io_clusterstagedupdateruns.yaml | 48 ++++++++--------- ....kubernetes-fleet.io_stagedupdateruns.yaml | 48 ++++++++--------- .../api_validation_integration_test.go | 54 +++++++++---------- 4 files changed, 91 insertions(+), 91 deletions(-) diff --git a/apis/placement/v1beta1/stageupdate_types.go b/apis/placement/v1beta1/stageupdate_types.go index 9491f3ec6..588e38136 100644 --- a/apis/placement/v1beta1/stageupdate_types.go +++ b/apis/placement/v1beta1/stageupdate_types.go @@ -154,27 +154,27 @@ type State string const ( // StateNotStarted describes user intent to initialize but not execute the update run. // This is the default state when an update run is created. - StateNotStarted State = "NotStarted" + StateNotStarted State = "Initialize" // StateStarted describes user intent to execute (or resume execution if paused). - // Users can subsequently set the state to Stopped or Abandoned. - StateStarted State = "Started" + // Users can subsequently set the state to Pause or Abandon. + StateStarted State = "Execute" // StateStopped describes user intent to pause the update run. - // Users can subsequently set the state to Started or Abandoned. - StateStopped State = "Stopped" + // Users can subsequently set the state to Execute or Abandon. + StateStopped State = "Pause" // StateAbandoned describes user intent to abandon the update run. // This is a terminal state; once set, it cannot be changed. - StateAbandoned State = "Abandoned" + StateAbandoned State = "Abandon" ) // UpdateRunSpec defines the desired rollout strategy and the snapshot indices of the resources to be updated. // It specifies a stage-by-stage update process across selected clusters for the given ResourcePlacement object. -// +kubebuilder:validation:XValidation:rule="!has(oldSelf.state) || oldSelf.state != 'NotStarted' || self.state != 'Stopped'",message="invalid state transition: cannot transition from NotStarted to Stopped" -// +kubebuilder:validation:XValidation:rule="!has(oldSelf.state) || oldSelf.state != 'Started' || self.state != 'NotStarted'",message="invalid state transition: cannot transition from Started to NotStarted" -// +kubebuilder:validation:XValidation:rule="!has(oldSelf.state) || oldSelf.state != 'Stopped' || self.state != 'NotStarted'",message="invalid state transition: cannot transition from Stopped to NotStarted" -// +kubebuilder:validation:XValidation:rule="!has(oldSelf.state) || oldSelf.state != 'Abandoned' || self.state == 'Abandoned'",message="invalid state transition: Abandoned is a terminal state and cannot transition to any other state" +// +kubebuilder:validation:XValidation:rule="!(has(oldSelf.state) && oldSelf.state == 'Initialize' && self.state == 'Pause')",message="invalid state transition: cannot transition from Initialize to Pause" +// +kubebuilder:validation:XValidation:rule="!(has(oldSelf.state) && oldSelf.state == 'Execute' && self.state == 'Initialize')",message="invalid state transition: cannot transition from Execute to Initialize" +// +kubebuilder:validation:XValidation:rule="!(has(oldSelf.state) && oldSelf.state == 'Pause' && self.state == 'Initialize')",message="invalid state transition: cannot transition from Pause to Initialize" +// +kubebuilder:validation:XValidation:rule="!has(oldSelf.state) || oldSelf.state != 'Abandon' || self.state == 'Abandon'",message="invalid state transition: Abandon is a terminal state and cannot transition to any other state" type UpdateRunSpec struct { // PlacementName is the name of placement that this update run is applied to. // There can be multiple active update runs for each placement, but @@ -199,13 +199,13 @@ type UpdateRunSpec struct { StagedUpdateStrategyName string `json:"stagedRolloutStrategyName"` // State indicates the desired state of the update run. - // NotStarted: The update run is initialized but execution has not started (default). - // Started: The update run should execute or resume execution. - // Stopped: The update run should pause execution. - // Abandoned: The update run should be abandoned and terminated. + // Initialize: The update run should be initialized but execution should not start (default). + // Execute: The update run should execute or resume execution. + // Pause: The update run should pause execution. + // Abandon: The update run should be abandoned and terminated. // +kubebuilder:validation:Optional - // +kubebuilder:default=NotStarted - // +kubebuilder:validation:Enum=NotStarted;Started;Stopped;Abandoned + // +kubebuilder:default=Initialize + // +kubebuilder:validation:Enum=Initialize;Execute;Pause;Abandon State State `json:"state,omitempty"` } diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml index 0bf83da28..c95748724 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml @@ -1185,40 +1185,40 @@ spec: - message: stagedRolloutStrategyName is immutable rule: self == oldSelf state: - default: NotStarted + default: Initialize description: |- State indicates the desired state of the update run. - NotStarted: The update run is initialized but execution has not started (default). - Started: The update run should execute or resume execution. - Stopped: The update run should pause execution. - Abandoned: The update run should be abandoned and terminated. + Initialize: The update run should be initialized but execution should not start (default). + Execute: The update run should execute or resume execution. + Pause: The update run should pause execution. + Abandon: The update run should be abandoned and terminated. enum: - - NotStarted - - Started - - Stopped - - Abandoned + - Initialize + - Execute + - Pause + - Abandon type: string required: - placementName - stagedRolloutStrategyName type: object x-kubernetes-validations: - - message: 'invalid state transition: cannot transition from NotStarted - to Stopped' - rule: '!has(oldSelf.state) || oldSelf.state != ''NotStarted'' || self.state - != ''Stopped''' - - message: 'invalid state transition: cannot transition from Started to - NotStarted' - rule: '!has(oldSelf.state) || oldSelf.state != ''Started'' || self.state - != ''NotStarted''' - - message: 'invalid state transition: cannot transition from Stopped to - NotStarted' - rule: '!has(oldSelf.state) || oldSelf.state != ''Stopped'' || self.state - != ''NotStarted''' - - message: 'invalid state transition: Abandoned is a terminal state and + - message: 'invalid state transition: cannot transition from Initialize + to Pause' + rule: '!(has(oldSelf.state) && oldSelf.state == ''Initialize'' && self.state + == ''Pause'')' + - message: 'invalid state transition: cannot transition from Execute to + Initialize' + rule: '!(has(oldSelf.state) && oldSelf.state == ''Execute'' && self.state + == ''Initialize'')' + - message: 'invalid state transition: cannot transition from Pause to + Initialize' + rule: '!(has(oldSelf.state) && oldSelf.state == ''Pause'' && self.state + == ''Initialize'')' + - message: 'invalid state transition: Abandon is a terminal state and cannot transition to any other state' - rule: '!has(oldSelf.state) || oldSelf.state != ''Abandoned'' || self.state - == ''Abandoned''' + rule: '!has(oldSelf.state) || oldSelf.state != ''Abandon'' || self.state + == ''Abandon''' status: description: The observed status of ClusterStagedUpdateRun. properties: diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml index 979c73e99..abfa39f46 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml @@ -105,40 +105,40 @@ spec: - message: stagedRolloutStrategyName is immutable rule: self == oldSelf state: - default: NotStarted + default: Initialize description: |- State indicates the desired state of the update run. - NotStarted: The update run is initialized but execution has not started (default). - Started: The update run should execute or resume execution. - Stopped: The update run should pause execution. - Abandoned: The update run should be abandoned and terminated. + Initialize: The update run should be initialized but execution should not start (default). + Execute: The update run should execute or resume execution. + Pause: The update run should pause execution. + Abandon: The update run should be abandoned and terminated. enum: - - NotStarted - - Started - - Stopped - - Abandoned + - Initialize + - Execute + - Pause + - Abandon type: string required: - placementName - stagedRolloutStrategyName type: object x-kubernetes-validations: - - message: 'invalid state transition: cannot transition from NotStarted - to Stopped' - rule: '!has(oldSelf.state) || oldSelf.state != ''NotStarted'' || self.state - != ''Stopped''' - - message: 'invalid state transition: cannot transition from Started to - NotStarted' - rule: '!has(oldSelf.state) || oldSelf.state != ''Started'' || self.state - != ''NotStarted''' - - message: 'invalid state transition: cannot transition from Stopped to - NotStarted' - rule: '!has(oldSelf.state) || oldSelf.state != ''Stopped'' || self.state - != ''NotStarted''' - - message: 'invalid state transition: Abandoned is a terminal state and + - message: 'invalid state transition: cannot transition from Initialize + to Pause' + rule: '!(has(oldSelf.state) && oldSelf.state == ''Initialize'' && self.state + == ''Pause'')' + - message: 'invalid state transition: cannot transition from Execute to + Initialize' + rule: '!(has(oldSelf.state) && oldSelf.state == ''Execute'' && self.state + == ''Initialize'')' + - message: 'invalid state transition: cannot transition from Pause to + Initialize' + rule: '!(has(oldSelf.state) && oldSelf.state == ''Pause'' && self.state + == ''Initialize'')' + - message: 'invalid state transition: Abandon is a terminal state and cannot transition to any other state' - rule: '!has(oldSelf.state) || oldSelf.state != ''Abandoned'' || self.state - == ''Abandoned''' + rule: '!has(oldSelf.state) || oldSelf.state != ''Abandon'' || self.state + == ''Abandon''' status: description: The observed status of StagedUpdateRun. properties: diff --git a/test/apis/placement/v1beta1/api_validation_integration_test.go b/test/apis/placement/v1beta1/api_validation_integration_test.go index 9d14e54f9..03aa7895e 100644 --- a/test/apis/placement/v1beta1/api_validation_integration_test.go +++ b/test/apis/placement/v1beta1/api_validation_integration_test.go @@ -1813,7 +1813,7 @@ var _ = Describe("Test placement v1beta1 API validation", func() { }) }) - Context("Test ClusterStagedUpdateRun State API validation - valid NotStarted state transitions", func() { + Context("Test ClusterStagedUpdateRun State API validation - valid Initialize state transitions", func() { var updateRun *placementv1beta1.ClusterStagedUpdateRun updateRunName := fmt.Sprintf(validupdateRunNameTemplate, GinkgoParallelProcess()) @@ -1839,7 +1839,7 @@ var _ = Describe("Test placement v1beta1 API validation", func() { Name: "unspecfied-state-update-run-" + fmt.Sprintf("%d", GinkgoParallelProcess()), }, Spec: placementv1beta1.UpdateRunSpec{ - // State not specified - should default to NotStarted + // State not specified - should default to Initialize }, } Expect(hubClient.Create(ctx, updateRunWithDefaultState)).Should(Succeed()) @@ -1847,7 +1847,7 @@ var _ = Describe("Test placement v1beta1 API validation", func() { Expect(hubClient.Delete(ctx, updateRunWithDefaultState)).Should(Succeed()) }) - It("should allow creation of ClusterStagedUpdateRun with empty state (defaults to NotStarted)", func() { + It("should allow creation of ClusterStagedUpdateRun with empty state (defaults to Initialize)", func() { updateRun := &placementv1beta1.ClusterStagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ Name: "empty-state-update-run-" + fmt.Sprintf("%d", GinkgoParallelProcess()), @@ -1861,18 +1861,18 @@ var _ = Describe("Test placement v1beta1 API validation", func() { Expect(hubClient.Delete(ctx, updateRun)).Should(Succeed()) }) - It("should allow transition from NotStarted to Started", func() { + It("should allow transition from Initialize to Execute", func() { updateRun.Spec.State = placementv1beta1.StateStarted Expect(hubClient.Update(ctx, updateRun)).Should(Succeed()) }) - It("should allow transition from NotStarted to Abandoned", func() { + It("should allow transition from Initialize to Abandon", func() { updateRun.Spec.State = placementv1beta1.StateAbandoned Expect(hubClient.Update(ctx, updateRun)).Should(Succeed()) }) }) - Context("Test ClusterStagedUpdateRun State API validation - valid Started state transitions", func() { + Context("Test ClusterStagedUpdateRun State API validation - valid Execute state transitions", func() { var updateRun *placementv1beta1.ClusterStagedUpdateRun updateRunName := fmt.Sprintf(validupdateRunNameTemplate, GinkgoParallelProcess()) @@ -1892,18 +1892,18 @@ var _ = Describe("Test placement v1beta1 API validation", func() { Expect(hubClient.Delete(ctx, updateRun)).Should(Succeed()) }) - It("should allow transition from Started to Stopped", func() { + It("should allow transition from Execute to Pause", func() { updateRun.Spec.State = placementv1beta1.StateStopped Expect(hubClient.Update(ctx, updateRun)).Should(Succeed()) }) - It("should allow transition from Started to Abandoned", func() { + It("should allow transition from Execute to Abandon", func() { updateRun.Spec.State = placementv1beta1.StateAbandoned Expect(hubClient.Update(ctx, updateRun)).Should(Succeed()) }) }) - Context("Test ClusterStagedUpdateRun State API validation - valid Stopped state transitions", func() { + Context("Test ClusterStagedUpdateRun State API validation - valid Pause state transitions", func() { var updateRun *placementv1beta1.ClusterStagedUpdateRun updateRunName := fmt.Sprintf(validupdateRunNameTemplate, GinkgoParallelProcess()) @@ -1917,7 +1917,7 @@ var _ = Describe("Test placement v1beta1 API validation", func() { }, } Expect(hubClient.Create(ctx, updateRun)).Should(Succeed()) - // Transition to Stopped state first + // Transition to Pause state first updateRun.Spec.State = placementv1beta1.StateStopped Expect(hubClient.Update(ctx, updateRun)).Should(Succeed()) }) @@ -1926,12 +1926,12 @@ var _ = Describe("Test placement v1beta1 API validation", func() { Expect(hubClient.Delete(ctx, updateRun)).Should(Succeed()) }) - It("should allow transition from Stopped to Started", func() { + It("should allow transition from Pause to Execute", func() { updateRun.Spec.State = placementv1beta1.StateStarted Expect(hubClient.Update(ctx, updateRun)).Should(Succeed()) }) - It("should allow transition from Stopped to Abandoned", func() { + It("should allow transition from Pause to Abandon", func() { updateRun.Spec.State = placementv1beta1.StateAbandoned Expect(hubClient.Update(ctx, updateRun)).Should(Succeed()) }) @@ -1947,7 +1947,7 @@ var _ = Describe("Test placement v1beta1 API validation", func() { } }) - It("should deny transition from NotStarted to Stopped", func() { + It("should deny transition from Initialize to Pause", func() { updateRun = &placementv1beta1.ClusterStagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ Name: updateRunName, @@ -1962,10 +1962,10 @@ var _ = Describe("Test placement v1beta1 API validation", func() { err := hubClient.Update(ctx, updateRun) var statusErr *k8sErrors.StatusError Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Update ClusterStagedUpdateRun call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) - Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: cannot transition from NotStarted to Stopped")) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: cannot transition from Initialize to Pause")) }) - It("should deny transition from Started to NotStarted", func() { + It("should deny transition from Execute to Initialize", func() { updateRun = &placementv1beta1.ClusterStagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ Name: updateRunName, @@ -1980,10 +1980,10 @@ var _ = Describe("Test placement v1beta1 API validation", func() { err := hubClient.Update(ctx, updateRun) var statusErr *k8sErrors.StatusError Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Update ClusterStagedUpdateRun call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) - Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: cannot transition from Started to NotStarted")) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: cannot transition from Execute to Initialize")) }) - It("should deny transition from Stopped to NotStarted", func() { + It("should deny transition from Pause to Initialize", func() { updateRun = &placementv1beta1.ClusterStagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ Name: updateRunName, @@ -1994,19 +1994,19 @@ var _ = Describe("Test placement v1beta1 API validation", func() { } Expect(hubClient.Create(ctx, updateRun)).Should(Succeed()) - // Transition to Stopped first + // Transition to Pause first updateRun.Spec.State = placementv1beta1.StateStopped Expect(hubClient.Update(ctx, updateRun)).Should(Succeed()) - // Try to transition back to NotStarted + // Try to transition back to Initialize updateRun.Spec.State = placementv1beta1.StateNotStarted err := hubClient.Update(ctx, updateRun) var statusErr *k8sErrors.StatusError Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Update ClusterStagedUpdateRun call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) - Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: cannot transition from Stopped to NotStarted")) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: cannot transition from Pause to Initialize")) }) - It("should deny transition from Abandoned to NotStarted", func() { + It("should deny transition from Abandon to Initialize", func() { updateRun = &placementv1beta1.ClusterStagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ Name: updateRunName, @@ -2021,10 +2021,10 @@ var _ = Describe("Test placement v1beta1 API validation", func() { err := hubClient.Update(ctx, updateRun) var statusErr *k8sErrors.StatusError Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Update ClusterStagedUpdateRun call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) - Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: Abandoned is a terminal state and cannot transition to any other state")) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: Abandon is a terminal state and cannot transition to any other state")) }) - It("should deny transition from Abandoned to Started", func() { + It("should deny transition from Abandon to Execute", func() { updateRun = &placementv1beta1.ClusterStagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ Name: updateRunName, @@ -2039,10 +2039,10 @@ var _ = Describe("Test placement v1beta1 API validation", func() { err := hubClient.Update(ctx, updateRun) var statusErr *k8sErrors.StatusError Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Update ClusterStagedUpdateRun call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) - Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: Abandoned is a terminal state and cannot transition to any other state")) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: Abandon is a terminal state and cannot transition to any other state")) }) - It("should deny transition from Abandoned to Stopped", func() { + It("should deny transition from Abandon to Pause", func() { updateRun = &placementv1beta1.ClusterStagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ Name: updateRunName, @@ -2057,7 +2057,7 @@ var _ = Describe("Test placement v1beta1 API validation", func() { err := hubClient.Update(ctx, updateRun) var statusErr *k8sErrors.StatusError Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Update ClusterStagedUpdateRun call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) - Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: Abandoned is a terminal state and cannot transition to any other state")) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("invalid state transition: Abandon is a terminal state and cannot transition to any other state")) }) }) @@ -2077,7 +2077,7 @@ var _ = Describe("Test placement v1beta1 API validation", func() { err := hubClient.Create(ctx, updateRun) var statusErr *k8sErrors.StatusError Expect(errors.As(err, &statusErr)).To(BeTrue(), fmt.Sprintf("Create ClusterStagedUpdateRun call produced error %s. Error type wanted is %s.", reflect.TypeOf(err), reflect.TypeOf(&k8sErrors.StatusError{}))) - Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("supported values: \"NotStarted\", \"Started\", \"Stopped\", \"Abandoned\"")) + Expect(statusErr.ErrStatus.Message).Should(MatchRegexp("supported values: \"Initialize\", \"Execute\", \"Pause\", \"Abandon\"")) }) }) From 3881da50f820bf6e44aa369d86147e683eeff0b0 Mon Sep 17 00:00:00 2001 From: michaelawyu Date: Fri, 28 Nov 2025 07:25:39 +0800 Subject: [PATCH 11/13] fix: abandon the work applier reconciliation loop when the main context exits (#343) * Minor changes Signed-off-by: michaelawyu * Minor fixes Signed-off-by: michaelawyu * Minor fixes Signed-off-by: michaelawyu --------- Signed-off-by: michaelawyu --- .../workapplier/availability_tracker.go | 15 +++++++++-- .../workapplier/availability_tracker_test.go | 5 +++- pkg/controllers/workapplier/controller.go | 12 +++++++-- pkg/controllers/workapplier/process.go | 25 +++++++++++++++++-- pkg/utils/parallelizer/parallelizer.go | 9 +++++++ 5 files changed, 59 insertions(+), 7 deletions(-) diff --git a/pkg/controllers/workapplier/availability_tracker.go b/pkg/controllers/workapplier/availability_tracker.go index 021c7babd..725ca3a9e 100644 --- a/pkg/controllers/workapplier/availability_tracker.go +++ b/pkg/controllers/workapplier/availability_tracker.go @@ -35,8 +35,8 @@ import ( "github.com/kubefleet-dev/kubefleet/pkg/utils/controller" ) -// trackInMemberClusterObjAvailability tracks the availability of an applied objects in the member cluster. -func (r *Reconciler) trackInMemberClusterObjAvailability(ctx context.Context, bundles []*manifestProcessingBundle, workRef klog.ObjectRef) { +// trackInMemberClusterObjAvailability tracks the availability of applied objects in the member cluster. +func (r *Reconciler) trackInMemberClusterObjAvailability(ctx context.Context, bundles []*manifestProcessingBundle, workRef klog.ObjectRef) error { // Track the availability of all the applied objects in the member cluster in parallel. // // This is concurrency-safe as the bundles slice has been pre-allocated. @@ -83,6 +83,17 @@ func (r *Reconciler) trackInMemberClusterObjAvailability(ctx context.Context, bu // Run the availability check in parallel. r.parallelizer.ParallelizeUntil(childCtx, len(bundles), doWork, "trackInMemberClusterObjAvailability") + + // Unlike some other steps in the reconciliation loop, the availability checking step does not end + // with a contextual API call; consequently, if the context has been cancelled during this step, + // some checks might not run at all, and passing such bundles to the next step may trigger + // unexpected behaviors. To address this, at the end of this step the work applier checks for context + // cancellation directly. + if err := ctx.Err(); err != nil { + klog.V(2).InfoS("availability checking has been interrupted as the main context has been cancelled") + return fmt.Errorf("availability checking has been interrupted: %w", err) + } + return nil } // trackInMemberClusterObjAvailabilityByGVR tracks the availability of an object in the member cluster based diff --git a/pkg/controllers/workapplier/availability_tracker_test.go b/pkg/controllers/workapplier/availability_tracker_test.go index 03a212f83..1bc43c328 100644 --- a/pkg/controllers/workapplier/availability_tracker_test.go +++ b/pkg/controllers/workapplier/availability_tracker_test.go @@ -1126,7 +1126,10 @@ func TestTrackInMemberClusterObjAvailability(t *testing.T) { parallelizer: parallelizer.NewParallelizer(2), } - r.trackInMemberClusterObjAvailability(ctx, tc.bundles, workRef) + if err := r.trackInMemberClusterObjAvailability(ctx, tc.bundles, workRef); err != nil { + // Normally this would never occur. + t.Fatalf("trackInMemberClusterObjAvailability() = %v, want no error", err) + } // A special less func to sort the bundles by their ordinal. lessFuncManifestProcessingBundle := func(i, j *manifestProcessingBundle) bool { diff --git a/pkg/controllers/workapplier/controller.go b/pkg/controllers/workapplier/controller.go index 393867554..c4e4df323 100644 --- a/pkg/controllers/workapplier/controller.go +++ b/pkg/controllers/workapplier/controller.go @@ -479,18 +479,26 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu // c) report configuration differences if applicable; // d) check for configuration drifts if applicable; // e) apply each manifest. - r.processManifests(ctx, bundles, work, expectedAppliedWorkOwnerRef) + if err := r.processManifests(ctx, bundles, work, expectedAppliedWorkOwnerRef); err != nil { + klog.ErrorS(err, "Failed to process the manifests", "work", workRef) + return ctrl.Result{}, err + } // Track the availability information. - r.trackInMemberClusterObjAvailability(ctx, bundles, workRef) + if err := r.trackInMemberClusterObjAvailability(ctx, bundles, workRef); err != nil { + klog.ErrorS(err, "Failed to check for object availability", "work", workRef) + return ctrl.Result{}, err + } // Refresh the status of the Work object. if err := r.refreshWorkStatus(ctx, work, bundles); err != nil { + klog.ErrorS(err, "Failed to refresh work object status", "work", workRef) return ctrl.Result{}, err } // Refresh the status of the AppliedWork object. if err := r.refreshAppliedWorkStatus(ctx, appliedWork, bundles); err != nil { + klog.ErrorS(err, "Failed to refresh appliedWork object status", "appliedWork", klog.KObj(appliedWork)) return ctrl.Result{}, err } diff --git a/pkg/controllers/workapplier/process.go b/pkg/controllers/workapplier/process.go index 82dc5ce18..1dba64f55 100644 --- a/pkg/controllers/workapplier/process.go +++ b/pkg/controllers/workapplier/process.go @@ -36,7 +36,7 @@ func (r *Reconciler) processManifests( bundles []*manifestProcessingBundle, work *fleetv1beta1.Work, expectedAppliedWorkOwnerRef *metav1.OwnerReference, -) { +) error { // Process all manifests in parallel. // // There are cases where certain groups of manifests should not be processed in parallel with @@ -58,7 +58,17 @@ func (r *Reconciler) processManifests( } r.parallelizer.ParallelizeUntil(ctx, len(bundles), doWork, "processingManifestsInReportDiffMode") - return + + // Unlike some other steps in the reconciliation loop, the manifest processing step does not end + // with a contextual API call; consequently, if the context has been cancelled during this step, + // some manifest might not get processed at all, and passing such bundles to the next step may trigger + // unexpected behaviors. To address this, at the end of this step the work applier checks for context + // cancellation directly. + if err := ctx.Err(); err != nil { + klog.V(2).InfoS("manifest processing has been interrupted as the main context has been cancelled") + return fmt.Errorf("manifest processing has been interrupted: %w", err) + } + return nil } // Organize the bundles into different waves of bundles for parallel processing based on their @@ -83,7 +93,18 @@ func (r *Reconciler) processManifests( } r.parallelizer.ParallelizeUntil(ctx, len(bundlesInWave), doWork, fmt.Sprintf("processingManifestsInWave%d", idx)) + + // Unlike some other steps in the reconciliation loop, the manifest processing step does not end + // with a contextual API call; consequently, if the context has been cancelled during this step, + // some manifest might not get processed at all, and passing such bundles to the next step may trigger + // unexpected behaviors. To address this, at the end of this step the work applier checks for context + // cancellation directly. + if err := ctx.Err(); err != nil { + klog.V(2).InfoS("manifest processing has been interrupted as the main context has been cancelled") + return fmt.Errorf("manifest processing has been interrupted: %w", err) + } } + return nil } // processOneManifest processes a manifest (in the JSON format) embedded in the Work object. diff --git a/pkg/utils/parallelizer/parallelizer.go b/pkg/utils/parallelizer/parallelizer.go index 4c26fb43d..ca473e171 100644 --- a/pkg/utils/parallelizer/parallelizer.go +++ b/pkg/utils/parallelizer/parallelizer.go @@ -56,4 +56,13 @@ func (p *parallelizer) ParallelizeUntil(ctx context.Context, pieces int, doWork } workqueue.ParallelizeUntil(ctx, p.numOfWorkers, pieces, doWorkWithLogs) + + // Note (chenyu1): the ParallelizeUntil method is essentially a thin wrapper around the + // workqueue.ParallelizeUntil method. Note that the workqueue.ParallelizeUntil method + // right now does not return any error; it returns when the context is cancelled, possibly + // in a willingly manner. Some of the KubeFleet code makes use of this to facilitate a + // fail-fast pattern (i.e., pass in a child context to the parallelizer; if one worker + // has exited, cancel the child context in the worker and consequently the whole parallelization). + // As only the caller knows why a context is cancelled (willingly by a worker or not), we leave it to the + // caller to inspect the context after this method returns rather than trying to do it here. } From b70a24ec33372da926b3685f75895a44c0921712 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Dec 2025 10:04:20 +0800 Subject: [PATCH 12/13] chore: bump actions/checkout from 5.0.1 to 6.0.0 (#349) Bumps [actions/checkout](https://github.com/actions/checkout) from 5.0.1 to 6.0.0. - [Release notes](https://github.com/actions/checkout/releases) - [Commits](https://github.com/actions/checkout/compare/v5.0.1...v6) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: 6.0.0 dependency-type: direct:production update-type: version-update:semver-major ... --- .github/workflows/chart.yml | 2 +- .github/workflows/ci.yml | 4 ++-- .github/workflows/code-lint.yml | 4 ++-- .github/workflows/codeql-analysis.yml | 2 +- .github/workflows/codespell.yml | 2 +- .github/workflows/markdown-lint.yml | 2 +- .github/workflows/trivy.yml | 2 +- .github/workflows/upgrade.yml | 6 +++--- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/chart.yml b/.github/workflows/chart.yml index ee61c51bf..f1d2a197c 100644 --- a/.github/workflows/chart.yml +++ b/.github/workflows/chart.yml @@ -18,7 +18,7 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v6.0.0 with: submodules: true fetch-depth: 0 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a72343b46..45193c1f5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,7 +40,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v6 + uses: actions/checkout@v6.0.0 - name: Set up Ginkgo CLI run: | @@ -91,7 +91,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v6 + uses: actions/checkout@v6.0.0 - name: Install Ginkgo CLI run: | diff --git a/.github/workflows/code-lint.yml b/.github/workflows/code-lint.yml index b46d30ee3..a49324aed 100644 --- a/.github/workflows/code-lint.yml +++ b/.github/workflows/code-lint.yml @@ -43,7 +43,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Checkout - uses: actions/checkout@v6 + uses: actions/checkout@v6.0.0 with: submodules: true @@ -64,7 +64,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v6 + uses: actions/checkout@v6.0.0 - name: golangci-lint run: make lint diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index c56c52856..2536446da 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -38,7 +38,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v6 + uses: actions/checkout@v6.0.0 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index fc3936cb9..c91f5d2d8 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -16,7 +16,7 @@ jobs: with: egress-policy: audit - - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v4.1.7 + - uses: actions/checkout@c2d88d3ecc89a9ef08eebf45d9637801dcee7eb5 # v4.1.7 - uses: codespell-project/actions-codespell@8f01853be192eb0f849a5c7d721450e7a467c579 # master with: check_filenames: true diff --git a/.github/workflows/markdown-lint.yml b/.github/workflows/markdown-lint.yml index d0c13afe1..7c8815f5a 100644 --- a/.github/workflows/markdown-lint.yml +++ b/.github/workflows/markdown-lint.yml @@ -10,7 +10,7 @@ jobs: markdown-link-check: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v6.0.0 - uses: tcort/github-action-markdown-link-check@v1 with: # this will only show errors in the output diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 2dc6a40f6..ab45f089a 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -44,7 +44,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Checkout code - uses: actions/checkout@v6 + uses: actions/checkout@v6.0.0 - name: Login to ${{ env.REGISTRY }} uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef diff --git a/.github/workflows/upgrade.yml b/.github/workflows/upgrade.yml index e7d0e5125..365dc1c33 100644 --- a/.github/workflows/upgrade.yml +++ b/.github/workflows/upgrade.yml @@ -44,7 +44,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v6 + uses: actions/checkout@v6.0.0 with: # Fetch the history of all branches and tags. # This is needed for the test suite to switch between releases. @@ -127,7 +127,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v6 + uses: actions/checkout@v6.0.0 with: # Fetch the history of all branches and tags. # This is needed for the test suite to switch between releases. @@ -210,7 +210,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Check out code into the Go module directory - uses: actions/checkout@v6 + uses: actions/checkout@v6.0.0 with: # Fetch the history of all branches and tags. # This is needed for the test suite to switch between releases. From b2198c0385c77a59dd09df07276d21c535381e61 Mon Sep 17 00:00:00 2001 From: Zhiying Lin <54013513+zhiying-lin@users.noreply.github.com> Date: Mon, 1 Dec 2025 22:41:55 +0800 Subject: [PATCH 13/13] fix: handle the case when master resource snapshot is not found (#350) Signed-off-by: Zhiying Lin --- pkg/controllers/rollout/controller.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/controllers/rollout/controller.go b/pkg/controllers/rollout/controller.go index f85da707b..e383508c2 100644 --- a/pkg/controllers/rollout/controller.go +++ b/pkg/controllers/rollout/controller.go @@ -151,6 +151,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim "placement", placementObjRef) return runtime.Result{}, err } + if masterResourceSnapshot == nil { + klog.V(2).InfoS("No masterResourceSnapshot found for the placement, stop rolling", "placement", placementObjRef) + // New masterResourceSnapshot creation should trigger the rollout controller. + return runtime.Result{}, nil + } klog.V(2).InfoS("Found the masterResourceSnapshot for the placement", "placement", placementObjRef, "masterResourceSnapshot", klog.KObj(masterResourceSnapshot)) // Note: there is a corner case that an override is in-between snapshots (the old one is marked as not the latest while the new one is not created yet)