Merge pull request #29505 from kargakis/debug-recreate-flake

Automatic merge from submit-queue controller: enhance timeout error message for Recreate deployments Makes the error message from https://github.com/kubernetes/kubernetes/issues/29197 more obvious @kubernetes/deployment
2025-09-14 05:36:12 +00:00 · 2016-09-21 01:45:47 -07:00
parent d17c6b903a 59da5385e0
commit 02605106a6
3 changed files with 29 additions and 27 deletions
--- a/hack/.linted_packages
+++ b/hack/.linted_packages
@@ -174,6 +174,7 @@ pkg/util/json
 pkg/util/limitwriter
 pkg/util/logs
 pkg/util/maps
+pkg/util/replicaset
 pkg/util/validation/field
 pkg/util/workqueue
 pkg/version/prometheus
--- a/pkg/controller/deployment/recreate.go
+++ b/pkg/controller/deployment/recreate.go
@@ -17,10 +17,11 @@ limitations under the License.
 package deployment

 import (
+	"fmt"
+
 	"k8s.io/kubernetes/pkg/apis/extensions"
 	unversionedclient "k8s.io/kubernetes/pkg/client/unversioned"
 	"k8s.io/kubernetes/pkg/controller"
-	rsutil "k8s.io/kubernetes/pkg/util/replicaset"
 	"k8s.io/kubernetes/pkg/util/wait"
 )

@@ -78,16 +79,18 @@ func (dc *DeploymentController) rolloutRecreate(deployment *extensions.Deploymen
 // scaleDownOldReplicaSetsForRecreate scales down old replica sets when deployment strategy is "Recreate"
 func (dc *DeploymentController) scaleDownOldReplicaSetsForRecreate(oldRSs []*extensions.ReplicaSet, deployment *extensions.Deployment) (bool, error) {
 	scaled := false
-	for _, rs := range oldRSs {
+	for i := range oldRSs {
+		rs := oldRSs[i]
 		// Scaling not required.
 		if rs.Spec.Replicas == 0 {
 			continue
 		}
-		scaledRS, _, err := dc.scaleReplicaSetAndRecordEvent(rs, 0, deployment)
+		scaledRS, updatedRS, err := dc.scaleReplicaSetAndRecordEvent(rs, 0, deployment)
 		if err != nil {
 			return false, err
 		}
 		if scaledRS {
+			oldRSs[i] = updatedRS
 			scaled = true
 		}
 	}
@@ -99,9 +102,29 @@ func (dc *DeploymentController) scaleDownOldReplicaSetsForRecreate(oldRSs []*ext
 func (dc *DeploymentController) waitForInactiveReplicaSets(oldRSs []*extensions.ReplicaSet) error {
 	for i := range oldRSs {
 		rs := oldRSs[i]
+		desiredGeneration := rs.Generation
+		observedGeneration := rs.Status.ObservedGeneration
+		specReplicas := rs.Spec.Replicas
+		statusReplicas := rs.Status.Replicas

-		condition := rsutil.ReplicaSetIsInactive(dc.client.Extensions(), rs)
-		if err := wait.ExponentialBackoff(unversionedclient.DefaultRetry, condition); err != nil {
+		if err := wait.ExponentialBackoff(unversionedclient.DefaultRetry, func() (bool, error) {
+			replicaSet, err := dc.rsStore.ReplicaSets(rs.Namespace).Get(rs.Name)
+			if err != nil {
+				return false, err
+			}
+
+			specReplicas = replicaSet.Spec.Replicas
+			statusReplicas = replicaSet.Status.Replicas
+			observedGeneration = replicaSet.Status.ObservedGeneration
+
+			// TODO: We also need to wait for terminating replicas to actually terminate.
+			// See https://github.com/kubernetes/kubernetes/issues/32567
+			return observedGeneration >= desiredGeneration && replicaSet.Spec.Replicas == 0 && replicaSet.Status.Replicas == 0, nil
+		}); err != nil {
+			if err == wait.ErrWaitTimeout {
+				err = fmt.Errorf("replica set %q never became inactive: synced=%t, spec.replicas=%d, status.replicas=%d",
+					rs.Name, observedGeneration >= desiredGeneration, specReplicas, statusReplicas)
+			}
 			return err
 		}
 	}
--- a/pkg/util/replicaset/replicaset.go
+++ b/pkg/util/replicaset/replicaset.go
@@ -108,25 +108,3 @@ func MatchingPodsFunc(rs *extensions.ReplicaSet) (func(api.Pod) bool, error) {
 		return selector.Matches(podLabelsSelector)
 	}, nil
 }
-
-// ReplicaSetIsInactive returns a condition that will be true when a replica set is inactive ie.
-// it has zero running replicas.
-func ReplicaSetIsInactive(c unversionedextensions.ExtensionsInterface, replicaSet *extensions.ReplicaSet) wait.ConditionFunc {
-
-	// If we're given a ReplicaSet where the status lags the spec, it either means that the
-	// ReplicaSet is stale, or that the ReplicaSet manager hasn't noticed the update yet.
-	// Polling status.Replicas is not safe in the latter case.
-	desiredGeneration := replicaSet.Generation
-
-	return func() (bool, error) {
-		rs, err := c.ReplicaSets(replicaSet.Namespace).Get(replicaSet.Name)
-		if err != nil {
-			return false, err
-		}
-
-		return rs.Status.ObservedGeneration >= desiredGeneration &&
-			rs.Spec.Replicas == 0 &&
-			rs.Status.Replicas == 0 &&
-			rs.Status.FullyLabeledReplicas == 0, nil
-	}
-}