Fix retry logic in DisruptionController

This changes the retry logic in DisruptionController so that it reconciles update conflicts. In the old behavior, any pdb status update failure was retried with the same status, regardless of error. Now there is no retry logic with the status update. The error is passed up the stack where the PDB can be requeued for processing. If the PDB status update error is a conflict error, there are some new special cases: - failSafe is not triggered, since this is considered a retryable error - the PDB is requeued immediately (ignoring the rate limiter) because we assume that conflict can be resolved by getting the latest version
2025-08-01 07:47:56 +00:00 · 2019-08-29 16:19:45 -07:00 · 2019-08-29 16:19:45 -07:00 · 8835f6bb00
commit 8835f6bb00
parent afa979c295
2 changed files with 11 additions and 29 deletions
--- a/pkg/controller/disruption/BUILD
+++ b/pkg/controller/disruption/BUILD
@ -32,7 +32,6 @@ go_library(
        "//staging/src/k8s.io/client-go/kubernetes:go_default_library",
        "//staging/src/k8s.io/client-go/kubernetes/scheme:go_default_library",
        "//staging/src/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library",
-        "//staging/src/k8s.io/client-go/kubernetes/typed/policy/v1beta1:go_default_library",
        "//staging/src/k8s.io/client-go/listers/apps/v1:go_default_library",
        "//staging/src/k8s.io/client-go/listers/core/v1:go_default_library",
        "//staging/src/k8s.io/client-go/listers/policy/v1beta1:go_default_library",
--- a/pkg/controller/disruption/disruption.go
+++ b/pkg/controller/disruption/disruption.go
@ -21,7 +21,7 @@ import (
 	"time"

 	apps "k8s.io/api/apps/v1beta1"
-	"k8s.io/api/core/v1"
+	v1 "k8s.io/api/core/v1"
 	"k8s.io/api/extensions/v1beta1"
 	policy "k8s.io/api/policy/v1beta1"
 	apiequality "k8s.io/apimachinery/pkg/api/equality"
@ -39,7 +39,6 @@ import (
 	clientset "k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/kubernetes/scheme"
 	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
-	policyclientset "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
 	appsv1listers "k8s.io/client-go/listers/apps/v1"
 	corelisters "k8s.io/client-go/listers/core/v1"
 	policylisters "k8s.io/client-go/listers/policy/v1beta1"
@ -53,8 +52,6 @@ import (
 	"k8s.io/klog"
 )

-const statusUpdateRetries = 2
-
 // DeletionTimeout sets maximum time from the moment a pod is added to DisruptedPods in PDB.Status
 // to the time when the pod is expected to be seen by PDB controller as having been marked for deletion.
 // If the pod was not marked for deletion during that time it is assumed that it won't be deleted at
@ -532,7 +529,13 @@ func (dc *DisruptionController) sync(key string) error {
 		return err
 	}

-	if err := dc.trySync(pdb); err != nil {
+	err = dc.trySync(pdb)
+	// If the reason for failure was a conflict, then allow this PDB update to be
+	// requeued without triggering the failSafe logic.
+	if errors.IsConflict(err) {
+		return err
+	}
+	if err != nil {
 		klog.Errorf("Failed to sync pdb %s/%s: %v", pdb.Namespace, pdb.Name, err)
 		return dc.failSafe(pdb)
 	}
@ -773,29 +776,9 @@ func (dc *DisruptionController) updatePdbStatus(pdb *policy.PodDisruptionBudget,
 	return dc.getUpdater()(newPdb)
 }

-// refresh tries to re-GET the given PDB.  If there are any errors, it just
-// returns the old PDB.  Intended to be used in a retry loop where it runs a
-// bounded number of times.
-func refresh(pdbClient policyclientset.PodDisruptionBudgetInterface, pdb *policy.PodDisruptionBudget) *policy.PodDisruptionBudget {
-	newPdb, err := pdbClient.Get(pdb.Name, metav1.GetOptions{})
-	if err == nil {
-		return newPdb
-	}
-	return pdb
-
-}
-
 func (dc *DisruptionController) writePdbStatus(pdb *policy.PodDisruptionBudget) error {
-	pdbClient := dc.kubeClient.PolicyV1beta1().PodDisruptionBudgets(pdb.Namespace)
-	st := pdb.Status
-
-	var err error
-	for i, pdb := 0, pdb; i < statusUpdateRetries; i, pdb = i+1, refresh(pdbClient, pdb) {
-		pdb.Status = st
-		if _, err = pdbClient.UpdateStatus(pdb); err == nil {
-			break
-		}
-	}
-
+	// If this update fails, don't retry it. Allow the failure to get handled &
+	// retried in `processNextWorkItem()`.
+	_, err := dc.kubeClient.PolicyV1beta1().PodDisruptionBudgets(pdb.Namespace).UpdateStatus(pdb)
 	return err
 }