Fix retry logic in DisruptionController

This changes the retry logic in DisruptionController so that it
reconciles update conflicts. In the old behavior, any pdb status update
failure was retried with the same status, regardless of error.

Now there is no retry logic with the status update. The error is passed
up the stack where the PDB can be requeued for processing.

If the PDB status update error is a conflict error, there are some new
special cases:
- failSafe is not triggered, since this is considered a retryable error
- the PDB is requeued immediately (ignoring the rate limiter) because we
  assume that conflict can be resolved by getting the latest version
This commit is contained in:
Jonathan Basseri 2019-08-29 16:19:45 -07:00
parent afa979c295
commit 8835f6bb00
2 changed files with 11 additions and 29 deletions

View File

@ -32,7 +32,6 @@ go_library(
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/scheme:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/typed/policy/v1beta1:go_default_library",
"//staging/src/k8s.io/client-go/listers/apps/v1:go_default_library",
"//staging/src/k8s.io/client-go/listers/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/listers/policy/v1beta1:go_default_library",

View File

@ -21,7 +21,7 @@ import (
"time"
apps "k8s.io/api/apps/v1beta1"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/api/extensions/v1beta1"
policy "k8s.io/api/policy/v1beta1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
@ -39,7 +39,6 @@ import (
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
policyclientset "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
appsv1listers "k8s.io/client-go/listers/apps/v1"
corelisters "k8s.io/client-go/listers/core/v1"
policylisters "k8s.io/client-go/listers/policy/v1beta1"
@ -53,8 +52,6 @@ import (
"k8s.io/klog"
)
const statusUpdateRetries = 2
// DeletionTimeout sets maximum time from the moment a pod is added to DisruptedPods in PDB.Status
// to the time when the pod is expected to be seen by PDB controller as having been marked for deletion.
// If the pod was not marked for deletion during that time it is assumed that it won't be deleted at
@ -532,7 +529,13 @@ func (dc *DisruptionController) sync(key string) error {
return err
}
if err := dc.trySync(pdb); err != nil {
err = dc.trySync(pdb)
// If the reason for failure was a conflict, then allow this PDB update to be
// requeued without triggering the failSafe logic.
if errors.IsConflict(err) {
return err
}
if err != nil {
klog.Errorf("Failed to sync pdb %s/%s: %v", pdb.Namespace, pdb.Name, err)
return dc.failSafe(pdb)
}
@ -773,29 +776,9 @@ func (dc *DisruptionController) updatePdbStatus(pdb *policy.PodDisruptionBudget,
return dc.getUpdater()(newPdb)
}
// refresh tries to re-GET the given PDB. If there are any errors, it just
// returns the old PDB. Intended to be used in a retry loop where it runs a
// bounded number of times.
func refresh(pdbClient policyclientset.PodDisruptionBudgetInterface, pdb *policy.PodDisruptionBudget) *policy.PodDisruptionBudget {
newPdb, err := pdbClient.Get(pdb.Name, metav1.GetOptions{})
if err == nil {
return newPdb
}
return pdb
}
func (dc *DisruptionController) writePdbStatus(pdb *policy.PodDisruptionBudget) error {
pdbClient := dc.kubeClient.PolicyV1beta1().PodDisruptionBudgets(pdb.Namespace)
st := pdb.Status
var err error
for i, pdb := 0, pdb; i < statusUpdateRetries; i, pdb = i+1, refresh(pdbClient, pdb) {
pdb.Status = st
if _, err = pdbClient.UpdateStatus(pdb); err == nil {
break
}
}
// If this update fails, don't retry it. Allow the failure to get handled &
// retried in `processNextWorkItem()`.
_, err := dc.kubeClient.PolicyV1beta1().PodDisruptionBudgets(pdb.Namespace).UpdateStatus(pdb)
return err
}