[disruptioncontroller] Don't error for unmanaged pods

As of now, we allow PDBs to be applied to pods via
selectors, so there can be unmanaged pods(pods that
don't have backing controllers) but still have PDBs associated.
Such pods are to be logged instead of immediately throwing
a sync error. This ensures disruption controller is
not frequently updating the status subresource and thus
preventing excessive and expensive writes to etcd.
This commit is contained in:
ravisantoshgudimetla 2021-07-01 16:10:17 -04:00
parent 1861e4756d
commit 2c116055f7
3 changed files with 105 additions and 10 deletions

View File

@ -595,11 +595,16 @@ func (dc *DisruptionController) trySync(pdb *policy.PodDisruptionBudget) error {
dc.recorder.Eventf(pdb, v1.EventTypeNormal, "NoPods", "No matching pods found")
}
expectedCount, desiredHealthy, err := dc.getExpectedPodCount(pdb, pods)
expectedCount, desiredHealthy, unmanagedPods, err := dc.getExpectedPodCount(pdb, pods)
if err != nil {
dc.recorder.Eventf(pdb, v1.EventTypeWarning, "CalculateExpectedPodCountFailed", "Failed to calculate the number of expected pods: %v", err)
return err
}
// We have unmamanged pods, instead of erroring and hotlooping in disruption controller, log and continue.
if len(unmanagedPods) > 0 {
klog.V(4).Infof("found unmanaged pods associated with this PDB: %v",
strings.Join(unmanagedPods, ",'"))
}
currentTime := time.Now()
disruptedPods, recheckTime := dc.buildDisruptedPodMap(pods, pdb, currentTime)
@ -615,7 +620,7 @@ func (dc *DisruptionController) trySync(pdb *policy.PodDisruptionBudget) error {
return err
}
func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount, desiredHealthy int32, err error) {
func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount, desiredHealthy int32, unmanagedPods []string, err error) {
err = nil
// TODO(davidopp): consider making the way expectedCount and rules about
// permitted controller configurations (specifically, considering it an error
@ -623,7 +628,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud
// handled the same way for integer and percentage minAvailable
if pdb.Spec.MaxUnavailable != nil {
expectedCount, err = dc.getExpectedScale(pdb, pods)
expectedCount, unmanagedPods, err = dc.getExpectedScale(pdb, pods)
if err != nil {
return
}
@ -641,7 +646,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud
desiredHealthy = pdb.Spec.MinAvailable.IntVal
expectedCount = int32(len(pods))
} else if pdb.Spec.MinAvailable.Type == intstr.String {
expectedCount, err = dc.getExpectedScale(pdb, pods)
expectedCount, unmanagedPods, err = dc.getExpectedScale(pdb, pods)
if err != nil {
return
}
@ -657,7 +662,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud
return
}
func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount int32, err error) {
func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount int32, unmanagedPods []string, err error) {
// When the user specifies a fraction of pods that must be available, we
// use as the fraction's denominator
// SUM_{all c in C} scale(c)
@ -672,13 +677,19 @@ func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget
// A mapping from controllers to their scale.
controllerScale := map[types.UID]int32{}
// 1. Find the controller for each pod. If any pod has 0 controllers,
// that's an error. With ControllerRef, a pod can only have 1 controller.
// 1. Find the controller for each pod.
// As of now, we allow PDBs to be applied to pods via selectors, so there
// can be unmanaged pods(pods that don't have backing controllers) but still have PDBs associated.
// Such pods are to be collected and PDB backing them should be enqueued instead of immediately throwing
// a sync error. This ensures disruption controller is not frequently updating the status subresource and thus
// preventing excessive and expensive writes to etcd.
// With ControllerRef, a pod can only have 1 controller.
for _, pod := range pods {
controllerRef := metav1.GetControllerOf(pod)
if controllerRef == nil {
err = fmt.Errorf("found no controller ref for pod %q", pod.Name)
return
unmanagedPods = append(unmanagedPods, pod.Name)
continue
}
// If we already know the scale of the controller there is no need to do anything.

View File

@ -22,6 +22,7 @@ import (
"fmt"
"os"
"runtime/debug"
"strings"
"sync"
"testing"
"time"
@ -115,6 +116,15 @@ func (ps *pdbStates) VerifyDisruptionAllowed(t *testing.T, key string, disruptio
}
}
func (ps *pdbStates) VerifyNoStatusError(t *testing.T, key string) {
pdb := ps.Get(key)
for _, condition := range pdb.Status.Conditions {
if strings.Contains(condition.Message, "found no controller ref") && condition.Reason == policy.SyncFailedReason {
t.Fatalf("PodDisruption Controller should not error when unmanaged pods are found but it failed for %q", key)
}
}
}
type disruptionController struct {
*DisruptionController
@ -534,6 +544,47 @@ func TestNakedPod(t *testing.T) {
ps.VerifyDisruptionAllowed(t, pdbName, 0)
}
// Verify that disruption controller is not erroring when unmanaged pods are found
func TestStatusForUnmanagedPod(t *testing.T) {
dc, ps := newFakeDisruptionController()
pdb, pdbName := newMinAvailablePodDisruptionBudget(t, intstr.FromString("28%"))
add(t, dc.pdbStore, pdb)
dc.sync(pdbName)
// This verifies that when a PDB has 0 pods, disruptions are not allowed.
ps.VerifyDisruptionAllowed(t, pdbName, 0)
pod, _ := newPod(t, "unmanaged")
add(t, dc.podStore, pod)
dc.sync(pdbName)
ps.VerifyNoStatusError(t, pdbName)
}
// Check if the unmanaged pods are correctly collected or not
func TestTotalUnmanagedPods(t *testing.T) {
dc, ps := newFakeDisruptionController()
pdb, pdbName := newMinAvailablePodDisruptionBudget(t, intstr.FromString("28%"))
add(t, dc.pdbStore, pdb)
dc.sync(pdbName)
// This verifies that when a PDB has 0 pods, disruptions are not allowed.
ps.VerifyDisruptionAllowed(t, pdbName, 0)
pod, _ := newPod(t, "unmanaged")
add(t, dc.podStore, pod)
dc.sync(pdbName)
var pods []*v1.Pod
pods = append(pods, pod)
_, unmanagedPods, _ := dc.getExpectedScale(pdb, pods)
if len(unmanagedPods) != 1 {
t.Fatalf("expected one pod to be unmanaged pod but found %d", len(unmanagedPods))
}
ps.VerifyNoStatusError(t, pdbName)
}
// Verify that we count the scale of a ReplicaSet even when it has no Deployment.
func TestReplicaSet(t *testing.T) {
dc, ps := newFakeDisruptionController()
@ -752,7 +803,7 @@ func TestReplicationController(t *testing.T) {
rogue, _ := newPod(t, "rogue")
add(t, dc.podStore, rogue)
dc.sync(pdbName)
ps.VerifyDisruptionAllowed(t, pdbName, 0)
ps.VerifyDisruptionAllowed(t, pdbName, 2)
}
func TestStatefulSetController(t *testing.T) {

View File

@ -19,6 +19,8 @@ package apps
import (
"context"
"fmt"
"github.com/onsi/gomega"
"strings"
"time"
jsonpatch "github.com/evanphx/json-patch"
@ -185,6 +187,25 @@ var _ = SIGDescribe("DisruptionController", func() {
framework.ExpectEmpty(patched.Status.DisruptedPods, "Expecting the PodDisruptionBudget's be empty")
})
// PDB shouldn't error out when there are unmanaged pods
ginkgo.It("should observe that the PodDisruptionBudget status is not updated for unmanaged pods",
func() {
createPDBMinAvailableOrDie(cs, ns, defaultName, intstr.FromInt(1), defaultLabels)
createPodsOrDie(cs, ns, 3)
waitForPodsOrDie(cs, ns, 3)
// Since we allow unmanaged pods to be associated with a PDB, we should not see any error
gomega.Consistently(func() (bool, error) {
pdb, err := cs.PolicyV1().PodDisruptionBudgets(ns).Get(context.TODO(), defaultName, metav1.GetOptions{})
if err != nil {
return false, err
}
return isPDBErroring(pdb), nil
}, 1*time.Minute, 1*time.Second).ShouldNot(gomega.BeTrue(), "pod shouldn't error for "+
"unmanaged pod")
})
evictionCases := []struct {
description string
minAvailable intstr.IntOrString
@ -677,3 +698,15 @@ func unstructuredToPDB(obj *unstructured.Unstructured) (*policyv1.PodDisruptionB
pdb.APIVersion = ""
return pdb, err
}
// isPDBErroring checks if the PDB is erroring on when there are unmanaged pods
func isPDBErroring(pdb *policyv1.PodDisruptionBudget) bool {
hasFailed := false
for _, condition := range pdb.Status.Conditions {
if strings.Contains(condition.Reason, "SyncFailed") &&
strings.Contains(condition.Message, "found no controller ref for pod") {
hasFailed = true
}
}
return hasFailed
}