[disruptioncontroller] Don't error for unmanaged pods

As of now, we allow PDBs to be applied to pods via selectors, so there can be unmanaged pods(pods that don't have backing controllers) but still have PDBs associated. Such pods are to be logged instead of immediately throwing a sync error. This ensures disruption controller is not frequently updating the status subresource and thus preventing excessive and expensive writes to etcd.
2025-07-23 11:50:44 +00:00 · 2021-07-01 16:10:17 -04:00 · 2021-07-01 16:10:17 -04:00 · 2c116055f7
commit 2c116055f7
parent 1861e4756d
3 changed files with 105 additions and 10 deletions
--- a/pkg/controller/disruption/disruption.go
+++ b/pkg/controller/disruption/disruption.go
@ -595,11 +595,16 @@ func (dc *DisruptionController) trySync(pdb *policy.PodDisruptionBudget) error {
 		dc.recorder.Eventf(pdb, v1.EventTypeNormal, "NoPods", "No matching pods found")
 	}

-	expectedCount, desiredHealthy, err := dc.getExpectedPodCount(pdb, pods)
+	expectedCount, desiredHealthy, unmanagedPods, err := dc.getExpectedPodCount(pdb, pods)
 	if err != nil {
 		dc.recorder.Eventf(pdb, v1.EventTypeWarning, "CalculateExpectedPodCountFailed", "Failed to calculate the number of expected pods: %v", err)
 		return err
 	}
+	// We have unmamanged pods, instead of erroring and hotlooping in disruption controller, log and continue.
+	if len(unmanagedPods) > 0 {
+		klog.V(4).Infof("found unmanaged pods associated with this PDB: %v",
+			strings.Join(unmanagedPods, ",'"))
+	}

 	currentTime := time.Now()
 	disruptedPods, recheckTime := dc.buildDisruptedPodMap(pods, pdb, currentTime)
@ -615,7 +620,7 @@ func (dc *DisruptionController) trySync(pdb *policy.PodDisruptionBudget) error {
 	return err
 }

-func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount, desiredHealthy int32, err error) {
+func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount, desiredHealthy int32, unmanagedPods []string, err error) {
 	err = nil
 	// TODO(davidopp): consider making the way expectedCount and rules about
 	// permitted controller configurations (specifically, considering it an error
@ -623,7 +628,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud
 	// handled the same way for integer and percentage minAvailable

 	if pdb.Spec.MaxUnavailable != nil {
-		expectedCount, err = dc.getExpectedScale(pdb, pods)
+		expectedCount, unmanagedPods, err = dc.getExpectedScale(pdb, pods)
 		if err != nil {
 			return
 		}
@ -641,7 +646,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud
 			desiredHealthy = pdb.Spec.MinAvailable.IntVal
 			expectedCount = int32(len(pods))
 		} else if pdb.Spec.MinAvailable.Type == intstr.String {
-			expectedCount, err = dc.getExpectedScale(pdb, pods)
+			expectedCount, unmanagedPods, err = dc.getExpectedScale(pdb, pods)
 			if err != nil {
 				return
 			}
@ -657,7 +662,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud
 	return
 }

-func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount int32, err error) {
+func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount int32, unmanagedPods []string, err error) {
 	// When the user specifies a fraction of pods that must be available, we
 	// use as the fraction's denominator
 	// SUM_{all c in C} scale(c)
@ -672,13 +677,19 @@ func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget
 	// A mapping from controllers to their scale.
 	controllerScale := map[types.UID]int32{}

-	// 1. Find the controller for each pod.  If any pod has 0 controllers,
-	// that's an error. With ControllerRef, a pod can only have 1 controller.
+	// 1. Find the controller for each pod.
+
+	// As of now, we allow PDBs to be applied to pods via selectors, so there
+	// can be unmanaged pods(pods that don't have backing controllers) but still have PDBs associated.
+	// Such pods are to be collected and PDB backing them should be enqueued instead of immediately throwing
+	// a sync error. This ensures disruption controller is not frequently updating the status subresource and thus
+	// preventing excessive and expensive writes to etcd.
+	// With ControllerRef, a pod can only have 1 controller.
 	for _, pod := range pods {
 		controllerRef := metav1.GetControllerOf(pod)
 		if controllerRef == nil {
-			err = fmt.Errorf("found no controller ref for pod %q", pod.Name)
-			return
+			unmanagedPods = append(unmanagedPods, pod.Name)
+			continue
 		}

 		// If we already know the scale of the controller there is no need to do anything.
--- a/pkg/controller/disruption/disruption_test.go
+++ b/pkg/controller/disruption/disruption_test.go
@ -22,6 +22,7 @@ import (
 	"fmt"
 	"os"
 	"runtime/debug"
+	"strings"
 	"sync"
 	"testing"
 	"time"
@ -115,6 +116,15 @@ func (ps *pdbStates) VerifyDisruptionAllowed(t *testing.T, key string, disruptio
 	}
 }

+func (ps *pdbStates) VerifyNoStatusError(t *testing.T, key string) {
+	pdb := ps.Get(key)
+	for _, condition := range pdb.Status.Conditions {
+		if strings.Contains(condition.Message, "found no controller ref") && condition.Reason == policy.SyncFailedReason {
+			t.Fatalf("PodDisruption Controller should not error when unmanaged pods are found but it failed for %q", key)
+		}
+	}
+}
+
 type disruptionController struct {
 	*DisruptionController

@ -534,6 +544,47 @@ func TestNakedPod(t *testing.T) {
 	ps.VerifyDisruptionAllowed(t, pdbName, 0)
 }

+// Verify that disruption controller is not erroring when unmanaged pods are found
+func TestStatusForUnmanagedPod(t *testing.T) {
+	dc, ps := newFakeDisruptionController()
+
+	pdb, pdbName := newMinAvailablePodDisruptionBudget(t, intstr.FromString("28%"))
+	add(t, dc.pdbStore, pdb)
+	dc.sync(pdbName)
+	// This verifies that when a PDB has 0 pods, disruptions are not allowed.
+	ps.VerifyDisruptionAllowed(t, pdbName, 0)
+
+	pod, _ := newPod(t, "unmanaged")
+	add(t, dc.podStore, pod)
+	dc.sync(pdbName)
+
+	ps.VerifyNoStatusError(t, pdbName)
+
+}
+
+// Check if the unmanaged pods are correctly collected or not
+func TestTotalUnmanagedPods(t *testing.T) {
+	dc, ps := newFakeDisruptionController()
+
+	pdb, pdbName := newMinAvailablePodDisruptionBudget(t, intstr.FromString("28%"))
+	add(t, dc.pdbStore, pdb)
+	dc.sync(pdbName)
+	// This verifies that when a PDB has 0 pods, disruptions are not allowed.
+	ps.VerifyDisruptionAllowed(t, pdbName, 0)
+
+	pod, _ := newPod(t, "unmanaged")
+	add(t, dc.podStore, pod)
+	dc.sync(pdbName)
+	var pods []*v1.Pod
+	pods = append(pods, pod)
+	_, unmanagedPods, _ := dc.getExpectedScale(pdb, pods)
+	if len(unmanagedPods) != 1 {
+		t.Fatalf("expected one pod to be unmanaged pod but found %d", len(unmanagedPods))
+	}
+	ps.VerifyNoStatusError(t, pdbName)
+
+}
+
 // Verify that we count the scale of a ReplicaSet even when it has no Deployment.
 func TestReplicaSet(t *testing.T) {
 	dc, ps := newFakeDisruptionController()
@ -752,7 +803,7 @@ func TestReplicationController(t *testing.T) {
 	rogue, _ := newPod(t, "rogue")
 	add(t, dc.podStore, rogue)
 	dc.sync(pdbName)
-	ps.VerifyDisruptionAllowed(t, pdbName, 0)
+	ps.VerifyDisruptionAllowed(t, pdbName, 2)
 }

 func TestStatefulSetController(t *testing.T) {
--- a/test/e2e/apps/disruption.go
+++ b/test/e2e/apps/disruption.go
@ -19,6 +19,8 @@ package apps
 import (
 	"context"
 	"fmt"
+	"github.com/onsi/gomega"
+	"strings"
 	"time"

 	jsonpatch "github.com/evanphx/json-patch"
@ -185,6 +187,25 @@ var _ = SIGDescribe("DisruptionController", func() {
 		framework.ExpectEmpty(patched.Status.DisruptedPods, "Expecting the PodDisruptionBudget's be empty")
 	})

+	// PDB shouldn't error out when there are unmanaged pods
+	ginkgo.It("should observe that the PodDisruptionBudget status is not updated for unmanaged pods",
+		func() {
+			createPDBMinAvailableOrDie(cs, ns, defaultName, intstr.FromInt(1), defaultLabels)
+
+			createPodsOrDie(cs, ns, 3)
+			waitForPodsOrDie(cs, ns, 3)
+
+			// Since we allow unmanaged pods to be associated with a PDB, we should not see any error
+			gomega.Consistently(func() (bool, error) {
+				pdb, err := cs.PolicyV1().PodDisruptionBudgets(ns).Get(context.TODO(), defaultName, metav1.GetOptions{})
+				if err != nil {
+					return false, err
+				}
+				return isPDBErroring(pdb), nil
+			}, 1*time.Minute, 1*time.Second).ShouldNot(gomega.BeTrue(), "pod shouldn't error for "+
+				"unmanaged pod")
+		})
+
 	evictionCases := []struct {
 		description        string
 		minAvailable       intstr.IntOrString
@ -677,3 +698,15 @@ func unstructuredToPDB(obj *unstructured.Unstructured) (*policyv1.PodDisruptionB
 	pdb.APIVersion = ""
 	return pdb, err
 }
+
+// isPDBErroring checks if the PDB is erroring on when there are unmanaged pods
+func isPDBErroring(pdb *policyv1.PodDisruptionBudget) bool {
+	hasFailed := false
+	for _, condition := range pdb.Status.Conditions {
+		if strings.Contains(condition.Reason, "SyncFailed") &&
+			strings.Contains(condition.Message, "found no controller ref for pod") {
+			hasFailed = true
+		}
+	}
+	return hasFailed
+}