diff --git a/pkg/controller/disruption/disruption.go b/pkg/controller/disruption/disruption.go index 24266da425c..8e2b6c99d8b 100644 --- a/pkg/controller/disruption/disruption.go +++ b/pkg/controller/disruption/disruption.go @@ -595,11 +595,16 @@ func (dc *DisruptionController) trySync(pdb *policy.PodDisruptionBudget) error { dc.recorder.Eventf(pdb, v1.EventTypeNormal, "NoPods", "No matching pods found") } - expectedCount, desiredHealthy, err := dc.getExpectedPodCount(pdb, pods) + expectedCount, desiredHealthy, unmanagedPods, err := dc.getExpectedPodCount(pdb, pods) if err != nil { dc.recorder.Eventf(pdb, v1.EventTypeWarning, "CalculateExpectedPodCountFailed", "Failed to calculate the number of expected pods: %v", err) return err } + // We have unmamanged pods, instead of erroring and hotlooping in disruption controller, log and continue. + if len(unmanagedPods) > 0 { + klog.V(4).Infof("found unmanaged pods associated with this PDB: %v", + strings.Join(unmanagedPods, ",'")) + } currentTime := time.Now() disruptedPods, recheckTime := dc.buildDisruptedPodMap(pods, pdb, currentTime) @@ -615,7 +620,7 @@ func (dc *DisruptionController) trySync(pdb *policy.PodDisruptionBudget) error { return err } -func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount, desiredHealthy int32, err error) { +func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount, desiredHealthy int32, unmanagedPods []string, err error) { err = nil // TODO(davidopp): consider making the way expectedCount and rules about // permitted controller configurations (specifically, considering it an error @@ -623,7 +628,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud // handled the same way for integer and percentage minAvailable if pdb.Spec.MaxUnavailable != nil { - expectedCount, err = dc.getExpectedScale(pdb, pods) + expectedCount, unmanagedPods, err = dc.getExpectedScale(pdb, pods) if err != nil { return } @@ -641,7 +646,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud desiredHealthy = pdb.Spec.MinAvailable.IntVal expectedCount = int32(len(pods)) } else if pdb.Spec.MinAvailable.Type == intstr.String { - expectedCount, err = dc.getExpectedScale(pdb, pods) + expectedCount, unmanagedPods, err = dc.getExpectedScale(pdb, pods) if err != nil { return } @@ -657,7 +662,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud return } -func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount int32, err error) { +func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount int32, unmanagedPods []string, err error) { // When the user specifies a fraction of pods that must be available, we // use as the fraction's denominator // SUM_{all c in C} scale(c) @@ -672,13 +677,19 @@ func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget // A mapping from controllers to their scale. controllerScale := map[types.UID]int32{} - // 1. Find the controller for each pod. If any pod has 0 controllers, - // that's an error. With ControllerRef, a pod can only have 1 controller. + // 1. Find the controller for each pod. + + // As of now, we allow PDBs to be applied to pods via selectors, so there + // can be unmanaged pods(pods that don't have backing controllers) but still have PDBs associated. + // Such pods are to be collected and PDB backing them should be enqueued instead of immediately throwing + // a sync error. This ensures disruption controller is not frequently updating the status subresource and thus + // preventing excessive and expensive writes to etcd. + // With ControllerRef, a pod can only have 1 controller. for _, pod := range pods { controllerRef := metav1.GetControllerOf(pod) if controllerRef == nil { - err = fmt.Errorf("found no controller ref for pod %q", pod.Name) - return + unmanagedPods = append(unmanagedPods, pod.Name) + continue } // If we already know the scale of the controller there is no need to do anything. diff --git a/pkg/controller/disruption/disruption_test.go b/pkg/controller/disruption/disruption_test.go index d542bef235f..f29433c7b89 100644 --- a/pkg/controller/disruption/disruption_test.go +++ b/pkg/controller/disruption/disruption_test.go @@ -22,6 +22,7 @@ import ( "fmt" "os" "runtime/debug" + "strings" "sync" "testing" "time" @@ -115,6 +116,15 @@ func (ps *pdbStates) VerifyDisruptionAllowed(t *testing.T, key string, disruptio } } +func (ps *pdbStates) VerifyNoStatusError(t *testing.T, key string) { + pdb := ps.Get(key) + for _, condition := range pdb.Status.Conditions { + if strings.Contains(condition.Message, "found no controller ref") && condition.Reason == policy.SyncFailedReason { + t.Fatalf("PodDisruption Controller should not error when unmanaged pods are found but it failed for %q", key) + } + } +} + type disruptionController struct { *DisruptionController @@ -534,6 +544,47 @@ func TestNakedPod(t *testing.T) { ps.VerifyDisruptionAllowed(t, pdbName, 0) } +// Verify that disruption controller is not erroring when unmanaged pods are found +func TestStatusForUnmanagedPod(t *testing.T) { + dc, ps := newFakeDisruptionController() + + pdb, pdbName := newMinAvailablePodDisruptionBudget(t, intstr.FromString("28%")) + add(t, dc.pdbStore, pdb) + dc.sync(pdbName) + // This verifies that when a PDB has 0 pods, disruptions are not allowed. + ps.VerifyDisruptionAllowed(t, pdbName, 0) + + pod, _ := newPod(t, "unmanaged") + add(t, dc.podStore, pod) + dc.sync(pdbName) + + ps.VerifyNoStatusError(t, pdbName) + +} + +// Check if the unmanaged pods are correctly collected or not +func TestTotalUnmanagedPods(t *testing.T) { + dc, ps := newFakeDisruptionController() + + pdb, pdbName := newMinAvailablePodDisruptionBudget(t, intstr.FromString("28%")) + add(t, dc.pdbStore, pdb) + dc.sync(pdbName) + // This verifies that when a PDB has 0 pods, disruptions are not allowed. + ps.VerifyDisruptionAllowed(t, pdbName, 0) + + pod, _ := newPod(t, "unmanaged") + add(t, dc.podStore, pod) + dc.sync(pdbName) + var pods []*v1.Pod + pods = append(pods, pod) + _, unmanagedPods, _ := dc.getExpectedScale(pdb, pods) + if len(unmanagedPods) != 1 { + t.Fatalf("expected one pod to be unmanaged pod but found %d", len(unmanagedPods)) + } + ps.VerifyNoStatusError(t, pdbName) + +} + // Verify that we count the scale of a ReplicaSet even when it has no Deployment. func TestReplicaSet(t *testing.T) { dc, ps := newFakeDisruptionController() @@ -752,7 +803,7 @@ func TestReplicationController(t *testing.T) { rogue, _ := newPod(t, "rogue") add(t, dc.podStore, rogue) dc.sync(pdbName) - ps.VerifyDisruptionAllowed(t, pdbName, 0) + ps.VerifyDisruptionAllowed(t, pdbName, 2) } func TestStatefulSetController(t *testing.T) { diff --git a/test/e2e/apps/disruption.go b/test/e2e/apps/disruption.go index 46ce07e5d46..c10d0588da5 100644 --- a/test/e2e/apps/disruption.go +++ b/test/e2e/apps/disruption.go @@ -19,6 +19,8 @@ package apps import ( "context" "fmt" + "github.com/onsi/gomega" + "strings" "time" jsonpatch "github.com/evanphx/json-patch" @@ -185,6 +187,25 @@ var _ = SIGDescribe("DisruptionController", func() { framework.ExpectEmpty(patched.Status.DisruptedPods, "Expecting the PodDisruptionBudget's be empty") }) + // PDB shouldn't error out when there are unmanaged pods + ginkgo.It("should observe that the PodDisruptionBudget status is not updated for unmanaged pods", + func() { + createPDBMinAvailableOrDie(cs, ns, defaultName, intstr.FromInt(1), defaultLabels) + + createPodsOrDie(cs, ns, 3) + waitForPodsOrDie(cs, ns, 3) + + // Since we allow unmanaged pods to be associated with a PDB, we should not see any error + gomega.Consistently(func() (bool, error) { + pdb, err := cs.PolicyV1().PodDisruptionBudgets(ns).Get(context.TODO(), defaultName, metav1.GetOptions{}) + if err != nil { + return false, err + } + return isPDBErroring(pdb), nil + }, 1*time.Minute, 1*time.Second).ShouldNot(gomega.BeTrue(), "pod shouldn't error for "+ + "unmanaged pod") + }) + evictionCases := []struct { description string minAvailable intstr.IntOrString @@ -677,3 +698,15 @@ func unstructuredToPDB(obj *unstructured.Unstructured) (*policyv1.PodDisruptionB pdb.APIVersion = "" return pdb, err } + +// isPDBErroring checks if the PDB is erroring on when there are unmanaged pods +func isPDBErroring(pdb *policyv1.PodDisruptionBudget) bool { + hasFailed := false + for _, condition := range pdb.Status.Conditions { + if strings.Contains(condition.Reason, "SyncFailed") && + strings.Contains(condition.Message, "found no controller ref for pod") { + hasFailed = true + } + } + return hasFailed +}