From 2c116055f7eb4c0a917010008c092a21dc4b47dc Mon Sep 17 00:00:00 2001
From: ravisantoshgudimetla <ravisantoshgudimetla@gmail.com>
Date: Thu, 1 Jul 2021 16:10:17 -0400
Subject: [PATCH] [disruptioncontroller] Don't error for unmanaged pods

As of now, we allow PDBs to be applied to pods via
selectors, so there can be unmanaged pods(pods that
don't have backing controllers) but still have PDBs associated.
Such pods are to be logged instead of immediately throwing
a sync error. This ensures disruption controller is
not frequently updating the status subresource and thus
preventing excessive and expensive writes to etcd.
---
 pkg/controller/disruption/disruption.go      | 29 +++++++----
 pkg/controller/disruption/disruption_test.go | 53 +++++++++++++++++++-
 test/e2e/apps/disruption.go                  | 33 ++++++++++++
 3 files changed, 105 insertions(+), 10 deletions(-)

diff --git a/pkg/controller/disruption/disruption.go b/pkg/controller/disruption/disruption.go
index 24266da425c..8e2b6c99d8b 100644
--- a/pkg/controller/disruption/disruption.go
+++ b/pkg/controller/disruption/disruption.go
@@ -595,11 +595,16 @@ func (dc *DisruptionController) trySync(pdb *policy.PodDisruptionBudget) error {
 		dc.recorder.Eventf(pdb, v1.EventTypeNormal, "NoPods", "No matching pods found")
 	}
 
-	expectedCount, desiredHealthy, err := dc.getExpectedPodCount(pdb, pods)
+	expectedCount, desiredHealthy, unmanagedPods, err := dc.getExpectedPodCount(pdb, pods)
 	if err != nil {
 		dc.recorder.Eventf(pdb, v1.EventTypeWarning, "CalculateExpectedPodCountFailed", "Failed to calculate the number of expected pods: %v", err)
 		return err
 	}
+	// We have unmamanged pods, instead of erroring and hotlooping in disruption controller, log and continue.
+	if len(unmanagedPods) > 0 {
+		klog.V(4).Infof("found unmanaged pods associated with this PDB: %v",
+			strings.Join(unmanagedPods, ",'"))
+	}
 
 	currentTime := time.Now()
 	disruptedPods, recheckTime := dc.buildDisruptedPodMap(pods, pdb, currentTime)
@@ -615,7 +620,7 @@ func (dc *DisruptionController) trySync(pdb *policy.PodDisruptionBudget) error {
 	return err
 }
 
-func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount, desiredHealthy int32, err error) {
+func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount, desiredHealthy int32, unmanagedPods []string, err error) {
 	err = nil
 	// TODO(davidopp): consider making the way expectedCount and rules about
 	// permitted controller configurations (specifically, considering it an error
@@ -623,7 +628,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud
 	// handled the same way for integer and percentage minAvailable
 
 	if pdb.Spec.MaxUnavailable != nil {
-		expectedCount, err = dc.getExpectedScale(pdb, pods)
+		expectedCount, unmanagedPods, err = dc.getExpectedScale(pdb, pods)
 		if err != nil {
 			return
 		}
@@ -641,7 +646,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud
 			desiredHealthy = pdb.Spec.MinAvailable.IntVal
 			expectedCount = int32(len(pods))
 		} else if pdb.Spec.MinAvailable.Type == intstr.String {
-			expectedCount, err = dc.getExpectedScale(pdb, pods)
+			expectedCount, unmanagedPods, err = dc.getExpectedScale(pdb, pods)
 			if err != nil {
 				return
 			}
@@ -657,7 +662,7 @@ func (dc *DisruptionController) getExpectedPodCount(pdb *policy.PodDisruptionBud
 	return
 }
 
-func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount int32, err error) {
+func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget, pods []*v1.Pod) (expectedCount int32, unmanagedPods []string, err error) {
 	// When the user specifies a fraction of pods that must be available, we
 	// use as the fraction's denominator
 	// SUM_{all c in C} scale(c)
@@ -672,13 +677,19 @@ func (dc *DisruptionController) getExpectedScale(pdb *policy.PodDisruptionBudget
 	// A mapping from controllers to their scale.
 	controllerScale := map[types.UID]int32{}
 
-	// 1. Find the controller for each pod.  If any pod has 0 controllers,
-	// that's an error. With ControllerRef, a pod can only have 1 controller.
+	// 1. Find the controller for each pod.
+
+	// As of now, we allow PDBs to be applied to pods via selectors, so there
+	// can be unmanaged pods(pods that don't have backing controllers) but still have PDBs associated.
+	// Such pods are to be collected and PDB backing them should be enqueued instead of immediately throwing
+	// a sync error. This ensures disruption controller is not frequently updating the status subresource and thus
+	// preventing excessive and expensive writes to etcd.
+	// With ControllerRef, a pod can only have 1 controller.
 	for _, pod := range pods {
 		controllerRef := metav1.GetControllerOf(pod)
 		if controllerRef == nil {
-			err = fmt.Errorf("found no controller ref for pod %q", pod.Name)
-			return
+			unmanagedPods = append(unmanagedPods, pod.Name)
+			continue
 		}
 
 		// If we already know the scale of the controller there is no need to do anything.
diff --git a/pkg/controller/disruption/disruption_test.go b/pkg/controller/disruption/disruption_test.go
index d542bef235f..f29433c7b89 100644
--- a/pkg/controller/disruption/disruption_test.go
+++ b/pkg/controller/disruption/disruption_test.go
@@ -22,6 +22,7 @@ import (
 	"fmt"
 	"os"
 	"runtime/debug"
+	"strings"
 	"sync"
 	"testing"
 	"time"
@@ -115,6 +116,15 @@ func (ps *pdbStates) VerifyDisruptionAllowed(t *testing.T, key string, disruptio
 	}
 }
 
+func (ps *pdbStates) VerifyNoStatusError(t *testing.T, key string) {
+	pdb := ps.Get(key)
+	for _, condition := range pdb.Status.Conditions {
+		if strings.Contains(condition.Message, "found no controller ref") && condition.Reason == policy.SyncFailedReason {
+			t.Fatalf("PodDisruption Controller should not error when unmanaged pods are found but it failed for %q", key)
+		}
+	}
+}
+
 type disruptionController struct {
 	*DisruptionController
 
@@ -534,6 +544,47 @@ func TestNakedPod(t *testing.T) {
 	ps.VerifyDisruptionAllowed(t, pdbName, 0)
 }
 
+// Verify that disruption controller is not erroring when unmanaged pods are found
+func TestStatusForUnmanagedPod(t *testing.T) {
+	dc, ps := newFakeDisruptionController()
+
+	pdb, pdbName := newMinAvailablePodDisruptionBudget(t, intstr.FromString("28%"))
+	add(t, dc.pdbStore, pdb)
+	dc.sync(pdbName)
+	// This verifies that when a PDB has 0 pods, disruptions are not allowed.
+	ps.VerifyDisruptionAllowed(t, pdbName, 0)
+
+	pod, _ := newPod(t, "unmanaged")
+	add(t, dc.podStore, pod)
+	dc.sync(pdbName)
+
+	ps.VerifyNoStatusError(t, pdbName)
+
+}
+
+// Check if the unmanaged pods are correctly collected or not
+func TestTotalUnmanagedPods(t *testing.T) {
+	dc, ps := newFakeDisruptionController()
+
+	pdb, pdbName := newMinAvailablePodDisruptionBudget(t, intstr.FromString("28%"))
+	add(t, dc.pdbStore, pdb)
+	dc.sync(pdbName)
+	// This verifies that when a PDB has 0 pods, disruptions are not allowed.
+	ps.VerifyDisruptionAllowed(t, pdbName, 0)
+
+	pod, _ := newPod(t, "unmanaged")
+	add(t, dc.podStore, pod)
+	dc.sync(pdbName)
+	var pods []*v1.Pod
+	pods = append(pods, pod)
+	_, unmanagedPods, _ := dc.getExpectedScale(pdb, pods)
+	if len(unmanagedPods) != 1 {
+		t.Fatalf("expected one pod to be unmanaged pod but found %d", len(unmanagedPods))
+	}
+	ps.VerifyNoStatusError(t, pdbName)
+
+}
+
 // Verify that we count the scale of a ReplicaSet even when it has no Deployment.
 func TestReplicaSet(t *testing.T) {
 	dc, ps := newFakeDisruptionController()
@@ -752,7 +803,7 @@ func TestReplicationController(t *testing.T) {
 	rogue, _ := newPod(t, "rogue")
 	add(t, dc.podStore, rogue)
 	dc.sync(pdbName)
-	ps.VerifyDisruptionAllowed(t, pdbName, 0)
+	ps.VerifyDisruptionAllowed(t, pdbName, 2)
 }
 
 func TestStatefulSetController(t *testing.T) {
diff --git a/test/e2e/apps/disruption.go b/test/e2e/apps/disruption.go
index 46ce07e5d46..c10d0588da5 100644
--- a/test/e2e/apps/disruption.go
+++ b/test/e2e/apps/disruption.go
@@ -19,6 +19,8 @@ package apps
 import (
 	"context"
 	"fmt"
+	"github.com/onsi/gomega"
+	"strings"
 	"time"
 
 	jsonpatch "github.com/evanphx/json-patch"
@@ -185,6 +187,25 @@ var _ = SIGDescribe("DisruptionController", func() {
 		framework.ExpectEmpty(patched.Status.DisruptedPods, "Expecting the PodDisruptionBudget's be empty")
 	})
 
+	// PDB shouldn't error out when there are unmanaged pods
+	ginkgo.It("should observe that the PodDisruptionBudget status is not updated for unmanaged pods",
+		func() {
+			createPDBMinAvailableOrDie(cs, ns, defaultName, intstr.FromInt(1), defaultLabels)
+
+			createPodsOrDie(cs, ns, 3)
+			waitForPodsOrDie(cs, ns, 3)
+
+			// Since we allow unmanaged pods to be associated with a PDB, we should not see any error
+			gomega.Consistently(func() (bool, error) {
+				pdb, err := cs.PolicyV1().PodDisruptionBudgets(ns).Get(context.TODO(), defaultName, metav1.GetOptions{})
+				if err != nil {
+					return false, err
+				}
+				return isPDBErroring(pdb), nil
+			}, 1*time.Minute, 1*time.Second).ShouldNot(gomega.BeTrue(), "pod shouldn't error for "+
+				"unmanaged pod")
+		})
+
 	evictionCases := []struct {
 		description        string
 		minAvailable       intstr.IntOrString
@@ -677,3 +698,15 @@ func unstructuredToPDB(obj *unstructured.Unstructured) (*policyv1.PodDisruptionB
 	pdb.APIVersion = ""
 	return pdb, err
 }
+
+// isPDBErroring checks if the PDB is erroring on when there are unmanaged pods
+func isPDBErroring(pdb *policyv1.PodDisruptionBudget) bool {
+	hasFailed := false
+	for _, condition := range pdb.Status.Conditions {
+		if strings.Contains(condition.Reason, "SyncFailed") &&
+			strings.Contains(condition.Message, "found no controller ref for pod") {
+			hasFailed = true
+		}
+	}
+	return hasFailed
+}