From 17d32ed0b8441901a994bce0cc4ecea7fb035e34 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Tue, 7 Sep 2021 11:49:20 -0400 Subject: [PATCH] kubelet: Rejected pods should be filtered from admission A pod that has been rejected by admission will have status manager set the phase to Failed locally, which make take some time to propagate to the apiserver. The rejected pod will be included in admission until the apiserver propagates the change back, which was an unintended regression when checking pod worker state as authoritative. A pod that is terminal in the API may still be consuming resources on the system, so it should still be included in admission. --- pkg/kubelet/kubelet.go | 2 +- pkg/kubelet/kubelet_pods.go | 34 ++++++++++++++++++++++++---------- pkg/kubelet/kubelet_test.go | 34 +++++++++++++++++++++++++--------- 3 files changed, 50 insertions(+), 20 deletions(-) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 9498eb47279..58aec52cbfd 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -2217,7 +2217,7 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { if !kl.podWorkers.IsPodTerminationRequested(pod.UID) { // We failed pods that we rejected, so activePods include all admitted // pods that are alive. - activePods := kl.filterOutTerminatedPods(existingPods) + activePods := kl.filterOutInactivePods(existingPods) // Check if we can admit the pod; if not, reject it. if ok, reason, message := kl.canAdmitPod(activePods, pod); !ok { diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 9a8b8cc8a30..1edbbb5113b 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -92,16 +92,17 @@ func (kl *Kubelet) listPodsFromDisk() ([]types.UID, error) { return pods, nil } -// GetActivePods returns pods that may have a running container (a -// terminated pod is one that is known to have no running containers and -// will not get any more). +// GetActivePods returns pods that have been admitted to the kubelet that +// are not fully terminated. This is mapped to the "desired state" of the +// kubelet - what pods should be running. // -// TODO: This method must include pods that have been force deleted from -// the config source (and thus removed from the pod manager) but are still -// terminating. +// WARNING: Currently this list does not include pods that have been force +// deleted but may still be terminating, which means resources assigned to +// those pods during admission may still be in use. See +// https://github.com/kubernetes/kubernetes/issues/104824 func (kl *Kubelet) GetActivePods() []*v1.Pod { allPods := kl.podManager.GetPods() - activePods := kl.filterOutTerminatedPods(allPods) + activePods := kl.filterOutInactivePods(allPods) return activePods } @@ -968,19 +969,32 @@ func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool { return kl.PodResourcesAreReclaimed(pod, status) } -// filterOutTerminatedPods returns pods that are not in a terminal phase +// filterOutInactivePods returns pods that are not in a terminal phase // or are known to be fully terminated. This method should only be used // when the set of pods being filtered is upstream of the pod worker, i.e. // the pods the pod manager is aware of. -func (kl *Kubelet) filterOutTerminatedPods(pods []*v1.Pod) []*v1.Pod { +func (kl *Kubelet) filterOutInactivePods(pods []*v1.Pod) []*v1.Pod { filteredPods := make([]*v1.Pod, 0, len(pods)) for _, p := range pods { + // if a pod is fully terminated by UID, it should be excluded from the + // list of pods if kl.podWorkers.IsPodKnownTerminated(p.UID) { continue } - if p.Status.Phase == v1.PodSucceeded || p.Status.Phase == v1.PodFailed { + + // terminal pods are considered inactive UNLESS they are actively terminating + isTerminal := p.Status.Phase == v1.PodSucceeded || p.Status.Phase == v1.PodFailed + if !isTerminal { + // a pod that has been marked terminal within the Kubelet is considered + // inactive (may have been rejected by Kubelet admision) + if status, ok := kl.statusManager.GetPodStatus(p.UID); ok { + isTerminal = status.Phase == v1.PodSucceeded || status.Phase == v1.PodFailed + } + } + if isTerminal && !kl.podWorkers.IsPodTerminationRequested(p.UID) { continue } + filteredPods = append(filteredPods, p) } return filteredPods diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index b95d82f743d..2c072216ee3 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -1418,15 +1418,18 @@ func TestNetworkErrorsWithoutHostNetwork(t *testing.T) { assert.NoError(t, err, "expected pod with hostNetwork=true to succeed when network in error") } -func TestFilterOutTerminatedPods(t *testing.T) { +func TestFilterOutInactivePods(t *testing.T) { testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) defer testKubelet.Cleanup() kubelet := testKubelet.kubelet - pods := newTestPods(5) + pods := newTestPods(8) now := metav1.NewTime(time.Now()) + + // terminal pods are excluded pods[0].Status.Phase = v1.PodFailed pods[1].Status.Phase = v1.PodSucceeded - // The pod is terminating, should not filter out. + + // deleted pod is included unless it's known to be terminated pods[2].Status.Phase = v1.PodRunning pods[2].DeletionTimestamp = &now pods[2].Status.ContainerStatuses = []v1.ContainerStatus{ @@ -1436,18 +1439,31 @@ func TestFilterOutTerminatedPods(t *testing.T) { }, }}, } + + // pending and running pods are included pods[3].Status.Phase = v1.PodPending pods[4].Status.Phase = v1.PodRunning - kubelet.podWorkers.(*fakePodWorkers).running = map[types.UID]bool{ - pods[2].UID: true, - pods[3].UID: true, - pods[4].UID: true, + // pod that is running but has been rejected by admission is excluded + pods[5].Status.Phase = v1.PodRunning + kubelet.statusManager.SetPodStatus(pods[5], v1.PodStatus{Phase: v1.PodFailed}) + + // pod that is running according to the api but is known terminated is excluded + pods[6].Status.Phase = v1.PodRunning + kubelet.podWorkers.(*fakePodWorkers).terminated = map[types.UID]bool{ + pods[6].UID: true, } - expected := []*v1.Pod{pods[2], pods[3], pods[4]} + // pod that is failed but still terminating is included (it may still be consuming + // resources) + pods[7].Status.Phase = v1.PodFailed + kubelet.podWorkers.(*fakePodWorkers).terminationRequested = map[types.UID]bool{ + pods[7].UID: true, + } + + expected := []*v1.Pod{pods[2], pods[3], pods[4], pods[7]} kubelet.podManager.SetPods(pods) - actual := kubelet.filterOutTerminatedPods(pods) + actual := kubelet.filterOutInactivePods(pods) assert.Equal(t, expected, actual) }