From a2ca66d280df83c5d32e058ef97a3d117a8309ed Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Wed, 25 Aug 2021 11:10:10 -0400 Subject: [PATCH] kubelet: Admission must exclude completed pods and avoid races Fixes two issues with how the pod worker refactor calculated the pods that admission could see (GetActivePods() and filterOutTerminatedPods()) First, completed pods must be filtered from the "desired" state for admission, which arguably should be happening earlier in config. Exclude the two terminal pods states from GetActivePods() Second, the previous check introduced with the pod worker lifecycle ownership changes was subtly wrong for the admission use case. Admission has to include pods that haven't yet hit the pod worker, which CouldHaveRunningContainers was filtering out (because the pod worker hasn't seen them). Introduce a weaker check - IsPodKnownTerminated() - that returns true only if the pod is in a known terminated state (no running containers AND known to pod worker). This weaker check may only be called from components that need admitted pods, not other kubelet subsystems. This commit does not fix the long standing bug that force deleted pods are omitted from admission checks, which must be fixed by having GetActivePods() also include pods "still terminating". --- pkg/kubelet/kubelet_pods.go | 17 +++++++++++++---- pkg/kubelet/pod_workers.go | 18 ++++++++++++++++++ pkg/kubelet/pod_workers_test.go | 6 ++++++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 1d78950e77a..9a8b8cc8a30 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -95,6 +95,10 @@ func (kl *Kubelet) listPodsFromDisk() ([]types.UID, error) { // GetActivePods returns pods that may have a running container (a // terminated pod is one that is known to have no running containers and // will not get any more). +// +// TODO: This method must include pods that have been force deleted from +// the config source (and thus removed from the pod manager) but are still +// terminating. func (kl *Kubelet) GetActivePods() []*v1.Pod { allPods := kl.podManager.GetPods() activePods := kl.filterOutTerminatedPods(allPods) @@ -964,12 +968,17 @@ func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool { return kl.PodResourcesAreReclaimed(pod, status) } -// filterOutTerminatedPods returns the pods that could still have running -// containers +// filterOutTerminatedPods returns pods that are not in a terminal phase +// or are known to be fully terminated. This method should only be used +// when the set of pods being filtered is upstream of the pod worker, i.e. +// the pods the pod manager is aware of. func (kl *Kubelet) filterOutTerminatedPods(pods []*v1.Pod) []*v1.Pod { - var filteredPods []*v1.Pod + filteredPods := make([]*v1.Pod, 0, len(pods)) for _, p := range pods { - if !kl.podWorkers.CouldHaveRunningContainers(p.UID) { + if kl.podWorkers.IsPodKnownTerminated(p.UID) { + continue + } + if p.Status.Phase == v1.PodSucceeded || p.Status.Phase == v1.PodFailed { continue } filteredPods = append(filteredPods, p) diff --git a/pkg/kubelet/pod_workers.go b/pkg/kubelet/pod_workers.go index 297900cca1c..9c855fef8d5 100644 --- a/pkg/kubelet/pod_workers.go +++ b/pkg/kubelet/pod_workers.go @@ -130,6 +130,14 @@ type PodWorkers interface { // true. SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorkType + // IsPodKnownTerminated returns true if the provided pod UID is known by the pod + // worker to be terminated. If the pod has been force deleted and the pod worker + // has completed termination this method will return false, so this method should + // only be used to filter out pods from the desired set such as in admission. + // + // Intended for use by the kubelet config loops, but not subsystems, which should + // use ShouldPod*(). + IsPodKnownTerminated(uid types.UID) bool // CouldHaveRunningContainers returns true before the pod workers have synced, // once the pod workers see the pod (syncPod could be called), and returns false // after the pod has been terminated (running containers guaranteed stopped). @@ -394,6 +402,16 @@ func newPodWorkers( } } +func (p *podWorkers) IsPodKnownTerminated(uid types.UID) bool { + p.podLock.Lock() + defer p.podLock.Unlock() + if status, ok := p.podSyncStatuses[uid]; ok { + return status.IsTerminated() + } + // if the pod is not known, we return false (pod worker is not aware of it) + return false +} + func (p *podWorkers) CouldHaveRunningContainers(uid types.UID) bool { p.podLock.Lock() defer p.podLock.Unlock() diff --git a/pkg/kubelet/pod_workers_test.go b/pkg/kubelet/pod_workers_test.go index 50705275ecd..bc08aae88da 100644 --- a/pkg/kubelet/pod_workers_test.go +++ b/pkg/kubelet/pod_workers_test.go @@ -49,6 +49,7 @@ type fakePodWorkers struct { statusLock sync.Mutex running map[types.UID]bool terminating map[types.UID]bool + terminated map[types.UID]bool terminationRequested map[types.UID]bool removeRuntime map[types.UID]bool removeContent map[types.UID]bool @@ -85,6 +86,11 @@ func (f *fakePodWorkers) SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodW return nil } +func (f *fakePodWorkers) IsPodKnownTerminated(uid types.UID) bool { + f.statusLock.Lock() + defer f.statusLock.Unlock() + return f.terminated[uid] +} func (f *fakePodWorkers) CouldHaveRunningContainers(uid types.UID) bool { f.statusLock.Lock() defer f.statusLock.Unlock()